十 31 2008
用Excel VBA获取并分析网页数据
主要用到了 Microsoft HTML Object Library
用createDocumentFromUrl这个方法可以下载一个网页的内容并且得到一个HTMLDocument对象,
利用HTMLDocument的一些属性就很容易把网页内容分析出来。
例如用 outerHTML 可以得到HTML文本
用 links 可以得到网页里面所有的链接
注:当时没有想到用正则表达式,实现的方法很笨。其实用正则表达式分析网页内容非常方便,在以后我会有实例。
下面是代码:
Option Explicit

Private mintSubCat As Integer
Private mintRowPages As Integer
Private mintRowPrdct As Integer

Public Sub GetCategories()
Dim strUrl As String
Dim objHtml1 As New HTMLDocument
Dim objHtml2 As HTMLDocument
Dim strHtml As String
Dim objLink As HTMLLinkElement
Dim objDic As New Dictionary
Dim intIndex As Integer
On Error Resume Next
mintSubCat = 1
mintRowPages = 1
mintRowPrdct = 2
strUrl = “http://www.hitachi-powertools.com.au/Categories.aspx“
Set objHtml2 = objHtml1.createDocumentFromUrl(strUrl, vbNullString)
While objHtml2.readyState <> “complete“ ‘And objHtml2.readyState <> ”interactive”
DoEvents
Wend
strHtml = objHtml2.documentElement.outerHTML
WriteStrToFile strHtml, ThisWorkbook.Path & “htmlCategories.html“, “UTF-8“
For Each objLink In objHtml2.links
‘Debug.Print objLink.toString
If Left(objLink.toString, 65) = “http://www.hitachi-powertools.com.au/Subcategories.aspx?Category=“ Then
If Not objDic.Exists(objLink.toString) Then
objDic.Add objLink.toString, objLink.toString
End If
End If
Next
For intIndex = 1 To objDic.Count
Sheet1.Cells(intIndex, 1).Value = objDic.Items(intIndex - 1)
Call GetSubCategories(objDic.Items(intIndex - 1))
Next intIndex
objDic.RemoveAll
Set objDic = Nothing
objHtml2.Close
Set objHtml2 = Nothing
objHtml1.Close
Set objHtml1 = Nothing
Set objLink = Nothing
End Sub

Private Sub GetSubCategories(ByVal strUrl As String)
Dim objHtml1 As New HTMLDocument
Dim objHtml2 As HTMLDocument
Dim strHtml As String
Dim strCategory As String
Dim objLink As HTMLLinkElement
Dim objDic As New Dictionary
Dim intIndex As Integer
strCategory = Replace(Mid(strUrl, 38), “?“, “”)
strCategory = Replace(strCategory, “/“, “”)
strCategory = Replace(strCategory, ““, “”)
Set objHtml2 = objHtml1.createDocumentFromUrl(strUrl, vbNullString)
While objHtml2.readyState <> “complete“
DoEvents
Wend
strHtml = objHtml2.documentElement.outerHTML
WriteStrToFile strHtml, ThisWorkbook.Path & “html“ & strCategory & “.html“, “UTF-8“
For Each objLink In objHtml2.links
‘Debug.Print objLink.toString
If UCase(Left(objLink.toString, 65)) = UCase(“http://www.hitachi-powertools.com.au/Subcategories.aspx?CATEGORY=“) _
And InStr(UCase(objLink.toString), UCase(“&SUBCATEGORY=&PAGE=“)) = 0 _
And InStr(UCase(objLink.toString), UCase(“&SUBCATEGORY=“)) > 0 Then
If Not objDic.Exists(UCase(objLink.toString)) Then
objDic.Add UCase(objLink.toString), objLink.toString
End If
End If
Next
For intIndex = 1 To objDic.Count
Sheet2.Cells(mintSubCat, 1).Value = objDic.Items(intIndex - 1)
mintSubCat = mintSubCat + 1
Call GetPages(objDic.Items(intIndex - 1))
Next intIndex
objDic.RemoveAll
Set objDic = Nothing
objHtml2.Close
Set objHtml2 = Nothing
objHtml1.Close
Set objHtml1 = Nothing
Set objLink = Nothing
End Sub




Private Sub GetPages(ByVal strUrl As String)
Dim objHtml1 As New HTMLDocument
Dim objHtml2 As HTMLDocument
Dim strHtml As String
Dim strCategory As String
Dim objLink As HTMLLinkElement
Dim objDic As New Dictionary
Dim intIndex As Integer
strCategory = Replace(Mid(strUrl, 38), “?“, “”)
strCategory = Replace(strCategory, “/“, “”)
strCategory = Replace(strCategory, ““, “”)
Set objHtml2 = objHtml1.createDocumentFromUrl(strUrl, vbNullString)
While objHtml2.readyState <> “complete“
DoEvents
Wend
strHtml = objHtml2.documentElement.outerHTML
WriteStrToFile strHtml, ThisWorkbook.Path & “html“ & strCategory & “.html“, “UTF-8“
For Each objLink In objHtml2.links
‘Debug.Print objLink.toString
‘ If Left(objLink.toString, 65) = ”http://www.hitachi-powertools.com.au/Subcategories.aspx?CATEGORY=” _
‘ And InStr(objLink.toString, ”&SUBCATEGORY=&PAGE=”) > 0 Then
If UCase(Left(objLink.toString, 65)) = UCase(“http://www.hitachi-powertools.com.au/Subcategories.aspx?CATEGORY=“) _
And InStr(UCase(objLink.toString), UCase(“&SUBCATEGORY=&PAGE=“)) = 0 _
And InStr(UCase(objLink.toString), UCase(“&SUBCATEGORY=“)) > 0 _
And InStr(UCase(objLink.toString), UCase(“&PAGE=“)) > 0 Then
If Not objDic.Exists(objLink.toString) Then
objDic.Add objLink.toString, objLink.toString
End If
End If
Next
For intIndex = 1 To objDic.Count
Sheet3.Cells(mintRowPages, 1).Value = objDic.Items(intIndex - 1)
mintRowPages = mintRowPages + 1
Call GetProductUrls(objDic.Items(intIndex - 1))
Next intIndex
objDic.RemoveAll
Set objDic = Nothing
objHtml2.Close
Set objHtml2 = Nothing
objHtml1.Close
Set objHtml1 = Nothing
Set objLink = Nothing
End Sub

Private Sub GetProductUrls(ByVal strUrl As String)
Dim objHtml1 As New HTMLDocument
Dim objHtml2 As HTMLDocument
Dim strHtml As String
Dim strCategory As String
Dim objLink As HTMLLinkElement
Dim objDic As New Dictionary
Dim intIndex As Integer
strCategory = Replace(Mid(strUrl, 38), “?“, “”)
strCategory = Replace(strCategory, “/“, “”)
strCategory = Replace(strCategory, ““, “”)
Set objHtml2 = objHtml1.createDocumentFromUrl(strUrl, vbNullString)
While objHtml2.readyState <> “complete“
DoEvents
Wend
strHtml = objHtml2.documentElement.outerHTML
WriteStrToFile strHtml, ThisWorkbook.Path & “html“ & strCategory & “.html“, “UTF-8“
For Each objLink In objHtml2.links
‘ Debug.Print objLink.toString
If UCase(Left(objLink.toString, 66)) = UCase(“http://www.hitachi-powertools.com.au/ProductDetails.aspx?Category=“) _
And InStr(UCase(objLink.toString), UCase(“&Subcategory=&Product=“)) = 0 _
And InStr(UCase(objLink.toString), UCase(“&SUBCATEGORY=“)) > 0 _
And InStr(UCase(objLink.toString), UCase(“&Product=“)) > 0 Then
If Not objDic.Exists(UCase(objLink.toString)) Then
objDic.Add UCase(objLink.toString), objLink.toString
End If
End If
Next
For intIndex = 1 To objDic.Count
Sheet4.Cells(mintRowPrdct, 1).Value = objDic.Items(intIndex - 1)
Call GetProductDesc(objDic.Items(intIndex - 1))
mintRowPrdct = mintRowPrdct + 1
‘ThisWorkbook.Save
Next intIndex
objDic.RemoveAll
Set objDic = Nothing
objHtml2.Close
Set objHtml2 = Nothing
objHtml1.Close
Set objHtml1 = Nothing
Set objLink = Nothing
End Sub

Sub GetProductDesc(ByVal strUrl As String)
Dim objHtml1 As New HTMLDocument
Dim objHtml2 As HTMLDocument
Dim strHtml As String
Dim strCategory As String
Dim objLink As HTMLLinkElement
Dim intIndex As Integer
Dim strCat1 As String
Dim strCat2 As String
Dim lngS1 As Long
Dim lngS2 As Long
Dim lngS3 As Long
Dim strTmp As String
Dim strProCode As String
Dim strProName As String
Dim strDesc As String
Dim strSpec() As String
strCategory = Replace(Mid(strUrl, 38), “?“, “”)
strCategory = Replace(strCategory, “/“, “”)
strCategory = Replace(strCategory, ““, “”)
Set objHtml2 = objHtml1.createDocumentFromUrl(strUrl, vbNullString)
While objHtml2.readyState <> “complete“
DoEvents
Wend
strHtml = objHtml2.documentElement.outerHTML
WriteStrToFile strHtml, ThisWorkbook.Path & “html“ & strCategory & “.html“, “UTF-8“
strTmp = UCase(strUrl)
lngS1 = InStr(strTmp, UCase(“Category=“)) + 9
lngS2 = InStr(strTmp, UCase(“&Subcategory=“)) + 13
lngS3 = InStr(strTmp, UCase(“&Product=“)) + 9
strCat1 = Mid(strUrl, lngS1, lngS2 - lngS1 - 13)
strCat2 = Mid(strUrl, lngS2, lngS3 - lngS2 - 9)
strProCode = Mid(strUrl, lngS3)
strTmp = UCase(strHtml)
lngS1 = InStr(strTmp, UCase(“<SPAN style=”"FONT-WEIGHT: bold; FONT-SIZE: 10pt; COLOR: #696969″”>“)) + 65
lngS2 = InStr(lngS1, strTmp, UCase(“</span>“))
strProName = Trim(Mid(strHtml, lngS1, lngS2 - lngS1))
strDesc = strGetDesc(objHtml2)
GetSpec objHtml2, strSpec
Sheet5.Cells(mintRowPrdct, “B“).Value = strCat1 & “|“ & strCat2
Sheet5.Cells(mintRowPrdct, “C“).Value = “HITACHI“
Sheet5.Cells(mintRowPrdct, “D“).Value = strProName
Sheet5.Cells(mintRowPrdct, “E“).Value = strProCode
Sheet5.Cells(mintRowPrdct, “J“).Value = strDesc
Dim intCol As Integer
Sheet6.Cells(mintRowPrdct, 1).Value = strProCode
Sheet7.Cells(mintRowPrdct, 1).Value = strProCode
If UBound(strSpec, 1) > 0 Then
For intCol = 1 To UBound(strSpec, 1)
Sheet6.Cells(mintRowPrdct, 2 + intCol).Value = strSpec(intCol, 1)
Sheet7.Cells(mintRowPrdct, 2 + intCol).Value = strSpec(intCol, 2)
Next intCol
End If
objHtml2.Close
Set objHtml2 = Nothing
objHtml1.Close
Set objHtml1 = Nothing
Set objLink = Nothing
End Sub

Private Function strGetDesc(ByRef objHtml As HTMLDocument) As String
Dim objHtmlItem As Object
Dim objHtmlLastItem As Object
Dim objHtmlTable As HTMLTable
For Each objHtmlItem In objHtml.all
If TypeName(objHtmlItem) = “HTMLTable“ Then
If Not objHtmlLastItem Is Nothing Then
If UCase(objHtmlLastItem.innerText) = UCase(“Description:“) Then
Set objHtmlTable = objHtmlItem
Exit For
End If
End If

End If
If TypeName(objHtmlItem) = “HTMLSpanElement“ Then
Set objHtmlLastItem = objHtmlItem
DoEvents
End If
Next
strGetDesc = Replace(objHtmlTable.innerText, vbCrLf, vbLf)
Set objHtmlItem = Nothing
Set objHtmlTable = Nothing
Set objHtmlLastItem = Nothing
End Function

Private Sub GetSpec(ByRef objHtml As HTMLDocument, ByRef strSpec() As String)
Dim objHtmlItem As Object
Dim blnFlag As Boolean
Dim objHtmlTable As HTMLTable
Dim intRow As Integer
blnFlag = False
For Each objHtmlItem In objHtml.all
If UCase(objHtmlItem.innerText) = UCase(“Specification:“) Then
blnFlag = True
End If
If blnFlag Then
If TypeName(objHtmlItem) = “HTMLTable“ Then
Set objHtmlTable = objHtmlItem
Exit For
End If
End If

Next
If Not objHtmlTable Is Nothing Then
If objHtmlTable.Cells.Length > 1 Then
ReDim strSpec(1 To objHtmlTable.Cells.Length / 2, 1 To 2)
For intRow = 1 To objHtmlTable.Cells.Length / 2
strSpec(intRow, 1) = objHtmlTable.Cells(intRow * 2 - 2).innerText
strSpec(intRow, 2) = objHtmlTable.Cells(intRow * 2 - 1).innerText
Next intRow
Else
ReDim strSpec(0 To 0, 0 To 0)
End If
Else
ReDim strSpec(0 To 0, 0 To 0)
End If
Set objHtmlItem = Nothing
Set objHtmlTable = Nothing
End Sub

Private Sub WriteStrToFile(ByVal strText As String, ByVal strPath As String, ByVal strCharSet As String)
‘Dim objFso As New FileSystemObject
Dim objText As New ADODB.Stream
objText.Type = adTypeText
objText.Open
objText.Charset = strCharSet
objText.WriteText strText, adWriteChar
objText.SaveToFile strPath, adSaveCreateOverWrite
objText.Close
Set objText = Nothing
End Sub


Function ReadStrFromFile(ByVal strPath As String, ByVal strCharSet As String) As String
‘Dim objFso As New FileSystemObject
Dim objText As New ADODB.Stream
objText.Type = adTypeText
objText.Open
objText.Charset = strCharSet
objText.LoadFromFile strPath
ReadStrFromFile = objText.ReadText
objText.Close
Set objText = Nothing
End Function

Public Sub com1()
Dim intRow As Integer
Dim intCol As Integer
Dim strTmp1 As String
Dim strTmp2 As String
For intRow = 2 To 177
strTmp2 = “”
For intCol = 3 To 15
strTmp1 = Trim(Sheet6.Cells(intRow, intCol) & “ | “ & Sheet7.Cells(intRow, intCol))
If strTmp1 <> “|“ Then
If strTmp2 <> “” Then
strTmp2 = strTmp2 & vbLf & strTmp1
Else
strTmp2 = strTmp1
End If
End If
Sheet4.Cells(intRow, 2).Value = strTmp2
Next intCol
Next intRow
End Sub
Related Blogs
Leave a Reply
You must be logged in to post a comment.
