[英]web scraping with vba using XMLHTTP
我想從網頁http://www.eex.com/en/market-data/power/derivatives-market/phelix-futures獲取一些數據。
如果我使用舊的InternetExplorer對象(下面的代碼),我可以瀏覽HTML文檔。 但我想使用XMLHTTP
對象(第二個代碼)。
Sub IEZagon()
'we define the essential variables
Dim ie As Object
Dim TDelement, TDelements
Dim AnhorLink, AnhorLinks
'add the "Microsoft Internet Controls" reference in your VBA Project indirectly
Set ie = CreateObject("InternetExplorer.Application")
With ie
.Visible = True
.navigate ("[URL]http://www.eex.com/en/market-data/power/derivatives-market/phelix-futures[/URL]")
While ie.ReadyState <> 4
DoEvents
Wend
Set AnhorLinks = .document.getElementsbytagname("a")
Set TDelements = .document.getElementsbytagname("td")
For Each AnhorLink In AnhorLinks
Debug.Print AnhorLink.innertext
Next
For Each TDelement In TDelements
Debug.Print TDelement.innertext
Next
End With
Set ie = Nothing
End Sub
使用XMLHTTP對象的代碼:
Sub FuturesScrap(ByVal URL As String)
Dim XMLHttpRequest As XMLHTTP
Dim HTMLDoc As New HTMLDocument
Set XMLHttpRequest = New MSXML2.XMLHTTP
XMLHttpRequest.Open "GET", URL, False
XMLHttpRequest.send
While XMLHttpRequest.readyState <> 4
DoEvents
Wend
Debug.Print XMLHttpRequest.responseText
HTMLDoc.body.innerHTML = XMLHttpRequest.responseText
With HTMLDoc.body
Set AnchorLinks = .getElementsByTagName("a")
Set TDelements = .getElementsByTagName("td")
For Each AnchorLink In AnchorLinks
Debug.Print AnhorLink.innerText
Next
For Each TDelement In TDelements
Debug.Print TDelement.innerText
Next
End With
End Sub
我只得到基本的HTML:
<html>
<head>
<title>Resource Not found</title>
<link rel= 'stylesheet' type='text/css' href='/blueprint/css/errorpage.css'/>
</head>
<body>
<table class="header">
<tr>
<td class="CMTitle CMHFill"><span class="large">Resource Not found</span></td>
</tr>
</table>
<div class="body">
<p style="font-weight:bold;">The requested resource does Not exist.</p>
</div>
<table class="footer">
<tr>
<td class="CMHFill"> </td>
</tr>
</table>
</body>
</html>
我想瀏覽表格和相應的數據......最后我想選擇年份到月份的不同時間間隔:
我真的很感激任何幫助! 謝謝!
我可以確認在運行代碼時(無論是否包含url標記),我都會獲得與您相同的HTML。 我在這里發現了一個有用的帖子 我已經使用在那里找到的方法修改了你的代碼,它現在似乎已經下載了正確的信息。
Sub test()
Call FuturesScrap1("http://www.eex.com/en/market-data/power/derivatives-market/phelix-futures")
End Sub
我包含了調用sub,因為url標記似乎導致MSXML請求出錯。
Sub FuturesScrap1(ByVal URL As String)
Dim HTMLDoc As New HTMLDocument
Dim oHttp As MSXML2.XMLHTTP
Dim sHTML As String
Dim AnchorLinks As Object
Dim TDelements As Object
Dim TDelement As Object
Dim AnchorLink As Object
On Error Resume Next
Set oHttp = New MSXML2.XMLHTTP
If Err.Number <> 0 Then
Set oHttp = CreateObject("MSXML.XMLHTTPRequest")
MsgBox "Error 0 has occured while creating a MSXML.XMLHTTPRequest object"
End If
On Error GoTo 0
If oHttp Is Nothing Then
MsgBox "For some reason I wasn't able to make a MSXML2.XMLHTTP object"
Exit Sub
End If
'Open the URL in browser object
oHttp.Open "GET", URL, False
oHttp.send
sHTML = oHttp.responseText
Debug.Print oHttp.responseText
HTMLDoc.body.innerHTML = oHttp.responseText
With HTMLDoc.body
Set AnchorLinks = .getElementsByTagName("a")
Set TDelements = .getElementsByTagName("td")
For Each AnchorLink In AnchorLinks
Debug.Print AnchorLink.innerText
Next
For Each TDelement In TDelements
Debug.Print TDelement.innerText
Next
End With
End Sub
編輯以下評論:
我無法使用MSXML2對象找到表元素,源代碼似乎不包含它們。 在firebug中存在td標記,因此我認為該表是由JavaScript代碼生成的。 我不知道MSXML2是否可以運行JavaScript所以我修改了sub以使用Internet Explorer,它不是快速代碼,但它確實找到了td元素並允許單擊選項卡。 我發現td元素可能需要一些時間才能使用(可能是因為IE必須運行JavaScript)所以我在下載數據之前已經進行了幾個步驟xl等待。
我已經添加了一些將td元素的內容下載到活動工作表中的代碼,如果在包含有用數據的工作簿中運行它,請小心。
Sub FuturesScrap3(ByVal URL As String)
Dim HTMLDoc As New HTMLDocument
Dim AnchorLinks As Object
Dim tdElements As Object
Dim tdElement As Object
Dim AnchorLink As Object
Dim lRow As Long
Dim oElement As Object
Dim oIE As InternetExplorer
Set oIE = New InternetExplorer
oIE.navigate URL
oIE.Visible = True
Do Until (oIE.readyState = 4 And Not oIE.Busy)
DoEvents
Loop
'Wait for Javascript to run
Application.Wait (Now + TimeValue("0:01:00"))
HTMLDoc.body.innerHTML = oIE.document.body.innerHTML
With HTMLDoc.body
Set AnchorLinks = .getElementsByTagName("a")
Set tdElements = .getElementsByTagName("td") '
For Each AnchorLink In AnchorLinks
Debug.Print AnchorLink.innerText
Next AnchorLink
End With
lRow = 1
For Each tdElement In tdElements
Debug.Print tdElement.innerText
Cells(lRow, 1).Value = tdElement.innerText
lRow = lRow + 1
Next
'Clicking the Month tab
For Each oElement In oIE.document.all
If Trim(oElement.innerText) = "Month" Then
oElement.Focus
oElement.Click
End If
Next oElement
Do Until (oIE.readyState = 4 And Not oIE.Busy)
DoEvents
Loop
'Wait for Javascript to run
Application.Wait (Now + TimeValue("0:01:00"))
HTMLDoc.body.innerHTML = oIE.document.body.innerHTML
With HTMLDoc.body
Set AnchorLinks = .getElementsByTagName("a")
Set tdElements = .getElementsByTagName("td") '
For Each AnchorLink In AnchorLinks
Debug.Print AnchorLink.innerText
Next AnchorLink
End With
lRow = 1
For Each tdElement In tdElements
Debug.Print tdElement.innerText
Cells(lRow, 2).Value = tdElement.innerText
lRow = lRow + 1
Next tdElement
End sub
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.