简体   繁体   中英

Setting a web table scraper by Msxml2.ServerXMLHTTP.6.0 on Excel VBA

I need to do a web data scraper.

  1. I need to login to the site: user, password, click login button
  2. Click a second button
  3. Wait for page to load, here is the Table in question. The table is a call log and adds new content dynamically, so it is always refreshing.
  4. I want to exclude a form from the table content and limit the rows pasted to Excel.

I make it work by InternetExplorer.Application code but I need to switch to MSXML2.XMLHTTP code because it is very slow.

Working InternetExplorer.Application Version:

 Sub extractTablesData()
 'we define the essential variables

 Dim IE As Object, obj As Object
 Dim r As Integer, c As Integer, t As Integer
 Dim elemCollection As Object


 'add the "Microsoft Internet Controls" reference in your VBA Project indirectly
 Set IE = CreateObject("InternetExplorer.Application")

 With IE
 .Silent = True
 .Visible = True
 .navigate ("https://www.clickphone.ro")

 ' we ensure that the web page downloads completely before we fill the form automatically
 While IE.readyState <> 4
 DoEvents
 Wend
Application.Wait Now + TimeValue("00:00:03")
Set HTMLDoc = IE.document
 HTMLDoc.all.user.Value = "user or email" 'Enter your email/user id here
 HTMLDoc.all.pass.Value = "xXXxXXXxxXXXxx" 'Enter your password here
 'Login Button Click               
 With IE.document

    Set elems = .getElementsByTagName("a")
    For Each e In elems

        If (e.getAttribute("class") = "orange_button") Then
            e.Click
            Exit For
        End If

    Next e

End With

 'Needed Table page Button Click https://www.clickphone.ro/account/istoric_apel_in.html
 While IE.readyState <> 4
 DoEvents
 Wend
Set iedoc = IE.document

Set elems = iedoc.getElementsByClassName("black")(12)
    elems.Click

 ' again ensuring that the web page loads completely before we start scraping data
 While IE.readyState <> 4
 DoEvents
 Wend
 Application.Wait Now + TimeValue("00:00:05")
 Set iedoc = IE.document

'Clearing any unnecessary or old data in Sheet1

 ThisWorkbook.Sheets("Sheet1").Range("A1:K1000").ClearContents

'Scrapping Data and past to Sheet1
 Set elemCollection = IE.document.getElementsByTagName("table")

    For t = 0 To (elemCollection.Length - 1)
        For r = 0 To (elemCollection(t).Rows.Length - 1)
            For c = 0 To (elemCollection(t).Rows(r).Cells.Length - 1)
                ThisWorkbook.Worksheets(1).Cells(r + 1, c + 1) = elemCollection(t).Rows(r).Cells(c).innerText
            Next c
        Next r
    Next t

 End With

 ' cleaning up memory
 Set IE = Nothing

 End Sub

This is my attempt of MSXMLHTTP:

Option Explicit
 'reference to Microsoft Internet Controls
 'reference to Microsoft HTML Object Library

Sub Web_Table_Option_One()
Dim xml    As Object
Dim html   As Object
Dim objTable As Object
Dim result As String
Dim lRow As Long
Dim lngTable As Long
Dim lngRow As Long
Dim lngCol As Long
Dim ActRw As Long

Set xml = CreateObject("MSXML2.XMLHTTP.6.0")

Set html = CreateObject("htmlFile")

With xml
.Open "POST", "https://www.clickphone.ro/login.html", False
.setRequestHeader "Content-type", "application/x-www-form-urlencoded"
.send "userName=USER&password=XXXXxxxxXxxxxXXX"
.Open "GET", "https://www.clickphone.ro/account/istoric_apel_in.html", False 
.setRequestHeader "Content-type", "text/xml"
.send
End With

html.body.innerHTML = xml.responseText

Set objTable = html.getElementsByTagName("table")
 For lngTable = 0 To objTable.Length - 1
        For lngRow = 0 To objTable(lngTable).Rows.Length - 1
            For lngCol = 0 To objTable(lngTable).Rows(lngRow).Cells.Length - 1
                ThisWorkbook.Sheets("Sheet2").Cells(ActRw + lngRow + 1, lngCol + 1) = objTable(lngTable).Rows(lngRow).Cells(lngCol).innerText
            Next lngCol
        Next lngRow
        ActRw = ActRw + objTable(lngTable).Rows.Length + 1
    Next lngTable
End Sub

HTML source code:

For user,pass,login button:

<form action="/login.html" id="toploginform" name="toploginform" method="post">
                                                                      <script>
                                            function processLoginForm(){
                                                with (document.toploginform) {
                                                    if (user.value=="Email"){alert('Email/Parola incorecte!'); return false}
                                                    document.getElementById('toploginform').submit();
                                                }
                                            }
                                        </script>

                                                                            <fieldset>
                                            <input name="userlogin" type="hidden" id="userlogin" value="true" />
                                            <span class="text">
                                            <input name="user" type="text" onFocus="if(this.value=='Email'){this.value=''}" onBlur="if(this.value==''){this.value='Email'}" value="Email">
                                            </span> <span class="text">
                                            <input name="pass" type="password" onFocus="if(this.value=='Password'){this.value=''}" onBlur="if(this.value==''){this.value='Password'}" value="Password">
                                            </span> 
                                            <input name="authcode" type="hidden" id="authcode" value="false" />
                                            <span><a href="#" class="orange_button" onClick="return processLoginForm()">Login</a></span>
                                             <span class="links"><a href="/login~parola.html">Am uitat parola</a><br/>
                                                <input class="css-checkbox" id="checkbox2" type="checkbox" name="rememberpass" value="da" />
                                                <label for="checkbox2" name="checkbox2_lbl" class="css-label lite-orange-check">Retin datele?</label>
                                        </span>
                                        </fieldset>                         
                                                                </form>

Table page button:

<br />&nbsp;<img src="/images/sageata_orange.gif" width="7" height="8" />&nbsp;<a class="black" href="/account/istoric_apel_in.html">Apeluri primite</a>

Table source code:

<table class="TabelDate" cellspacing="0">
  <thead>
    <tr>
      <th width="130">Data</th>
      <th>Sursa</th>
      <th>Destinatie</th>
      <th>Durata</th>
      <th class="ultima">Status</th>
    </tr>
  </thead>
  <tr class="u">    <td class="prima">19-03-2017 17:31:16</td><td><font color="green"><form name="form24-1489937476.41719" method="post" action="">0720145931 <a class="TipFB" href="#"><span class="tip"><span class="tipTitle">Suna inapoi la 0720145931</span><span class="tipBody">Click si serverul te va suna gratuit pe numarul tau 0371780434.<br />Dupa ce raspunzi se formeaza automat numarul 0720145931.</span><span class="tipArrow"></span></span><input type="image" name="btn_opentextbox" src="/images/phone_small.gif" value="Submit" /></a>    <input name="numartel" type="hidden" id="numartel" value="0720145931" /></form></font></td><td align="center"><font color="green">0371780444</font></td><td align="center"><font color="green">00:00:07</font></td>
            <td class="ultima" align="center"><font color="green">Apel preluat</font></td></tr>  <tr class="gri">    <td class="prima">19-03-2017 17:30:48</td><td><font color="green"><form name="form24-1489937448.41715" method="post" action="">0728409617 <a class="TipFB" href="#"><span class="tip"><span class="tipTitle">Suna inapoi la 0728409617</span><span class="tipBody">Click si serverul te va suna gratuit pe numarul tau 0371780434.<br />Dupa ce raspunzi se formeaza automat numarul 0728409617.</span><span class="tipArrow"></span></span><input type="image" name="btn_opentextbox" src="/images/phone_small.gif" value="Submit" /></a>    <input name="numartel" type="hidden" id="numartel" value="0728409617" /></form></font></td><td align="center"><font color="green">0371780655</font></td><td align="center"><font color="green">00:00:07</font></td>

I manage to partially resolve my problem. Now i can login and retrieve the table i need with XmlHttp. I'l post the working code here so every one can use it (i don't take any credits for this code, i did it with help from different forums)

Option Explicit
 'reference to Microsoft Internet Controls
 'reference to Microsoft HTML Object Library

Sub CallLog()
Dim xml    As Object
Dim html   As Object
Dim objTable As Object
Dim result As String
Dim lRow As Long
Dim lngTable As Long
Dim lngRow As Long
Dim lngCol As Long
Dim ActRw As Long

Set xml = CreateObject("MSXML2.XMLHTTP.6.0")

Set html = CreateObject("htmlFile")

With xml
.Open "POST", "https://www.XXXXXX.xx/login.html", False
.setRequestHeader "Content-type", "application/x-www-form-urlencoded" 'send appropriate Headers
.send "userlogin=true&user=USERNAME&pass=PASSWORD&authcode=false" ' send login info
'MsgBox xml.responseText
.Open "GET", "https://www.XXXXXX.xx/account/callLog.html", False
.setRequestHeader "Content-type", "text/xml"
.send
End With

html.body.innerHTML = xml.responseText

Set objTable = html.getElementsByTagName("table")
    For lngTable = 0 To objTable.Length - 1
        For lngRow = 0 To objTable(lngTable).Rows.Length - 1
            For lngCol = 0 To objTable(lngTable).Rows(lngRow).Cells.Length - 1
                ThisWorkbook.Sheets("Sheet2").Cells(ActRw + lngRow + 1, lngCol + 1) = objTable(lngTable).Rows(lngRow).Cells(lngCol).innerText
            Next lngCol
        Next lngRow
        ActRw = ActRw + objTable(lngTable).Rows.Length + 1
    Next lngTable
End Sub

Now i'm left with the two problems... How can i get the children "table" from the parent "table" (the table i'm after is in a bigger table, see below source code) and i want to get only the first row, but excluding a "form" from the Row (it's a href link) Source Code

How i can get this continuously (this table is dynamic, it's updating every time some one call me, this first Row, is updating continuously)

Version 2.0 of my working code:

Option Explicit
 'reference to Microsoft Internet Controls
 'reference to Microsoft HTML Object Library

Sub CallLog()
Dim xml    As Object
Dim html   As Object
Dim objTable As Object
Dim result As String
Dim lRow As Long
Dim lngTable As Long
Dim lngRow As Long
Dim lngCol As Long
Dim ActRw As Long

Set xml = CreateObject("MSXML2.XMLHTTP.6.0")

Set html = CreateObject("htmlFile")

With xml
.Open "POST", "https://www.XXXXXX.xx/login.html", False
.setRequestHeader "Content-type", "application/x-www-form-urlencoded" 'send appropriate Headers
.send "userlogin=true&user=USERNAME&pass=PASSWORD&authcode=false" ' send login info
'MsgBox xml.responseText
.Open "GET", "https://www.XXXXXX.xx/account/callLog.html", False
.setRequestHeader "Content-type", "text/xml"
.send
End With

html.body.innerHTML = xml.responseText

Set objTable = html.getElementsByTagName("table")
ThisWorkbook.Sheets("LogClickPhone").Range("A2") = objTable(1).Rows(1).Cells(0).innerText
ThisWorkbook.Sheets("LogClickPhone").Range("B2") = objTable(1).Rows(1).Cells(1).innerText
ThisWorkbook.Sheets("LogClickPhone").Range("C2") = objTable(1).Rows(1).Cells(2).innerText
ThisWorkbook.Sheets("LogClickPhone").Range("D2") = objTable(1).Rows(1).Cells(3).innerText
ThisWorkbook.Sheets("LogClickPhone").Range("E2") = objTable(1).Rows(1).Cells(4).innerText
End Sub

I manage to get only the row i need but it's very slow, it takes 38.5 Sec to complete. I think I will better to use MSXML2.DOMDocument.6.0 structure for getting the text i need. But i don't know how to do that. Question: How i can automate this code so it's running every 60 sec or so? Tx

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM