简体   繁体   中英

Scrape Information from HTML using VBA

<div class="game_content">

                <div class="game_table">
                    <article class="league_name">
                        <span class="flags flags-category flags--uefa-europa-league"></span>
                        <h6 class="hide-on-small-only hide-on-med-only hide_ipadLand"><a title="International">International</a></h6>
                        <h3><a href="/en/league-games/1/uefa-europa-league" title="UEFA Europa League">UEFA Europa League</a></h3>
                    </article>                    <div class="game_detailsCol">
                        <div class="col s2 m1 l1">
                            <span class="red-text"><span class="green-text bold">F</span><span class="grey-text">21:45</span></span>
                        </div>
                        <div class="col s3 m4 l5">
                            <div>
                                <p class="yellow_card">3</p>
                            </div>
                            <a href="/en/teams/Marseille" title="Marseille livescore">Marseille</a>
                        </div>

                                                            <div class="col s1 m1 l1">
                                    <a href="/en/game-info/467162/marseille-atletico-madrid" title="Marseille-Atletico Madrid livescore" class="">
                                        0-3                                    </a>
                                </div>



                        <div class="col s3 m4 l5">
                            <a href="/en/teams/Atletico+Madrid" title="Atletico Madrid livescore">Atletico Madrid</a>
                            <div>
                                <span class="grey-text">0-1</span>
                                <p class="yellow_card">2</p>                            </div>
                        </div>
                        <div class="col s2 m1 l1">
                                                        <i class="icon icon-star " game_id="467162"></i>
                            <a class="dropdown-button hide-on-small-only hide-on-med-only hide_ipadLand" data-activates="dropdown467162">
                                <i class="icon icon-more_vert"></i>
                            </a><ul id="dropdown467162" class="dropdown-content">
                                <li class="close_dropMenu"><strong>x</strong></li>
                                                                <li><a href="/en/h2h/467162/marseille-atletico-madrid" title="Head To Head">Head To Head</a></li>
                            </ul>

                        </div>
                    </div>

Guys,

I'm new in scraping information from websites via VBA so help is needed. i want to loop through div class="game_content" and in each div class="game_table" scrap the below info:

FROM title="UEFA Europa League" GET "UEFA Europa League"
FROM title="Marseille livescore">Marseille GET "Marseille"
FROM title="Marseille-Atletico Madrid livescore" class="">0-3 GET 0-3
FROM title="Atletico Madrid livescore">Atletico Madrid GET Atletico Madrid
FROM <span class="grey-text">0-1</span> GET 0-1

My post includes the FIRST class="game_table FROM class="game_content"

My piece of code:

Sub InternetConnection()

    Dim IE As InternetExplorer
    Dim HTMLDocument As HTMLDocument
    Dim GameList As IHTMLElement
    Dim Games As IHTMLElementCollection
    Dim Game As IHTMLElement

    'Open Internet Explorer
    Set IE = New InternetExplorer
        IE.Visible = True
        IE.Navigate ("https://azscore.com/en/yesterday")            
        Do While IE.readyState <> READYSTATE_COMPLETE   
            Application.StatusBar = "Loading..."
        Loop

   Set HTMLDocument = IE.document   
   Set GameList = HTMLDocument.getElementsByClassName("game_content")(0)
   Set Games = GameList.Children       

    For Each Game In Games
        If Game.className = "game_table" Then
        End If
    Next

End Sub

Any help will be appreciate.

Required references via tools > references

  1. HTML Object Library
  2. MS Internet Controls.

Note:

Different output as now run on a different day against the website.


① Here is a version that parses out just those items:

Option Explicit

Public Sub PrintMatchInfo()
    Application.ScreenUpdating = False
    Dim IE As InternetExplorer, doc As HTMLDocument

    Set IE = New InternetExplorer

    With IE
        .Visible = True
        .Navigate2 "https://azscore.com/en/yesterday"

        While .Busy Or .readyState < 4: DoEvents: Wend

        Set doc = .document
    End With

    Dim games As Object, i As Long, j As Long, counter As Long
    counter = 1

    Set games = doc.getElementsByClassName("game_table")

    For i = 0 To games.Length - 1

        For j = 0 To games(i).getElementsByClassName("game_detailsCol").Length - 1
           With ThisWorkbook.Worksheets("Sheet2")
               .Cells(counter, 1) = Split(games(i).innerText, Chr$(10))(3)
               Dim tempArr() As String
               tempArr() = Split(GetKeyDetails(ReplacedString(games(i).getElementsByClassName("game_detailsCol")(j).innerText)), Chr$(126))
               .Cells(counter, 2).Resize(1, UBound(tempArr) + 1) = tempArr
               counter = counter + 1
            End With
        Next j

    Next i

    Application.ScreenUpdating = True

End Sub
Public Function ReplacedString(ByVal str As String) As String
    Dim regex As Object
    Set regex = CreateObject("VBScript.RegExp")

    With regex
        .Global = True
        .MultiLine = True
        .IgnoreCase = False
        .Pattern = "\s{2,}"
    End With
    ReplacedString = regex.Replace(str, Chr$(126)) '<== or other unexpected character
End Function


Public Function GetKeyDetails(ByVal inputString As String) As String
    Dim regex As Object, tempString As String
    Set regex = CreateObject("VBScript.RegExp")

    With regex
        .Global = True
        .MultiLine = True
        .IgnoreCase = False
        .Pattern = "(^[A-Z](?:\d|[01]\d|2[0-3]):[0-5]\d~)(\d?~?)+" '"^[A-Z](?:\d|[01]\d|2[0-3]):[0-5]\d~(\d~)?(\d~)?|x\~Head To Head"
    End With

    tempString = regex.Replace(inputString, vbNullString)

    Dim tempArr() As String

    tempArr = Split(tempString, Chr$(126))

    Dim keyDetails(), outputString As String, detail As Long
    keyDetails = Array(0, 1, 2, 3)

    For detail = LBound(keyDetails) To UBound(keyDetails)
        outputString = outputString & Chr$(126) & tempArr(keyDetails(detail))
    Next detail

    GetKeyDetails = Trim$(outputString)

End Function

Output:

产量

② Version with all the info:

Code:

Option Explicit
Public Sub PrintAllMatchInfo()
    Application.ScreenUpdating = False
    Dim IE As InternetExplorer, doc As HTMLDocument

    Set IE = New InternetExplorer

    With IE
        .Visible = True
        .Navigate2 "https://azscore.com/en/yesterday"

        While .Busy Or .readyState < 4: DoEvents: Wend

        Set doc = .document
    End With

    Dim games As Object, i As Long, j As Long, counter As Long
    counter = 1

    Set games = doc.getElementsByClassName("game_table")

    For i = 0 To games.Length - 1

        For j = 0 To games(i).getElementsByClassName("game_detailsCol").Length - 1
           With ThisWorkbook.Worksheets("Sheet1")
               .Cells(counter, 1) = Split(games(i).innerText, Chr$(10))(3)
               .Cells(counter, 2) = ReplacedString(games(i).getElementsByClassName("game_detailsCol")(j).innerText)
               counter = counter + 1
            End With
        Next j

    Next i

    Application.ScreenUpdating = True

End Sub
Public Function ReplacedString(ByVal str As String) As String
    Dim regex As Object
    Set regex = CreateObject("VBScript.RegExp")

    With regex
        .Global = True
        .MultiLine = True
        .IgnoreCase = False
        .Pattern = "\s{2,}"
    End With
    ReplacedString = regex.Replace(str, " ") '<== or other unexpected character
End Function

Output:

产量

Pattern for your follow-up question: "^\\w+\\/(\\d+~)+"

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM