Excel VBA web scraping for data table

前端 未结 2 1357
臣服心动
臣服心动 2020-12-20 10:51

I am trying to get data from Roster Resource, here\'s an example of a webpage (https://www.rosterresource.com/mlb-arizona-diamondbacks). At the very minimum, I want to get t

相关标签:
2条回答
  • 2020-12-20 10:56

    Matt: You can use PowerQuery (in either Excel or PowerBI) to do this...even if the data isn't stored in an HTML table (which is the case here). There is a very good tutorial at https://datachant.com/2017/03/30/web-scraping-power-bi-excel-power-query/

    I'm currently in the middle of my own web scraping challenge, but if you decide to use PowerQuery and get stuck, yell out and I'll see if I can assist further.

    0 讨论(0)
  • 2020-12-20 11:04

    If you navigate the webpage https://www.rosterresource.com/mlb-arizona-diamondbacks and choose Inspect element from context menu on the table, you will see in browser developer tools that the whole table is located within a frame:

    <iframe id="pageswitcher-content" frameborder="0" marginheight="0" marginwidth="0" src="https://docs.google.com/spreadsheets/d/e/2PACX-1vSe6YBd7UW_ijhVHdRsM132Z3aUXUIzGuHcoqqdsr_nUXIYHbvRDFY0XCwGVndXJnWRaWVYhbeDbo5W/pubhtml/sheet?headers=false&amp;gid=1569103012" style="display: block; width: 100%; height: 100%;"></iframe>
    

    So actually you need to retrieve the data from that Google Spreadsheet document. That could be done with XHR and Regex, as shown in the below code:

    Option Explicit
    
    Sub Test()
    
        Dim sContent As String
        Dim i As Long
        Dim j As Long
        Dim k As Long
        Dim aTables()
        Dim aHeader() As String
        Dim aRows() As String
    
        ' Retrieve HTML content via XHR
        With CreateObject("MSXML2.XMLHTTP")
            .Open "GET", "https://www.rosterresource.com/mlb-arizona-diamondbacks", False
            .Send
            sContent = .ResponseText
        End With
        ' Cut all before iframe URL
        sContent = Split(sContent, "<iframe src=""", 2)(1)
        ' Cut all after ? sign within URL
        sContent = Split(sContent, "?", 2)(0)
        ' Download google spreadsheet by extracted URL
        ' e. g. https://docs.google.com/spreadsheets/d/e/2PACX-1vQngsjnOpqkD8FQIOLn4cFayZTe4dl5VJZLNjMzji2Iq0dVXan7nj20Pq6oKnVS_HFla9e5GUtCyYl_/pubhtml
        ' e. g. https://docs.google.com/spreadsheets/d/e/2PACX-1vSe6YBd7UW_ijhVHdRsM132Z3aUXUIzGuHcoqqdsr_nUXIYHbvRDFY0XCwGVndXJnWRaWVYhbeDbo5W/pubhtml
        ' Retrieve HTML content via XHR
        With CreateObject("MSXML2.XMLHTTP")
            .Open "GET", sContent, False
            .Send
            sContent = .ResponseText
        End With
        ' Parse with RegEx
        With CreateObject("VBScript.RegExp")
            .Global = True
            .MultiLine = True
            .IgnoreCase = True
            ' Process all tables within iframe content
            .Pattern = "<table\b[\s\S]*?>([\s\S]*?)</table>"
            With .Execute(sContent)
                ReDim aTables(0 To .Count - 1)
                For i = 0 To .Count - 1
                    aTables(i) = .Item(i).SubMatches(0)
                Next
            End With
            For k = 0 To UBound(aTables)
                ' Minor HTML simplification
                sContent = aTables(k)
                ' Remove all tags except table formatting
                .Pattern = "<(?!/td|/tr|/th|(?:td|tr|th)\b)[^>]*>|\r|\n|\t"
                sContent = .Replace(sContent, "")
                ' Remove tags attributes
                .Pattern = "<(\w+)\b[^>]+>"
                sContent = .Replace(sContent, "<$1>")
                ' Replace th with td
                .Pattern = "<(/?)th>"
                sContent = .Replace(sContent, "<$1td>")
                ' Replace HTML entities &name; &#number; with chars
                .Pattern = "&(?:\w+|#\d+);"
                .Global = False
                Do
                    With .Execute(sContent)
                        If .Count = 0 Then Exit Do
                        sContent = Replace(sContent, .Item(0), DecodeHTMLEntities(.Item(0)))
                    End With
                Loop
                .Global = True
               ' Extract rows
                .Pattern = "<tr>((?:<td>.*?</td>)+)</tr>"
                With .Execute(sContent)
                    ReDim aRows(0 To .Count - 1, 0)
                    For i = 0 To .Count - 1
                        aRows(i, 0) = .Item(i).SubMatches(0)
                    Next
                End With
                ' Extract cells
                .Pattern = "<td>(.*?)</td>"
                For i = 0 To UBound(aRows, 1)
                    With .Execute(aRows(i, 0))
                        For j = 0 To .Count - 1
                            If UBound(aRows, 2) < j Then ReDim Preserve aRows(UBound(aRows, 1), j)
                            aRows(i, j) = Trim(.Item(j).SubMatches(0))
                            DoEvents
                        Next
                    End With
                Next
                aTables(k) = aRows
            Next
        End With
        ' Output
        With ThisWorkbook
            ' Remove all existing worksheets
            Application.DisplayAlerts = False
            .Sheets.Add , .Sheets(.Sheets.Count)
            Do While .Sheets.Count > 1
                .Sheets(1).Delete
            Loop
            Application.DisplayAlerts = True
            ' Output each table to separate worksheet
            For k = 0 To UBound(aTables)
                If .Sheets.Count < (k + 1) Then .Sheets.Add , .Sheets(.Sheets.Count)
                With .Sheets(k + 1)
                    .Cells.Delete
                    Output2DArray .Cells(1, 1), aTables(k)
                    .Columns.AutoFit
                End With
            Next
        End With
    
    End Sub
    
    Function DecodeHTMLEntities(sText As String) As String
    
        Static oHtmlfile As Object
        Static oDiv As Object
    
        If oHtmlfile Is Nothing Then
            Set oHtmlfile = CreateObject("htmlfile")
            oHtmlfile.Open
            Set oDiv = oHtmlfile.createElement("div")
        End If
        oDiv.innerHTML = sText
        DecodeHTMLEntities = oDiv.innerText
    
    End Function
    
    Sub Output2DArray(oDstRng As Range, aCells As Variant)
    
        With oDstRng
            .Parent.Select
            With .Resize( _
                    UBound(aCells, 1) - LBound(aCells, 1) + 1, _
                    UBound(aCells, 2) - LBound(aCells, 2) + 1)
                .NumberFormat = "@"
                .Value = aCells
            End With
        End With
    
    End Sub
    

    Generally RegEx's aren't recommended for HTML parsing, so there is disclaimer. Data being processed in this case is quite simple that is why it is parsed with RegEx. About RegEx: introduction (especially syntax), introduction JS, VB flavor. Simplification makes HTML code suitable for parsing in some degree. BTW there is one more answer using the same approach.

    0 讨论(0)
提交回复
热议问题