I am trying to get data from Roster Resource, here\'s an example of a webpage (https://www.rosterresource.com/mlb-arizona-diamondbacks). At the very minimum, I want to get t
Matt: You can use PowerQuery (in either Excel or PowerBI) to do this...even if the data isn't stored in an HTML table (which is the case here). There is a very good tutorial at https://datachant.com/2017/03/30/web-scraping-power-bi-excel-power-query/
I'm currently in the middle of my own web scraping challenge, but if you decide to use PowerQuery and get stuck, yell out and I'll see if I can assist further.
If you navigate the webpage https://www.rosterresource.com/mlb-arizona-diamondbacks and choose Inspect element from context menu on the table, you will see in browser developer tools that the whole table is located within a frame:
<iframe id="pageswitcher-content" frameborder="0" marginheight="0" marginwidth="0" src="https://docs.google.com/spreadsheets/d/e/2PACX-1vSe6YBd7UW_ijhVHdRsM132Z3aUXUIzGuHcoqqdsr_nUXIYHbvRDFY0XCwGVndXJnWRaWVYhbeDbo5W/pubhtml/sheet?headers=false&gid=1569103012" style="display: block; width: 100%; height: 100%;"></iframe>
So actually you need to retrieve the data from that Google Spreadsheet document. That could be done with XHR and Regex, as shown in the below code:
Option Explicit
Sub Test()
Dim sContent As String
Dim i As Long
Dim j As Long
Dim k As Long
Dim aTables()
Dim aHeader() As String
Dim aRows() As String
' Retrieve HTML content via XHR
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.rosterresource.com/mlb-arizona-diamondbacks", False
.Send
sContent = .ResponseText
End With
' Cut all before iframe URL
sContent = Split(sContent, "<iframe src=""", 2)(1)
' Cut all after ? sign within URL
sContent = Split(sContent, "?", 2)(0)
' Download google spreadsheet by extracted URL
' e. g. https://docs.google.com/spreadsheets/d/e/2PACX-1vQngsjnOpqkD8FQIOLn4cFayZTe4dl5VJZLNjMzji2Iq0dVXan7nj20Pq6oKnVS_HFla9e5GUtCyYl_/pubhtml
' e. g. https://docs.google.com/spreadsheets/d/e/2PACX-1vSe6YBd7UW_ijhVHdRsM132Z3aUXUIzGuHcoqqdsr_nUXIYHbvRDFY0XCwGVndXJnWRaWVYhbeDbo5W/pubhtml
' Retrieve HTML content via XHR
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", sContent, False
.Send
sContent = .ResponseText
End With
' Parse with RegEx
With CreateObject("VBScript.RegExp")
.Global = True
.MultiLine = True
.IgnoreCase = True
' Process all tables within iframe content
.Pattern = "<table\b[\s\S]*?>([\s\S]*?)</table>"
With .Execute(sContent)
ReDim aTables(0 To .Count - 1)
For i = 0 To .Count - 1
aTables(i) = .Item(i).SubMatches(0)
Next
End With
For k = 0 To UBound(aTables)
' Minor HTML simplification
sContent = aTables(k)
' Remove all tags except table formatting
.Pattern = "<(?!/td|/tr|/th|(?:td|tr|th)\b)[^>]*>|\r|\n|\t"
sContent = .Replace(sContent, "")
' Remove tags attributes
.Pattern = "<(\w+)\b[^>]+>"
sContent = .Replace(sContent, "<$1>")
' Replace th with td
.Pattern = "<(/?)th>"
sContent = .Replace(sContent, "<$1td>")
' Replace HTML entities &name; &#number; with chars
.Pattern = "&(?:\w+|#\d+);"
.Global = False
Do
With .Execute(sContent)
If .Count = 0 Then Exit Do
sContent = Replace(sContent, .Item(0), DecodeHTMLEntities(.Item(0)))
End With
Loop
.Global = True
' Extract rows
.Pattern = "<tr>((?:<td>.*?</td>)+)</tr>"
With .Execute(sContent)
ReDim aRows(0 To .Count - 1, 0)
For i = 0 To .Count - 1
aRows(i, 0) = .Item(i).SubMatches(0)
Next
End With
' Extract cells
.Pattern = "<td>(.*?)</td>"
For i = 0 To UBound(aRows, 1)
With .Execute(aRows(i, 0))
For j = 0 To .Count - 1
If UBound(aRows, 2) < j Then ReDim Preserve aRows(UBound(aRows, 1), j)
aRows(i, j) = Trim(.Item(j).SubMatches(0))
DoEvents
Next
End With
Next
aTables(k) = aRows
Next
End With
' Output
With ThisWorkbook
' Remove all existing worksheets
Application.DisplayAlerts = False
.Sheets.Add , .Sheets(.Sheets.Count)
Do While .Sheets.Count > 1
.Sheets(1).Delete
Loop
Application.DisplayAlerts = True
' Output each table to separate worksheet
For k = 0 To UBound(aTables)
If .Sheets.Count < (k + 1) Then .Sheets.Add , .Sheets(.Sheets.Count)
With .Sheets(k + 1)
.Cells.Delete
Output2DArray .Cells(1, 1), aTables(k)
.Columns.AutoFit
End With
Next
End With
End Sub
Function DecodeHTMLEntities(sText As String) As String
Static oHtmlfile As Object
Static oDiv As Object
If oHtmlfile Is Nothing Then
Set oHtmlfile = CreateObject("htmlfile")
oHtmlfile.Open
Set oDiv = oHtmlfile.createElement("div")
End If
oDiv.innerHTML = sText
DecodeHTMLEntities = oDiv.innerText
End Function
Sub Output2DArray(oDstRng As Range, aCells As Variant)
With oDstRng
.Parent.Select
With .Resize( _
UBound(aCells, 1) - LBound(aCells, 1) + 1, _
UBound(aCells, 2) - LBound(aCells, 2) + 1)
.NumberFormat = "@"
.Value = aCells
End With
End With
End Sub
Generally RegEx's aren't recommended for HTML parsing, so there is disclaimer. Data being processed in this case is quite simple that is why it is parsed with RegEx. About RegEx: introduction (especially syntax), introduction JS, VB flavor. Simplification makes HTML code suitable for parsing in some degree. BTW there is one more answer using the same approach.