My current project consists of retrieving data from HTML source code. Specifically, I am looking at crash cases on this website:
https://crashviewer.nhtsa.dot.gov/nass-c
What you are asking is a pretty big request so I am going to give some pointers and starting code. My code should write out all the tables but you will want to play around to get your desired format. There is certainly enough logic around selecting elements efficiently that this should help. * I haven't tested the use of the class to loop over all retrieved ids due time contraints but have tested the individual case and the retrieval of all ids.
To get the initial case links and ids:
I might use a function returning an array containing the links and ids. If you extract the ids they can be passed the XMLHTTP request I show below.
URL is https://crashviewer.nhtsa.dot.gov/LegacyCDS/Search
Public Function GetLinksAndIds(ByVal URL) As Variant
Dim ie As InternetExplorer, i As Long
Set ie = New InternetExplorer
With ie
.Visible = True
.navigate2 URL
While .Busy Or .readyState < 4: DoEvents: Wend
.document.getElementById("btnSubmit1").Click
While .Busy Or .readyState < 4: DoEvents: Wend
Dim caseLinks As Object, id As String, newURL As String
Set caseLinks = .document.querySelectorAll("[href*='CaseID=']")
Dim linksAndIds()
ReDim linksAndIds(1 To caseLinks.Length, 1 To 2)
For i = 0 To caseLinks.Length - 1
linksAndIds(i + 1, 1) = caseLinks.item(i)
linksAndIds(i + 1, 2) = Replace$(caseLinks.item(i), "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?xsl=main.xsl&CaseID=", vbNullString)
Next
.Quit
End With
GetLinksAndIds = linksAndIds
End Function
Example return values:
For each case - using XMLHTTP:
I would be tempted to avoid IE and use XMLHTTP request (url encoded query string returning more readable page version using the print option). Although I have parsed using css selectors you can read the response into an MSXML2.DOMDocument60
and query with XPath
for example. You can concatenate caseid into URL.
Option Explicit
Public Sub GetTables()
Dim sResponse As String, html As HTMLDocument, clipboard As Object, ws As Worksheet
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?ViewPage&xsl=Case.xsl&tab=Crash&form=CaseForm&baseNode=&vehnum=-1&occnum=-1&pos=-1&pos2=-1&websrc=true&title=Crash%20Overview%20-%20Summary&caseid=112007272&year=&fullimage=false", False '<==concatenate caseid into URL
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
sResponse = .responseText
End With
Set html = New HTMLDocument
html.body.innerHTML = sResponse
Dim tables As Object, i As Long
Set tables = html.querySelectorAll("table")
For i = 0 To tables.Length - 1
clipboard.SetText tables.item(i).outerHTML
clipboard.PutInClipboard
ws.Cells(LastRow(ws) + 2, 1).PasteSpecial
Next
End Sub
'https://www.rondebruin.nl/win/s9/win005.htm '<< Function below modified from here
Public Function LastRow(ByVal sh As Worksheet) As Long
On Error Resume Next
LastRow = sh.Cells.Find(What:="*", _
After:=sh.Range("A1"), _
Lookat:=xlPart, _
LookIn:=xlFormulas, _
SearchOrder:=xlByRows, _
SearchDirection:=xlPrevious, _
MatchCase:=False).Row
On Error GoTo 0
End Function
What might it look like altogether (not tested) using a class to hold the xmlhttp object:
Class clsHTTP:
Option Explicit
Private http As Object
Private Sub Class_Initialize()
Set http = CreateObject("MSXML2.XMLHTTP")
End Sub
Public Function GetString(ByVal URL As String) As String
Dim sResponse As String
With http
.Open "GET", URL, False
.send
sResponse = .responseText
End With
End Function
Standard module 1:
Option Explicit
Public Sub GetTables()
Dim sResponse As String, html As HTMLDocument, clipboard As Object, ws As Worksheet
Dim initialLinksURL As String, http As clsHTTP, i As Long, j As Long, newURL As String
Set http = New clsHTTP
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
Set html = New HTMLDocument
initialLinksURL = "https://crashviewer.nhtsa.dot.gov/LegacyCDS/Search"
Dim linksAndIds()
linksAndIds = GetLinksAndIds(initialLinksURL)
For i = LBound(linksAndIds, 2) To UBound(linksAndIds, 2)
newURL = "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?ViewPage&xsl=Case.xsl&tab=Crash&form=CaseForm&baseNode=&vehnum=-1&occnum=-1&pos=-1&pos2=-1&websrc=true&title=Crash%20Overview%20-%20Summary&caseid=" & linksAndIds(i, 2) & "&year=&fullimage=false"
html.body.innerHTML = http.GetString(newURL)
Dim tables As Object
Set tables = html.querySelectorAll("table")
For j = 0 To tables.Length - 1
clipboard.SetText tables.item(j).outerHTML
clipboard.PutInClipboard
ws.Cells(LastRow(ws) + 2, 1).PasteSpecial
Next
Next
End Sub
'https://www.rondebruin.nl/win/s9/win005.htm
Public Function LastRow(ByVal sh As Worksheet) As Long
On Error Resume Next
LastRow = sh.Cells.Find(What:="*", _
After:=sh.Range("A1"), _
Lookat:=xlPart, _
LookIn:=xlFormulas, _
SearchOrder:=xlByRows, _
SearchDirection:=xlPrevious, _
MatchCase:=False).Row
On Error GoTo 0
End Function
Public Function GetLinksAndIds(ByVal URL) As Variant
Dim ie As InternetExplorer, i As Long
Set ie = New InternetExplorer
With ie
.Visible = True
.navigate URL
While .Busy Or .readyState < 4: DoEvents: Wend
.document.getElementById("btnSubmit1").Click
While .Busy Or .readyState < 4: DoEvents: Wend
Dim caseLinks As Object, id As String, newURL As String
Set caseLinks = .document.querySelectorAll("[href*='CaseID=']")
Dim linksAndIds()
ReDim linksAndIds(1 To caseLinks.Length, 1 To 2)
For i = 0 To caseLinks.Length - 1
linksAndIds(i + 1, 1) = caseLinks.item(i)
linksAndIds(i + 1, 2) = Replace$(caseLinks.item(i), "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?xsl=main.xsl&CaseID=", vbNullString)
Next
.Quit
End With
GetLinksAndIds = linksAndIds
End Function
All Internet Explorer option:
Option Explicit
Public Sub GetTables()
Dim sResponse As String, html As HTMLDocument, clipboard As Object, ws As Worksheet
Dim initialLinksURL As String, i As Long, j As Long, newURL As String
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
Set html = New HTMLDocument
initialLinksURL = "https://crashviewer.nhtsa.dot.gov/LegacyCDS/Search"
Dim ie As InternetExplorer, caseLinks As Object
Set ie = New InternetExplorer
With ie
.Visible = True
.Navigate2 initialLinksURL
While .Busy Or .readyState < 4: DoEvents: Wend
.document.getElementById("btnSubmit1").Click
While .Busy Or .readyState < 4: DoEvents: Wend
Set caseLinks = .document.querySelectorAll("[href*='CaseID=']")
Dim linksAndIds()
ReDim linksAndIds(1 To caseLinks.Length, 1 To 2)
For i = 0 To caseLinks.Length - 1
linksAndIds(i + 1, 1) = caseLinks.item(i)
linksAndIds(i + 1, 2) = Replace$(caseLinks.item(i), "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?xsl=main.xsl&CaseID=", vbNullString)
Next
For i = LBound(linksAndIds, 2) To 2 ' UBound(linksAndIds, 2)
newURL = "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?ViewPage&xsl=Case.xsl&tab=Crash&form=CaseForm&baseNode=&vehnum=-1&occnum=-1&pos=-1&pos2=-1&websrc=true&title=Crash%20Overview%20-%20Summary&caseid=" & linksAndIds(i, 2) & "&year=&fullimage=false"
.Navigate2 newURL
While .Busy Or .readyState < 4: DoEvents: Wend
Dim tables As Object
Set tables = .document.querySelectorAll("table")
For j = 0 To tables.Length - 1
clipboard.SetText tables.item(j).outerHTML
clipboard.PutInClipboard
ws.Cells(LastRow(ws) + 2, 1).PasteSpecial
Next
Next
.Quit
End With
End Sub
'https://www.rondebruin.nl/win/s9/win005.htm
Public Function LastRow(ByVal sh As Worksheet) As Long
On Error Resume Next
LastRow = sh.Cells.Find(What:="*", _
After:=sh.Range("A1"), _
Lookat:=xlPart, _
LookIn:=xlFormulas, _
SearchOrder:=xlByRows, _
SearchDirection:=xlPrevious, _
MatchCase:=False).Row
On Error GoTo 0
End Function