I created this script for a friend that cycles through a real estate website and snags email address for her (for promotion). The site offers them freely, but it\'s inconve
Although not complete, not optimal, not bugfree, this could help:
' VB Script Document
option explicit
Dim strResult: strResult = Wscript.ScriptName
Dim numResult: numResult = 0
Dim ii, IE, pageText, fso, ts, xLink, Links
set fso = createobject("scripting.filesystemobject")
set ts = fso.opentextfile("d:\bat\files\28384650_webdump.txt",8,true)
set IE = createobject("internetexplorer.application")
'read first page
IE.navigate "https://netforum.avectra.com/eweb/DynamicPage.aspx?Site=NEFAR&WebCode=IndResult&FromSearchControl=Yes&FromSearchControl=Yes"
IE.Visible = True
For ii = 1 to 3 '239
ts.writeLine "-----------------" & ii
strResult = strResult & vbNewLine & ii
While IE.Busy
Wscript.Sleep 100
Wend
While IE.ReadyState <> 4
Wscript.Sleep 100
Wend
While IE.document.readystate <> "complete"
wscript.sleep 100
Wend
WScript.Sleep 100
pageText = IE.document.body.innertext
ts.writeLine pageText
' get sublinks and collect them in the 'strResult' variable
Set Links = IE.document.getElementsByTagName("a")
For Each xLink In Links
If InStr(1, xLink.href, "WebCode=PrimaryContactInfo" _
, vbTextCompare) > 0 Then
If InStr(1, strResult, xLink.href, vbTextCompare) > 0 Then
Else
numResult = numResult + 1
strResult = strResult & vbNewLine & xLink.href
End If
End If
Next
' read a page of the 'ii' index
IE.Navigate "javascript:window.__doPostBack('JumpToPage','" & ii+1 & "');"
IE.Visible = True
Next
ts.writeLine "===========" & numResult & vbTab & strResult
ts.close
Wscript.Echo "All site data copied! " _
& numResult & vbNewline & strResult
Wscript.Quit
Explanation:
http
(s) addressii+1
index) with javascript
... __doPostBack
call (the same as if one fulfill Jump to Page field and click the GO
button)not bugfree:
ii+1
th page, so fails on the last one. Here is true jedi approach :) uses only XMLHttpRequests
, there aren't IE disadvantages or dependencies from it. Output window created dynamically via mshta
without temp files. Processing speed can be improved by implementing async requests or multiprocess environment. The only way to stop the script at the moment unfortunately is wscript.exe
process termination.
Option Explicit
Dim oDisplay, sUrl, sRespHeaders, sRespText, arrSetHeaders, sEventTarget, arrFormData, lPage, lMember, i, arrFormStrings, sFormData, arrMembers, arrMemeber, sUrlEmail, sRespTextEmail, sEmail
Set oDisplay = New OutputWindow
sUrl = "https://netforum.avectra.com/eweb/DynamicPage.aspx?Site=NEFAR&WebCode=IndResult&FromSearchControl=Yes"
lPage = 0
lMember = 0
' Initial webpage request
oDisplay.Write("Connecting " & vbCrLf & sUrl)
XmlHttpRequest "GET", sUrl, Array(), "", sRespHeaders, sRespText
' Loop through all pages
Do
' Get cookies, form data, listctrl
oDisplay.Write("Processing page #" & (lPage + 1))
sEventTarget = ParseFragm("__doPostBack\('(ListControl_[\s\S]*?)',", sRespText)
ParseResponse "^Set-(Cookie): ([\S]*?=[\S]*?);[\s\S]*?$", sRespHeaders, arrSetHeaders
ParseResponse "<input type=""hidden"" name=""([\S]*?)""[\s\S]*?value=""([\s\S]*?)"" />", sRespText, arrFormData
' Update form params
For i = 0 To UBound(arrFormData)
Select Case arrFormData(i)(0)
Case "__POSTBACKCONTROL"
arrFormData(i)(1) = "JumpToPage"
Case "__EVENTTARGET"
arrFormData(i)(1) = sEventTarget
Case "__EVENTARGUMENT"
arrFormData(i)(1) = CStr(lPage)
End Select
Next
' Jump to page #lPage
arrFormStrings = Array()
ReDim arrFormStrings(UBound(arrFormData))
For i = 0 To UBound(arrFormData)
arrFormStrings(i) = EncodeUriComponent(arrFormData(i)(0)) & "=" & EncodeUriComponent(arrFormData(i)(1))
Next
sFormData = Join(arrFormStrings, "&")
PushItem arrSetHeaders, Array("Content-Type", "application/x-www-form-urlencoded")
PushItem arrSetHeaders, Array("Content-Length", CStr(Len(sFormData)))
' New page POST request
XmlHttpRequest "POST", sUrl, arrSetHeaders, sFormData, sRespHeaders, sRespText
' Parse members from new page
ParseMembers sRespText, arrMembers
' Parse members emails, and output
For Each arrMemeber in arrMembers
lMember = lMember + 1
sUrlEmail = "https://netforum.avectra.com/eweb/DynamicPage.aspx?Site=NEFAR&WebCode=PrimaryContactInfo&ind_cst_key=" & arrMemeber(0)
XmlHttpRequest "GET", sUrlEmail, Array(), "", "", sRespTextEmail
sEmail = ParseFragm("""mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})""", sRespTextEmail)
oDisplay.WriteTable(Array(CStr(lMember), sEmail, arrMemeber(0), arrMemeber(1)))
Next
lPage = lPage + 1
Loop
Sub ParseResponse(sPattern, sResponse, arrData)
Dim oMatch
arrData = Array()
With CreateObject("VBScript.RegExp")
.Global = True
.MultiLine = True
.Pattern = sPattern
For Each oMatch In .Execute(sResponse)
PushItem arrData, Array(oMatch.SubMatches(0), oMatch.SubMatches(1))
Next
End With
End Sub
Function ParseFragm(sPattern, sResponse)
Dim oMatches
With CreateObject("VBScript.RegExp")
.Global = True
.MultiLine = True
.Pattern = sPattern
Set oMatches = .Execute(sResponse)
If oMatches.Count > 0 Then ParseFragm = oMatches(0).SubMatches(0)
End With
End Function
Sub ParseMembers(sRespText, arrMembers)
Dim oMatch
arrMembers = Array()
With CreateObject("VBScript.RegExp")
.Global = True
.MultiLine = True
.Pattern = "<td class[\s\S]*?>([\s\S]*?<[\s\S]*?Key=([\s\S]*?)&[\s\S]*?)</td>"
For Each oMatch In .Execute(sRespText)
PushItem arrMembers, Array(oMatch.SubMatches(1), GetInnerText(oMatch.SubMatches(0)))
Next
End With
End Sub
Sub PushItem(arrList, varItem)
ReDim Preserve arrList(UBound(arrList) + 1)
arrList(UBound(arrList)) = varItem
End Sub
Function EncodeUriComponent(sText)
With CreateObject("htmlfile")
.Write ("<script language='JScript'></script>")
EncodeUriComponent = .DocumentElement.Document.Script.EncodeUriComponent(sText)
End With
End Function
Function GetInnerText(sText)
With CreateObject("htmlfile")
.Write ("<body>" & sText & "</body>")
GetInnerText = .DocumentElement.Document.GetElementsByTagName("body")(0).InnerText
End With
End Function
Sub XmlHttpRequest(sMethod, sUrl, arrSetHeaders, sFormData, sRespHeaders, sRespText)
Dim arrHeader
With CreateObject("Msxml2.ServerXMLHTTP.3.0")
.SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
.Open sMethod, sUrl, False
For Each arrHeader In arrSetHeaders
.SetRequestHeader arrHeader(0), arrHeader(1)
Next
.Send sFormData
sRespHeaders = .GetAllResponseHeaders
sRespText = .ResponseText
End With
End Sub
Class OutputWindow
Dim oWnd, oDoc, oOutDiv, oCursorDiv, oOutTBody, sSignature, lCols
Private Sub Class_Initialize()
sSignature = "OutputWindow"
ProvideWindow()
End Sub
Private Sub ProvideWindow()
' source http://forum.script-coding.com/viewtopic.php?pid=75356#p75356
Dim lWidth, lHeight
GetWindow()
If oWnd Is Nothing Then
CreateWindow()
With oWnd
With .Document
.GetElementsByTagName("head")(0).AppendChild .CreateElement("style")
.stylesheets(0).cssText = "body, td, #output {font-family: consolas, courier new; font-size: 9pt;} #cursor {margin: 3px;} body {background-color: buttonface;} #output {height: 100%; width: 100%; overflow: scroll; background: #FFF;} div.hline {height: 1px; width: 100%; background-color: #000; overflow: hidden;} table {width: 100%; TEXT-ALIGN: center; border-COLLAPSE: collapse; background: transparent; margin-top: 1px;} td {border: black 1px solid;}"
.Title = "Output Window"
.Body.InnerHtml = "<div id='output'><div id='cursor'><img src='' /></div></div>"
End With
lWidth = CInt(.Screen.AvailWidth * 0.75)
lHeight = CInt(.Screen.AvailHeight * 0.75)
.ResizeTo .Screen.AvailWidth, .Screen.AvailHeight
.ResizeTo lWidth + .Screen.AvailWidth - .Document.Body.OffsetWidth, lHeight + .Screen.AvailHeight - .Document.Body.OffsetHeight
.MoveTo CInt((.Screen.AvailWidth - lWidth) / 2), CInt((.Screen.AvailHeight - lHeight) / 2)
End With
End If
Set oDoc = oWnd.Document
Set oOutDiv = oWnd.output
Set oCursorDiv = oWnd.cursor
lCols = -1
End Sub
Private Sub GetWindow()
Dim oShellWnd
On Error Resume Next
For Each oShellWnd In CreateObject("Shell.Application").Windows
Set oWnd = oShellWnd.GetProperty(sSignature)
If Err.Number = 0 Then Exit Sub
Err.Clear
Next
Set oWnd = Nothing
End Sub
Private Sub CreateWindow()
Dim oProc
Do
Set oProc = CreateObject("WScript.Shell").exec("mshta ""about:<head><script>moveTo(-32000,-32000);window.document.title=' ';</script><hta:application id=app border=dialog minimizebutton=no maximizebutton=no scroll=no showintaskbar=yes contextmenu=yes selection=yes innerborder=no /><object id='shellwindow' classid='clsid:8856F961-340A-11D0-A96B-00C04FD705A2'><param name=RegisterAsBrowser value=1></object><script>shellwindow.putproperty('" & sSignature & "',window);</script></head>""")
Do
If oProc.Status > 0 Then Exit Do
GetWindow()
If Not (oWnd Is Nothing) Then Exit Sub
Loop
Loop
End Sub
Private Sub ChkDoc()
On Error Resume Next
If TypeName(oDoc) <> "HTMLDocument" Then ProvideWindow()
End Sub
Public Sub Write(sText)
Dim oDiv
ChkDoc()
On Error Resume Next
Set oDiv = oDoc.CreateElement("div")
oDiv.InnerHtml = EscapeHtml(sText) & "<div class='hline'></div>"
oOutDiv.AppendChild oDiv
oOutDiv.AppendChild oCursorDiv
oOutDiv.ScrollTop = oOutDiv.ScrollHeight
lCols = -1
End Sub
Public Sub WriteTable(arrCells)
Dim sInner, oTable, oRow, oTr, oCell, n
ChkDoc()
On Error Resume Next
If UBound(arrCells) <> lCols Then
Set oTable = oDoc.CreateElement("table")
oOutDiv.AppendChild oTable
Set oOutTBody = oDoc.CreateElement("tbody")
oTable.AppendChild oOutTBody
lCols = UBound(arrCells)
End If
Set oTr = oDoc.CreateElement("tr")
oOutTBody.AppendChild oTr
For n = 0 To lCols
Set oCell = oTr.InsertCell(n)
oCell.InnerHtml = EscapeHtml(arrCells(n))
Next
oOutDiv.AppendChild oCursorDiv
oOutDiv.ScrollTop = oOutDiv.ScrollHeight
End Sub
Public Sub BreakTable()
lCols = -1
End Sub
Private Function EscapeHtml(sCnt)
Dim n
sCnt = Replace(sCnt, "&", "&")
sCnt = Replace(sCnt, """", """)
sCnt = Replace(sCnt, "<", "<")
sCnt = Replace(sCnt, ">", ">")
sCnt = Replace(sCnt, "'", "'")
sCnt = Replace(sCnt, vbCrLf, "<br>")
sCnt = Replace(sCnt, Chr(9), " ")
sCnt = Replace(sCnt, " ", " ")
sCnt = Replace(sCnt, " ", " ")
For n = 0 To 31
sCnt = Replace(sCnt, Chr(n), "¶")
Next
EscapeHtml = sCnt
End Function
Private Sub Class_Terminate()
' oWnd.close
End Sub
End Class