Cycle through webpages and copy data

前端 未结 2 1411
我在风中等你
我在风中等你 2021-01-20 00:26

I created this script for a friend that cycles through a real estate website and snags email address for her (for promotion). The site offers them freely, but it\'s inconve

相关标签:
2条回答
  • 2021-01-20 00:55

    Although not complete, not optimal, not bugfree, this could help:

    ' VB Script Document
    option explicit
    
    Dim strResult: strResult = Wscript.ScriptName
    Dim numResult: numResult = 0
    Dim ii, IE, pageText, fso, ts, xLink, Links
    
      set fso = createobject("scripting.filesystemobject") 
      set ts = fso.opentextfile("d:\bat\files\28384650_webdump.txt",8,true) 
    
      set IE = createobject("internetexplorer.application") 
    
      'read first page
      IE.navigate "https://netforum.avectra.com/eweb/DynamicPage.aspx?Site=NEFAR&WebCode=IndResult&FromSearchControl=Yes&FromSearchControl=Yes"
      IE.Visible = True
    
    For ii = 1 to 3 '239
      ts.writeLine "-----------------" & ii
      strResult = strResult & vbNewLine & ii
    
      While IE.Busy
        Wscript.Sleep 100
      Wend
      While IE.ReadyState <> 4
        Wscript.Sleep 100
      Wend
      While IE.document.readystate <> "complete" 
          wscript.sleep 100
      Wend
      WScript.Sleep 100
    
      pageText = IE.document.body.innertext
      ts.writeLine pageText
    
      ' get sublinks and collect them in the 'strResult' variable
      Set Links = IE.document.getElementsByTagName("a")
      For Each xLink In Links
        If InStr(1, xLink.href, "WebCode=PrimaryContactInfo" _
          , vbTextCompare) > 0 Then
          If InStr(1, strResult, xLink.href, vbTextCompare) > 0 Then
          Else
            numResult = numResult + 1
            strResult = strResult & vbNewLine & xLink.href
          End If
        End If
      Next
    
      ' read a page of the 'ii' index
      IE.Navigate "javascript:window.__doPostBack('JumpToPage','" & ii+1 & "');"
      IE.Visible = True
    Next
    
      ts.writeLine "===========" & numResult & vbTab & strResult
      ts.close 
    
    Wscript.Echo "All site data copied! " _
        & numResult & vbNewline & strResult
    Wscript.Quit
    

    Explanation:

    • navigates to first page with usual http(s) address
    • navigates to next pages (of the ii+1 index) with javascript ... __doPostBack call (the same as if one fulfill Jump to Page field and click the GO button)
    • not complete: collects (indirect) links to Primary Contact Info webpages where e-mail addresses could be found without getting them
    • not optimal: keeps collecting text of pages visited
    • not bugfree:

      • works fine with freshly cleared MSIE temporary files, history and cookies; otherwise starts at an odd (last visited?) page of netforum.avectra.com
      • navigates to ii+1th page, so fails on the last one.
    0 讨论(0)
  • 2021-01-20 01:03

    Here is true jedi approach :) uses only XMLHttpRequests, there aren't IE disadvantages or dependencies from it. Output window created dynamically via mshta without temp files. Processing speed can be improved by implementing async requests or multiprocess environment. The only way to stop the script at the moment unfortunately is wscript.exe process termination.

    Option Explicit
    
    Dim oDisplay, sUrl, sRespHeaders, sRespText, arrSetHeaders, sEventTarget, arrFormData, lPage, lMember, i, arrFormStrings, sFormData, arrMembers, arrMemeber, sUrlEmail, sRespTextEmail, sEmail
    
    Set oDisplay = New OutputWindow
    sUrl = "https://netforum.avectra.com/eweb/DynamicPage.aspx?Site=NEFAR&WebCode=IndResult&FromSearchControl=Yes"
    lPage = 0
    lMember = 0
    
    ' Initial webpage request
    oDisplay.Write("Connecting " & vbCrLf & sUrl)
    XmlHttpRequest "GET", sUrl, Array(), "", sRespHeaders, sRespText
    
    ' Loop through all pages
    Do
        ' Get cookies, form data, listctrl
        oDisplay.Write("Processing page #" & (lPage + 1))
        sEventTarget = ParseFragm("__doPostBack\('(ListControl_[\s\S]*?)',", sRespText)
        ParseResponse "^Set-(Cookie): ([\S]*?=[\S]*?);[\s\S]*?$", sRespHeaders, arrSetHeaders
        ParseResponse "<input type=""hidden"" name=""([\S]*?)""[\s\S]*?value=""([\s\S]*?)"" />", sRespText, arrFormData
    
        ' Update form params
        For i = 0 To UBound(arrFormData)
            Select Case arrFormData(i)(0)
            Case "__POSTBACKCONTROL"
                arrFormData(i)(1) = "JumpToPage"
            Case "__EVENTTARGET"
                arrFormData(i)(1) = sEventTarget
            Case "__EVENTARGUMENT"
                arrFormData(i)(1) = CStr(lPage)
            End Select
        Next
    
        ' Jump to page #lPage
        arrFormStrings = Array()
        ReDim arrFormStrings(UBound(arrFormData))
        For i = 0 To UBound(arrFormData)
            arrFormStrings(i) = EncodeUriComponent(arrFormData(i)(0)) & "=" & EncodeUriComponent(arrFormData(i)(1))
        Next
        sFormData = Join(arrFormStrings, "&")
        PushItem arrSetHeaders, Array("Content-Type", "application/x-www-form-urlencoded")
        PushItem arrSetHeaders, Array("Content-Length", CStr(Len(sFormData)))
    
        ' New page POST request
        XmlHttpRequest "POST", sUrl, arrSetHeaders, sFormData, sRespHeaders, sRespText
    
        ' Parse members from new page
        ParseMembers sRespText, arrMembers
    
        ' Parse members emails, and output 
        For Each arrMemeber in arrMembers
            lMember = lMember + 1
            sUrlEmail = "https://netforum.avectra.com/eweb/DynamicPage.aspx?Site=NEFAR&WebCode=PrimaryContactInfo&ind_cst_key=" & arrMemeber(0)
            XmlHttpRequest "GET", sUrlEmail, Array(), "", "", sRespTextEmail
            sEmail = ParseFragm("""mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})""", sRespTextEmail)
            oDisplay.WriteTable(Array(CStr(lMember), sEmail, arrMemeber(0), arrMemeber(1)))
        Next
    
        lPage = lPage + 1
    Loop
    
    
    Sub ParseResponse(sPattern, sResponse, arrData)
        Dim oMatch
        arrData = Array()
        With CreateObject("VBScript.RegExp")
            .Global = True
            .MultiLine = True
            .Pattern = sPattern
            For Each oMatch In .Execute(sResponse)
                PushItem arrData, Array(oMatch.SubMatches(0), oMatch.SubMatches(1))
            Next
        End With
    End Sub
    
    Function ParseFragm(sPattern, sResponse)
        Dim oMatches
        With CreateObject("VBScript.RegExp")
            .Global = True
            .MultiLine = True
            .Pattern = sPattern
            Set oMatches = .Execute(sResponse)
            If oMatches.Count > 0 Then ParseFragm = oMatches(0).SubMatches(0)
        End With
    End Function
    
    Sub ParseMembers(sRespText, arrMembers)
        Dim oMatch
        arrMembers = Array()
        With CreateObject("VBScript.RegExp")
            .Global = True
            .MultiLine = True
            .Pattern = "<td class[\s\S]*?>([\s\S]*?<[\s\S]*?Key=([\s\S]*?)&[\s\S]*?)</td>"
            For Each oMatch In .Execute(sRespText)
                PushItem arrMembers, Array(oMatch.SubMatches(1), GetInnerText(oMatch.SubMatches(0)))
            Next
        End With
    End Sub
    
    Sub PushItem(arrList, varItem)
        ReDim Preserve arrList(UBound(arrList) + 1)
        arrList(UBound(arrList)) = varItem
    End Sub
    
    Function EncodeUriComponent(sText)
        With CreateObject("htmlfile")
            .Write ("<script language='JScript'></script>")
            EncodeUriComponent = .DocumentElement.Document.Script.EncodeUriComponent(sText)
        End With
    End Function
    
    Function GetInnerText(sText)
        With CreateObject("htmlfile")
            .Write ("<body>" & sText & "</body>")
            GetInnerText = .DocumentElement.Document.GetElementsByTagName("body")(0).InnerText
        End With
    End Function
    
    Sub XmlHttpRequest(sMethod, sUrl, arrSetHeaders, sFormData, sRespHeaders, sRespText)
        Dim arrHeader
        With CreateObject("Msxml2.ServerXMLHTTP.3.0")
            .SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
            .Open sMethod, sUrl, False
            For Each arrHeader In arrSetHeaders
                .SetRequestHeader arrHeader(0), arrHeader(1)
            Next
            .Send sFormData
            sRespHeaders = .GetAllResponseHeaders
            sRespText = .ResponseText
        End With
    End Sub
    
    Class OutputWindow
    
        Dim oWnd, oDoc, oOutDiv, oCursorDiv, oOutTBody, sSignature, lCols
    
        Private Sub Class_Initialize()
            sSignature = "OutputWindow"
            ProvideWindow()
        End Sub
    
        Private Sub ProvideWindow()
            ' source http://forum.script-coding.com/viewtopic.php?pid=75356#p75356
            Dim lWidth, lHeight
            GetWindow()
            If oWnd Is Nothing Then
                CreateWindow()
                With oWnd
                    With .Document
                        .GetElementsByTagName("head")(0).AppendChild .CreateElement("style")
                        .stylesheets(0).cssText = "body, td, #output {font-family: consolas, courier new; font-size: 9pt;} #cursor {margin: 3px;} body {background-color: buttonface;} #output {height: 100%; width: 100%; overflow: scroll; background: #FFF;} div.hline {height: 1px; width: 100%; background-color: #000; overflow: hidden;} table {width: 100%; TEXT-ALIGN: center; border-COLLAPSE: collapse; background: transparent; margin-top: 1px;} td {border: black 1px solid;}"
                        .Title = "Output Window"
                        .Body.InnerHtml = "<div id='output'><div id='cursor'><img src='' /></div></div>"
                    End With
                    lWidth = CInt(.Screen.AvailWidth * 0.75)
                    lHeight = CInt(.Screen.AvailHeight * 0.75)
                    .ResizeTo .Screen.AvailWidth, .Screen.AvailHeight
                    .ResizeTo lWidth + .Screen.AvailWidth - .Document.Body.OffsetWidth, lHeight + .Screen.AvailHeight - .Document.Body.OffsetHeight
                    .MoveTo CInt((.Screen.AvailWidth - lWidth) / 2), CInt((.Screen.AvailHeight - lHeight) / 2)
                End With
            End If
            Set oDoc = oWnd.Document
            Set oOutDiv = oWnd.output
            Set oCursorDiv = oWnd.cursor
            lCols = -1
        End Sub
    
        Private Sub GetWindow()
            Dim oShellWnd
            On Error Resume Next
            For Each oShellWnd In CreateObject("Shell.Application").Windows
                Set oWnd = oShellWnd.GetProperty(sSignature)
                If Err.Number = 0 Then Exit Sub
                Err.Clear
            Next
            Set oWnd = Nothing
        End Sub
    
        Private Sub CreateWindow()
            Dim oProc
            Do
                Set oProc = CreateObject("WScript.Shell").exec("mshta ""about:<head><script>moveTo(-32000,-32000);window.document.title=' ';</script><hta:application id=app border=dialog minimizebutton=no maximizebutton=no scroll=no showintaskbar=yes contextmenu=yes selection=yes innerborder=no /><object id='shellwindow' classid='clsid:8856F961-340A-11D0-A96B-00C04FD705A2'><param name=RegisterAsBrowser value=1></object><script>shellwindow.putproperty('" & sSignature & "',window);</script></head>""")
                Do
                    If oProc.Status > 0 Then Exit Do
                    GetWindow()
                    If Not (oWnd Is Nothing) Then Exit Sub
                Loop
            Loop
        End Sub
    
        Private Sub ChkDoc()
            On Error Resume Next
            If TypeName(oDoc) <> "HTMLDocument" Then ProvideWindow()
        End Sub
    
        Public Sub Write(sText)
            Dim oDiv
            ChkDoc()
            On Error Resume Next
            Set oDiv = oDoc.CreateElement("div")
            oDiv.InnerHtml = EscapeHtml(sText) & "<div class='hline'></div>"
            oOutDiv.AppendChild oDiv
            oOutDiv.AppendChild oCursorDiv
            oOutDiv.ScrollTop = oOutDiv.ScrollHeight
            lCols = -1
        End Sub
    
        Public Sub WriteTable(arrCells)
            Dim sInner, oTable, oRow, oTr, oCell, n
            ChkDoc()
            On Error Resume Next
            If UBound(arrCells) <> lCols Then
                Set oTable = oDoc.CreateElement("table")
                oOutDiv.AppendChild oTable
                Set oOutTBody = oDoc.CreateElement("tbody")
                oTable.AppendChild oOutTBody
                lCols = UBound(arrCells)
            End If
            Set oTr = oDoc.CreateElement("tr")
            oOutTBody.AppendChild oTr
            For n = 0 To lCols
                Set oCell = oTr.InsertCell(n)
                oCell.InnerHtml = EscapeHtml(arrCells(n))
            Next
            oOutDiv.AppendChild oCursorDiv
            oOutDiv.ScrollTop = oOutDiv.ScrollHeight
        End Sub
    
        Public Sub BreakTable()
            lCols = -1
        End Sub
    
        Private Function EscapeHtml(sCnt)
            Dim n
            sCnt = Replace(sCnt, "&", "&amp;")
            sCnt = Replace(sCnt, """", "&quot;")
            sCnt = Replace(sCnt, "<", "&lt;")
            sCnt = Replace(sCnt, ">", "&gt;")
            sCnt = Replace(sCnt, "'", "&#39;")
            sCnt = Replace(sCnt, vbCrLf, "<br>")
            sCnt = Replace(sCnt, Chr(9), "&nbsp;&nbsp;&nbsp;&nbsp;")
            sCnt = Replace(sCnt, "  ", " &nbsp;")
            sCnt = Replace(sCnt, "&nbsp; ", "&nbsp;&nbsp;")
            For n = 0 To 31
                sCnt = Replace(sCnt, Chr(n), "¶")
            Next
            EscapeHtml = sCnt
        End Function
    
        Private Sub Class_Terminate()
            ' oWnd.close
        End Sub
    
    End Class
    
    0 讨论(0)
提交回复
热议问题