Using post data and xlmlhttp

前端 未结 1 1120
鱼传尺愫
鱼传尺愫 2020-12-17 07:25

I am still trying to learn more about scraping and I could devise a code that enables me to get the desired results.

Here\'s the code



        
相关标签:
1条回答
  • 2020-12-17 08:18

    Take a look at the below example:

    Option Explicit
    
    Sub Test()
    
        Dim sState As String
        Dim sCourt As String
        Dim dtFrom As Date
        Dim dtTill As Date
        Dim sSubject As String
        Dim sOrder As String
        Dim oStates As Object
        Dim oCourts As Object
        Dim oSubjects As Object
        Dim oOrders As Object
        Dim sStateCode As String
        Dim sCourtId As String
        Dim sSubjectVal As String
        Dim sOrderVal As String
        Dim aData
    
        ' Set query data
        sState = ""
        sCourt = ""
        dtFrom = DateSerial(2018, 2, 11)
        dtTill = DateSerial(2018, 2, 11)
        sSubject = ""
        sOrder = "Aktenzeichen"
    
        ' Retrieve options
        GetOptions oStates, oCourts, oSubjects, oOrders
    
        ' Validate query parameters
        If Not oStates.Exists(sState) Then MsgBox "State valid values:" & vbCrLf & vbCrLf & """" & Join(oStates.Keys(), """" & vbCrLf & """") & """": Exit Sub
        If Not oCourts(oStates(sState)).Exists(sCourt) Then MsgBox "Court valid values:" & vbCrLf & vbCrLf & """" & Join(oCourts(oStates(sState)).Keys(), """" & vbCrLf & """") & """": Exit Sub
        If Not oSubjects.Exists(sSubject) Then MsgBox "Subject valid values:" & vbCrLf & vbCrLf & """" & Join(oSubjects.Keys(), """" & vbCrLf & """") & """": Exit Sub
        If Not oOrders.Exists(sOrder) Then MsgBox "Order valid values:" & vbCrLf & vbCrLf & """" & Join(oOrders.Keys(), """" & vbCrLf & """") & """": Exit Sub
    
        ' Request data
        sStateCode = oStates(sState)
        sCourtId = oCourts(sStateCode)(sCourt)
        sSubjectVal = oSubjects(sSubject)
        sOrderVal = oOrders(sOrder)
        GetData sStateCode, sCourt, sCourtId, dtFrom, dtTill, sSubjectVal, sOrderVal, aData
    
        ' Rebuild nested arrays to 2d array for output
        aData = Denestify(aData)
        ' Output
        With ThisWorkbook.Sheets(1)
            .Cells.Delete
            Output2DArray .Cells(1, 1), aData
            .Columns.AutoFit
        End With
    
        MsgBox "Completed"
    
    End Sub
    
    Sub GetOptions(oStates As Object, oCourts As Object, oSubjects As Object, oOrders As Object)
    
        Dim sContent As String
        Dim aTmp0
        Dim aTmp1
        Dim vItem
        Dim oTmp
        Dim i As Long
    
        ' Retrieve request options from search page
        XmlHttpRequest "GET", "https://www.handelsregisterbekanntmachungen.de/?aktion=suche", "", "", "", sContent
    
        ' Get each state and code
        ExtractOptions sContent, "land", oStates
    
        ' Get courts with courts ids for each federal state
        Set oCourts = CreateObject("Scripting.Dictionary")
        For Each vItem In oStates.Items()
            ' Put courts and ids into temp dictionary
            Set oTmp = CreateObject("Scripting.Dictionary")
            If vItem <> "" Then
                ' Extract the whole JS array statement with courts names
                ParseResponse "BundeslandArray\['" & vItem & "'\]=new Array\(('[^']*'(?:,'[^']*')*)\);", sContent, aTmp0, False
                ' Extract each court name into temp array
                ParseResponse "'([^']*)'", (aTmp0(0)), aTmp0, False
                ' Extract the whole JS array statement with courts ids
                ParseResponse "BundeslandArrayId\['" & vItem & "'\]=new Array\(('[^']*'(?:,'[^']*')*)\);", sContent, aTmp1, False
                ' Extract each court id into temp array
                ParseResponse "'([^']*)'", (aTmp1(0)), aTmp1, False
                For i = 0 To UBound(aTmp0)
                    oTmp(DecodeHTMLEntities((aTmp0(i)))) = DecodeHTMLEntities((aTmp1(i)))
                Next
            End If
            ' Add dummy item
            oTmp("") = ""
            ' Put courts-ids for the state code into dictionary
            Set oCourts(vItem) = oTmp
        Next
        ' Add dummy item
        oStates("") = ""
    
        ' Get subjects
        ExtractOptions sContent, "gegenstand", oSubjects
        ' Add dummy item
        oSubjects("") = "0"
    
        ' Get sort order types
        ExtractOptions sContent, "order", oOrders
    
    End Sub
    
    Sub GetData(sStateCode As String, sCourt As String, sCourtId As String, dtFrom As Date, dtTill As Date, sSubjectVal As String, sOrderVal As String, aData)
    
        Dim i As Long
        Dim oQuery As Object
        Dim sQuery As String
        Dim sContent As String
    
        ' Set query parameters
        Set oQuery = CreateObject("Scripting.Dictionary")
        With oQuery
            .Add "suchart", "uneingeschr"
            .Add "button", "Start search"
            .Add "land", sStateCode
            .Add "gericht", sCourtId
            .Add "gericht_name", sCourt
            .Add "seite", ""
            .Add "l", ""
            .Add "r", ""
            .Add "all", "false"
            .Add "vt", Day(dtFrom)
            .Add "vm", Month(dtFrom)
            .Add "vj", Year(dtFrom)
            .Add "bt", Day(dtTill)
            .Add "bm", Month(dtTill)
            .Add "bj", Year(dtTill)
            .Add "fname", ""
            .Add "fsitz", ""
            .Add "rubrik", ""
            .Add "az", ""
            .Add "gegenstand", sSubjectVal
            .Add "anzv", "alle"
            .Add "order", sOrderVal
        End With
        sQuery = EncodeQueryParams(oQuery)
    
        ' Retrieve search results
        XmlHttpRequest "POST", _
            "https://www.handelsregisterbekanntmachungen.de/de/index.php?aktion=suche", _
            Array( _
                Array("Content-Type", "application/x-www-form-urlencoded"), _
                Array("Content-Length", Len(sQuery) _
                ) _
            ), _
            sQuery, _
            "", _
            sContent
    
        ' Parse response
        sContent = Replace(sContent, "<br>", vbCrLf)
        ParseResponse "<li[^>]*><a[^>]*?href=""javascript:NeuFenster\('([^']*)'\)""[^>]*>([^<]*)<ul[^>]*>([\s\S]*?)</ul>", sContent, aData, False
        For i = 0 To UBound(aData, 1)
            aData(i)(0) = "http://www.handelsregisterbekanntmachungen.de/en/skripte/hrb.php?" & aData(i)(0)
        Next
    
    End Sub
    
    Sub ExtractOptions(sContent As String, sName As String, oOptions As Object)
    
        Dim aTmp0
        Dim vItem
    
        ' Extract the whole <select> for parameter
        ParseResponse "<select[^>]* name=" & sName & "[^>]*>[^<]*((?:<option[^>]*>[^<]*</option>[^<]*)+)[^<]*</[^>]*>", sContent, aTmp0, False
        ' Extract each parameter <option>
        ParseResponse "<option[^>]*value=(""[^""]*""|[^\s>]*)[^>]*>([^<]*)</option>", (aTmp0(0)), aTmp0, False
        ' Put each parameter and value into dictionary
        Set oOptions = CreateObject("Scripting.Dictionary")
        For Each vItem In aTmp0
            oOptions(DecodeHTMLEntities((vItem(1)))) = DecodeHTMLEntities(Replace(vItem(0), """", ""))
        Next
    
    End Sub
    
    Sub XmlHttpRequest(sMethod, sUrl, aSetHeaders, sFormData, sRespHeaders, sRespText)
    
        Dim aHeader
    
        ' With CreateObject("MSXML2.ServerXMLHTTP")
            ' .SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
        With CreateObject("MSXML2.XMLHTTP")
            .Open sMethod, sUrl, False
            If IsArray(aSetHeaders) Then
                For Each aHeader In aSetHeaders
                    .SetRequestHeader aHeader(0), aHeader(1)
                Next
            End If
            .Send (sFormData)
            sRespHeaders = .GetAllResponseHeaders
            sRespText = .ResponseText
        End With
    
    End Sub
    
    Sub ParseResponse(sPattern, sResponse, aData, Optional bAppend As Boolean = True, Optional bGlobal = True, Optional bMultiLine = True, Optional bIgnoreCase = True)
    
        Dim oMatch
        Dim aTmp0()
        Dim sSubMatch
    
        If Not (IsArray(aData) And bAppend) Then aData = Array()
        With CreateObject("VBScript.RegExp")
            .Global = bGlobal
            .MultiLine = bMultiLine
            .IgnoreCase = bIgnoreCase
            .Pattern = sPattern
            For Each oMatch In .Execute(sResponse)
                If oMatch.SubMatches.Count = 1 Then
                    PushItem aData, oMatch.SubMatches(0)
                Else
                    aTmp0 = Array()
                    For Each sSubMatch In oMatch.SubMatches
                        PushItem aTmp0, sSubMatch
                    Next
                    PushItem aData, aTmp0
                End If
            Next
        End With
    
    End Sub
    
    Sub PushItem(aData, vItem, Optional bAppend As Boolean = True)
    
        If Not (IsArray(aData) And bAppend) Then aData = Array()
        ReDim Preserve aData(UBound(aData) + 1)
        aData(UBound(aData)) = vItem
    
    End Sub
    
    Function DecodeHTMLEntities(sText As String) As String
    
        Static oHtmlfile As Object
        Static oDiv As Object
    
        If oHtmlfile Is Nothing Then
            Set oHtmlfile = CreateObject("htmlfile")
            oHtmlfile.Open
            Set oDiv = oHtmlfile.createElement("div")
        End If
        oDiv.innerHTML = sText
        DecodeHTMLEntities = oDiv.innerText
    
    End Function
    
    Function EncodeQueryParams(oParams As Object) As String
    
        Dim aParams
        Dim i As Long
    
        aParams = oParams.Keys()
        For i = 0 To UBound(aParams)
            aParams(i) = EncodeUriComponent((aParams(i))) & "=" & EncodeUriComponent((oParams(aParams(i))))
        Next
        EncodeQueryParams = Join(aParams, "&")
    
    End Function
    
    Function EncodeUriComponent(strText As String) As String
    
        Static objHtmlfile As Object
    
        If objHtmlfile Is Nothing Then
            Set objHtmlfile = CreateObject("htmlfile")
            objHtmlfile.parentWindow.execScript "function encode(s) {return encodeURIComponent(s)}", "jscript"
        End If
        EncodeUriComponent = objHtmlfile.parentWindow.encode(strText)
    
    End Function
    
    Function Denestify(aRows)
    
        Dim aData()
        Dim aItems()
        Dim i As Long
        Dim j As Long
    
        If UBound(aRows) = -1 Then Exit Function
        ReDim aData(1 To UBound(aRows) + 1, 1 To 1)
        For j = 0 To UBound(aRows)
            If IsArray(aRows(j)) Then
                aItems = aRows(j)
                For i = 0 To UBound(aItems)
                    If i + 1 > UBound(aData, 2) Then ReDim Preserve aData(1 To UBound(aRows) + 1, 1 To i + 1)
                    aData(j + 1, i + 1) = aItems(i)
                Next
            Else
                aData(j + 1, 1) = aRows(j)
            End If
        Next
        Denestify = aData
    
    End Function
    
    Sub Output2DArray(oDstRng As Range, aCells As Variant)
    
        With oDstRng
            .Parent.Select
            With .Resize( _
                    UBound(aCells, 1) - LBound(aCells, 1) + 1, _
                    UBound(aCells, 2) - LBound(aCells, 2) + 1)
                .NumberFormat = "@"
                .Value = aCells
            End With
        End With
    
    End Sub
    
    0 讨论(0)
提交回复
热议问题