How to suppress cookie request

前端 未结 1 857
庸人自扰
庸人自扰 2020-12-19 18:39

I am using vba inside of Excel 2013 to scrape data off of Yahoo Option Contract and while I do get the data, I also get multiple requests to accept a cookie (see dialog belo

相关标签:
1条回答
  • 2020-12-19 19:06

    Try VBA code below to retrieve HTML content of the page via XHR, parse it with RegEx and output to worksheet:

    Option Explicit
    
    Sub Scrape_Yahoo_Option_Contract()
    
        Dim sUrl As String
        Dim aHeaders
        Dim sResp As String
        Dim sContent
        Dim oTables As Object
        Dim oRows As Object
        Dim aData()
        Dim i As Long
    
        ' Get data
        sUrl = "https://finance.yahoo.com/quote/AAPL"
        aHeaders = Array( _
            Array("user-agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36") _
        )
        XmlHttpRequest "GET", sUrl, aHeaders, "", "", sResp
        ' Parse tables
        ParseToDict "(<table class=""[^""]*?W\(100%\)[^>]*>)([\s\S]*?)</table>", sResp, oTables
        ' Parse rows
        For Each sContent In oTables.Items
            ParseToDict "<tr><td>(.*?)</td><td>(.*?)</td></tr>", HtmlSimplify(sContent), oRows
        Next
        ' Populate 2d array
        ReDim aData(1 To oRows.Count, 1 To 2)
        i = 1
        For Each sContent In oRows
            aData(i, 1) = GetInnerText(sContent)
            aData(i, 2) = GetInnerText(oRows(sContent))
            i = i + 1
        Next
        ' Output array to worksheet 1
        With ThisWorkbook.Sheets(1)
            .Cells.Delete
            Output2DArray .Cells(1, 1), aData
            .Cells.EntireColumn.AutoFit
        End With
    
    End Sub
    
    Sub Output2DArray(oDstRng As Range, aCells As Variant)
    
        With oDstRng
            .Parent.Select
            With .Resize( _
                    UBound(aCells, 1) - LBound(aCells, 1) + 1, _
                    UBound(aCells, 2) - LBound(aCells, 2) + 1)
                .NumberFormat = "@"
                .Value = aCells
            End With
        End With
    
    End Sub
    
    Sub XmlHttpRequest(sMethod As String, sUrl As String, arrSetHeaders, sFormData, sRespHeaders As String, sContent As String)
    
        Dim arrHeader
    
        'With CreateObject("Msxml2.ServerXMLHTTP.3.0")
        '    .SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
        With CreateObject("Msxml2.XMLHTTP")
            .Open sMethod, sUrl, False
            If IsArray(arrSetHeaders) Then
                For Each arrHeader In arrSetHeaders
                    .SetRequestHeader arrHeader(0), arrHeader(1)
                Next
            End If
            .Send sFormData
            sRespHeaders = .GetAllResponseHeaders
            sContent = .ResponseText
        End With
    
    End Sub
    
    Function HtmlSimplify(ByVal sCont)
    
        With CreateObject("VBScript.RegExp")
            .Global = True
            .MultiLine = True
            .IgnoreCase = True
            .Pattern = "(<[\w\/^<]*)[\s\S]*?>"
            sCont = .Replace(sCont, "$1>")
            .Pattern = "(?:<span>|</span>)"
            sCont = .Replace(sCont, "")
            .Pattern = "(?:<small>|</small>)"
            sCont = .Replace(sCont, "")
            .Pattern = "&nbsp;"
            sCont = .Replace(sCont, " ")
            .Pattern = "[\f\n\r\t\v]"
            sCont = .Replace(sCont, "")
            .Pattern = " +"
            sCont = .Replace(sCont, " ")
            .Pattern = "> <"
            sCont = .Replace(sCont, "><")
        End With
        HtmlSimplify = sCont
    
    End Function
    
    Sub ParseToDict(sPattern As String, sResponse As String, oDict As Object)
    
        Dim oMatch
    
        If oDict Is Nothing Then Set oDict = CreateObject("Scripting.Dictionary")
        With CreateObject("VBScript.RegExp")
            .Global = True
            .MultiLine = True
            .IgnoreCase = True
            .Pattern = sPattern
            For Each oMatch In .Execute(sResponse)
                If Trim(oMatch.SubMatches(0)) <> "" Then oDict(oMatch.SubMatches(0)) = oMatch.SubMatches(1)
            Next
        End With
    
    End Sub
    
    Function GetInnerText(ByVal sHtml As String) As String
    
        Static oHtmlfile As Object
    
        If oHtmlfile Is Nothing Then ' init
            Set oHtmlfile = CreateObject("htmlfile")
            oHtmlfile.Open
            oHtmlfile.Write "<body></body>"
        End If
        ' Convert
        On Error Resume Next
        oHtmlfile.body.innerHTML = sHtml
        GetInnerText = oHtmlfile.body.innerText
    
    End Function
    
    0 讨论(0)
提交回复
热议问题