Scraper throws errors instead of quitting the browser when everything is done

后端 未结 2 358
一整个雨季
一整个雨季 2021-01-11 15:49

I\'ve written a scraper to parse movie information from a torrent site. I used IE and queryselector.

My code does parse everything. It thro

相关标签:
2条回答
  • 2021-01-11 16:08

    The website has an API. Check e. g. result from the URL https://yts.am/api/v2/list_movies.json?page=1&limit=50, which actually represents 50 movies from first page of latest movies category, in JSON format.

    Take a look at the below example. Import JSON.bas module into the VBA project for JSON processing.

    Option Explicit
    
    Sub Test()
    
        Dim sJSONString As String
        Dim vJSON
        Dim sState As String
        Dim lPage As Long
        Dim aRes()
        Dim i As Long
        Dim aData()
        Dim aHeader()
    
        With Sheets(1)
            .Cells.Delete
            .Cells.WrapText = False
        End With
        lPage = 1
        aRes = Array()
        Do
            With CreateObject("MSXML2.XMLHTTP")
                .Open "GET", "https://yts.am/api/v2/list_movies.json?page=" & lPage & "&limit=50", False
                .send
                sJSONString = .responseText
            End With
            JSON.Parse sJSONString, vJSON, sState
            If Not vJSON("data").Exists("movies") Then Exit Do
            vJSON = vJSON("data")("movies")
            ReDim Preserve aRes(UBound(aRes) + UBound(vJSON) + 1)
            For i = 0 To UBound(vJSON)
                Set aRes(UBound(aRes) - UBound(vJSON) + i) = vJSON(i)
            Next
            lPage = lPage + 1
            Debug.Print "Parsed " & (UBound(aRes) + 1)
            DoEvents
        Loop
        JSON.ToArray aRes, aData, aHeader
        With Sheets(1)
            OutputArray .Cells(1, 1), aHeader
            Output2DArray .Cells(2, 1), aData
            .Columns.AutoFit
        End With
        MsgBox "Completed"
    
    End Sub
    
    Sub OutputArray(oDstRng As Range, aCells As Variant)
    
        With oDstRng
            .Parent.Select
            With .Resize(1, UBound(aCells) - LBound(aCells) + 1)
                .NumberFormat = "@"
                .Value = aCells
            End With
        End With
    
    End Sub
    
    Sub Output2DArray(oDstRng As Range, aCells As Variant)
    
        With oDstRng
            .Parent.Select
            With .Resize( _
                    UBound(aCells, 1) - LBound(aCells, 1) + 1, _
                    UBound(aCells, 2) - LBound(aCells, 2) + 1)
                .NumberFormat = "@"
                .Value = aCells
            End With
        End With
    
    End Sub
    

    The output for me as follows, at the moment there are 7182 movies total:

    BTW, the similar approach applied in other answers.

    0 讨论(0)
  • 2021-01-11 16:10

    Ok, so there is something seriously unfriendly about that webpage. It kept crashing for me. So I have resorted to running a javascript program within scripting engine/scripting control and it works.

    I hope you can follow it. The logic is in the javascript added to the ScriptEngine. I get two lists of nodes, one list of films and one list of years; then I step through each array in sync and add them as key value pair to a Microsoft Scripting Dictionary.

    Option Explicit
    
    '*Tools->References
    '*    Microsoft Scripting Runtime
    '*    Microsoft Scripting Control
    '*    Microsoft Internet Controls
    '*    Microsoft HTML Object Library
    
    Sub Torrent_Data()
        Dim row As Long
        Dim IE As New InternetExplorer, html As HTMLDocument
        Dim post As Object
    
        With IE
            .Visible = True
            .navigate "https://yts.am/browse-movies"
            Do While .readyState <> READYSTATE_COMPLETE:
                DoEvents
            Loop
            Set html = .document
        End With
    
        Dim dicFilms As Scripting.Dictionary
        Set dicFilms = New Scripting.Dictionary
    
        Call GetScriptEngine.Run("getMovies", html, dicFilms)
    
        Dim vFilms As Variant
        vFilms = dicFilms.Keys
    
        Dim vYears As Variant
        vYears = dicFilms.Items
    
        Dim lRowLoop As Long
        For lRowLoop = 0 To dicFilms.Count - 1
    
            Cells(lRowLoop + 1, 1) = vFilms(lRowLoop)
            Cells(lRowLoop + 1, 2) = vYears(lRowLoop)
    
        Next lRowLoop
    
        Stop
    
        IE.Quit
    End Sub
    
    Private Function GetScriptEngine() As ScriptControl
        '* see code from this SO Q & A
        ' https://stackoverflow.com/questions/37711073/in-excel-vba-on-windows-how-to-get-stringified-json-respresentation-instead-of
        Static soScriptEngine As ScriptControl
        If soScriptEngine Is Nothing Then
            Set soScriptEngine = New ScriptControl
            soScriptEngine.Language = "JScript"
    
            soScriptEngine.AddCode "function getMovies(htmlDocument, microsoftDict) { " & _
                                        "var titles = htmlDocument.querySelectorAll('a.browse-movie-title'), i;" & _
                                        "var years = htmlDocument.querySelectorAll('div.browse-movie-year'), j;" & _
                                        "if ( years.length === years.length) {" & _
                                        "for (i=0; i< years.length; ++i) {" & _
                                        "   var film = titles[i].innerText;" & _
                                        "   var year = years[i].innerText;" & _
                                        "   microsoftDict.Add(film, year);" & _
                                        "}}}"
    
        End If
        Set GetScriptEngine = soScriptEngine
    End Function
    
    0 讨论(0)
提交回复
热议问题