Get Wikipedia page urls from an Excel list

↘锁芯ラ 提交于 2019-12-25 03:12:05

问题


I have an issue where I am not making any progress.

I am working on my master thesis at the moment. For that I have a list of Actors and need to check which of them has a own (German) Wikipedia page. (approximately 20,000 actors) Since i am not very experienced in vba programming, I looked for a solution here in the forum. I found a code with which you can search for urls via google and get the first result copied into excel.

Using VBA in Excel to Google Search in IE and return the hyperlink of the first result

I tried to restrict the search to the german wikipedia by having google search for german pages only. E.g. "site:de.wikipedia.org intitle:johnny depp"

This works out fine for the known actors, but I get an error code when I search for an actor that does not have his own page. "Error 91: Object variable or with block variable not set"

So can you maybe help me to build a work-around into the code that skips the actor when he/she has no own page and insead continues with the next in the list?

Sorry for the noobie question, but that would be great! :) Or maybe you even have a way simpler solution.

Thank you so much!

Sample File

Sub XMLHTTP()
Dim url As String, lastRow As Long
Dim XMLHTTP As Object, html As Object, objResultDiv As Object, objH3 As Object, link As Object
Dim start_time As Date
Dim end_time As Date

lastRow = Range("A" & Rows.Count).End(xlUp).Row

Dim cookie As String
Dim result_cookie As String

start_time = Time
Debug.Print "start_time:" & start_time

For i = 2 To lastRow

    url = "https://www.google.de/search?q=" & Cells(i, 1) & "&rnd=" & WorksheetFunction.RandBetween(1, 10000)

    Set XMLHTTP = CreateObject("MSXML2.serverXMLHTTP")
    XMLHTTP.Open "GET", url, False
    XMLHTTP.setRequestHeader "Content-Type", "text/xml"
    XMLHTTP.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0"
    XMLHTTP.send

        Set html = CreateObject("htmlfile")
    html.body.innerHTML = XMLHTTP.ResponseText
    Set objResultDiv = html.getelementbyid("rso")
    Set objH3 = objResultDiv.getelementsbytagname("H3")(0)
    Set link = objH3.getelementsbytagname("a")(0)


    str_text = Replace(link.innerHTML, "<EM>", "")
    str_text = Replace(str_text, "</EM>", "")

    Cells(i, 2) = str_text
    Cells(i, 3) = link.href
    DoEvents
Next

end_time = Time
Debug.Print "end_time:" & end_time

Debug.Print "done" & "Time taken : " & DateDiff("n", start_time, end_time)
MsgBox "done" & "Time taken : " & DateDiff("n", start_time, end_time)
End Sub

回答1:


Check if objResultDiv element is found and if it is found, proceed further else write "Not Found" to the cells.

You may try something like this...

Sub XMLHTTP()
Dim url As String, lastRow As Long
Dim XMLHTTP As Object, html As Object, objResultDiv As Object, objH3 As Object, link As Object
Dim start_time As Date
Dim end_time As Date
Dim i As Long
Dim str_text As String

lastRow = Range("A" & Rows.Count).End(xlUp).Row

Dim cookie As String
Dim result_cookie As String

start_time = Time
Debug.Print "start_time:" & start_time

For i = 2 To lastRow

    url = "https://www.google.de/search?q=" & Cells(i, 1) & "&rnd=" & WorksheetFunction.RandBetween(1, 10000)

    Set XMLHTTP = CreateObject("MSXML2.serverXMLHTTP")
    XMLHTTP.Open "GET", url, False
    XMLHTTP.setRequestHeader "Content-Type", "text/xml"
    XMLHTTP.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0"
    XMLHTTP.send

    If XMLHTTP.Status = 200 Then
        Set html = CreateObject("htmlfile")
        html.body.innerHTML = XMLHTTP.ResponseText
        Set objResultDiv = html.getelementbyid("rso")

        If Not objResultDiv Is Nothing Then
            Set objH3 = objResultDiv.getelementsbytagname("H3")(0)
            Set link = objH3.getelementsbytagname("a")(0)


            str_text = Replace(link.innerHTML, "<EM>", "")
            str_text = Replace(str_text, "</EM>", "")

            Cells(i, 2) = str_text
            Cells(i, 3) = link.href
            DoEvents
        Else
            Cells(i, 2) = "Not Found"
            Cells(i, 3) = "Not Found"
        End If
    Else
        Cells(i, 2) = "Not Found"
        Cells(i, 3) = "Not Found"
    End If
Next

end_time = Time
Debug.Print "end_time:" & end_time

Debug.Print "done" & "Time taken : " & DateDiff("n", start_time, end_time)
MsgBox "done" & "Time taken : " & DateDiff("n", start_time, end_time)
End Sub



回答2:


It's hard sometimes to scrape information from google using xmlhttp, serverxmlhttp or winhttp request. Even if you try with proxy, google can easily detect you as a bot so it will lead you to a captcha page and your attempt will be miserably failed. However, the safer approach in this case is to pilot IE. Try the below way instead. if you have IE9 then the .querySelector() defined within the scraper will rock.

Sub ScrapeGoogle()
    Dim IE As New InternetExplorer, HTML As HTMLDocument
    Dim cel As Range, URL$, post As Object

    For Each cel In Range("A2:A" & Cells(Rows.Count, 1).End(xlUp).row)
        URL = "https://www.google.de/search?q=" & Replace(cel.Value, " ", "%20")

        With IE
            .Visible = True
            .navigate URL
            While .Busy = True Or .readyState <> 4: DoEvents: Wend
            Set HTML = .document

            If Not HTML.querySelector(".rc h3.r a") Is Nothing Then
                Set post = HTML.querySelector(".rc h3.r a")
                cel(1, 2) = post.innerText
                cel(1, 3) = post.getAttribute("href")
            Else
                cel(1, 2) = "Nothing found"
                cel(1, 3) = "Sorry dear"
            End If
        End With
    Next cel
    IE.Quit
End Sub

Reference to add to the library:

Microsoft Internet Controls
Microsoft HTML Object Library


来源:https://stackoverflow.com/questions/50079366/get-wikipedia-page-urls-from-an-excel-list

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!