Scrape multiple urls using QWebPage

问题

I'm using Qt's QWebPage to render a page that uses javascript to update its content dynamically - so a library that just downloads a static version of the page (such as urllib2) won't work.

My problem is, when I render a second page, about 99% of the time the program just crashes. At other times, it will work three times before crashing. I've also gotten a few segfaults, but it is all very random.

My guess is the object I'm using to render isn't getting deleted properly, so trying to reuse it is possibly causing some problems for myself. I've looked all over and no one really seems to be having this same issue.

Here's the code I'm using. The program downloads web pages from steam's community market so I can create a database of all the items. I need to call the getItemsFromPage function multiple times to get all of the items, as they are broken up into pages (showing results 1-10 out of X amount).

import csv
import re
import sys
from string import replace
from bs4 import BeautifulSoup
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *

class Item:
    __slots__ = ("name", "count", "price", "game")

    def __repr__(self):
        return self.name + "(" + str(self.count) + ")"

    def __str__(self):
        return self.name + ", " + str(self.count) + ", $" + str(self.price)

class Render(QWebPage):  
    def __init__(self, url):
        self.app = QApplication(sys.argv)
        QWebPage.__init__(self)
        self.loadFinished.connect(self._loadFinished)
        self.mainFrame().load(QUrl(url))
        self.app.exec_()

    def _loadFinished(self, result):
        self.frame = self.mainFrame()
        self.app.quit()
        self.deleteLater()

def getItemsFromPage(appid, page=1):

    r = Render("http://steamcommunity.com/market/search?q=appid:" + str(appid) + "#p" + str(page))

    soup = BeautifulSoup(str(r.frame.toHtml().toUtf8()))

    itemLst = soup.find_all("div", "market_listing_row market_recent_listing_row")

    items = []

    for k in itemLst:
        i = Item()

        i.name = k.find("span", "market_listing_item_name").string
        i.count = int(replace(k.find("span", "market_listing_num_listings_qty").string, ",", ""))
        i.price = float(re.search(r'\$([0-9]+\.[0-9]+)', str(k)).group(1))
        i.game = appid

        items.append(i)

    return items

if __name__ == "__main__":

    print "Updating market items to dota2.csv ..."

    i = 1

    with open("dota2.csv", "w") as f:
        writer = csv.writer(f)

        r = None

        while True:
            print "Page " + str(i)

            items = getItemsFromPage(570)

            if len(items) == 0:
                print "No items found, stopping..."
                break

            for k in items:
                writer.writerow((k.name, k.count, k.price, k.game))

            i += 1

    print "Done."

Calling getItemsFromPage once works fine. Subsequent calls give me my problem. The output of the program is typically

Updating market items to dota2.csv ...
Page 1
Page 2

and then it crashes. It should go on for over 700 pages.

回答1:

The problem with your program is that you are attempting to create a new QApplication with every url you fetch.

Instead, you should create one QApplication, and handle all the loading and processing of web pages within the WebPage class itself. The key concept is to use the loadFinished signal to create a loop by fetching a new url after the current one has been loaded and processed.

The two demo scripts below (for PyQt4 and PyQt5) are simplified examples that show how to structure the program. Hopefully, it should be fairly obvious how to adapt them for your own use:

import sys
from PyQt4 import QtCore, QtGui, QtWebKit

class WebPage(QtWebKit.QWebPage):
    def __init__(self):
        super(WebPage, self).__init__()
        self.mainFrame().loadFinished.connect(self.handleLoadFinished)

    def start(self, urls):
        self._urls = iter(urls)
        self.fetchNext()

    def fetchNext(self):
        try:
            url = next(self._urls)
        except StopIteration:
            return False
        else:
            self.mainFrame().load(QtCore.QUrl(url))
        return True

    def processCurrentPage(self):
        url = self.mainFrame().url().toString()
        html = self.mainFrame().toHtml()
        # do stuff with html...
        print('loaded: [%d bytes] %s' % (self.bytesReceived(), url))

    def handleLoadFinished(self):
        self.processCurrentPage()
        if not self.fetchNext():
            QtGui.qApp.quit()

if __name__ == '__main__':

    # generate some test urls
    urls = []
    url = 'http://pyqt.sourceforge.net/Docs/PyQt4/%s.html'
    for name in dir(QtWebKit):
        if name.startswith('Q'):
            urls.append(url % name.lower())

    app = QtGui.QApplication(sys.argv)
    webpage = WebPage()
    webpage.start(urls)
    sys.exit(app.exec_())

Here is a PyQt5/QWebEngine version of the above script:

import sys
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets

class WebPage(QtWebEngineWidgets.QWebEnginePage):
    def __init__(self):
        super(WebPage, self).__init__()
        self.loadFinished.connect(self.handleLoadFinished)

    def start(self, urls):
        self._urls = iter(urls)
        self.fetchNext()

    def fetchNext(self):
        try:
            url = next(self._urls)
        except StopIteration:
            return False
        else:
            self.load(QtCore.QUrl(url))
        return True

    def processCurrentPage(self, html):
        url = self.url().toString()
        # do stuff with html...
        print('loaded: [%d chars] %s' % (len(html), url))
        if not self.fetchNext():
            QtWidgets.qApp.quit()

    def handleLoadFinished(self):
        self.toHtml(self.processCurrentPage)

if __name__ == '__main__':

    # generate some test urls
    urls = []
    url = 'http://pyqt.sourceforge.net/Docs/PyQt5/%s.html'
    for name in dir(QtWebEngineWidgets):
        if name.startswith('Q'):
            urls.append(url % name.lower())

    app = QtWidgets.QApplication(sys.argv)
    webpage = WebPage()
    webpage.start(urls)
    sys.exit(app.exec_())

来源：https://stackoverflow.com/questions/47381962/pyqt5-web-scraping-not-working-within-for-loop

标签

python

web-scraping

pyqt

pyqt4

qwebpage