I\'m using PyQt to fully load a page(including JS) and get it contents using Beautiful Soup. Works fine at the first iteration, but after, it crashes. I don\'t have a big kn
The example crashes because the RenderPage
class attempts to create a new QApplication
and event-loop for every url it tries to load.
Instead, only one QApplication
should be created, and the QWebPage
subclass should load a new url after each page has been processed, rather than using a for-loop.
Here's a re-write of the example which should do what you want:
import sys, signal
from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
from PyQt4 import QtCore, QtGui, QtWebKit
class WebPage(QtWebKit.QWebPage):
def __init__(self):
QtWebKit.QWebPage.__init__(self)
self.mainFrame().loadFinished.connect(self.handleLoadFinished)
def process(self, items):
self._items = iter(items)
self.fetchNext()
def fetchNext(self):
try:
self._url, self._func = next(self._items)
self.mainFrame().load(QtCore.QUrl(self._url))
except StopIteration:
return False
return True
def handleLoadFinished(self):
self._func(self._url, self.mainFrame().toHtml())
if not self.fetchNext():
print('# processing complete')
QtGui.qApp.quit()
def funcA(url, html):
print('# processing:', url)
# soup = BeautifulSoup(UnicodeDammit(html).unicode_markup)
# do stuff with soup...
def funcB(url, html):
print('# processing:', url)
# soup = BeautifulSoup(UnicodeDammit(html).unicode_markup)
# do stuff with soup...
if __name__ == '__main__':
items = [
('http://stackoverflow.com', funcA),
('http://google.com', funcB),
]
signal.signal(signal.SIGINT, signal.SIG_DFL)
print('Press Ctrl+C to quit\n')
app = QtGui.QApplication(sys.argv)
webpage = WebPage()
webpage.process(items)
sys.exit(app.exec_())