问题
I'm referencing a stackoverflow answer that is similar to my GUI app. My scrappy application is a bit different. When exectuing the app, a user is prompt to enter keywords for scrapy to search for
looks like this
im trying to put this logic on the GUI, but im unsure how to do it.
here is what the gui looks like as of now.
I want to be able to input fields where a user can input the information need before processing the scrapy script.
here is a bit of the scrapy script
my_spider.py
import scrapy
import sys
import random
import csv
from scrape.items import Item
from var_dump import var_dump
search_item = input("Input The Search Item: ")
location = input("Location:")
second_location = input("Second Location:")
third_location = input("Third Location:")
fourth_location = input("Fourth Location:")
fifth_location = input("Fifth Location:")
sixth_location = input("Sixth Location:")
# city = [
# "Los Angeles", "Chicago", "Houston", "Phoenix", "Philadelphia", "San Antonio", "Fort Worth",
# "San Diego", "Dallas", "San Jose", "Austin", "Columbus", "Indianapolis", "Seattle", "St. Paul", "Nashville",
# "Louisville", "Plano"
# ]
# rancity = random.choice(city)
class YellowSpider(scrapy.Spider):
name = "yellow"
# start_urls = [
# "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + location
# # "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + third_location,
# # "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fourth_location
# ]
def start_requests(self):
yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + location, self.parse)
yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + second_location, self.parse2)
yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + third_location, self.parse3)
yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fourth_location, self.parse4)
yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fifth_location, self.parse5)
yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + sixth_location, self.parse6)
# yield scrapy.Request('http://www.example.com/3.html', self.parse)
def __init__(self):
self.seen_business_names = []
self.seen_phonenumbers = []
self.seen_websites = []
self.seen_emails = []
def parse(self, response):
for href in response.css('div.v-card a.business-name::attr(href)'):
yield response.follow(href, self.businessprofile)
for href in response.css('div.pagination a::attr(href)'):
yield response.follow(href, self.parse)
def parse2(self, response):
for href in response.css('div.v-card a.business-name::attr(href)'):
yield response.follow(href, self.businessprofile2)
for href in response.css('div.pagination a::attr(href)'):
yield response.follow(href, self.parse2)
def parse3(self, response):
for href in response.css('div.v-card a.business-name::attr(href)'):
yield response.follow(href, self.businessprofile3)
for href in response.css('div.pagination a::attr(href)'):
yield response.follow(href, self.parse3)
........
here is the GUI
main.py
from functools import partial
from PyQt5 import QtCore, QtGui, QtWidgets
class ScrapyWorker(QtCore.QObject):
logChanged = QtCore.pyqtSignal(str)
started = QtCore.pyqtSignal()
finished = QtCore.pyqtSignal()
def __init__(self, parent=None):
super(ScrapyWorker, self).__init__(parent)
self._process = QtCore.QProcess(self)
self._process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
self._process.readyReadStandardOutput.connect(self.on_readyReadStandardOutput)
self._process.setProgram('scrapy')
self._process.started.connect(self.started)
self._process.finished.connect(self.finished)
def run(self, project, spider):
self._process.setWorkingDirectory(project)
self._process.setArguments(['crawl', spider])
self._process.start()
@QtCore.pyqtSlot()
def on_readyReadStandardOutput(self):
data = self._process.readAllStandardOutput().data().decode()
self.logChanged.emit(data)
@QtCore.pyqtSlot()
def stop(self):
self._process.kill()
def spiders(self, project):
process = QtCore.QProcess()
process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
process.setWorkingDirectory(project)
loop = QtCore.QEventLoop()
process.finished.connect(loop.quit)
process.start('scrapy', ['list'])
loop.exec_()
return process.readAllStandardOutput().data().decode().split()
class MainWindow(QtWidgets.QMainWindow):
def __init__(self, parent=None):
super(MainWindow, self).__init__(parent)
self.project_le = QtWidgets.QLineEdit()
self.project_button = QtWidgets.QPushButton('Select Project')
self.spider_combobox = QtWidgets.QComboBox()
self.start_stop_button = QtWidgets.QPushButton("Start", checkable=True)
self.text_edit = QtWidgets.QTextBrowser()
self.input = QtWidgets.QLineEdit()
self.input1 = QtWidgets.QLineEdit()
self.input2 = QtWidgets.QLineEdit()
self.input3 = QtWidgets.QLineEdit()
self.input4 = QtWidgets.QLineEdit()
self.input5 = QtWidgets.QLineEdit()
self.input6 = QtWidgets.QLineEdit()
central_widget = QtWidgets.QWidget()
self.setCentralWidget(central_widget)
lay = QtWidgets.QVBoxLayout(central_widget)
hlay = QtWidgets.QHBoxLayout()
hlay.addWidget(self.project_le)
hlay.addWidget(self.project_button)
lay.addLayout(hlay)
hlay2 = QtWidgets.QHBoxLayout()
hlay2.addWidget(QtWidgets.QLabel("Input The Search Item :"))
hlay2.addWidget(self.input, 1)
hlay3 = QtWidgets.QHBoxLayout()
hlay4 = QtWidgets.QHBoxLayout()
hlay5 = QtWidgets.QHBoxLayout()
hlay6 = QtWidgets.QHBoxLayout()
hlay7 = QtWidgets.QHBoxLayout()
hlay8 = QtWidgets.QHBoxLayout()
hlay3.addWidget(QtWidgets.QLabel("Location :"))
hlay3.addWidget(self.input1, 1 )
hlay4.addWidget(QtWidgets.QLabel("Location 2 :"))
hlay4.addWidget(self.input2, 1 )
hlay5.addWidget(QtWidgets.QLabel("Location 3 :"))
hlay5.addWidget(self.input3, 1 )
hlay6.addWidget(QtWidgets.QLabel("Location 4 :"))
hlay6.addWidget(self.input4, 1 )
hlay7.addWidget(QtWidgets.QLabel("Location 5 :"))
hlay7.addWidget(self.input5, 1 )
hlay8.addWidget(QtWidgets.QLabel("Location 6 :"))
hlay8.addWidget(self.input6, 1 )
lay.addLayout(hlay2)
lay.addLayout(hlay3)
lay.addLayout(hlay4)
lay.addLayout(hlay5)
lay.addLayout(hlay6)
lay.addLayout(hlay7)
lay.addLayout(hlay8)
lay.addWidget(self.start_stop_button)
lay.addWidget(self.text_edit)
self.start_stop_button.setEnabled(False)
self.scrapy_worker = ScrapyWorker(self)
self.scrapy_worker.logChanged.connect(self.insert_log)
self.scrapy_worker.started.connect(self.text_edit.clear)
self.scrapy_worker.finished.connect(partial(self.start_stop_button.setChecked, False))
self.start_stop_button.toggled.connect(self.on_checked)
self.project_button.clicked.connect(self.select_project)
self.resize(640, 480)
@QtCore.pyqtSlot(bool)
def on_checked(self, state):
if state:
filename = self.project_le.text()
finfo = QtCore.QFileInfo(filename)
directory = finfo.dir().absolutePath()
self.scrapy_worker.run(directory, self.spider_combobox.currentText())
self.start_stop_button.setText('Stop')
else:
self.start_stop_button.setText('Start')
self.scrapy_worker.stop()
@QtCore.pyqtSlot()
def select_project(self):
filename, _ = QtWidgets.QFileDialog.getOpenFileName(
self,
"Select .cfg file",
QtCore.QDir.currentPath(),
"Configure File (*.cfg)"
)
if filename:
self.project_le.setText(filename)
finfo = QtCore.QFileInfo(filename)
directory = finfo.dir().absolutePath()
spiders = self.scrapy_worker.spiders(directory)
self.spider_combobox.clear()
self.spider_combobox.addItems(spiders)
self.start_stop_button.setEnabled(True if spiders else False)
@QtCore.pyqtSlot(str)
def insert_log(self, text):
prev_cursor = self.text_edit.textCursor()
self.text_edit.moveCursor(QtGui.QTextCursor.End)
self.text_edit.insertPlainText(text)
self.text_edit.setTextCursor(prev_cursor)
if __name__ == '__main__':
import sys
app = QtWidgets.QApplication(sys.argv)
app.setStyle('fusion')
w = MainWindow()
w.show()
sys.exit(app.exec_())
回答1:
First you have to modify your spider to accept arguments directly by the console avoiding using the input()
method:
yellowpage_spider.py
import json
import scrapy
from scrape.items import Item
class YellowSpider(scrapy.Spider):
name = "yellow"
def __init__(self, *args, **kwargs):
super(YellowSpider, self).__init__(*args, **kwargs)
self.seen_business_names = []
self.seen_phonenumbers = []
self.seen_websites = []
self.seen_emails = []
def start_requests(self):
if not hasattr(self, 'parameters'):
return
parameters = json.loads(self.parameters)
search_item = parameters['search_item']
locations = parameters['locations']
for location in locations:
url = "https://www.yellowpages.com/search?search_terms={}&geo_location_terms={}".format(search_item, location)
yield scrapy.Request(url=url, callback=self.parse, meta={'location': location})
def parse(self, response):
location = response.meta['location']
for href in response.css('div.v-card a.business-name::attr(href)'):
yield response.follow(href, self.businessprofile, meta={'location': location})
for href in response.css('div.pagination a::attr(href)'):
yield response.follow(href, self.parse, meta={'location': location})
def businessprofile(self, response):
location = response.meta['location']
for business in response.css('header#main-header'):
item = Item()
item['business_name'] = business.css('div.sales-info h1::text').extract()
w = business.css('a.secondary-btn.website-link::attr(href)').extract()
item['website'] = str(w).strip('[]')
item['location'] = location
s = business.css('a.email-business::attr(href)').extract()
item['email'] = [item[7:] for item in s]
item['phonenumber'] = business.css('p.phone::text').extract_first()
for x in item['email']:
#new code here, call to self.seen_business_names
if x not in self.seen_emails:
if item['email']:
if item['phonenumber']:
if item['website']:
self.seen_emails.append(x)
yield item
Then the previous code expects a parameter called parameters
:
scrapy crawl yellow -a parameters='{"search_item": "house", "locations": ["usa", "germany", "brazil"]}'
So in the GUI we must now form the entry using the GUI inputs:
gui.py
import os
import json
from functools import partial
from PyQt5 import QtCore, QtGui, QtWidgets
import utils
dir_path = os.path.dirname(os.path.abspath(__file__))
icons_dir = os.path.join(dir_path, 'assets', 'icons')
class ScrapyWorker(QtCore.QObject):
logChanged = QtCore.pyqtSignal(str)
started = QtCore.pyqtSignal()
finished = QtCore.pyqtSignal()
def __init__(self, parent=None):
super(ScrapyWorker, self).__init__(parent)
self._process = QtCore.QProcess(self)
self._process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
self._process.readyReadStandardOutput.connect(self.on_readyReadStandardOutput)
self._process.started.connect(self.started)
self._process.finished.connect(self.finished)
def run(self, project, program, arguments):
self._process.setWorkingDirectory(project)
self._process.setProgram('scrapy')
self._process.setArguments(arguments)
self._process.start()
@QtCore.pyqtSlot()
def on_readyReadStandardOutput(self):
data = self._process.readAllStandardOutput().data().decode()
self.logChanged.emit(data)
@QtCore.pyqtSlot()
def stop(self):
self._process.kill()
class LocationWidget(QtWidgets.QWidget):
def __init__(self, parent=None):
super(LocationWidget, self).__init__(parent)
self.lay = QtWidgets.QVBoxLayout(self)
self.lay.setContentsMargins(0, 0, 0, 0)
self.lay.addStretch()
self.setContentsMargins(0, 0, 0, 0)
self.widgets = []
self.create_row()
def create_row(self):
widget = QtWidgets.QWidget()
widget.setContentsMargins(0, 0, 0, 0)
hlay = QtWidgets.QHBoxLayout(widget)
hlay.setContentsMargins(0, 0, 0, 0)
lineedit = QtWidgets.QLineEdit()
button = QtWidgets.QToolButton(clicked=self.on_clicled)
button.setFocusPolicy(QtCore.Qt.NoFocus)
hlay.addWidget(lineedit)
hlay.addWidget(button)
button.setIconSize(QtCore.QSize(24, 24))
button.setIcon(QtGui.QIcon(os.path.join(icons_dir, 'add.png')))
self.widgets.append(widget)
self.lay.insertWidget(-1, widget)
@QtCore.pyqtSlot()
def on_clicled(self):
button = self.sender()
widget = button.parentWidget()
if self.lay.indexOf(widget) == (self.lay.count()-1):
self.create_row()
else:
self.lay.removeWidget(widget)
widget.deleteLater()
self.widgets.remove(widget)
for widget in self.widgets:
button = widget.findChild(QtWidgets.QToolButton)
button.setIcon(QtGui.QIcon(os.path.join(icons_dir, 'remove.png')))
self.widgets[-1].findChild(QtWidgets.QToolButton).setIcon(QtGui.QIcon(os.path.join(icons_dir, 'add.png')))
def get_locations(self):
locations = []
for widget in self.widgets:
le = widget.findChild(QtWidgets.QLineEdit)
if le.text():
locations.append(le.text())
return locations
class YellowWidget(QtWidgets.QMainWindow):
def __init__(self, parent=None):
super(YellowWidget, self).__init__(parent)
self.setWindowTitle('Yellow Pages Scrapper')
self.scrapy_worker = ScrapyWorker(self)
self.search_item_le = QtWidgets.QLineEdit()
self.location_widget = LocationWidget()
self.start_stop_button = QtWidgets.QPushButton("Start", checkable=True)
self.text_edit = QtWidgets.QTextBrowser()
central_widget = QtWidgets.QWidget()
self.setCentralWidget(central_widget)
lay = QtWidgets.QGridLayout(central_widget)
lay.addWidget(QtWidgets.QLabel("<b>Search:</b>"), 0, 0)
lay.addWidget(self.search_item_le, 0, 1)
lay.addWidget(QtWidgets.QLabel("<b>Locations:</b>"), 1, 0, alignment=QtCore.Qt.AlignTop|QtCore.Qt.AlignLeft)
lay.addWidget(self.location_widget, 1, 1, alignment=QtCore.Qt.AlignTop)
lay.addWidget(self.start_stop_button, 2, 0, 1, 2)
lay.addWidget(self.text_edit, 3, 0, 1, 2)
self.start_stop_button.toggled.connect(self.on_checked)
self.scrapy_worker.logChanged.connect(self.insert_log)
self.scrapy_worker.started.connect(self.text_edit.clear)
self.scrapy_worker.finished.connect(partial(self.start_stop_button.setChecked, False))
@QtCore.pyqtSlot(bool)
def on_checked(self, state):
if state:
# crapy crawl yellow -a parameters='{"search_item": "house", "locations": ["usa", "germany"]}'
search_item = self.search_item_le.text()
locations = self.location_widget.get_locations()
directory, program, args = utils.create_arguments(search_item, locations)
self.scrapy_worker.run(directory, program, args)
self.start_stop_button.setText('Stop')
else:
self.start_stop_button.setText('Start')
self.scrapy_worker.stop()
@QtCore.pyqtSlot(str)
def insert_log(self, text):
prev_cursor = self.text_edit.textCursor()
self.text_edit.moveCursor(QtGui.QTextCursor.End)
self.text_edit.insertPlainText(text)
self.text_edit.setTextCursor(prev_cursor)
if __name__ == '__main__':
import sys
app = QtWidgets.QApplication(sys.argv)
app.setStyle('fusion')
w = YellowWidget()
w.resize(640, 480)
w.show()
sys.exit(app.exec_())
I used a function that is in the utils.py file:
import os
import json
def create_arguments(search_item, locations):
program = 'scrapy'
dir_path = os.path.dirname(os.path.abspath(__file__))
directory = os.path.join(dir_path, 'scrape')
d = {"search_item": search_item, "locations": locations}
argument = 'parameters={}'.format(json.dumps(d))
return directory, program, ['crawl', 'yellow', "-a", argument]
Obtaining the following:
The complete project is here.
来源:https://stackoverflow.com/questions/55212321/insert-multiple-input-fields-before-running-scrapy