Can't get desired results using try/except clause within scrapy

后端未结

关注

 2  746

抹茶落季 2021-01-23 19:50

I\'ve written a script in scrapy to make proxied requests using newly generated proxies by get_proxies() method. I used requests module to

2条回答

遥遥无期 (楼主)

2021-01-23 20:21

The start_requests() function is just the entry point. On subsequent requests, you would need to resupply this metadata to the Request object.

Also, errors can occur on two levels: proxy and target server

We need to handle bad response codes from both the proxy and the target server. Proxy errors are returned by the middelware to the errback function. The target server response can be handled during parsing from the response.status

import scrapy
import random
import requests
from itertools import cycle
from bs4 import BeautifulSoup
from scrapy.crawler import CrawlerProcess


def get_proxies():
    response = requests.get("https://www.us-proxy.org/")
    soup = BeautifulSoup(response.text, "lxml")
    proxy = [':'.join([item.select_one("td").text, item.select_one("td:nth-of-type(2)").text]) for item in
             soup.select("table.table tbody tr") if "yes" in item.text]
    # proxy = ['https://52.0.0.1:8090', 'https://52.0.0.2:8090']
    return proxy


def get_random_proxy(proxy_vault):
    random.shuffle(proxy_vault)
    proxy_url = next(cycle(proxy_vault))
    return proxy_url


class ProxySpider(scrapy.Spider):
    name = "proxiedscript"
    handle_httpstatus_list = [503, 502, 401, 403]
    check_url = "https://yts.am/browse-movies"
    proxy_vault = get_proxies()

    def handle_middleware_errors(self, *args, **kwargs):
        # implement middleware error handling here
        print('Middleware Error')
        # retry request with different proxy
        yield self.make_request(url=args[0].request._url, callback=args[0].request._meta['callback'])

    def start_requests(self):
        yield self.make_request(url=self.check_url, callback=self.parse)

    def make_request(self, url, callback, dont_filter=True):
        return scrapy.Request(url,
                              meta={'proxy': f'https://{get_random_proxy(self.proxy_vault)}', 'callback': callback},
                              callback=callback,
                              dont_filter=dont_filter,
                              errback=self.handle_middleware_errors)

    def parse(self, response):
        print(response.meta)
        try:
            if response.status != 200:
                # implement server status code handling here - this loops forever
                print(f'Status code: {response.status}')
                raise
            else:
                for item in response.css(".browse-movie-wrap a.browse-movie-title::attr(href)").getall():
                    nlink = response.urljoin(item)
                    yield self.make_request(url=nlink, callback=self.parse_details)
        except:
            # if anything goes wrong fetching the lister page, try again
            yield self.make_request(url=self.check_url, callback=self.parse)

    def parse_details(self, response):
        print(response.meta)
        try:
            if response.status != 200:
                # implement server status code handeling here - this loops forever
                print(f'Status code: {response.status}')
                raise
            name = response.css("#movie-info h1::text").get()
            yield {"Name": name}
        except:
            # if anything goes wrong fetching the detail page, try again
            yield self.make_request(url=response.request._url, callback=self.parse_details)


if __name__ == "__main__":
    c = CrawlerProcess({'USER_AGENT': 'Mozilla/5.0'})
    c.crawl(ProxySpider)
    c.start()

0 讨论(0)

查看其它2个回答