How to write python scrapy code for extracting url's present in sitemap of a site

后端 未结 1 653
悲&欢浪女
悲&欢浪女 2021-01-15 14:02

I\'m trying to use this code to get list of urls in sitemap. when i run this, i see no results in the screen. could anyone tell me whats the problem or suggest me better one

相关标签:
1条回答
  • 2021-01-15 14:57

    This spider will get all the URLs from a sitemap and save them to a list. You can easily change it to output to a file or the console.

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.spiders import SitemapSpider
    from scrapy.spiders import Spider
    from scrapy.http import Request, XmlResponse
    from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
    from scrapy.utils.gz import gunzip, is_gzipped
    import re
    import requests
    
    class GetpagesfromsitemapSpider(SitemapSpider):
        name = "test"
        handle_httpstatus_list = [404]
    
        def parse(self, response):
           print response.url
    
        def _parse_sitemap(self, response):
            if response.url.endswith('/robots.txt'):
                for url in sitemap_urls_from_robots(response.body):
                    yield Request(url, callback=self._parse_sitemap)
            else:
                body = self._get_sitemap_body(response)
                if body is None:
                    self.logger.info('Ignoring invalid sitemap: %s', response.url)
                    return
    
                s = Sitemap(body)
                sites = []
                if s.type == 'sitemapindex':
                    for loc in iterloc(s, self.sitemap_alternate_links):
                        if any(x.search(loc) for x in self._follow):
                            yield Request(loc, callback=self._parse_sitemap)
                elif s.type == 'urlset':
                    for loc in iterloc(s):
                        for r, c in self._cbs:
                            if r.search(loc):
                                sites.append(loc)
                                break
                print sites
    
        def __init__(self, spider=None, *a, **kw):
                super(GetpagesfromsitemapSpider, self).__init__(*a, **kw)
                self.spider = spider
                l = []
                url = "https://channelstore.roku.com"
                resp = requests.head(url + "/sitemap.xml")
                if (resp.status_code != 404):
                    l.append(resp.url)
                else:
                    resp = requests.head(url + "/robots.txt")
                    if (resp.status_code == 200):
                        l.append(resp.url)
                self.sitemap_urls = l
                print self.sitemap_urls
    
    def iterloc(it, alt=False):
        for d in it:
            yield d['loc']
    
            # Also consider alternate URLs (xhtml:link rel="alternate")
            if alt and 'alternate' in d:
                for l in d['alternate']:
                    yield l
    
    0 讨论(0)
提交回复
热议问题