Scrape title by only downloading relevant part of webpage

前端 未结 6 1724
深忆病人
深忆病人 2021-02-05 10:45

I would like to scrape just the title of a webpage using Python. I need to do this for thousands of sites so it has to be fast. I\'ve seen previous questions like retrieving jus

6条回答
  •  终归单人心
    2021-02-05 11:21

    my code also solves cases when title tag is splitted between chunks.

    #!/usr/bin/env python2
    # -*- coding: utf-8 -*-
    """
    Created on Tue May 30 04:21:26 2017
    ====================
    @author: s
    """
    
    import requests
    from string import lower
    from html.parser import HTMLParser
    
    #proxies = { 'http': 'http://127.0.0.1:8080' }
    urls = ['http://opencvexamples.blogspot.com/p/learning-opencv-functions-step-by-step.html',
            'http://www.robindavid.fr/opencv-tutorial/chapter2-filters-and-arithmetic.html',
            'http://blog.iank.org/playing-capitals-with-opencv-and-python.html',
            'http://docs.opencv.org/3.2.0/df/d9d/tutorial_py_colorspaces.html',
            'http://scikit-image.org/docs/dev/api/skimage.exposure.html',
            'http://apprize.info/programming/opencv/8.html',
            'http://opencvexamples.blogspot.com/2013/09/find-contour.html',
            'http://docs.opencv.org/2.4/modules/imgproc/doc/geometric_transformations.html',
            'https://github.com/ArunJayan/OpenCV-Python/blob/master/resize.py']
    
    class TitleParser(HTMLParser):
        def __init__(self):
            HTMLParser.__init__(self)
            self.match = False
            self.title = ''
        def handle_starttag(self, tag, attributes):
            self.match = True if tag == 'title' else False
        def handle_data(self, data):
            if self.match:
                self.title = data
                self.match = False
    
    def valid_content( url, proxies=None ):
        valid = [ 'text/html; charset=utf-8',
                  'text/html',
                  'application/xhtml+xml',
                  'application/xhtml',
                  'application/xml',
                  'text/xml' ]
        r = requests.head(url, proxies=proxies)
        our_type = lower(r.headers.get('Content-Type'))
        if not our_type in valid:
            print('unknown content-type: {} at URL:{}'.format(our_type, url))
            return False
        return our_type in valid
    
    def range_header_overlapped( chunksize, seg_num=0, overlap=50 ):
        """
        generate overlapping ranges
        (to solve cases when title tag splits between them)
    
        seg_num: segment number we want, 0 based
        overlap: number of overlaping bytes, defaults to 50
        """
        start = chunksize * seg_num
        end = chunksize * (seg_num + 1)
        if seg_num:
            overlap = overlap * seg_num
            start -= overlap
            end -= overlap
        return {'Range': 'bytes={}-{}'.format( start, end )}
    
    def get_title_from_url(url, proxies=None, chunksize=300, max_chunks=5):
        if not valid_content(url, proxies=proxies):
            return False
        current_chunk = 0
        myparser = TitleParser()
        while current_chunk <= max_chunks:
            headers = range_header_overlapped( chunksize, current_chunk )
            headers['Accept-Encoding'] = 'deflate'
            # quick fix, as my locally hosted Apache/2.4.25 kept raising
            # ContentDecodingError when using "Content-Encoding: gzip"
            # ContentDecodingError: ('Received response with content-encoding: gzip, but failed to decode it.', 
            #                  error('Error -3 while decompressing: incorrect header check',))
            r = requests.get( url, headers=headers, proxies=proxies )
            myparser.feed(r.content)
            if myparser.title:
                return myparser.title
            current_chunk += 1
        print('title tag not found within {} chunks ({}b each) at {}'.format(current_chunk-1, chunksize, url))
        return False
    

提交回复
热议问题