Scrape title by only downloading relevant part of webpage

前端未结

关注

 6  1724

深忆病人 2021-02-05 10:45

I would like to scrape just the title of a webpage using Python. I need to do this for thousands of sites so it has to be fast. I\'ve seen previous questions like retrieving jus

6条回答

终归单人心 (楼主)

2021-02-05 11:21

my code also solves cases when title tag is splitted between chunks.

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Tue May 30 04:21:26 2017
====================
@author: s
"""

import requests
from string import lower
from html.parser import HTMLParser

#proxies = { 'http': 'http://127.0.0.1:8080' }
urls = ['http://opencvexamples.blogspot.com/p/learning-opencv-functions-step-by-step.html',
        'http://www.robindavid.fr/opencv-tutorial/chapter2-filters-and-arithmetic.html',
        'http://blog.iank.org/playing-capitals-with-opencv-and-python.html',
        'http://docs.opencv.org/3.2.0/df/d9d/tutorial_py_colorspaces.html',
        'http://scikit-image.org/docs/dev/api/skimage.exposure.html',
        'http://apprize.info/programming/opencv/8.html',
        'http://opencvexamples.blogspot.com/2013/09/find-contour.html',
        'http://docs.opencv.org/2.4/modules/imgproc/doc/geometric_transformations.html',
        'https://github.com/ArunJayan/OpenCV-Python/blob/master/resize.py']

class TitleParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.match = False
        self.title = ''
    def handle_starttag(self, tag, attributes):
        self.match = True if tag == 'title' else False
    def handle_data(self, data):
        if self.match:
            self.title = data
            self.match = False

def valid_content( url, proxies=None ):
    valid = [ 'text/html; charset=utf-8',
              'text/html',
              'application/xhtml+xml',
              'application/xhtml',
              'application/xml',
              'text/xml' ]
    r = requests.head(url, proxies=proxies)
    our_type = lower(r.headers.get('Content-Type'))
    if not our_type in valid:
        print('unknown content-type: {} at URL:{}'.format(our_type, url))
        return False
    return our_type in valid

def range_header_overlapped( chunksize, seg_num=0, overlap=50 ):
    """
    generate overlapping ranges
    (to solve cases when title tag splits between them)

    seg_num: segment number we want, 0 based
    overlap: number of overlaping bytes, defaults to 50
    """
    start = chunksize * seg_num
    end = chunksize * (seg_num + 1)
    if seg_num:
        overlap = overlap * seg_num
        start -= overlap
        end -= overlap
    return {'Range': 'bytes={}-{}'.format( start, end )}

def get_title_from_url(url, proxies=None, chunksize=300, max_chunks=5):
    if not valid_content(url, proxies=proxies):
        return False
    current_chunk = 0
    myparser = TitleParser()
    while current_chunk <= max_chunks:
        headers = range_header_overlapped( chunksize, current_chunk )
        headers['Accept-Encoding'] = 'deflate'
        # quick fix, as my locally hosted Apache/2.4.25 kept raising
        # ContentDecodingError when using "Content-Encoding: gzip"
        # ContentDecodingError: ('Received response with content-encoding: gzip, but failed to decode it.', 
        #                  error('Error -3 while decompressing: incorrect header check',))
        r = requests.get( url, headers=headers, proxies=proxies )
        myparser.feed(r.content)
        if myparser.title:
            return myparser.title
        current_chunk += 1
    print('title tag not found within {} chunks ({}b each) at {}'.format(current_chunk-1, chunksize, url))
    return False

0 讨论(0)

查看其它6个回答