多特

多特瑞商品信息抓取(scrapy爬虫框架)

匿名 (未验证) 提交于 2019-12-02 23:03:14
1、spider目录下爬虫项目(duo_te_rui.py) - - coding: utf-8 - - import scrapy import re from Duo_te_rui.items import DuoTeRuiItem class DuoTeRuiSpider(scrapy.Spider): # 爬虫项目名 name = ‘duo_te_rui’ # 爬虫允许的域 allowed_domains = [‘ doterra.cn ’] # 爬虫的起始URL start_urls = [] # 替换所有的HTML标签 def re_html ( self , data ): # 替换抓取数据中的html标签 try : message = str ( data ) re_h = re . compile ( '</?\w+[^>]*>' ) # html标签 ret1 = re_h . sub ( '' , message ) ret2 = re . sub ( r '\n' , '' , ret1 ) ret3 = re . sub ( r '\u3000' , '' , ret2 ) return ret3 except : pass def start_requests ( self ): for i in range ( 1 , 4 ): yield