问题
Am using scrapy to scrape data off the new york times website, but the scraped data are full of null values I don't want so in order to clean my extracted data I have changed the pipeline.py script. and it worked when I extract a single value or two it works like a charm. but when I extract multiple values and since there is at least one null value on each extracted row the algorithm ends up deleting almost all my data. is there a way to stop this from happening ?
here is my spider file :
# -*- coding: utf-8 -*-
import scrapy
import re
from config_spider.items import Item
from urllib.parse import urljoin, urlparse
def get_real_url(response, url):
if re.search(r'^https?', url):
return url
elif re.search(r'^\/\/', url):
u = urlparse(response.url)
return u.scheme + url
return urljoin(response.url, url)
class ConfigSpider(scrapy.Spider):
name = 'config_spider'
def start_requests(self):
yield scrapy.Request(url='https://www.nytimes.com/', callback=self.parse_list)
def parse_list(self, response):
prev_item = response.meta.get('item')
for elem in response.css('div'):
item = Item()
item['title'] = elem.css('h2::text').extract_first()
item['date'] = elem.css('time::text').extract_first()
item['excerpt'] = elem.css('.e1n8kpyg0::text').extract_first()
item['author'] = elem.css('.e1god9m10::text').extract_first()
item['categories'] = elem.css('.css-kh29da::text').extract_first()
item['tags'] = elem.css('.css-1oxvs31::text').extract_first()
item['link'] = elem.css('a::attr("href")').extract_first()
item['published_at'] = elem.css('time::text').extract_first()
item['source_url'] = elem.css('img::attr("src")').extract_first()
item['caption'] = elem.css('img::attr("alt")').extract_first()
if prev_item is not None:
for key, value in prev_item.items():
item[key] = value
yield item
here is my piplines file :
import os
from pymongo import MongoClient
mongo = MongoClient(
host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost',
port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017),
username=os.environ.get('CRAWLAB_MONGO_USERNAME'),
password=os.environ.get('CRAWLAB_MONGO_PASSWORD'),
authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin'
)
db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test']
col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test']
task_id = os.environ.get('CRAWLAB_TASK_ID')
class ConfigSpiderPipeline(object):
def process_item(self, item, spider):
item['task_id'] = task_id
if col is not None:
if any(item.values()):
col.save(item)
return item
else:
raise DropItem()
here is my items file :
import scrapy
class Item(scrapy.Item):
_id = scrapy.Field()
task_id = scrapy.Field()
ts = scrapy.Field()
title = scrapy.Field()
date = scrapy.Field()
excerpt = scrapy.Field()
author = scrapy.Field()
categories = scrapy.Field()
tags = scrapy.Field()
link = scrapy.Field()
published_at = scrapy.Field()
source_url = scrapy.Field()
caption = scrapy.Field()
来源:https://stackoverflow.com/questions/61535398/removing-null-value-from-scraped-data-without-removing-entire