Removing null value from scraped data without removing entire

问题

Am using scrapy to scrape data off the new york times website, but the scraped data are full of null values I don't want so in order to clean my extracted data I have changed the pipeline.py script. and it worked when I extract a single value or two it works like a charm. but when I extract multiple values and since there is at least one null value on each extracted row the algorithm ends up deleting almost all my data. is there a way to stop this from happening ?

here is my spider file :

# -*- coding: utf-8 -*-
import scrapy
import re
from config_spider.items import Item
from urllib.parse import urljoin, urlparse

def get_real_url(response, url):
    if re.search(r'^https?', url):
        return url
    elif re.search(r'^\/\/', url):
        u = urlparse(response.url)
        return u.scheme + url
    return urljoin(response.url, url)

class ConfigSpider(scrapy.Spider):
    name = 'config_spider'

    def start_requests(self):
        yield scrapy.Request(url='https://www.nytimes.com/', callback=self.parse_list)

    def parse_list(self, response):
        prev_item = response.meta.get('item')
        for elem in response.css('div'):
            item = Item()
            item['title'] = elem.css('h2::text').extract_first()
            item['date'] = elem.css('time::text').extract_first()
            item['excerpt'] = elem.css('.e1n8kpyg0::text').extract_first()
            item['author'] = elem.css('.e1god9m10::text').extract_first()
            item['categories'] = elem.css('.css-kh29da::text').extract_first()
            item['tags'] = elem.css('.css-1oxvs31::text').extract_first()
            item['link'] = elem.css('a::attr("href")').extract_first()
            item['published_at'] = elem.css('time::text').extract_first()
            item['source_url'] = elem.css('img::attr("src")').extract_first()
            item['caption'] = elem.css('img::attr("alt")').extract_first()
            if prev_item is not None:
                for key, value in prev_item.items():
                    item[key] = value
            yield item

here is my piplines file :

import os
from pymongo import MongoClient

mongo = MongoClient(
    host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost',
    port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017),
    username=os.environ.get('CRAWLAB_MONGO_USERNAME'),
    password=os.environ.get('CRAWLAB_MONGO_PASSWORD'),
    authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin'
)
db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test']
col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test']
task_id = os.environ.get('CRAWLAB_TASK_ID')

class ConfigSpiderPipeline(object):
    def process_item(self, item, spider):
        item['task_id'] = task_id
        if col is not None:
            if any(item.values()):
                col.save(item)
                return item
            else:
                raise DropItem()

here is my items file :

import scrapy


class Item(scrapy.Item):
    _id = scrapy.Field()
    task_id = scrapy.Field()
    ts = scrapy.Field()
    title = scrapy.Field()
    date = scrapy.Field()
    excerpt = scrapy.Field()
    author = scrapy.Field()
    categories = scrapy.Field()
    tags = scrapy.Field()
    link = scrapy.Field()
    published_at = scrapy.Field()
    source_url = scrapy.Field()
    caption = scrapy.Field()

来源：https://stackoverflow.com/questions/61535398/removing-null-value-from-scraped-data-without-removing-entire

标签

python

scrapy

web-crawler

data-processing