Removing null value from scraped data without removing entire

怎甘沉沦 提交于 2020-06-01 07:38:07

问题


Am using scrapy to scrape data off the new york times website, but the scraped data are full of null values I don't want so in order to clean my extracted data I have changed the pipeline.py script. and it worked when I extract a single value or two it works like a charm. but when I extract multiple values and since there is at least one null value on each extracted row the algorithm ends up deleting almost all my data. is there a way to stop this from happening ?

here is my spider file :

# -*- coding: utf-8 -*-
import scrapy
import re
from config_spider.items import Item
from urllib.parse import urljoin, urlparse

def get_real_url(response, url):
    if re.search(r'^https?', url):
        return url
    elif re.search(r'^\/\/', url):
        u = urlparse(response.url)
        return u.scheme + url
    return urljoin(response.url, url)

class ConfigSpider(scrapy.Spider):
    name = 'config_spider'

    def start_requests(self):
        yield scrapy.Request(url='https://www.nytimes.com/', callback=self.parse_list)

    def parse_list(self, response):
        prev_item = response.meta.get('item')
        for elem in response.css('div'):
            item = Item()
            item['title'] = elem.css('h2::text').extract_first()
            item['date'] = elem.css('time::text').extract_first()
            item['excerpt'] = elem.css('.e1n8kpyg0::text').extract_first()
            item['author'] = elem.css('.e1god9m10::text').extract_first()
            item['categories'] = elem.css('.css-kh29da::text').extract_first()
            item['tags'] = elem.css('.css-1oxvs31::text').extract_first()
            item['link'] = elem.css('a::attr("href")').extract_first()
            item['published_at'] = elem.css('time::text').extract_first()
            item['source_url'] = elem.css('img::attr("src")').extract_first()
            item['caption'] = elem.css('img::attr("alt")').extract_first()
            if prev_item is not None:
                for key, value in prev_item.items():
                    item[key] = value
            yield item




here is my piplines file :

import os
from pymongo import MongoClient

mongo = MongoClient(
    host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost',
    port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017),
    username=os.environ.get('CRAWLAB_MONGO_USERNAME'),
    password=os.environ.get('CRAWLAB_MONGO_PASSWORD'),
    authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin'
)
db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test']
col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test']
task_id = os.environ.get('CRAWLAB_TASK_ID')

class ConfigSpiderPipeline(object):
    def process_item(self, item, spider):
        item['task_id'] = task_id
        if col is not None:
            if any(item.values()):
                col.save(item)
                return item
            else:
                raise DropItem()



here is my items file :

import scrapy


class Item(scrapy.Item):
    _id = scrapy.Field()
    task_id = scrapy.Field()
    ts = scrapy.Field()
    title = scrapy.Field()
    date = scrapy.Field()
    excerpt = scrapy.Field()
    author = scrapy.Field()
    categories = scrapy.Field()
    tags = scrapy.Field()
    link = scrapy.Field()
    published_at = scrapy.Field()
    source_url = scrapy.Field()
    caption = scrapy.Field()



来源:https://stackoverflow.com/questions/61535398/removing-null-value-from-scraped-data-without-removing-entire

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!