NTLM authentication with Scrapy for web scraping

前端未结

关注

 2  828

礼貌的吻别 2021-02-03 12:03

I am attempting to scrape data from a website that requires authentication.
I have been able to successfully login using requests and HttpNtlmAuth with the following:

2条回答

长情又很酷 (楼主)

2021-02-03 12:30

Thank you @SpaceDog for the comment just above, I faced a similar problem trying to crawl an intranet website using ntlm authentification. The crawler would just see the first page because LinkExtractor within the CrawlSpider didn't fire up.

Here's my working solution using scrapy 1.0.5

NTLM_Middleware.py

from scrapy.http import Response, HtmlResponse
import requests
from requests_ntlm import HttpNtlmAuth

class NTLM_Middleware(object):

    def process_request(self, request, spider):
        url = request.url
        usr = getattr(spider, 'http_usr', '')
        pwd = getattr(spider, 'http_pass','')
        s = requests.session()
        response = s.get(url, auth=HttpNtlmAuth(usr,pwd))
        return HtmlResponse(url,response.status_code, response.headers.iteritems(), response.content)

settings.py

import logging

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'scrapy intranet'

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS=16


# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    'intranet.NTLM_Middleware.NTLM_Middleware': 200,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware':None
}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'scrapyelasticsearch.scrapyelasticsearch.ElasticSearchPipeline',
}

ELASTICSEARCH_SERVER='localhost'
ELASTICSEARCH_PORT=9200
ELASTICSEARCH_USERNAME=''
ELASTICSEARCH_PASSWORD=''
ELASTICSEARCH_INDEX='intranet'
ELASTICSEARCH_TYPE='pages_intranet'
ELASTICSEARCH_UNIQ_KEY='url'
ELASTICSEARCH_LOG_LEVEL=logging.DEBUG

spiders/intranetspider.py

# -*- coding: utf-8 -*-
import scrapy

#from scrapy import log
from scrapy.spiders import CrawlSpider, Rule, Spider
from scrapy.linkextractors import LinkExtractor
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.http import Response

import requests
import sys
from bs4 import BeautifulSoup

class PageItem(scrapy.Item):
    body=scrapy.Field()
    title=scrapy.Field()
    url=scrapy.Field()

class IntranetspiderSpider(CrawlSpider):
    http_usr='DOMAIN\\user'
    http_pass='pass'
    name = "intranetspider"
    protocol='https://'
    allowed_domains = ['intranet.mydomain.ca']
    start_urls = ['https://intranet.mydomain.ca/']
    rules = (Rule(LinkExtractor(),callback="parse_items",follow=True),)

    def parse_items(self, response):
        self.logger.info('Crawl de la page %s',response.url)
        item = PageItem()

        soup = BeautifulSoup(response.body)

        #remove script tags and javascript from content
        [x.extract() for x in soup.findAll('script')]

        item['body']=soup.get_text(" ", strip=True)
        item['url']=response.url

        return item

0 讨论(0)

查看其它2个回答