NTLM authentication with Scrapy for web scraping

前端 未结 2 823
礼貌的吻别
礼貌的吻别 2021-02-03 12:03

I am attempting to scrape data from a website that requires authentication.
I have been able to successfully login using requests and HttpNtlmAuth with the following:

<
相关标签:
2条回答
  • 2021-02-03 12:30

    Thank you @SpaceDog for the comment just above, I faced a similar problem trying to crawl an intranet website using ntlm authentification. The crawler would just see the first page because LinkExtractor within the CrawlSpider didn't fire up.

    Here's my working solution using scrapy 1.0.5

    NTLM_Middleware.py

    from scrapy.http import Response, HtmlResponse
    import requests
    from requests_ntlm import HttpNtlmAuth
    
    class NTLM_Middleware(object):
    
        def process_request(self, request, spider):
            url = request.url
            usr = getattr(spider, 'http_usr', '')
            pwd = getattr(spider, 'http_pass','')
            s = requests.session()
            response = s.get(url, auth=HttpNtlmAuth(usr,pwd))
            return HtmlResponse(url,response.status_code, response.headers.iteritems(), response.content)
    

    settings.py

    import logging
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    USER_AGENT = 'scrapy intranet'
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    CONCURRENT_REQUESTS=16
    
    
    # Enable or disable downloader middlewares
    # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
    DOWNLOADER_MIDDLEWARES = {
        'intranet.NTLM_Middleware.NTLM_Middleware': 200,
        'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware':None
    }
    
    # Configure item pipelines
    # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
        'scrapyelasticsearch.scrapyelasticsearch.ElasticSearchPipeline',
    }
    
    ELASTICSEARCH_SERVER='localhost'
    ELASTICSEARCH_PORT=9200
    ELASTICSEARCH_USERNAME=''
    ELASTICSEARCH_PASSWORD=''
    ELASTICSEARCH_INDEX='intranet'
    ELASTICSEARCH_TYPE='pages_intranet'
    ELASTICSEARCH_UNIQ_KEY='url'
    ELASTICSEARCH_LOG_LEVEL=logging.DEBUG
    

    spiders/intranetspider.py

    # -*- coding: utf-8 -*-
    import scrapy
    
    #from scrapy import log
    from scrapy.spiders import CrawlSpider, Rule, Spider
    from scrapy.linkextractors import LinkExtractor
    from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
    from scrapy.http import Response
    
    import requests
    import sys
    from bs4 import BeautifulSoup
    
    class PageItem(scrapy.Item):
        body=scrapy.Field()
        title=scrapy.Field()
        url=scrapy.Field()
    
    class IntranetspiderSpider(CrawlSpider):
        http_usr='DOMAIN\\user'
        http_pass='pass'
        name = "intranetspider"
        protocol='https://'
        allowed_domains = ['intranet.mydomain.ca']
        start_urls = ['https://intranet.mydomain.ca/']
        rules = (Rule(LinkExtractor(),callback="parse_items",follow=True),)
    
        def parse_items(self, response):
            self.logger.info('Crawl de la page %s',response.url)
            item = PageItem()
    
            soup = BeautifulSoup(response.body)
    
            #remove script tags and javascript from content
            [x.extract() for x in soup.findAll('script')]
    
            item['body']=soup.get_text(" ", strip=True)
            item['url']=response.url
    
            return item
    
    0 讨论(0)
  • 2021-02-03 12:42

    I was able to figure out what was going on.

    1: This is considered a "DOWNLOADER_MIDDLEWARE" not a "SPIDER_MIDDLEWARE".

    DOWNLOADER_MIDDLEWARES = { 'test.ntlmauth.NTLM_Middleware': 400, }
    

    2: The middleware which I was trying to use needed to be modified significantly. Here is what works for me:

    from scrapy.http import Response
    import requests                                                              
    from requests_ntlm import HttpNtlmAuth
    
    class NTLM_Middleware(object):
    
        def process_request(self, request, spider):
            url = request.url
            pwd = getattr(spider, 'http_pass', '')
            usr = getattr(spider, 'http_user', '')
            s = requests.session()     
            response = s.get(url,auth=HttpNtlmAuth(usr,pwd))      
            return Response(url,response.status_code,{}, response.content)
    

    Within the spider, all you need to do is set these variables:

    http_user = 'DOMAIN\\USER'
    http_pass = 'PASS'
    
    0 讨论(0)
提交回复
热议问题