How to download pdf files from URLs leading to sub-URLs using Python

江枫思渺然 提交于 2020-08-07 08:13:54

问题


I am trying to download all pdf files from the links in the following URLs:

https://www.adb.org/projects/documents/country/ban/year/2020?terms=education
https://www.adb.org/projects/documents/country/ban/year/2019?terms=education
https://www.adb.org/projects/documents/country/ban/year/2018?terms=education

These URLs have lists of links which directs to sub-links containing pdf files. The lists of links in the main URLs come from the search result of a country, year and a term.

I have tried with the following codes by changing it in different ways. However, it does not seem to be working. Any help would be appreciated. Thanks.

import os
import time
from glob import glob 
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
 
url = ["https://www.adb.org/projects/documents/country/ban/year/2020?terms=education",
      "https://www.adb.org/projects/documents/country/ban/year/2019?terms=education",
      "https://www.adb.org/projects/documents/country/ban/year/2018?terms=education"]

folder = glob("J:/pdfs/*/")

for i, folder_location in zip(url, folder):
    time.sleep(1)
    response = requests.get(i)
    soup= BeautifulSoup(response.text, "lxml")
    for link in soup.select("[href$='.pdf']"):

        filename = os.path.join(folder_location,link['href'].split('/')[-1])
        with open(filename, 'wb') as f:
            f.write(requests.get(urljoin(i,link['href'])).content)


回答1:


Try this. It will put the files in the PDF folder.

import os
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain, utils

class MySpider(Spider):
    name = 'download_pdf'
    allowed_domains = ["www.adb.org"]
    start_urls = [
        "https://www.adb.org/projects/documents/country/ban/year/2020?terms=education",
        "https://www.adb.org/projects/documents/country/ban/year/2019?terms=education",
        "https://www.adb.org/projects/documents/country/ban/year/2018?terms=education"
    ]  # Entry page

    def __init__(self):
        Spider.__init__(self, self.name)  #necessary
        if (not os.path.exists('./pdfs')):
            os.mkdir('./pdfs')

    def afterResponse(self, response, url, error=None, extra=None):
        try:
            path = './pdfs' + url[url.rindex('/'):]
            index = path.find('?')
            if index > 0: path = path[:index]
            flag = utils.saveResponseAsFile(response, path, fileType="pdf")
            if flag:
                return None
            else:  # If it's not a pdf, leave it to the frame
                return Spider.afterResponse(self, response, url, error)
        except Exception as err:
            print(err)

    def extract(self, url, html, models, modelNames):
        doc = SimplifiedDoc(html)
        lst = doc.selects('div.list >a').contains("documents/", attr="href")
        if not lst:
            lst = doc.selects('div.hidden-md hidden-lg >a')
        urls = []
        for a in lst:
            a["url"] = utils.absoluteUrl(url.url, a["href"])
            urls.append(a)

        return {"Urls": urls}


SimplifiedMain.startThread(MySpider())  # Start download

The pdf from each url be downloaded to each separate folder.

from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain, utils

class MySpider(Spider):
    name = 'download_pdf'
    allowed_domains = ["www.adb.org"]
    start_urls = [
        "https://www.adb.org/projects/documents/country/ban/year/2020?terms=education",
        "https://www.adb.org/projects/documents/country/ban/year/2019?terms=education",
        "https://www.adb.org/projects/documents/country/ban/year/2018?terms=education"
    ]  # Entry page

    def afterResponse(self, response, url, error=None, extra=None):
        if not extra:
            print ("The version of library simplified_scrapy is too old, please update.")
            SimplifiedMain.setRunFlag(False)
            return
        try:
            path = './pdfs'
            # create folder start
            srcUrl = extra.get('srcUrl')
            if srcUrl:
                index = srcUrl.find('year/')
                year = ''
                if index > 0:
                    year = srcUrl[index + 5:]
                    index = year.find('?')
                    if index>0:
                        path = path + year[:index]
                        utils.createDir(path)
            # create folder end

            path = path + url[url.rindex('/'):]
            index = path.find('?')
            if index > 0: path = path[:index]
            flag = utils.saveResponseAsFile(response, path, fileType="pdf")
            if flag:
                return None
            else:  # If it's not a pdf, leave it to the frame
                return Spider.afterResponse(self, response, url, error, extra)
        except Exception as err:
            print(err)

    def extract(self, url, html, models, modelNames):
        doc = SimplifiedDoc(html)
        lst = doc.selects('div.list >a').contains("documents/", attr="href")
        if not lst:
            lst = doc.selects('div.hidden-md hidden-lg >a')
        urls = []
        for a in lst:
            a["url"] = utils.absoluteUrl(url.url, a["href"])
            # Set root url start
            a["srcUrl"] = url.get('srcUrl')
            if not a['srcUrl']:
                a["srcUrl"] = url.url
            # Set root url end
            urls.append(a)

        return {"Urls": urls}

    # Download again by resetting the URL. Called when you want to download again.
    def resetUrl(self):
        Spider.clearUrl(self)
        Spider.resetUrlsTest(self)

SimplifiedMain.startThread(MySpider())  # Start download


来源:https://stackoverflow.com/questions/62633192/how-to-download-pdf-files-from-urls-leading-to-sub-urls-using-python

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!