How to download pdf files from URLs leading to sub-URLs using Python

问题

I am trying to download all pdf files from the links in the following URLs:

https://www.adb.org/projects/documents/country/ban/year/2020?terms=education
https://www.adb.org/projects/documents/country/ban/year/2019?terms=education
https://www.adb.org/projects/documents/country/ban/year/2018?terms=education

These URLs have lists of links which directs to sub-links containing pdf files. The lists of links in the main URLs come from the search result of a country, year and a term.

I have tried with the following codes by changing it in different ways. However, it does not seem to be working. Any help would be appreciated. Thanks.

import os
import time
from glob import glob 
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
 
url = ["https://www.adb.org/projects/documents/country/ban/year/2020?terms=education",
      "https://www.adb.org/projects/documents/country/ban/year/2019?terms=education",
      "https://www.adb.org/projects/documents/country/ban/year/2018?terms=education"]

folder = glob("J:/pdfs/*/")

for i, folder_location in zip(url, folder):
    time.sleep(1)
    response = requests.get(i)
    soup= BeautifulSoup(response.text, "lxml")
    for link in soup.select("[href$='.pdf']"):

        filename = os.path.join(folder_location,link['href'].split('/')[-1])
        with open(filename, 'wb') as f:
            f.write(requests.get(urljoin(i,link['href'])).content)

回答1:

Try this. It will put the files in the PDF folder.

import os
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain, utils

class MySpider(Spider):
    name = 'download_pdf'
    allowed_domains = ["www.adb.org"]
    start_urls = [
        "https://www.adb.org/projects/documents/country/ban/year/2020?terms=education",
        "https://www.adb.org/projects/documents/country/ban/year/2019?terms=education",
        "https://www.adb.org/projects/documents/country/ban/year/2018?terms=education"
    ]  # Entry page

    def __init__(self):
        Spider.__init__(self, self.name)  #necessary
        if (not os.path.exists('./pdfs')):
            os.mkdir('./pdfs')

    def afterResponse(self, response, url, error=None, extra=None):
        try:
            path = './pdfs' + url[url.rindex('/'):]
            index = path.find('?')
            if index > 0: path = path[:index]
            flag = utils.saveResponseAsFile(response, path, fileType="pdf")
            if flag:
                return None
            else:  # If it's not a pdf, leave it to the frame
                return Spider.afterResponse(self, response, url, error)
        except Exception as err:
            print(err)

    def extract(self, url, html, models, modelNames):
        doc = SimplifiedDoc(html)
        lst = doc.selects('div.list >a').contains("documents/", attr="href")
        if not lst:
            lst = doc.selects('div.hidden-md hidden-lg >a')
        urls = []
        for a in lst:
            a["url"] = utils.absoluteUrl(url.url, a["href"])
            urls.append(a)

        return {"Urls": urls}


SimplifiedMain.startThread(MySpider())  # Start download

The pdf from each url be downloaded to each separate folder.

from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain, utils

class MySpider(Spider):
    name = 'download_pdf'
    allowed_domains = ["www.adb.org"]
    start_urls = [
        "https://www.adb.org/projects/documents/country/ban/year/2020?terms=education",
        "https://www.adb.org/projects/documents/country/ban/year/2019?terms=education",
        "https://www.adb.org/projects/documents/country/ban/year/2018?terms=education"
    ]  # Entry page

    def afterResponse(self, response, url, error=None, extra=None):
        if not extra:
            print ("The version of library simplified_scrapy is too old, please update.")
            SimplifiedMain.setRunFlag(False)
            return
        try:
            path = './pdfs'
            # create folder start
            srcUrl = extra.get('srcUrl')
            if srcUrl:
                index = srcUrl.find('year/')
                year = ''
                if index > 0:
                    year = srcUrl[index + 5:]
                    index = year.find('?')
                    if index>0:
                        path = path + year[:index]
                        utils.createDir(path)
            # create folder end

            path = path + url[url.rindex('/'):]
            index = path.find('?')
            if index > 0: path = path[:index]
            flag = utils.saveResponseAsFile(response, path, fileType="pdf")
            if flag:
                return None
            else:  # If it's not a pdf, leave it to the frame
                return Spider.afterResponse(self, response, url, error, extra)
        except Exception as err:
            print(err)

    def extract(self, url, html, models, modelNames):
        doc = SimplifiedDoc(html)
        lst = doc.selects('div.list >a').contains("documents/", attr="href")
        if not lst:
            lst = doc.selects('div.hidden-md hidden-lg >a')
        urls = []
        for a in lst:
            a["url"] = utils.absoluteUrl(url.url, a["href"])
            # Set root url start
            a["srcUrl"] = url.get('srcUrl')
            if not a['srcUrl']:
                a["srcUrl"] = url.url
            # Set root url end
            urls.append(a)

        return {"Urls": urls}

    # Download again by resetting the URL. Called when you want to download again.
    def resetUrl(self):
        Spider.clearUrl(self)
        Spider.resetUrlsTest(self)

SimplifiedMain.startThread(MySpider())  # Start download

来源：https://stackoverflow.com/questions/62633192/how-to-download-pdf-files-from-urls-leading-to-sub-urls-using-python

标签

python

python-3.x

web-scraping

request

urllib