问题
I want to download all the tender documents from this url 'http://www.ha.org.hk/haho/ho/bssd/T18G014Pc.htm'
I'm using selenium to go through each tender links and download the files.
However, my scraper couldn't handle the Excel download behavior. Currently, it handles PDF files pretty well.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import pandas as pd
from bs4 import BeautifulSoup
import os
from urllib.request import urlretrieve
driver = webdriver.Chrome(executable_path='chromedriver_win32/chromedriver.exe')
# open url in browser
driver.get('http://www.ha.org.hk/haho/ho/bssd/TN_236490_000952a.htm')
# get html file source
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
# extract table
table_body=soup.find('tbody')
# extract all tender links
table_url = soup.find_all('a')
for url in table_url:
print("Opening url:", url['href'])
print("Subject matter:", url.getText().strip())
driver.get(url['href'])
# get html file source
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
# look for url links which may contain downloadable documents
doc_urls = soup.find_all('a')
if doc_urls[0].has_attr('href'): # some a tag doesn't have any href, so we skip
driver.get(doc_urls[0]['href'])
tender_document = driver.current_url
print(doc_urls[0].getText().strip(),'.pdf', sep='')
# loop through all urls
for doc_url in doc_urls:
if doc_url.has_attr('href'): # some a tag doesn't have any href, so we skip
#open the doc url
driver.get(doc_url['href'])
# get the tender pdf file path
tender_document = driver.current_url
# download file
folder_location = 'C:\\Users\\user1\\Desktop\\tender_documents'
print(doc_url.getText().strip(),'.pdf', sep='')
fullfilename = os.path.join(folder_location, filename)
urlretrieve(tender_document, fullfilename)
回答1:
Try requests and beautifulsoup to download all documents:
import requests
from bs4 import BeautifulSoup
import re
base_url = "http://www.ha.org.hk"
tender = "T18G014Pc"
with requests.Session() as session:
r = session.get(f"{base_url}/haho/ho/bssd/{tender}.htm")
# get all documents links
docs = BeautifulSoup(r.text, "html.parser").select("a[href]")
for doc in docs:
href = doc.attrs["href"]
name = doc.text
print(f"name: {name}, href: {href}")
# open document page
r = session.get(href)
# get file path
file_path = re.search("(?<=window.open\\(')(.*)(?=',)", r.text).group(0)
file_name = file_path.split("/")[-1]
# get file and save
r = session.get(f"{base_url}/{file_path}")
with open(file_name, 'wb') as f:
f.write(r.content)
来源:https://stackoverflow.com/questions/57566692/how-do-i-control-selenium-pdf-and-excel-files-download-behavior