Python Selenium Get All “href” attributes

前端 未结 2 1492
名媛妹妹
名媛妹妹 2020-12-22 10:46

How will I get all the \"href\" attributes for this \"h2\" titles on this page?

相关标签:
2条回答
  • 2020-12-22 11:03

    Here code getting all books from all pages:

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    
    driver = webdriver.Chrome()
    baseUrl = "http://www.allitebooks.com/page/1/?s=python"
    driver.get(baseUrl)
    
    # wait = WebDriverWait(driver, 5)
    # wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".search-result-list li")))
    
    # Get last page number
    lastPage = int(driver.find_element(By.CSS_SELECTOR, ".pagination a:last-child").text)
    
    # Get all HREFs for the first page and save them in hrefs list
    js = 'return [...document.querySelectorAll(".entry-title a")].map(e=>e.href)'
    hrefs = driver.execute_script(js)
    
    # Iterate throw all pages and get all HREFs of books
    for i in range(2, lastPage):
        driver.get("http://www.allitebooks.com/page/" + str(i) + "/?s=python")
        hrefs.extend(driver.execute_script(js))
    
    for href in hrefs:
        print(href)
    
    0 讨论(0)
  • 2020-12-22 11:11

    Selenium might be overkill for what you need, good old BeautifulSoup will do the trick as well.

    import urllib.request, bs4
    body = urllib.request.urlopen(urllib.request.Request("http://www.allitebooks.com/page/1/?s=python", headers={"User-Agent": "Mozilla"})).read().decode("utf-8")
    soup = bs4.BeautifulSoup(body)
    for element in soup.find_all("h2", class_="entry-title"):
        for link in element.find_all("a"):
            print(link.get("href"))
    
    0 讨论(0)
提交回复
热议问题