Headless endless scroll selenium

前端 未结 1 1115
情书的邮戳
情书的邮戳 2021-02-15 12:30
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from s         


        
1条回答
  •  独厮守ぢ
    2021-02-15 12:32

    Here is a set of things that made it work for me in headless mode:

    • switch to PhantomJS
    • pretend to be a different browser by setting a custom User-Agent string
    • before scrolling into view of the last tweet, scroll to the top of the page (several times to increase reliability)

    The code:

    import time
    
    def return_html_code(url):
        dcap = dict(webdriver.DesiredCapabilities.PHANTOMJS)
        dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36"
    
        driver = webdriver.PhantomJS(desired_capabilities=dcap)
        driver.maximize_window()
    
        driver.get(url)
    
        # initial wait for the tweets to load
        wait = WebDriverWait(driver, 30)
        wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
        # scroll down to the last tweet until there is no more tweets loaded
        while True:
            tweets = driver.find_elements_by_css_selector("li[data-item-id]")
            number_of_tweets = len(tweets)
            print(number_of_tweets)
    
            # move to the top and then to the bottom 5 times in a row
            for _ in range(5):
                driver.execute_script("window.scrollTo(0, 0)")
                driver.execute_script("arguments[0].scrollIntoView(true);", tweets[-1])
                time.sleep(0.5)
    
            try:
                wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
            except TimeoutException:
                break
    

    0 讨论(0)
提交回复
热议问题