Optimizing python web scraping script with Selenium

不想你离开。 提交于 2021-02-11 12:04:52

问题


I'm having an issue with my web scraping script with Selenium Normally, the script can run smoothly.

However, I would usually have this error within this for loop (I believe the script ran too fast before the elements can be visible):

NoSuchElementException                    Traceback (most recent call last)
<ipython-input-6-470748a6674f> in <module>
     66                 item_brand.append(driver.find_element_by_xpath('.//*[@id="brand"]/a/span/bdi').get_attribute('textContent'))
     67                 item_prices.append(driver.find_element_by_css_selector('[id="price"]').text)
---> 68                 item_names.append(driver1.find_element_by_css_selector('[class="nav-product-link-text"] span').text)
     69                 total_rate.append(driver1.find_element_by_class_name('css-i36p8g').text)
     70                 review_contents.append(containers.find_element_by_class_name('review-text').text)
......

"NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"[class="nav-product-link-text"] span"}"

I had to add driver.implicitly_wait(3) within the for loop so it can wait until the elements are visible but it didn't work.

please help to check my script as below:

driver = webdriver.Chrome(chrome_path)
driver1 = webdriver.Chrome(chrome_path)

# Create lists for the dataframe:
item_names = list()
item_description = list()
item_brand = list()
review_titles= list()
review_contents = list()
product_helpful= list() 
product_not_helpful = list()
member_rating = list()
total_rate = list()
item_prices = list()
item_images = list()

URL = "https://ca.iherb.com/c/Vitamins?sr=2&noi=48&p="


for n in range(1,2):
    driver.get(f"{URL}{n}") # modify the page numbers to scrape the products information
    # driver.get(f"https://ca.iherb.com/c/Vitamins?sr=2&noi=48&p={n}".format(n+1))
    wait = WebDriverWait(driver, 10)

    # Store all the links in a list
    item_links = [item.get_attribute("href") for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,".absolute-link-wrapper > a.product-link")))]

    # Iterate over the links
    for item_link in item_links:
        driver.get(item_link)
    
        # Locate and click on the `View All Reviews` link
        all_reviews_link = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"span.all-reviews-link > a")))
        time.sleep(2)

        x = all_reviews_link.get_attribute("href")

        MAX_PAGE_NUM = 60   # Scrape maximum 60 pages in the review section

        for i in range(1, MAX_PAGE_NUM + 1):
            page_num = str(i)
            url = x +'?&p='+ page_num 
            print(url)    
            driver1.get(url)
            review_containers = driver1.find_elements_by_class_name('review-row')

            for containers in review_containers:
                driver.implicitly_wait(3) # waiting for the browser to se the website elements
                elements = ', '.join([item.text for item in driver.find_elements_by_css_selector("[itemprop='description'] > ul:nth-of-type(1) > li")])
                item_description.append(elements)
                item_images.append(driver.find_element_by_xpath('//*[@id="product-image"]/div[1]/a').get_attribute('href'))
                item_brand.append(driver.find_element_by_xpath('.//*[@id="brand"]/a/span/bdi').get_attribute('textContent'))
                item_prices.append(driver.find_element_by_css_selector('[id="price"]').text)
                item_names.append(driver1.find_element_by_css_selector('[class="nav-product-link-text"] span').text)
                total_rate.append(driver1.find_element_by_class_name('css-i36p8g').text)       
                review_contents.append(containers.find_element_by_class_name('review-text').text)
                product_helpful.append(containers.find_element_by_css_selector('[title="Helpful"] span').text)
                product_not_helpful.append(containers.find_element_by_css_selector('[title="Unhelpful"] span').text)
                stars = containers.find_elements_by_class_name("css-172co2l")
                rating = 0
                for star in stars:
                    star_color = star.find_element_by_tag_name("path").get_attribute("fill")
                    if star_color != "transparent":
                        rating += 1
                member_rating.append(rating)

            time.sleep(2) # Slow the script down

driver.quit()

Please help to check this issue for me. I really appreciate it.

来源:https://stackoverflow.com/questions/66073388/optimizing-python-web-scraping-script-with-selenium

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!