selenium.common.exceptions.NoSuchElementException: Message: Web element reference not seen before using GeckoDriver Firefox and Selenium with Python

问题

I'm trying to get the data for several different tests from a test prep site. There are different subjects, each of which has a specialization, each of which has a practice-test, each of which has several questions.

subject <--- specialization <---- practice-test *------ question

Here's my code:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pathlib
import time
import json
import os

driver=webdriver.Firefox(executable_path="../../../geckodriver.exe")
wait = WebDriverWait(driver, 15)
data=[]

def setup():

   driver.get('https://www.varsitytutors.com/practice-tests')
   try:
      go_away_1= driver.find_element_by_class_name("ub-emb-iframe")
      driver.execute_script("arguments[0].style.visibility='hidden'", go_away_1)
      go_away_2= driver.find_element_by_class_name("ub-emb-iframe-wrapper")
      driver.execute_script("arguments[0].style.visibility='hidden'", go_away_2)
      go_away_3= driver.find_element_by_class_name("ub-emb-visible")
      driver.execute_script("arguments[0].style.visibility='hidden'", go_away_3)
   except:
      pass

def get_subjects(subs=[]):
   subject_clickables_xpath="/html/body/div[3]/div[9]/div/*/div[@data-subject]/div[1]"
   subject_clickables=driver.find_elements_by_xpath(subject_clickables_xpath)
   subject_names=map(lambda x : x.find_element_by_xpath('..').get_attribute('data-subject'), subject_clickables)
   subject_pairs=zip(subject_names, subject_clickables)
   return subject_pairs

def get_specializations(subject):

   specialization_clickables_xpath="//div//div[@data-subject='"+subject+"']/following-sibling::div//div[@class='public_problem_set']//a[contains(.,'Practice Tests')]"
   specialization_names_xpath="//div//div[@data-subject='"+subject+"']/following-sibling::div//div[@class='public_problem_set']//a[contains(.,'Practice Tests')]/../.."
   specialization_names=map(lambda x : x.get_attribute('data-subject'), driver.find_elements_by_xpath(specialization_names_xpath))
   specialization_clickables = driver.find_elements_by_xpath(specialization_clickables_xpath)
   specialization_pairs=zip(specialization_names, specialization_clickables)
   return specialization_pairs

def get_practices(subject, specialization):
   practice_clickables_xpath="/html/body/div[3]/div[8]/div[3]/*/div[1]/a[1]"
   practice_names_xpath="//*/h3[@class='subject_header']"
   lengths_xpath="/html/body/div[3]/div[8]/div[3]/*/div[2]"
   lengths=map(lambda x : x.text, driver.find_elements_by_xpath(lengths_xpath))
   print(lengths)
   practice_names=map(lambda x : x.text, driver.find_elements_by_xpath(practice_names_xpath))
   practice_clickables = driver.find_elements_by_xpath(practice_clickables_xpath)
   practice_pairs=zip(practice_names, practice_clickables)
   return practice_pairs

def remove_popup():
   try:

      button=wait.until(EC.element_to_be_clickable((By.XPATH,"//button[contains(.,'No Thanks')]")))
      button.location_once_scrolled_into_view
      button.click()
   except:
      print('could not find the popup')

def get_questions(subject, specialization, practice):
   remove_popup()
   questions=[]
   current_question=None
   while True:
      question={}
      try:
         WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,"/html/body/div[3]/div[7]/div[1]/div[2]/div[2]/table/tbody/tr/td[1]")))
         question_number=driver.find_element_by_xpath('/html/body/div[3]/div[7]/div[1]/div[2]/div[2]/table/tbody/tr/td[1]').text.replace('.','')
         question_pre=driver.find_element_by_class_name('question_pre')
         question_body=driver.find_element_by_xpath('/html/body/div[3]/div[7]/div[1]/div[2]/div[2]/table/tbody/tr/td[2]/p')
         answer_choices=driver.find_elements_by_class_name('question_row')
         answers=map(lambda x : x.text, answer_choices)
         question['id']=question_number
         question['pre']=question_pre.text
         question['body']=question_body.text
         question['answers']=list(answers)
         questions.append(question)
         choice=WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"input.test_button")))
         driver.execute_script("arguments[0].click();", choice[3])
         time.sleep(3)
      except Exception as e:
         if 'results' in driver.current_url:
            driver.get(driver.current_url.replace('http://', 'https://'))
            # last question has been answered; record results
            remove_popup()
            pathlib.Path('data/'+subject+'/'+specialization).mkdir(parents=True, exist_ok=True)
            with open('data/'+subject+'/'+specialization+'/questions.json', 'w') as outfile:
               json.dump(list(questions), outfile)
               break
         else:
            driver.get(driver.current_url.replace('http://', 'https://'))
   return questions


def scrape():
   setup()
   subjects=get_subjects()
   for subject_name, subject_clickable in subjects:
      subject={}
      subject['name']=subject_name
      subject['specializations']=[]
      subject_clickable.click()
      subject_url=driver.current_url.replace('http://', 'https://')
      specializations=get_specializations(subject_name)
      for specialization_name, specialization_clickable in specializations:
         specialization={}
         specialization['name']=specialization_name
         specialization['practices']=[]
         specialization_clickable.click()
         specialization_url=driver.current_url.replace('http://', 'https://')
         practices=get_practices(subject_name, specialization_name)
         for practice_name, practice_clickable in practices:
            practice={}
            practice['name']=practice_name
            practice_clickable.click()
            questions=get_questions(subject_name, specialization_name, practice_name)
            practice['questions']=questions
            driver.get(specialization_url)
         driver.get(subject_url)
      data.append(subject)
   print(data)
scrape()

Running this produces the error message:

Traceback (most recent call last):
  File "scrape.py", line 141, in <module>
    scrape()
  File "scrape.py", line 126, in scrape
    for practice_name, practice_clickable in practices:
  File "scrape.py", line 49, in <lambda>
    practice_names=map(lambda x : x.text, driver.find_elements_by_xpath(practice_names_xpath))
  File "C:\Users\Joseph\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webelement.py", line 76, in text
    return self._execute(Command.GET_ELEMENT_TEXT)['value']
  File "C:\Users\Joseph\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webelement.py", line 628, in _execute
    return self._parent.execute(command, params)
  File "C:\Users\Joseph\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 312, in execute
    self.error_handler.check_response(response)
  File "C:\Users\Joseph\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 237, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.NoSuchElementException: Message: Web element reference not seen before: 980e5c29-e3af-4b13-979f-0f2bb58b3480

After getting the questions from one practice-test, the driver needs to return to specialization page where the next practice-test can be found. Hence these lines (where the problem is):

for practice_name, practice_clickable in practices:
            practice={}
            practice['name']=practice_name
            practice_clickable.click()
            questions=get_questions(subject_name, specialization_name, practice_name)
            practice['questions']=questions
            driver.get(specialization_url)

Apparently, the next practice clickable is no longer found on the page. Why not?

Also, I'm not sure if this is worthy of its own question, but I couldn't get this program to work on my Ubuntu computer -- the get_questions function stops on the last question and won't go to the results page.

Here's a tentative suggestion I'm trying by Guy:

def scrape():
   setup()
   subjects=get_subjects()
   for subject_name, subject_clickable in subjects:
      subject={}
      subject['name']=subject_name
      subject['specializations']=[]
      subject_clickable.click()
      subject_url=driver.current_url.replace('http://', 'https://')
      specializations=get_specializations(subject_name)
      for specialization_name, specialization_clickable in specializations:
         specialization={}
         specialization['name']=specialization_name
         specialization['practices']=[]
         specialization_clickable.click()
         specialization_url=driver.current_url.replace('http://', 'https://')
         practices=get_practices(subject_name, specialization_name)

         practices_len = len(list(get_practices(subject_name, specialization_name)))
         for i in range(practices_len):
            practices_list = list(get_practices(subject_name, specialization_name))
            practice = {}
            practice['name'] = practices_list[i][0]
            practices_list[i][1].click()
#         for practice_name, practice_clickable in practices:
#            practice={}
#            practice['name']=practice_name
#            practice_clickable.click()
#            questions=get_questions(subject_name, specialization_name, practice_name)
#            practice['questions']=questions
            driver.get(specialization_url)
         driver.get(subject_url)
      data.append(subject)
   print(data)
scrape()

Edit: As suggested by Hubert, I tried the following out:

 practices = get_practices(subject_name, specialization_name)
 practices = [item[0] for item in practices]
 for index, practice_name in enumerate(practices):
    practice={}
    practice['name'] = practice_name
    practice_row = driver.find_element_by_xpath('//*[text()="'+practice_name+'"]/..')
    practice_clickable_n = practice_row.find_element_by_link_text('Begin')
    print('old:', practice_clickable[index])
    print('new:', practice_clickable_n)
    practice_clickable_n.click()
    questions=get_questions(subject_name, specialization_name, practice_name)

And this was the result:

<map object at 0x7fabc0129860>
<map object at 0x7fabc0129898>
Traceback (most recent call last):
  File "scrape.py", line 140, in <module>
    scrape()
  File "scrape.py", line 131, in scrape
    print('old:', practice_clickable[index])
IndexError: list index out of range

回答1:

This error message...

selenium.common.exceptions.NoSuchElementException: Message: Web element reference not seen before: 980e5c29-e3af-4b13-979f-0f2bb58b3480

...implies that the GeckoDriver was unable identify the WebElement.

This error is coming out from get(webEl, win) within the Marionette source code:

get(webEl, win) {
  if (!(webEl instanceof WebElement)) {
    throw new TypeError(pprint`Expected web element, got: ${webEl}`);
  }
  if (!this.has(webEl)) {
    throw new NoSuchElementError(
      "Web element reference not seen before: " + webEl.uuid
    );
  }

@fc's comment in the discussion 'Element reference not seen before: undefined' using geckodriver, waitForElementVisible fails explains the actual issue:

However, the core issue was discussed in Intermittent test_navigation.py TestRefresh.test_basic | NoSuchElementException: Failed to trigger opening a new tab: Web element reference not seen before and was subsequently and was addressed through the changeset

Solution

Using the latest version of the binaries will solve the issue in terms of:

Selenium v3.141.59
GeckoDriver v0.26.0
Firefox v70.0

回答2:

The problem is the iteration over practices. It holds WebElements, but thier reference is lost when you are navigating to a new page, even if it actually the same page Selenium treats it as a new one.

You can solve it by iterating by index. To do it with zip you can do something like this

practices_len = len(list(get_practices(subject_name, specialization_name)))
for i in range(practices_len):
    practices_list = list(get_practices(subject_name, specialization_name))
    practice = {}
    practice['name'] = practices_list[i][0]
    practices_list[i][1].click()

回答3:

Guy is right. The next time you load the specialization_url it is a new page with new elements but practices contains the web elements of the old page as web elements.

To only change the part where it happens the code below first creates a list of the practices and practice_clickables. It then searches for a fresh clickable whenever it returns to the new specialization_url page and prints the ID of the old and the current practice_clickable. With that it is clearly visible that the element in the same row now is a different one than it was the first time the page was loaded.

In addition the map or zip function seems to create a generator so that even the iteration over the practices fails because in that step webdriver code is executed on old objects. That is why I first create lists and iterate over a list.

Changed snippet:

practices = get_practices(subject_name, specialization_name)
practice_clickable = [item[1] for item in practices]
practices = get_practices(subject_name, specialization_name)
practices = [item[0] for item in practices]
for index, practice_name in enumerate(practices):
    practice={}
    practice['name'] = practice_name
    practice_row = driver.find_element_by_xpath(f'//*[text()="{practice_name}"]/..')
    practice_clickable_n = practice_row.find_element_by_link_text('Begin')
    print('old:', practice_clickable[index])
    print('new:', practice_clickable_n)
    practice_clickable_n.click()
    questions=get_questions(subject_name, specialization_name, practice_name)

Complete scrape function:

def scrape():
    setup()
    subjects=get_subjects()
    for subject_name, subject_clickable in subjects:
        subject={}
        subject['name']=subject_name
        subject['specializations']=[]
        subject_clickable.click()
        if ('http://') in driver.current_url:
            subject_url=driver.current_url.replace('http://', 'https://')
        else:
            subject_url=driver.current_url
        specializations=get_specializations(subject_name)
        for specialization_name, specialization_clickable in specializations:
            specialization={}
            specialization['name']=specialization_name
            specialization['practices']=[]
            specialization_clickable.click()
            if 'http://' in driver.current_url:
                specialization_url=driver.current_url.replace('http://', 'https://')
            else:
                specialization_url=driver.current_url
            practices = get_practices(subject_name, specialization_name)
            practice_clickable = [item[1] for item in practices]
            practices = get_practices(subject_name, specialization_name)
            practices = [item[0] for item in practices]
            for index, practice_name in enumerate(practices):
                practice={}
                practice['name'] = practice_name
                practice_row = driver.find_element_by_xpath(f'//*[text()="{practice_name}"]/..')
                practice_clickable_n = practice_row.find_element_by_link_text('Begin')
                print('old:', practice_clickable[index])
                print('new:', practice_clickable_n)
                practice_clickable_n.click()
                questions=get_questions(subject_name, specialization_name, practice_name)
                practice['questions']=questions
                driver.get(specialization_url)
            driver.get(subject_url)
        data.append(subject)
    print(data)

来源：https://stackoverflow.com/questions/59070019/selenium-common-exceptions-nosuchelementexception-message-web-element-referenc

标签

python

selenium

firefox

geckodriver

selenium-firefoxdriver