问题
I'm trying to get the data for several different tests from a test prep site. There are different subjects, each of which has a specialization, each of which has a practice-test, each of which has several questions.
subject <--- specialization <---- practice-test *------ question
Here's my code:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pathlib
import time
import json
import os
driver=webdriver.Firefox(executable_path="../../../geckodriver.exe")
wait = WebDriverWait(driver, 15)
data=[]
def setup():
driver.get('https://www.varsitytutors.com/practice-tests')
try:
go_away_1= driver.find_element_by_class_name("ub-emb-iframe")
driver.execute_script("arguments[0].style.visibility='hidden'", go_away_1)
go_away_2= driver.find_element_by_class_name("ub-emb-iframe-wrapper")
driver.execute_script("arguments[0].style.visibility='hidden'", go_away_2)
go_away_3= driver.find_element_by_class_name("ub-emb-visible")
driver.execute_script("arguments[0].style.visibility='hidden'", go_away_3)
except:
pass
def get_subjects(subs=[]):
subject_clickables_xpath="/html/body/div[3]/div[9]/div/*/div[@data-subject]/div[1]"
subject_clickables=driver.find_elements_by_xpath(subject_clickables_xpath)
subject_names=map(lambda x : x.find_element_by_xpath('..').get_attribute('data-subject'), subject_clickables)
subject_pairs=zip(subject_names, subject_clickables)
return subject_pairs
def get_specializations(subject):
specialization_clickables_xpath="//div//div[@data-subject='"+subject+"']/following-sibling::div//div[@class='public_problem_set']//a[contains(.,'Practice Tests')]"
specialization_names_xpath="//div//div[@data-subject='"+subject+"']/following-sibling::div//div[@class='public_problem_set']//a[contains(.,'Practice Tests')]/../.."
specialization_names=map(lambda x : x.get_attribute('data-subject'), driver.find_elements_by_xpath(specialization_names_xpath))
specialization_clickables = driver.find_elements_by_xpath(specialization_clickables_xpath)
specialization_pairs=zip(specialization_names, specialization_clickables)
return specialization_pairs
def get_practices(subject, specialization):
practice_clickables_xpath="/html/body/div[3]/div[8]/div[3]/*/div[1]/a[1]"
practice_names_xpath="//*/h3[@class='subject_header']"
lengths_xpath="/html/body/div[3]/div[8]/div[3]/*/div[2]"
lengths=map(lambda x : x.text, driver.find_elements_by_xpath(lengths_xpath))
print(lengths)
practice_names=map(lambda x : x.text, driver.find_elements_by_xpath(practice_names_xpath))
practice_clickables = driver.find_elements_by_xpath(practice_clickables_xpath)
practice_pairs=zip(practice_names, practice_clickables)
return practice_pairs
def remove_popup():
try:
button=wait.until(EC.element_to_be_clickable((By.XPATH,"//button[contains(.,'No Thanks')]")))
button.location_once_scrolled_into_view
button.click()
except:
print('could not find the popup')
def get_questions(subject, specialization, practice):
remove_popup()
questions=[]
current_question=None
while True:
question={}
try:
WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,"/html/body/div[3]/div[7]/div[1]/div[2]/div[2]/table/tbody/tr/td[1]")))
question_number=driver.find_element_by_xpath('/html/body/div[3]/div[7]/div[1]/div[2]/div[2]/table/tbody/tr/td[1]').text.replace('.','')
question_pre=driver.find_element_by_class_name('question_pre')
question_body=driver.find_element_by_xpath('/html/body/div[3]/div[7]/div[1]/div[2]/div[2]/table/tbody/tr/td[2]/p')
answer_choices=driver.find_elements_by_class_name('question_row')
answers=map(lambda x : x.text, answer_choices)
question['id']=question_number
question['pre']=question_pre.text
question['body']=question_body.text
question['answers']=list(answers)
questions.append(question)
choice=WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"input.test_button")))
driver.execute_script("arguments[0].click();", choice[3])
time.sleep(3)
except Exception as e:
if 'results' in driver.current_url:
driver.get(driver.current_url.replace('http://', 'https://'))
# last question has been answered; record results
remove_popup()
pathlib.Path('data/'+subject+'/'+specialization).mkdir(parents=True, exist_ok=True)
with open('data/'+subject+'/'+specialization+'/questions.json', 'w') as outfile:
json.dump(list(questions), outfile)
break
else:
driver.get(driver.current_url.replace('http://', 'https://'))
return questions
def scrape():
setup()
subjects=get_subjects()
for subject_name, subject_clickable in subjects:
subject={}
subject['name']=subject_name
subject['specializations']=[]
subject_clickable.click()
subject_url=driver.current_url.replace('http://', 'https://')
specializations=get_specializations(subject_name)
for specialization_name, specialization_clickable in specializations:
specialization={}
specialization['name']=specialization_name
specialization['practices']=[]
specialization_clickable.click()
specialization_url=driver.current_url.replace('http://', 'https://')
practices=get_practices(subject_name, specialization_name)
for practice_name, practice_clickable in practices:
practice={}
practice['name']=practice_name
practice_clickable.click()
questions=get_questions(subject_name, specialization_name, practice_name)
practice['questions']=questions
driver.get(specialization_url)
driver.get(subject_url)
data.append(subject)
print(data)
scrape()
Running this produces the error message:
Traceback (most recent call last):
File "scrape.py", line 141, in <module>
scrape()
File "scrape.py", line 126, in scrape
for practice_name, practice_clickable in practices:
File "scrape.py", line 49, in <lambda>
practice_names=map(lambda x : x.text, driver.find_elements_by_xpath(practice_names_xpath))
File "C:\Users\Joseph\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webelement.py", line 76, in text
return self._execute(Command.GET_ELEMENT_TEXT)['value']
File "C:\Users\Joseph\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webelement.py", line 628, in _execute
return self._parent.execute(command, params)
File "C:\Users\Joseph\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 312, in execute
self.error_handler.check_response(response)
File "C:\Users\Joseph\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 237, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.NoSuchElementException: Message: Web element reference not seen before: 980e5c29-e3af-4b13-979f-0f2bb58b3480
After getting the questions from one practice-test, the driver needs to return to specialization page where the next practice-test can be found. Hence these lines (where the problem is):
for practice_name, practice_clickable in practices:
practice={}
practice['name']=practice_name
practice_clickable.click()
questions=get_questions(subject_name, specialization_name, practice_name)
practice['questions']=questions
driver.get(specialization_url)
Apparently, the next practice clickable is no longer found on the page. Why not?
Also, I'm not sure if this is worthy of its own question, but I couldn't get this program to work on my Ubuntu computer -- the get_questions
function stops on the last question and won't go to the results page.
Here's a tentative suggestion I'm trying by Guy:
def scrape():
setup()
subjects=get_subjects()
for subject_name, subject_clickable in subjects:
subject={}
subject['name']=subject_name
subject['specializations']=[]
subject_clickable.click()
subject_url=driver.current_url.replace('http://', 'https://')
specializations=get_specializations(subject_name)
for specialization_name, specialization_clickable in specializations:
specialization={}
specialization['name']=specialization_name
specialization['practices']=[]
specialization_clickable.click()
specialization_url=driver.current_url.replace('http://', 'https://')
practices=get_practices(subject_name, specialization_name)
practices_len = len(list(get_practices(subject_name, specialization_name)))
for i in range(practices_len):
practices_list = list(get_practices(subject_name, specialization_name))
practice = {}
practice['name'] = practices_list[i][0]
practices_list[i][1].click()
# for practice_name, practice_clickable in practices:
# practice={}
# practice['name']=practice_name
# practice_clickable.click()
# questions=get_questions(subject_name, specialization_name, practice_name)
# practice['questions']=questions
driver.get(specialization_url)
driver.get(subject_url)
data.append(subject)
print(data)
scrape()
Edit: As suggested by Hubert, I tried the following out:
practices = get_practices(subject_name, specialization_name)
practices = [item[0] for item in practices]
for index, practice_name in enumerate(practices):
practice={}
practice['name'] = practice_name
practice_row = driver.find_element_by_xpath('//*[text()="'+practice_name+'"]/..')
practice_clickable_n = practice_row.find_element_by_link_text('Begin')
print('old:', practice_clickable[index])
print('new:', practice_clickable_n)
practice_clickable_n.click()
questions=get_questions(subject_name, specialization_name, practice_name)
And this was the result:
<map object at 0x7fabc0129860>
<map object at 0x7fabc0129898>
Traceback (most recent call last):
File "scrape.py", line 140, in <module>
scrape()
File "scrape.py", line 131, in scrape
print('old:', practice_clickable[index])
IndexError: list index out of range
回答1:
This error message...
selenium.common.exceptions.NoSuchElementException: Message: Web element reference not seen before: 980e5c29-e3af-4b13-979f-0f2bb58b3480
...implies that the GeckoDriver was unable identify the WebElement.
This error is coming out from get(webEl, win) within the Marionette source code:
get(webEl, win) {
if (!(webEl instanceof WebElement)) {
throw new TypeError(pprint`Expected web element, got: ${webEl}`);
}
if (!this.has(webEl)) {
throw new NoSuchElementError(
"Web element reference not seen before: " + webEl.uuid
);
}
@fc's comment in the discussion 'Element reference not seen before: undefined' using geckodriver, waitForElementVisible fails explains the actual issue:
However, the core issue was discussed in Intermittent test_navigation.py TestRefresh.test_basic | NoSuchElementException: Failed to trigger opening a new tab: Web element reference not seen before and was subsequently and was addressed through the changeset
Solution
Using the latest version of the binaries will solve the issue in terms of:
- Selenium v3.141.59
- GeckoDriver v0.26.0
- Firefox v70.0
回答2:
The problem is the iteration over practices
. It holds WebElement
s, but thier reference is lost when you are navigating to a new page, even if it actually the same page Selenium treats it as a new one.
You can solve it by iterating by index. To do it with zip
you can do something like this
practices_len = len(list(get_practices(subject_name, specialization_name)))
for i in range(practices_len):
practices_list = list(get_practices(subject_name, specialization_name))
practice = {}
practice['name'] = practices_list[i][0]
practices_list[i][1].click()
回答3:
Guy is right. The next time you load the specialization_url it is a new page with new elements but practices contains the web elements of the old page as web elements.
To only change the part where it happens the code below first creates a list of the practices and practice_clickables. It then searches for a fresh clickable whenever it returns to the new specialization_url page and prints the ID of the old and the current practice_clickable. With that it is clearly visible that the element in the same row now is a different one than it was the first time the page was loaded.
In addition the map or zip function seems to create a generator so that even the iteration over the practices fails because in that step webdriver code is executed on old objects. That is why I first create lists and iterate over a list.
Changed snippet:
practices = get_practices(subject_name, specialization_name)
practice_clickable = [item[1] for item in practices]
practices = get_practices(subject_name, specialization_name)
practices = [item[0] for item in practices]
for index, practice_name in enumerate(practices):
practice={}
practice['name'] = practice_name
practice_row = driver.find_element_by_xpath(f'//*[text()="{practice_name}"]/..')
practice_clickable_n = practice_row.find_element_by_link_text('Begin')
print('old:', practice_clickable[index])
print('new:', practice_clickable_n)
practice_clickable_n.click()
questions=get_questions(subject_name, specialization_name, practice_name)
Complete scrape function:
def scrape():
setup()
subjects=get_subjects()
for subject_name, subject_clickable in subjects:
subject={}
subject['name']=subject_name
subject['specializations']=[]
subject_clickable.click()
if ('http://') in driver.current_url:
subject_url=driver.current_url.replace('http://', 'https://')
else:
subject_url=driver.current_url
specializations=get_specializations(subject_name)
for specialization_name, specialization_clickable in specializations:
specialization={}
specialization['name']=specialization_name
specialization['practices']=[]
specialization_clickable.click()
if 'http://' in driver.current_url:
specialization_url=driver.current_url.replace('http://', 'https://')
else:
specialization_url=driver.current_url
practices = get_practices(subject_name, specialization_name)
practice_clickable = [item[1] for item in practices]
practices = get_practices(subject_name, specialization_name)
practices = [item[0] for item in practices]
for index, practice_name in enumerate(practices):
practice={}
practice['name'] = practice_name
practice_row = driver.find_element_by_xpath(f'//*[text()="{practice_name}"]/..')
practice_clickable_n = practice_row.find_element_by_link_text('Begin')
print('old:', practice_clickable[index])
print('new:', practice_clickable_n)
practice_clickable_n.click()
questions=get_questions(subject_name, specialization_name, practice_name)
practice['questions']=questions
driver.get(specialization_url)
driver.get(subject_url)
data.append(subject)
print(data)
来源:https://stackoverflow.com/questions/59070019/selenium-common-exceptions-nosuchelementexception-message-web-element-referenc