问题
Am doing web scraping with python & selenium. I used to scrape data for one location & year at a time, by creating 1800 .py files (600 places * 3 years = 1800) and batch opening 10 at a time and waiting for it to complete. which is time-consuming so I decided to use multiprocessing.
I made my code to read places data from a text file and iterate with it.
the text file looks like this
Aandimadam
Aathur_Dindugal
Aathur_Salem East
Abiramam
Acchirapakkam
Adayar
Adhiramapattinam
Alandur
Alanganallur
my code without multiprocessing works fine. loops with register number and extracts data from the table to CSV. here's my code without multiprocessing. I used a single place name here
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from multiprocessing import Pool
from distributed import Client
import os, glob, codecs, csv, re, time, numpy
options = Options()
options.headless = False
options.add_argument("--window-size=1920,1200")
print('loading...')
def myScraper(loc):
print(" running function...")
########################### SETUP ###########################
startNo = 1 # <
location = loc # < ENTER HERE
year = 2015 # <
########################### SETUP ###########################
fileName = (f'RECORDS_HINMAR_{location}_{year}.csv') #
driver = webdriver.Chrome(options=options, executable_path='/chromedriver')
driver.get('https://tnreginet.gov.in/portal/')
time.sleep(5)
print('navigating to form')
en = driver.find_element_by_xpath('//*[@id="fontSelection"]').click() #change site to english
#navigating to form
more = driver.find_element_by_xpath('//*[@id="1195002"]/a')
search1 = driver.find_element_by_xpath('//*[@id="8500020"]/a')
hovmarr = driver.find_element_by_xpath('//*[@id="90000403"]/a')
Hover = ActionChains(driver).move_to_element(more).move_to_element(search1).move_to_element(hovmarr)
Hover.click().perform()
time.sleep(3)
try:
for x in range(startNo,2000): #looping for each reg number
print('__________START__________')
#filling form
mType =driver.find_element_by_xpath('//*[@id="cmb_marrType"]').send_keys("Hindu Marriage")
searchBy =driver.find_element_by_xpath('//*[@id="Search_Criteria_Two"]')
Hover = ActionChains(driver).move_to_element(searchBy)
Hover.click().perform()
Office =driver.find_element_by_xpath('//*[@id="cmb_sub_registrar_office"]').send_keys(location)
RegNo =driver.find_element_by_xpath('//*[@id="RegNO1"]').send_keys(x)
Year =driver.find_element_by_xpath('//*[@id="Year"]').send_keys(year)
submit =driver.find_element_by_xpath('//*[@id="CopyOfMarriageSearch"]/div[2]/div/div[18]/input')
Hover = ActionChains(driver).move_to_element(submit)
Hover.click().perform() #click submit
print('Loading reg no: ', x )
time.sleep(6)
#EXTRACT DATA FROM TABLE
print('Extracting data')
Res_re =driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[1]').text
Res_hus =driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[2]').text
Res_wife =driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[3]').text
Res_wPar =driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[8]').text
print('-----------------------')
print(f'| {location} | {Res_wife}')
print('-----------------------')
print('start csv write...')
#write to CSV FILE
with codecs.open(fileName, mode='a', encoding='utf-8') as RECORDS_file:
employee_writer = csv.writer(RECORDS_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
employee_writer.writerow([x, Res_re, Res_hus, Res_wife, Res_wPar])
print('Write to CSV success !')
print('**********END**********')
except:
#if error caused by invalid reg number (reg number not present, max value reached) close the driver
print('\n \n \n')
print('+++++++++++++ REQUIRES ATTENTION +++++++++++++\n')
print('\n \n \n')
print('error in --->', x)
print('\n')
time.sleep(1)
driver.quit()
# print('opening', fileName)
# path = os.path.normpath(f'C:\\Users\\Shyam\\Documents\\{fileName}')
# os.startfile(path)
y = 'Aandimadam'
myScraper(y)
but when i tried to use it with Pool its loading only first reg number and then failing to write to csv and crashing, then opening another window and doing the same for second place name.
here's code am trying to run i set pool value to 1 so it doesnt open multiple tabs and freeze my pc while debugging
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from multiprocessing import Pool
from distributed import Client
import os, glob, codecs, csv, re, time, numpy
options = Options()
options.headless = False
options.add_argument("--window-size=1920,1200")
print('loading...')
# placeListRaw = glob.glob(os.path.join("C:\\Users\\Shyam\\Documents\\Special Marriage\\placeList.txt") )
with open('placeList.txt', 'r', encoding="utf-8") as f:
tempList= f.readlines() #loading data from textfile to list
# print(tempList)
print('\n \n \n')
def myScraper(loc):
print(" running function...")
########################### SETUP ###########################
startNo = 1 # <
location = loc # < ENTER HERE
year = 2015 # <
########################### SETUP ###########################
fileName = (f'RECORDS_HINMAR_{location}_{year}.csv') #
driver = webdriver.Chrome(options=options, executable_path='/chromedriver')
driver.get('https://tnreginet.gov.in/portal/')
time.sleep(5)
print('navigating to form')
en = driver.find_element_by_xpath('//*[@id="fontSelection"]').click() #change site to english
#navigating to form
more = driver.find_element_by_xpath('//*[@id="1195002"]/a')
search1 = driver.find_element_by_xpath('//*[@id="8500020"]/a')
hovmarr = driver.find_element_by_xpath('//*[@id="90000403"]/a')
Hover = ActionChains(driver).move_to_element(more).move_to_element(search1).move_to_element(hovmarr)
Hover.click().perform()
time.sleep(3)
try:
for x in range(startNo,2000): #looping for each reg number
print('__________START__________')
#filling form
mType =driver.find_element_by_xpath('//*[@id="cmb_marrType"]').send_keys("Hindu Marriage")
searchBy =driver.find_element_by_xpath('//*[@id="Search_Criteria_Two"]')
Hover = ActionChains(driver).move_to_element(searchBy)
Hover.click().perform()
Office =driver.find_element_by_xpath('//*[@id="cmb_sub_registrar_office"]').send_keys(location)
RegNo =driver.find_element_by_xpath('//*[@id="RegNO1"]').send_keys(x)
Year =driver.find_element_by_xpath('//*[@id="Year"]').send_keys(year)
submit =driver.find_element_by_xpath('//*[@id="CopyOfMarriageSearch"]/div[2]/div/div[18]/input')
Hover = ActionChains(driver).move_to_element(submit)
Hover.click().perform() #click submit
print('Loading reg no: ', x )
time.sleep(6)
#EXTRACT DATA FROM TABLE
print('Extracting data')
Res_re =driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[1]').text
Res_hus =driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[2]').text
Res_wife =driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[3]').text
Res_wPar =driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[8]').text
print('-----------------------')
print(f'| {location} | {Res_wife}')
print('-----------------------')
print('start csv write...')
#write to CSV FILE
with codecs.open(fileName, mode='a', encoding='utf-8') as RECORDS_file:
employee_writer = csv.writer(RECORDS_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
employee_writer.writerow([x, Res_re, Res_hus, Res_wife, Res_wPar])
print('Write to CSV success !')
print('**********END**********')
except:
#if error caused by invalid reg number (reg number not present, max value reached) close the driver
print('\n \n \n')
print('+++++++++++++ REQUIRES ATTENTION +++++++++++++\n')
print('\n \n \n')
print('error in --->', x)
print('\n')
time.sleep(1)
driver.quit()
# print('opening', fileName)
# path = os.path.normpath(f'C:\\Users\\Shyam\\Documents\\{fileName}')
# os.startfile(path)
if __name__ == "__main__":
# freeze_support()
p = Pool(1)
p.map(myScraper, tempList)
p.terminate()
p.join()
P.s. sorry for my bad english, also just learning to code, this is my 4th self written code so far
回答1:
finally did it with help of a person on reddit. he told me to use concurrent.futures instead of pool.
I retyped my code (kinda) from scratch in pycharm and it was easier than using sublimeText also added explicit wait by waiting for loading gif to disappear (why didn't i thought of this before) and additional wait time with if statement for javascript to render page. it made duplicate data. old data from previous page taken as new data and then it crashed.
here's my code
main()
# scraper used to scrape marriage data's from tnreginet.gov.in used fro educational purposes created by @rajaspidey
import concurrent.futures
import csv
from itertools import repeat
####### SETUP #######
from myScraper import myscraper
mar_type = 'TMR1' # 'TMR1' 'HMR'
mar_year = 2015
####### SETUP #######
project_path = 'C:\\Users\\Shyam\\PycharmProjects\\TNregi_Scrape\\'
logs_path = f'{project_path}logs\\'
# LOAD place list from file
with open('placeList.txt', 'r', encoding="utf-8") as f:
for line in csv.reader(f):
print(' \n file reading done ! \n')
place_list = line
print(line)
print('\n PLACE list successfully loaded..!! \n')
print('\n \n \n')
if __name__ == '__main__':
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
print('creating ThreadPoolExecutor')
start_scrape = executor.map(myscraper, repeat(mar_type), place_list, repeat(mar_year))
# def myscraper(mar_type, start, loc, scrape_year):
# def check_file(mar_type, mar_place, mar_year) -> int:
check_file.py
import codecs
import pathlib
import re
project_path = 'C:\\Users\\Shyam\\PycharmProjects\\TNregi_Scrape\\'
logs_path = f'{project_path}logs\\'
def check_file(mar_type, mar_place, mar_year) -> int:
place_name = mar_place
file_path = (f'{project_path}{mar_type}\\{mar_year}\\')
new_file_name = f'{file_path}RECORDS_{mar_type}_{place_name}_{mar_year}.csv'
if pathlib.Path(new_file_name).is_file():
print('file exist')
print('extracting last lines...')
# temporarily used this to check if last line in the files are blank and fixing them based on error
# all_files = glob.glob(os.path.join(yr_path, f"RECORDS_*_.csv")) # change here
# print(all_files)
# for file in all_files:
# print(file)
#
# matched_place_name = re.findall("(?<=\\w+_\\w+_)[A-Za-z0-9 ()_.]+(?=_\\d+)", file)
#
# print(f'Regex result -------- >{matched_place_name}')
with open(new_file_name, 'r', encoding="utf-8") as f:
temp_last_line = f.readlines()[-1]
print(f'temp last line {temp_last_line}')
last_line = re.findall("(?<=.../)[A-Za-z0-9 \\-()_.]+/\\d+/\\d+(?=,)", temp_last_line)
print(f"this is {last_line}")
with codecs.open(f"{logs_path}LastLines_{mar_year}.txt",
mode='a') as lastLine_file:
final_out = last_line[0].split("/", 3)
lastLine_file.write(f"{last_line[0]}\n")
print('Write last line to file success !')
print(f' {final_out[0]} has details till {final_out[1]} ')
return int(final_out[1]) + 1
# verify(matched_place_name[0], final_out[1], final_out[2])
else:
start_no = 1
print(f'{place_name} is fresh run')
return 1
#print(check_file('TMR1', 'ambattur', 2015))
and main scrapper function
import codecs
import csv
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from check_file import check_file
project_path = 'C:\\Users\\Shyam\\PycharmProjects\\TNregi_Scrape\\'
logs_path = f'{project_path}logs\\'
def myscraper(mar_type, mar_place, mar_year):
options = Options()
options.headless = True
options.add_argument("--window-size=1920,1200")
print('loading...\n')
print(f'--------------> {mar_place}\n')
print("running myScraper...")
start_no = check_file(mar_type, mar_place, mar_year)
location = mar_place
year = mar_year # <
file_path = (f'{project_path}{mar_type}\\{mar_year}\\')
file_name = f'{file_path}RECORDS_{mar_type}_{location}_{year}.csv' # FILENAME HERE <----
print(f'data will be saved to > {file_name}\n')
driver = webdriver.Chrome(options=options, executable_path='/chromedriver')
driver.get('https://tnreginet.gov.in/portal/')
delay = 60 # seconds
time.sleep(3)
print('navigating to form...')
en = driver.find_element_by_xpath('//*[@id="fontSelection"]').click() # change site to english
# navigating to form
more = driver.find_element_by_xpath('//*[@id="1195002"]/a')
search1 = driver.find_element_by_xpath('//*[@id="8500020"]/a')
hov_mar = driver.find_element_by_xpath('//*[@id="90000403"]/a')
hover = ActionChains(driver).move_to_element(more).move_to_element(search1).move_to_element(hov_mar)
hover.click().perform()
time.sleep(0.5) # wait till load
try:
myElem = WebDriverWait(driver, delay).until(EC.invisibility_of_element_located((By.ID, 'statusbar')))
print("Page is ready!")
except:
print("navigating took too much time!")
driver.quit()
try:
for x in range(start_no, 2000): # looping for each reg number
print('__________START LOOP__________')
# filling form
m_type = driver.find_element_by_xpath('//*[@id="cmb_marrType"]').click()
if mar_type == 'TMR1':
sel = driver.find_element_by_xpath('//*[@id="cmb_marrType"]/option[3]').click() # TN MAR FORM I
elif mar_type == 'HMR':
sel = driver.find_element_by_xpath('//*[@id="cmb_marrType"]/option[2]').click() # HINDU MARRIAGE
search_by = driver.find_element_by_xpath('//*[@id="Search_Criteria_Two"]')
hover = ActionChains(driver).move_to_element(search_by)
hover.click().perform()
office = driver.find_element_by_xpath('//*[@id="cmb_sub_registrar_office"]').send_keys(location)
in_reg_no = driver.find_element_by_xpath('//*[@id="RegNO1"]').send_keys(x)
in_year = driver.find_element_by_xpath('//*[@id="Year"]').send_keys(year)
submit = driver.find_element_by_xpath('//*[@id="CopyOfMarriageSearch"]/div[2]/div/div[18]/input')
hover = ActionChains(driver).move_to_element(submit)
hover.click().perform() # click submit
print(f'Loading reg no: {x} in {location}')
###### WAIT till page load ######
time.sleep(0.5)
try:
myElem = WebDriverWait(driver, delay).until(EC.invisibility_of_element_located((By.ID, 'statusbar'))) #wait till loading gif disappear
time.sleep(0.5)
new_reg_no = driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[1]').text
if res_reg_no == new_reg_no: #additional wait in case of duplicate old data due to javascript rendering
time.sleep(3)
print("table is ready!")
except:
print("Loading table took too much time!")
###### EXTRACT DATA FROM TABLE #####p
print('Saerching for table to Extract data...')
res_reg_no = driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[1]').text
res_hus = driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[2]').text
res_wife = driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[3]').text
res_w_par = driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[8]').text
print('Table Found')
print('-----------------------')
print(f'| {location} | {res_wife}')
print('-----------------------')
# mType.send_keys('- Select -')
print('start csv write...')
##### write to CSV FILE #####
with codecs.open(file_name, mode='a', encoding='utf-8') as RECORDS_file:
employee_writer = csv.writer(RECORDS_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
employee_writer.writerow([x, res_reg_no, res_hus, res_wife, res_w_par])
print('Write to CSV success !')
print('**********END**********')
driver.quit()
except:
# driver.quit()
# if error caused by invalid reg number (reg number not present, max value reached) close the driver
print('\n \n \n')
print('+++++++++++++ REQUIRES ATTENTION +++++++++++++\n')
print('\n \n \n')
print('error in ---= >', x)
with codecs.open(f"{logs_path}Completed_{mar_type}_{year}.txt", mode='a+', encoding='utf-8') as completed_file:
print('error in --->', x)
completed_file.write(f' Ended at ----->>>>> Place: {mar_place} | loop no: {x} | RegNo: {res_reg_no}| year: {year} \n')
print(f" Ended at ----->>>>> Place: {mar_place} | loop no: {x} | RegNo: {res_reg_no}| year: {year} \n")
print('error in --->', x)
time.sleep(0.5)
driver.quit()
print('error in --->', x)
print('\n')
print('error in --->', x)
driver.quit()
driver.quit()
# myscraper('TMR1', 1, 'ADAYAR', 2015) #REMOVE AT END for test run
来源:https://stackoverflow.com/questions/65768906/webscraping-using-python-and-selenium-and-tried-to-use-multiprocessing-but-code