webscraping using python and selenium and tried to use multiprocessing but code not working. without it code works fine

问题

Am doing web scraping with python & selenium. I used to scrape data for one location & year at a time, by creating 1800 .py files (600 places * 3 years = 1800) and batch opening 10 at a time and waiting for it to complete. which is time-consuming so I decided to use multiprocessing.

I made my code to read places data from a text file and iterate with it.

the text file looks like this

Aandimadam
Aathur_Dindugal
Aathur_Salem East
Abiramam
Acchirapakkam
Adayar
Adhiramapattinam
Alandur
Alanganallur

my code without multiprocessing works fine. loops with register number and extracts data from the table to CSV. here's my code without multiprocessing. I used a single place name here

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException

from multiprocessing import Pool
from distributed import Client

import os, glob, codecs, csv, re, time, numpy


options = Options()
options.headless =  False
options.add_argument("--window-size=1920,1200")

print('loading...')


def myScraper(loc):


    print(" running function...")

    ########################### SETUP ###########################
    startNo = 1                                                 # <
    location = loc                          # < ENTER HERE
    year = 2015                                                 # <
    ########################### SETUP ###########################

    fileName = (f'RECORDS_HINMAR_{location}_{year}.csv')        #




    driver = webdriver.Chrome(options=options, executable_path='/chromedriver')
    driver.get('https://tnreginet.gov.in/portal/')

    time.sleep(5)
    print('navigating to form')

    en = driver.find_element_by_xpath('//*[@id="fontSelection"]').click() #change site to english


    #navigating to form
    more = driver.find_element_by_xpath('//*[@id="1195002"]/a')
    search1 = driver.find_element_by_xpath('//*[@id="8500020"]/a')
    hovmarr = driver.find_element_by_xpath('//*[@id="90000403"]/a')
    Hover = ActionChains(driver).move_to_element(more).move_to_element(search1).move_to_element(hovmarr)
    Hover.click().perform()


    time.sleep(3)

    try:

        for x in range(startNo,2000): #looping for each reg number

            print('__________START__________')

            #filling form
            mType =driver.find_element_by_xpath('//*[@id="cmb_marrType"]').send_keys("Hindu Marriage")
            searchBy =driver.find_element_by_xpath('//*[@id="Search_Criteria_Two"]')
            Hover = ActionChains(driver).move_to_element(searchBy)
            Hover.click().perform()
            Office =driver.find_element_by_xpath('//*[@id="cmb_sub_registrar_office"]').send_keys(location) 
            RegNo =driver.find_element_by_xpath('//*[@id="RegNO1"]').send_keys(x)
            Year =driver.find_element_by_xpath('//*[@id="Year"]').send_keys(year) 


            submit =driver.find_element_by_xpath('//*[@id="CopyOfMarriageSearch"]/div[2]/div/div[18]/input')
            Hover = ActionChains(driver).move_to_element(submit)
            Hover.click().perform() #click submit
            print('Loading reg no:        ', x )
            time.sleep(6)

            #EXTRACT DATA FROM TABLE
            print('Extracting data')
            Res_re =driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[1]').text
            Res_hus =driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[2]').text
            Res_wife =driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[3]').text
            Res_wPar =driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[8]').text
            print('-----------------------')
            print(f'|     {location}                 | {Res_wife}')
            print('-----------------------')
            print('start csv write...')
                        #write to CSV FILE 
            with codecs.open(fileName, mode='a', encoding='utf-8') as RECORDS_file: 
                
                    employee_writer = csv.writer(RECORDS_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

                    employee_writer.writerow([x, Res_re, Res_hus, Res_wife, Res_wPar])

                    print('Write to CSV success !')
                    print('**********END**********')

    except: 
        #if error caused by invalid reg number (reg number not present, max value reached) close the driver
        print('\n \n \n')
        print('+++++++++++++  REQUIRES ATTENTION  +++++++++++++\n') 
        print('\n \n \n')
        print('error in --->', x)
        print('\n')
        time.sleep(1)
        driver.quit()
        # print('opening', fileName)
        # path = os.path.normpath(f'C:\\Users\\Shyam\\Documents\\{fileName}')
        # os.startfile(path)


y = 'Aandimadam'
myScraper(y)

but when i tried to use it with Pool its loading only first reg number and then failing to write to csv and crashing, then opening another window and doing the same for second place name.

here's code am trying to run i set pool value to 1 so it doesnt open multiple tabs and freeze my pc while debugging

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException

from multiprocessing import Pool
from distributed import Client

import os, glob, codecs, csv, re, time, numpy


options = Options()
options.headless =  False
options.add_argument("--window-size=1920,1200")

print('loading...')





# placeListRaw = glob.glob(os.path.join("C:\\Users\\Shyam\\Documents\\Special Marriage\\placeList.txt") )


with open('placeList.txt', 'r', encoding="utf-8") as f:
        tempList= f.readlines() #loading data from textfile to list
        # print(tempList)


print('\n \n \n')


def myScraper(loc):


    print(" running function...")

    ########################### SETUP ###########################
    startNo = 1                                                 # <
    location = loc                          # < ENTER HERE
    year = 2015                                                 # <
    ########################### SETUP ###########################

    fileName = (f'RECORDS_HINMAR_{location}_{year}.csv')        #




    driver = webdriver.Chrome(options=options, executable_path='/chromedriver')
    driver.get('https://tnreginet.gov.in/portal/')

    time.sleep(5)
    print('navigating to form')

    en = driver.find_element_by_xpath('//*[@id="fontSelection"]').click() #change site to english


    #navigating to form
    more = driver.find_element_by_xpath('//*[@id="1195002"]/a')
    search1 = driver.find_element_by_xpath('//*[@id="8500020"]/a')
    hovmarr = driver.find_element_by_xpath('//*[@id="90000403"]/a')
    Hover = ActionChains(driver).move_to_element(more).move_to_element(search1).move_to_element(hovmarr)
    Hover.click().perform()


    time.sleep(3)

    try:

        for x in range(startNo,2000): #looping for each reg number

            print('__________START__________')

            #filling form
            mType =driver.find_element_by_xpath('//*[@id="cmb_marrType"]').send_keys("Hindu Marriage")
            searchBy =driver.find_element_by_xpath('//*[@id="Search_Criteria_Two"]')
            Hover = ActionChains(driver).move_to_element(searchBy)
            Hover.click().perform()
            Office =driver.find_element_by_xpath('//*[@id="cmb_sub_registrar_office"]').send_keys(location) 
            RegNo =driver.find_element_by_xpath('//*[@id="RegNO1"]').send_keys(x)
            Year =driver.find_element_by_xpath('//*[@id="Year"]').send_keys(year) 


            submit =driver.find_element_by_xpath('//*[@id="CopyOfMarriageSearch"]/div[2]/div/div[18]/input')
            Hover = ActionChains(driver).move_to_element(submit)
            Hover.click().perform() #click submit
            print('Loading reg no:        ', x )
            time.sleep(6)

            #EXTRACT DATA FROM TABLE
            print('Extracting data')
            Res_re =driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[1]').text
            Res_hus =driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[2]').text
            Res_wife =driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[3]').text
            Res_wPar =driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[8]').text
            print('-----------------------')
            print(f'|     {location}                 | {Res_wife}')
            print('-----------------------')
            print('start csv write...')
                        #write to CSV FILE 
            with codecs.open(fileName, mode='a', encoding='utf-8') as RECORDS_file: 
                
                    employee_writer = csv.writer(RECORDS_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

                    employee_writer.writerow([x, Res_re, Res_hus, Res_wife, Res_wPar])

                    print('Write to CSV success !')
                    print('**********END**********')

    except: 
        #if error caused by invalid reg number (reg number not present, max value reached) close the driver
        print('\n \n \n')
        print('+++++++++++++  REQUIRES ATTENTION  +++++++++++++\n') 
        print('\n \n \n')
        print('error in --->', x)
        print('\n')
        time.sleep(1)
        driver.quit()
        # print('opening', fileName)
        # path = os.path.normpath(f'C:\\Users\\Shyam\\Documents\\{fileName}')
        # os.startfile(path)




if __name__ == "__main__":
    
    # freeze_support()
    p = Pool(1)
    p.map(myScraper, tempList)
    p.terminate()
    p.join()

P.s. sorry for my bad english, also just learning to code, this is my 4th self written code so far

回答1:

finally did it with help of a person on reddit. he told me to use concurrent.futures instead of pool.

I retyped my code (kinda) from scratch in pycharm and it was easier than using sublimeText also added explicit wait by waiting for loading gif to disappear (why didn't i thought of this before) and additional wait time with if statement for javascript to render page. it made duplicate data. old data from previous page taken as new data and then it crashed.

here's my code

main()

    # scraper used to scrape marriage data's from tnreginet.gov.in used fro educational purposes created by @rajaspidey
import concurrent.futures
import csv
from itertools import repeat



####### SETUP #######

from myScraper import myscraper

mar_type = 'TMR1'  # 'TMR1'  'HMR'
mar_year = 2015
####### SETUP #######
project_path = 'C:\\Users\\Shyam\\PycharmProjects\\TNregi_Scrape\\'
logs_path = f'{project_path}logs\\'

# LOAD place list from file
with open('placeList.txt', 'r', encoding="utf-8") as f:
    for line in csv.reader(f):
        print(' \n file reading done ! \n')
        place_list = line
        print(line)
        print('\n PLACE list successfully loaded..!! \n')

print('\n \n \n')

if __name__ == '__main__':
    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
        print('creating ThreadPoolExecutor')
        start_scrape = executor.map(myscraper, repeat(mar_type), place_list, repeat(mar_year))



        # def myscraper(mar_type, start, loc, scrape_year):
        # def check_file(mar_type, mar_place, mar_year) -> int:

check_file.py

import codecs
import pathlib
import re

project_path = 'C:\\Users\\Shyam\\PycharmProjects\\TNregi_Scrape\\'
logs_path = f'{project_path}logs\\'


def check_file(mar_type, mar_place, mar_year) -> int:
    place_name = mar_place
    file_path = (f'{project_path}{mar_type}\\{mar_year}\\')
    new_file_name = f'{file_path}RECORDS_{mar_type}_{place_name}_{mar_year}.csv'
    if pathlib.Path(new_file_name).is_file():
        print('file exist')
        print('extracting last lines...')

        # temporarily used this to check if last line in the files are blank and fixing them based on error
        # all_files = glob.glob(os.path.join(yr_path, f"RECORDS_*_.csv"))  # change here
        # print(all_files)
        # for file in all_files:
        #     print(file)
        #
        #     matched_place_name = re.findall("(?<=\\w+_\\w+_)[A-Za-z0-9 ()_.]+(?=_\\d+)", file)
        #
        #     print(f'Regex result -------- >{matched_place_name}')

        with open(new_file_name, 'r', encoding="utf-8") as f:
            temp_last_line = f.readlines()[-1]
            print(f'temp last line {temp_last_line}')
            last_line = re.findall("(?<=.../)[A-Za-z0-9 \\-()_.]+/\\d+/\\d+(?=,)", temp_last_line)

            print(f"this is {last_line}")
            with codecs.open(f"{logs_path}LastLines_{mar_year}.txt",
                             mode='a') as lastLine_file:
                final_out = last_line[0].split("/", 3)
                lastLine_file.write(f"{last_line[0]}\n")
                print('Write last line to file success !')
                print(f' {final_out[0]} has details till {final_out[1]} ')
                return int(final_out[1]) + 1

                # verify(matched_place_name[0], final_out[1], final_out[2])
    else:
        start_no = 1
        print(f'{place_name} is fresh run')
        return 1


#print(check_file('TMR1', 'ambattur', 2015))

and main scrapper function

import codecs
import csv
import time

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from check_file import check_file

project_path = 'C:\\Users\\Shyam\\PycharmProjects\\TNregi_Scrape\\'
logs_path = f'{project_path}logs\\'


def myscraper(mar_type, mar_place, mar_year):
    
    options = Options()
    options.headless = True
    options.add_argument("--window-size=1920,1200")

    print('loading...\n')

    print(f'--------------> {mar_place}\n')
    print("running myScraper...")

    
    
    start_no = check_file(mar_type, mar_place, mar_year)
    location = mar_place
    year = mar_year  # <
    
    file_path = (f'{project_path}{mar_type}\\{mar_year}\\')
    file_name = f'{file_path}RECORDS_{mar_type}_{location}_{year}.csv'  # FILENAME HERE <----

    print(f'data will be saved to > {file_name}\n')

    driver = webdriver.Chrome(options=options, executable_path='/chromedriver')

    driver.get('https://tnreginet.gov.in/portal/')
    delay = 60  # seconds
    time.sleep(3)
    print('navigating to form...')

    en = driver.find_element_by_xpath('//*[@id="fontSelection"]').click()  # change site to english

    # navigating to form
    more = driver.find_element_by_xpath('//*[@id="1195002"]/a')
    search1 = driver.find_element_by_xpath('//*[@id="8500020"]/a')
    hov_mar = driver.find_element_by_xpath('//*[@id="90000403"]/a')
    hover = ActionChains(driver).move_to_element(more).move_to_element(search1).move_to_element(hov_mar)
    hover.click().perform()

    time.sleep(0.5)  # wait till load
    try:
        myElem = WebDriverWait(driver, delay).until(EC.invisibility_of_element_located((By.ID, 'statusbar')))

        print("Page is ready!")

    except:
        print("navigating took too much time!")
        driver.quit()

    try:

        for x in range(start_no, 2000):  # looping for each reg number

            print('__________START LOOP__________')

            # filling form
            m_type = driver.find_element_by_xpath('//*[@id="cmb_marrType"]').click()

            if mar_type == 'TMR1':
                sel = driver.find_element_by_xpath('//*[@id="cmb_marrType"]/option[3]').click()  # TN MAR FORM I
            elif mar_type == 'HMR':
                sel = driver.find_element_by_xpath('//*[@id="cmb_marrType"]/option[2]').click()  # HINDU MARRIAGE

            search_by = driver.find_element_by_xpath('//*[@id="Search_Criteria_Two"]')
            hover = ActionChains(driver).move_to_element(search_by)
            hover.click().perform()
            office = driver.find_element_by_xpath('//*[@id="cmb_sub_registrar_office"]').send_keys(location)
            in_reg_no = driver.find_element_by_xpath('//*[@id="RegNO1"]').send_keys(x)
            in_year = driver.find_element_by_xpath('//*[@id="Year"]').send_keys(year)

            submit = driver.find_element_by_xpath('//*[@id="CopyOfMarriageSearch"]/div[2]/div/div[18]/input')
            hover = ActionChains(driver).move_to_element(submit)
            hover.click().perform()  # click submit
            print(f'Loading reg no:         {x}  in   {location}')

            ######    WAIT till page load  ######

            time.sleep(0.5)
            try:
                myElem = WebDriverWait(driver, delay).until(EC.invisibility_of_element_located((By.ID, 'statusbar'))) #wait till loading gif disappear
                time.sleep(0.5)
                new_reg_no = driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[1]').text
                if res_reg_no == new_reg_no: #additional wait in case of duplicate old data due to javascript rendering
                    time.sleep(3)
                print("table  is ready!")
            except:
                print("Loading table took too much time!")



            ######   EXTRACT DATA FROM TABLE    #####p
            print('Saerching for table to Extract data...')
            res_reg_no = driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[1]').text
            res_hus = driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[2]').text
            res_wife = driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[3]').text
            res_w_par = driver.find_element_by_xpath('//*[@id="MarriageMstListDisp"]/tbody/tr/td[8]').text
            print('Table Found')
            print('-----------------------')
            print(f'|     {location}                 | {res_wife}')
            print('-----------------------')
            # mType.send_keys('- Select -')
            print('start csv write...')
            #####   write to CSV FILE   #####
            with codecs.open(file_name, mode='a', encoding='utf-8') as RECORDS_file:
                employee_writer = csv.writer(RECORDS_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                employee_writer.writerow([x, res_reg_no, res_hus, res_wife, res_w_par])
                print('Write to CSV success !')
                print('**********END**********')

        driver.quit()


    except:
        # driver.quit()
        # if error caused by invalid reg number (reg number not present, max value reached) close the driver
        print('\n \n \n')
        print('+++++++++++++  REQUIRES ATTENTION  +++++++++++++\n')
        print('\n \n \n')
        print('error in ---= >', x)
        with codecs.open(f"{logs_path}Completed_{mar_type}_{year}.txt", mode='a+', encoding='utf-8') as completed_file:
            print('error in --->', x)
            completed_file.write(f' Ended at ----->>>>> Place: {mar_place} | loop no: {x} | RegNo: {res_reg_no}| year: {year} \n')
            print(f" Ended at ----->>>>> Place: {mar_place} | loop no: {x} | RegNo: {res_reg_no}| year: {year}    \n")
            print('error in --->', x)
            time.sleep(0.5)
            driver.quit()
        print('error in --->', x)
        print('\n')

        print('error in --->', x)
        driver.quit()

    driver.quit()

# myscraper('TMR1', 1, 'ADAYAR', 2015)   #REMOVE AT END for test run

来源：https://stackoverflow.com/questions/65768906/webscraping-using-python-and-selenium-and-tried-to-use-multiprocessing-but-code

标签

python

selenium

web-scraping

multiprocessing

selenium-chromedriver