Python Multithreading missing data

问题

useI am working on a python script to check if the url is working. The script will write the url and response code to a log file. To speed up the check, I am using threading and queue.

The script works well if the number of url's to check is small but when increasing the number of url's to hundreds, some url's just will miss from the log file. Is there anything I need to fix?

My script is

#!/usr/bin/env python
import Queue
import threading
import urllib2,urllib,sys,cx_Oracle,os
import time
from urllib2 import HTTPError, URLError


queue = Queue.Queue()
##print_queue = Queue.Queue()

class NoRedirectHandler(urllib2.HTTPRedirectHandler):
    def http_error_302(self, req, fp, code, msg, headers):
        infourl = urllib.addinfourl(fp, headers, req.get_full_url())
        infourl.status = code
        infourl.code = code
        return infourl
    http_error_300 = http_error_302
    http_error_301 = http_error_302
    http_error_303 = http_error_302
    http_error_307 = http_error_302

class ThreadUrl(threading.Thread):
    #Threaded Url Grab
##    def __init__(self, queue, print_queue):
    def __init__(self, queue,error_log):    
        threading.Thread.__init__(self)
        self.queue = queue
##        self.print_queue = print_queue
        self.error_log = error_log

    def do_something_with_exception(self,idx,url,error_log):
        exc_type, exc_value = sys.exc_info()[:2]
##        self.print_queue.put([idx,url,exc_type.__name__])
        with open( error_log, 'a') as err_log_f:
            err_log_f.write("{0},{1},{2}\n".format(idx,url,exc_type.__name__))


    def openUrl(self,pair):
        try:
            idx = pair[1]
            url = 'http://'+pair[2]

            opener = urllib2.build_opener(NoRedirectHandler())
            urllib2.install_opener(opener)
            request = urllib2.Request(url)
            request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 5.1; rv:13.0) Gecko/20100101 Firefox/13.0.1')


            #open urls of hosts 
            resp = urllib2.urlopen(request, timeout=10)

##            self.print_queue.put([idx,url,resp.code])
            with open( self.error_log, 'a') as err_log_f:
                err_log_f.write("{0},{1},{2}\n".format(idx,url,resp.code))


        except:
            self.do_something_with_exception(idx,url,self.error_log)    


    def run(self):
        while True:
            #grabs host from queue
            pair = self.queue.get()
            self.openUrl(pair)

            #signals to queue job is done
            self.queue.task_done()

def readUrlFromDB(queue,connect_string,column_name,table_name):
    try:  
        connection = cx_Oracle.Connection(connect_string)
        cursor = cx_Oracle.Cursor(connection)
        query = 'select ' + column_name + ' from ' + table_name
        cursor.execute(query)

        #Count lines in the file
        rows = cursor.fetchall()
        total = cursor.rowcount        

        #Loop through returned urls
        for row in rows:
            #print row[1],row[2]
##            url = 'http://'+row[2]
            queue.put(row)
        cursor.close()
        connection.close()

        return total

    except cx_Oracle.DatabaseError, e:
        print e[0].context
        raise   

def main():   
    start = time.time()
    error_log = "D:\\chkWebsite_Error_Log.txt"

    #Check if error_log file exists
    #If exists then deletes it
    if os.path.isfile(error_log):  
         os.remove(error_log)

    #spawn a pool of threads, and pass them queue instance 
    for i in range(10):
        t = ThreadUrl(queue,error_log)
        t.setDaemon(True)
        t.start()

    connect_string,column_name,table_name = "user/pass@db","*","T_URL_TEST"
    tn = readUrlFromDB(queue,connect_string,column_name,table_name)


   #wait on the queue until everything has been processed     
    queue.join()
##    print_queue.join()

    print "Total retrived: {0}".format(tn)
    print "Elapsed Time: %s" % (time.time() - start)

main()

回答1:

Python's threading module isn't really multithreaded because of the global interpreter lock, http://wiki.python.org/moin/GlobalInterpreterLock as such you should really use multiprocessing http://docs.python.org/library/multiprocessing.html if you really want to take advantage of multiple cores.

Also you seem to be accessing a file simultatnously

with open( self.error_log, 'a') as err_log_f:
    err_log_f.write("{0},{1},{2}\n".format(idx,url,resp.code))

This is really bad AFAIK, if two threads are trying to write to the same file at the same time or almost at the same time, keep in mind, their not really multithreaded, the behavior tends to be undefined, imagine one thread writing while another just closed it...

Anyway you would need a third queue to handle writing to the file.

回答2:

At first glance this looks like a race condition, since many threads are trying to write to the log file at the same time. See this question for some pointers on how to lock a file for writing (so only one thread can access it at a time).

来源：https://stackoverflow.com/questions/11464750/python-multithreading-missing-data

标签

multithreading

queue

python-2.7