问题
I have a large list containing binary encoded strings that I used to process in a single function before, like so:
""" just included this to demonstrate the 'data' structure """
data=np.zeros(250,dtype='float32, (250000,2)float32')
def func numpy_array(data, peaks):
rt_counter=0
for x in peaks:
if rt_counter %(len(peaks)/20) == 0:
update_progress()
peak_counter=0
data_buff=base64.b64decode(x)
buff_size=len(data_buff)/4
unpack_format=">%dL" % buff_size
index=0
for y in struct.unpack(unpack_format,data_buff):
buff1=struct.pack("I",y)
buff2=struct.unpack("f",buff1)[0]
if (index % 2 == 0):
data[rt_counter][1][peak_counter][0]=float(buff2)
else:
data[rt_counter][1][peak_counter][1]=float(buff2)
peak_counter+=1
index+=1
rt_counter+=1
I have been reading up on multiprocessing and figured that I wanted to try that to see if I could get a big increase in performance, I rewrote my function into 2 (helper and 'caller') like so:
def numpy_array(data, peaks):
processors=mp.cpu_count #Might as well throw this directly in the mp.Pool (just for clarity for now)
pool = mp.Pool(processes=processors)
chunk_size=len(peaks)/processors
for i in range(processors):
counter = i*chunk_size
chunk=peaks[i*chunk_size:(i+1)*chunk_size-1]
pool.map(decode(data,chunk,counter))
def decode(data,chunk,counter):
for x in chunk:
peak_counter=0
data_buff=base64.b64decode(x)
buff_size=len(data_buff)/4
unpack_format=">%dL" % buff_size
index=0
for y in struct.unpack(unpack_format,data_buff):
buff1=struct.pack("I",y)
buff2=struct.unpack("f",buff1)[0]
if (index % 2 == 0):
data[counter][1][peak_counter][0]=float(buff2)
else:
data[counter][1][peak_counter][1]=float(buff2)
peak_counter+=1
index+=1
print data[counter][1][10][0]
counter+=1
The program runs but only uses 100-110% of CPU (according to top) and once it should be finished it throws TypeError: map() takes at least 3 arguments (2 given)
at me, could anyone with some more experience with multiprocess give me a hint as to what things to look out for (that could cause the TypeError)? What might be causing my low cpu usage?
-- Code after incorporating answers --
def decode((data,chunk,counter)):
print len(chunk), counter
for x in chunk:
peak_counter=0
data_buff=base64.b64decode(x)
buff_size=len(data_buff)/4
unpack_format=">%dL" % buff_size
index=0
for y in struct.unpack(unpack_format,data_buff):
buff1=struct.pack("I",y)
buff2=struct.unpack("f",buff1)[0]
if (index % 2 == 0):
data[counter][1][peak_counter][0]=float(buff2)
else:
data[counter][1][peak_counter][1]=float(buff2)
peak_counter+=1
index+=1
counter+=1
def numpy_array(data, peaks):
"""Fills the NumPy array 'data' with m/z-intensity values acquired
from b64 decoding and unpacking the binary string read from the
mzXML file, which is stored in the list 'peaks'.
The m/z values are assumed to be ordered without validating this
assumption.
Note: This function uses multi-processing
"""
processors=mp.cpu_count()
pool = mp.Pool(processes=processors)
chunk_size=int(len(peaks)/processors)
map_parameters=[]
for i in range(processors):
counter = i*chunk_size
chunk=peaks[i*chunk_size:(i+1)*chunk_size-1]
map_parameters.append((data,chunk,counter))
pool.map(decode,map_parameters)
This latest version 'works' so far that it fills the array in the processes (where the array contains values) but once all processes are done accessing the array yields zero values only because each process gets a local copy of the array.
回答1:
Something like this should work
Note that pool.map
takes a function and a list of parameters for that function for each call. In your original example you are just calling it in the numpy_array
function.
The function must only have one argument, hence the packing of the arguments into a tuple and the rather odd looking double brackets in decode
(which is called tuple unpacking).
def numpy_array(data, peaks):
processors=4
pool = mp.Pool(processes=processors)
chunk_size=len(data)/processors
print range(processors)
map_parameters = [] # new
for i in range(processors):
counter = i*chunk_size
chunk=peaks[i*chunk_size:(i+1)*chunk_size-1]
map_parameters.append((data,chunk,counter)) # new
pool.map(decode, map_parameters) # new
def decode((data,chunk,counter)): # changed
for x in chunk:
peak_counter=0
data_buff=base64.b64decode(x)
buff_size=len(data_buff)/4
unpack_format=">%dL" % buff_size
index=0
for y in struct.unpack(unpack_format,data_buff):
buff1=struct.pack("I",y)
buff2=struct.unpack("f",buff1)[0]
if (index % 2 == 0):
data[counter][1][peak_counter][0]=float(buff2)
else:
data[counter][1][peak_counter][1]=float(buff2)
peak_counter+=1
index+=1
print data[counter][1][10][0]
counter+=1
回答2:
The bug is in your numpy_array
function:
for i in range(processors):
counter = i*chunk_size
chunk=peaks[i*chunk_size:(i+1)*chunk_size-1]
pool.map(decode(data,chunk,counter))
The problem is that you're calling map
sequentially so you're only running one process at a time. Also, I don't think you're calling map
correctly as you're doing pool.map(f(*args))
when the signature is map(f, ['list', 'of', 'data'])
.
I would use a partial so that you don't create copies of data
as I assume that array is quite large or could be larger in the future.
This should be:
import functools
decode_with_data = functools.partial(decode, data)
args = []
for i in range(processors):
counter = i * chunk_size
chunk = peaks[1*chunk_size:(i+1)*chunk_size-1]
args.append(chunk, counter)
pool.map(decode_with_data, args)
来源:https://stackoverflow.com/questions/15966157/python-multi-processing