问题
I am trying to perform the below operation using python:
for n in range (1,100) :
routedat = xr.open_dataset(route_files[n])
lsmdat = xr.open_dataset(lsm_files[n])
routedat = reformat_LIS_output(routedat)
lsmdat = reformat_LIS_output(lsmdat)
for i in range(1,len(stations)):
start_date = stations[i]['Streamflow (cumecs)'].first_valid_index()
lis_date = routedat['time'][0].values
gauge_id = valid_stations[i]
gauge_lat = meta_file.loc[gauge_id,'Latitude']
gauge_lon = meta_file.loc[gauge_id,'Longitude']
if start_date >= lis_date :
route_sel = routedat.sel(lat=gauge_lat,lon=gauge_lon,method='nearest')
lsm_sel = lsmdat.sel(lat=gauge_lat,lon=gauge_lon,method='nearest')
stations[i].loc[lis_date,'Precip'] = lsm_sel['TotalPrecip_tavg']
stations[i].loc[lis_date,'Evap'] = lsm_sel['Evap_tavg']
stations[i].loc[lis_date,'SoilMoist_L1'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=0)
stations[i].loc[lis_date,'SoilMoist_L2'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=1)
stations[i].loc[lis_date,'SoilMoist_L3'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=2)
stations[i].loc[lis_date,'SoilMoist_L4'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=3)
stations[i].loc[lis_date,'FloodFraction'] = route_sel['FloodedFrac_tavg']
stations[i].loc[lis_date,'RiverDepth'] = route_sel['RiverDepth_tavg']
stations[i].loc[lis_date,'SWS'] = route_sel['SWS_tavg']
stations[i].loc[lis_date,'Streamflow'] = route_sel['Streamflow_tavg']
I have to extract the information from lsmdat and routedat using lat/lon from 300 dataframes which are stored in stations
list. Currently, the code is taking 8 minutes to write the information to the dataframes in stations
.
Can anyone suggest me how can I make use of Dask to parallelize the 2nd part of above code? Thank you.
EDIT :
So, as per the comments ,I tried to implement dask.delayed()
. I made the below function :
def build_data(stations,routedat,lsmdat,valid_stations,meta_file) :
start_date = stations[i]['Streamflow (cumecs)'].first_valid_index()
lis_date = routedat['time'][0].values
gauge_id = valid_stations[i]
gauge_lat = meta_file.loc[gauge_id,'Latitude']
gauge_lon = meta_file.loc[gauge_id,'Longitude']
route_sel = routedat.sel(lat=gauge_lat,lon=gauge_lon,method='nearest')
lsm_sel = lsmdat.sel(lat=gauge_lat,lon=gauge_lon,method='nearest')
stations[i].loc[lis_date,'Precip'] = lsm_sel['TotalPrecip_tavg']
stations[i].loc[lis_date,'Evap'] = lsm_sel['Evap_tavg']
stations[i].loc[lis_date,'SoilMoist_L1'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=0)
stations[i].loc[lis_date,'SoilMoist_L2'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=1)
stations[i].loc[lis_date,'SoilMoist_L3'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=2)
stations[i].loc[lis_date,'SoilMoist_L4'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=3)
stations[i].loc[lis_date,'FloodFraction'] = route_sel['FloodedFrac_tavg']
stations[i].loc[lis_date,'RiverDepth'] = route_sel['RiverDepth_tavg']
stations[i].loc[lis_date,'SWS'] = route_sel['SWS_tavg']
stations[i].loc[lis_date,'Streamflow'] = route_sel['Streamflow_tavg']
return
If I run this function within the loop which opens xarray dataset one by one , it works fine and data is indeed written properly into elements of stations:
for n in tqdm(range (4112,4115)) :
routedat = (xr.open_dataset)(route_files[n])
lsmdat = (xr.open_dataset)(lsm_files[n])
routedat = reformat_LIS_output(routedat)
lsmdat = reformat_LIS_output(lsmdat)
for i in range (0,10):
build_data(stations,routedat,lsmdat,valid_stations,meta_file)
But when I tried to parallelize the same using dask.delayed()
, it did not write anything into the stations elements and took considerably more time to complete.
for n in tqdm(range (4112,4115)) :
routedat = (xr.open_dataset)(route_files[n])
lsmdat = (xr.open_dataset)(lsm_files[n])
routedat = reformat_LIS_output(routedat)
lsmdat = reformat_LIS_output(lsmdat)
build_delayed = []
for i in range(0,10):
task = dask.delayed(build_data)(stations,routedat,lsmdat,valid_stations,meta_file)
build_delayed.append(task)
dask.compute(*build_delayed)
Also, I was successfully able to parallelize a loop on the same local cluster earlier into this script.
Can anyone please help me where am I going wrong?
EDIT2 : I also tried to assign independent variables to each daskworker while they execute the function, but still the output is nil. :
start_date = [None]*len(stations)
gauge_id = [None]*len(stations)
gauge_lon = [None]*len(stations)
gauge_lat = [None]*len(stations)
route_sel = [None]*len(stations)
lsm_sel = [None]*len(stations)
def build_data(stations,routedat,lsmdat,valid_stations,meta_file,i) :
start_date[i] = stations[i]['Streamflow (cumecs)'].first_valid_index()
lis_date = routedat['time'][0].values
gauge_id[i]= valid_stations[i]
gauge_lat[i] = meta_file.loc[gauge_id[i],'Latitude']
gauge_lon[i] = meta_file.loc[gauge_id[i],'Longitude']
route_sel[i] = routedat.sel(lat=gauge_lat[i],lon=gauge_lon[i],method='nearest')
lsm_sel[i] = lsmdat.sel(lat=gauge_lat[i],lon=gauge_lon[i],method='nearest')
if stations[i].loc[lis_date,'Streamflow (cumecs)']!= math.nan :
stations[i].loc[lis_date,'Precip'] = lsm_sel[i]['TotalPrecip_tavg']
stations[i].loc[lis_date,'Evap'] = lsm_sel[i]['Evap_tavg']
stations[i].loc[lis_date,'SoilMoist_L1'] = lsm_sel[i]['SoilMoist_tavg'].sel(SoilMoist_profiles=0)
stations[i].loc[lis_date,'SoilMoist_L2'] = lsm_sel[i]['SoilMoist_tavg'].sel(SoilMoist_profiles=1)
stations[i].loc[lis_date,'SoilMoist_L3'] = lsm_sel[i]['SoilMoist_tavg'].sel(SoilMoist_profiles=2)
stations[i].loc[lis_date,'SoilMoist_L4'] = lsm_sel[i]['SoilMoist_tavg'].sel(SoilMoist_profiles=3)
stations[i].loc[lis_date,'FloodFraction'] = route_sel[i]['FloodedFrac_tavg']
stations[i].loc[lis_date,'RiverDepth'] = route_sel[i]['RiverDepth_tavg']
stations[i].loc[lis_date,'SWS'] = route_sel[i]['SWS_tavg']
stations[i].loc[lis_date,'Streamflow'] = route_sel[i]['Streamflow_tavg']
return
来源:https://stackoverflow.com/questions/62971626/parallelizing-generic-python-code-with-dask