Parallelizing generic python code with Dask

问题

I am trying to perform the below operation using python:

for n in range (1,100) :
    routedat = xr.open_dataset(route_files[n])
    lsmdat = xr.open_dataset(lsm_files[n])
    
    routedat = reformat_LIS_output(routedat)
    lsmdat = reformat_LIS_output(lsmdat)


    for i in range(1,len(stations)): 
        start_date = stations[i]['Streamflow (cumecs)'].first_valid_index()
        lis_date = routedat['time'][0].values
        gauge_id = valid_stations[i]
        gauge_lat = meta_file.loc[gauge_id,'Latitude']
        gauge_lon = meta_file.loc[gauge_id,'Longitude']
        if start_date >= lis_date :
            route_sel = routedat.sel(lat=gauge_lat,lon=gauge_lon,method='nearest')
            lsm_sel = lsmdat.sel(lat=gauge_lat,lon=gauge_lon,method='nearest')
            stations[i].loc[lis_date,'Precip'] = lsm_sel['TotalPrecip_tavg']
            stations[i].loc[lis_date,'Evap'] = lsm_sel['Evap_tavg']
            stations[i].loc[lis_date,'SoilMoist_L1'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=0)
            stations[i].loc[lis_date,'SoilMoist_L2'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=1)
            stations[i].loc[lis_date,'SoilMoist_L3'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=2)
            stations[i].loc[lis_date,'SoilMoist_L4'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=3)
            stations[i].loc[lis_date,'FloodFraction'] = route_sel['FloodedFrac_tavg']
            stations[i].loc[lis_date,'RiverDepth'] = route_sel['RiverDepth_tavg']
            stations[i].loc[lis_date,'SWS'] = route_sel['SWS_tavg']
            stations[i].loc[lis_date,'Streamflow'] = route_sel['Streamflow_tavg']

I have to extract the information from lsmdat and routedat using lat/lon from 300 dataframes which are stored in stations list. Currently, the code is taking 8 minutes to write the information to the dataframes in stations.

Can anyone suggest me how can I make use of Dask to parallelize the 2nd part of above code? Thank you.

EDIT : So, as per the comments ,I tried to implement dask.delayed(). I made the below function :

def build_data(stations,routedat,lsmdat,valid_stations,meta_file) :
    start_date = stations[i]['Streamflow (cumecs)'].first_valid_index()
    lis_date = routedat['time'][0].values
    gauge_id = valid_stations[i]
    gauge_lat = meta_file.loc[gauge_id,'Latitude']
    gauge_lon = meta_file.loc[gauge_id,'Longitude']
    route_sel = routedat.sel(lat=gauge_lat,lon=gauge_lon,method='nearest')
    lsm_sel = lsmdat.sel(lat=gauge_lat,lon=gauge_lon,method='nearest')
    stations[i].loc[lis_date,'Precip'] = lsm_sel['TotalPrecip_tavg']
    stations[i].loc[lis_date,'Evap'] = lsm_sel['Evap_tavg']
    stations[i].loc[lis_date,'SoilMoist_L1'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=0)
    stations[i].loc[lis_date,'SoilMoist_L2'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=1)
    stations[i].loc[lis_date,'SoilMoist_L3'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=2)
    stations[i].loc[lis_date,'SoilMoist_L4'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=3)
    stations[i].loc[lis_date,'FloodFraction'] = route_sel['FloodedFrac_tavg']
    stations[i].loc[lis_date,'RiverDepth'] = route_sel['RiverDepth_tavg']
    stations[i].loc[lis_date,'SWS'] = route_sel['SWS_tavg']
    stations[i].loc[lis_date,'Streamflow'] = route_sel['Streamflow_tavg']
    return

If I run this function within the loop which opens xarray dataset one by one , it works fine and data is indeed written properly into elements of stations:

for n in tqdm(range (4112,4115)) :
    routedat = (xr.open_dataset)(route_files[n])
    lsmdat = (xr.open_dataset)(lsm_files[n])
    
    routedat = reformat_LIS_output(routedat)
    lsmdat = reformat_LIS_output(lsmdat)
    for i in range (0,10):
        build_data(stations,routedat,lsmdat,valid_stations,meta_file)

But when I tried to parallelize the same using dask.delayed() , it did not write anything into the stations elements and took considerably more time to complete.

for n in tqdm(range (4112,4115)) :
      routedat = (xr.open_dataset)(route_files[n])
      lsmdat = (xr.open_dataset)(lsm_files[n])
      
      routedat = reformat_LIS_output(routedat)
      lsmdat = reformat_LIS_output(lsmdat)
      build_delayed = []
      for i in range(0,10):
          task = dask.delayed(build_data)(stations,routedat,lsmdat,valid_stations,meta_file)
          build_delayed.append(task)
      dask.compute(*build_delayed)

Also, I was successfully able to parallelize a loop on the same local cluster earlier into this script.

Can anyone please help me where am I going wrong?

EDIT2 : I also tried to assign independent variables to each daskworker while they execute the function, but still the output is nil. :

start_date = [None]*len(stations)
gauge_id = [None]*len(stations)
gauge_lon = [None]*len(stations)
gauge_lat = [None]*len(stations)
route_sel = [None]*len(stations)
lsm_sel = [None]*len(stations)

def build_data(stations,routedat,lsmdat,valid_stations,meta_file,i) :
    start_date[i] = stations[i]['Streamflow (cumecs)'].first_valid_index()
    lis_date = routedat['time'][0].values
    gauge_id[i]= valid_stations[i]
    gauge_lat[i] = meta_file.loc[gauge_id[i],'Latitude']
    gauge_lon[i] = meta_file.loc[gauge_id[i],'Longitude']
    route_sel[i] = routedat.sel(lat=gauge_lat[i],lon=gauge_lon[i],method='nearest')
    lsm_sel[i] = lsmdat.sel(lat=gauge_lat[i],lon=gauge_lon[i],method='nearest')
    if stations[i].loc[lis_date,'Streamflow (cumecs)']!= math.nan :
        stations[i].loc[lis_date,'Precip'] = lsm_sel[i]['TotalPrecip_tavg']
        stations[i].loc[lis_date,'Evap'] = lsm_sel[i]['Evap_tavg']
        stations[i].loc[lis_date,'SoilMoist_L1'] = lsm_sel[i]['SoilMoist_tavg'].sel(SoilMoist_profiles=0)
        stations[i].loc[lis_date,'SoilMoist_L2'] = lsm_sel[i]['SoilMoist_tavg'].sel(SoilMoist_profiles=1)
        stations[i].loc[lis_date,'SoilMoist_L3'] = lsm_sel[i]['SoilMoist_tavg'].sel(SoilMoist_profiles=2)
        stations[i].loc[lis_date,'SoilMoist_L4'] = lsm_sel[i]['SoilMoist_tavg'].sel(SoilMoist_profiles=3)
        stations[i].loc[lis_date,'FloodFraction'] = route_sel[i]['FloodedFrac_tavg']
        stations[i].loc[lis_date,'RiverDepth'] = route_sel[i]['RiverDepth_tavg']
        stations[i].loc[lis_date,'SWS'] = route_sel[i]['SWS_tavg']
        stations[i].loc[lis_date,'Streamflow'] = route_sel[i]['Streamflow_tavg']
    return

来源：https://stackoverflow.com/questions/62971626/parallelizing-generic-python-code-with-dask

标签

python

dask