问题
I find xarray to be very slow to save a dataset to netCDF. I suspect the reason for this is that the .to_netcdf() command first has to load the data before saving it. For an example I get the following timings:
Example a)
ds.to_netcdf(file_path) # ~6 minutes
Example b)
ds.load() # ~6 minutes
ds.to_netcdf(file_path) # <1 second
It would seem the slow down comes from loading. Is there any way circumventing this load or speeding this process up?
An explicit example of (a) that I am working with is as follows:
import xarray as xr
import numpy as np
from Python import config, read
from Python import array_opperations as ao
class Model(read.Read):
def __init__(self, case, mean=1):
self.readPath = config.readPath(case)
self.load_dataset(['Vorticity.nc', 'ssh.nc', 'vels_snap.nc', 'hFac.nc'],
['DXC', 'DYC', 'DXG', 'DYG',]) #load a dataset using the xarray open_dataset functionality
self.ds = self.ds.sel(TIME=slice('1980-01-08', '1981-01-07'))
self.KE(mean=mean)
def KE(self, mean=1):
'''
Calculates the kinetic energy term of the momentum budget according
to KEscheme=0
'''
u = self.ds['UVEL']
v = self.ds['VVEL']
def calc(u, v):
u2 = u**2
v2 = v**2
KE = 0.25 * ( u2 + ao.roll(u2, 'LONGITUDE', -1) +
v2 + ao.roll(v2, 'LATITUDE', -1) )
KEx = - ao.padded_diff(KE, 'LONGITUDE') / self.ds['DXC']
KEy = - ao.padded_diff(KE, 'LATITUDE') / self.ds['DYC']
return KEx, KEy
if mean:
u_mean = u.mean(dim='TIME', skipna=True)
v_mean = v.mean(dim='TIME', skipna=True)
u_rey = u - u_mean
v_rey = v - v_mean
KEx, KEy = calc(u_mean, v_mean)
KExRey, KEyRey = calc(u_rey, v_rey)
KE_terms = [KEx.rename('u_gradKE'), KEy.rename('v_gradKE'),
KExRey.rename('u_gradKERey'), KEyRey.rename('v_gradKERey')]
else:
KEx, KEy = calc(u, v)
KE_terms = [KEx.rename('u_gradKE'), KEy.rename('v_gradKE')]
KE_nc = xr.merge(KE_terms)
KE_nc.to_netcdf(self.readPath + 'offlineKE.nc', unlimited_dims='TIME')
The KE function runs real fast due to the xarray functionality but as soon as it gets to the last (.to_netcdf) line it really lags.
Here is my dataset loading method:
def load_dataset(self, nc_files, data_files):
'''
Load all NetCDF files listed
'''
file_paths = [self.readPath + file for file in nc_files]
self.ds = xr.open_mfdataset(file_paths, chunks={'LATITUDE':200, 'LONGITUDE':200})
LAT = self.ds.coords['LATITUDE'].values
LON = self.ds.coords['LONGITUDE'].values
for file in data_files:
_, x, y, z, _, prec = self.readMeta(self.readPath + file + '.meta')
data = self.readBin(file + '.data', x=x, y=y, z=z, dtype=prec)
self.ds = xr.merge([ self.ds,
xr.DataArray(data,
coords=[LAT,LON], dims=['LATITUDE','LONGITUDE']
).to_dataset(name=file)] )
self.ds = xr.merge([ self.ds,
xr.DataArray(np.ones(self.ds['DXC'].shape)*5000.0,
coords=[LAT,LON], dims=['LATITUDE','LONGITUDE']
).to_dataset(name='DRF')] )
来源:https://stackoverflow.com/questions/47238155/xarray-slow-to-save-netcdf