Python: Running nested loop, 2D moving window, in Parallel

前端未结

关注

 2  482

I work with topographic data. For one particular problem, I have written a function in Python which uses a moving window of a particular size to zip through a matrix (grid o

So far, quite promising, isn't it?

If you @morrismc test your as-is code now, on a [100,100]-matrix, we can already guess the achieved range of the principal speedup, even before running tests are completed.

>>> pass;    import numpy as np
>>> from zmq import Stopwatch; clk = Stopwatch()
>>>
>>> size =  100; demF32 = np.random.random( ( size, size ) ).astype( np.float32 ); resF32 = demF32.copy(); clk.start(); _ = RMSH_det( demF32, 10, resF32 ); t = clk.stop(); print( "{1:>13d} [us]\nNumOf_np.nan-s was {0:d}".format( _, t ) )
      6492192 [us]
NumOf_np.nan-s was 0

>>> size =  200; demF32 = np.random.random( ( size, size ) ).astype( np.float32 ); resF32 = demF32.copy(); clk.start(); _ = RMSH_det( demF32, 10, resF32 ); t = clk.stop(); print( "{1:>13d} [us]\nNumOf_np.nan-s was {0:d}".format( _, t ) )
     35650629 [us]
NumOf_np.nan-s was 0

>>> size = 1000; demF32 = np.random.random( ( size, size ) ).astype( np.float32 ); resF32 = demF32.copy(); clk.start(); _ = RMSH_det( demF32, 10, resF32 ); t = clk.stop(); print( "{1:>13d} [us]\nNumOf_np.nan-s was {0:d}".format( _, t ) )
   1058702889 [us]
NumOf_np.nan-s was 0

All this on scipy 1.2.1, thus without the benefits from 1.3.1 possible further speedups

A `numba.jit()` LLVM-compiled code. Ooops, slower ?

numba.jit()-acceleration has shown about 200 [ms] worse runtime on [100,100] DEM-grid, with signature having been specified ( so no ad-hoc analysis costs were accrued here ) and nogil = True ( '0.43.1+0.g8dabe7abe.dirty' not being the most recent, yet )

Guess there is nothing more to gain here, without moving the game into compiled Cython territories, yet having about tens of minutes instead of tens of hours, the Alea Iacta Est - just the numpy smart-vectorised code rulez!

EPILOGUE :

If the original algorithm was correct (and some doubts were left in the source-code for any further improvement work), any attempt to run some other form of [PARALLEL] code-execution-flow will not help here ( kernel-windows[w,w] are very-small and non-contiguous areas of the DEM-grid memory-layout, memory-I/O costs are dominant part of the run-time budget here, and some nicer indexing may improve the cache-line re-use, yet the overall efforts are well beyond the budget, as the target of going down from ~ 11 [hrs] to about ~ 6 [hrs] was more than successfully met with about ~ 20 [min] runtimes achievable for [1300,1100] float32 DEM-grids

The code was left as-is ( non-PEP-8 ), because of all the add-on didactic value for the [DOC.me], [TEST.me] and [PERF.me] phases of QA, so all kind PEP-isto-evangelisators do bear with the O/P author's view onto a full-scree-width layout left, so as to allow to understand WHY and to improve the code, which with stripped-off-comments would lose her/his way forwards in improving the code performance further on. Thx.

@jit( [ "int32( float32[:,:], int32, float32[:,:] )", ], nogil    = True )                  # numba.__version__ '0.43.1+0.g8dabe7abe.dirty'
def RMSH_det_jit( DEMf32, w, rmsRESULTf32 ):                            # pre-allocate rmsRESULTf32[:,:] externally
    #import numpy as np
    #from scipy import signal
    #
    # [nrows, ncols] = np.shape( DEM )                                  # avoid ~ [ 1355, 1165 ]
    #                                                                   # DEM.dtype = float32 
    #                                                                   #    .flags = C_CONTIGUOUS    : True
    #                                                                   #             F_CONTIGUOUS    : False
    #                                                                   #             OWNDATA         : True
    #                                                                   #             WRITEABLE       : True
    #                                                                   #             ALIGNED         : True
    #                                                                   #             WRITEBACKIFCOPY : False
    #                                                                   #             UPDATEIFCOPY    : False
    #
    rmsRESULTf32[:,:] = np.nan                                          #        .STO[:,:] np.nan-s, using in-place assignment into the by-ref passed, externally pre-allocated np.ndarray
    dtdWIN            = np.ones( ( 2 * w - 1,                           #        .ALLOC once, re-use 1M+ times
                                   2 * w - 1 ) )
    a_div_by_nz_minus1 = 1. / ( dtdWIN.size - 1  )                      #        .SET float CONST with about a ~1M+ re-use
    a_num_of_NaNs      = 0                                              #        .SET i4 bonus value, ret'd as a side-effect of the signature ... 
    # rms = DEM*np.nan                                                  # avoid ( pre-alloc rmsRESULTf32 ) externally create and pass a right-sized, empty array to store all results
    # nw  = ( w * 2 )**2
    # x   = np.arange( 0, nw )

    #                        11..1344
    #or     i in np.arange( w+1,           nrows-w ):                   # w ~ 10 -> [11:1344, 11:1154]
    for     i in np.arange( w+1, DEMf32.shape[0]-w ):                   #         ??? never touches DEM-row/column[0]?? or off-by-one indexing error ???
        fromI = i - w                                                   #        .UPD ALAP
        tillI = i + w - 1                                               #        .UPD ALAP upper bound index excluded ( this is how a code in [ np.arange(...)[0]:np.arange(...)[-1] ] works )
        #                    11..1154
        #or j in np.arange( w+1,           ncols-w ):
        for j in np.arange( w+1, DEMf32.shape[1]-w ):
            fromJ = j - w                                               #        .UPD ALAP
            tillJ = j + w - 1                                           #        .UPD ALAP upper bound index excluded ( this is how a code in [ np.arange(...)[0]:np.arange(...)[-1] ] works )
            #                       1..1334:21..1354                    #         ??? never touches first/last DEM-row/column??
            # d1 = np.int64( np.arange( i-w, i+w ) )                    # AVOID: 1M+ times allocated, yet never consumed, but their edge values
            # d2 = np.int64( np.arange( j-w, j+w ) )                    # AVOID: 1M+ times allocated, yet never consumed, but their edge values

            # win = DEM[ d1[0]:d1[-1],                                  # AVOID: while a .view-only, no need to 1M+ times instantiate a "kernel"-win(dow] ( this will create a np.view into the original DEM, not a copy ! )
            #            d2[0]:d2[-1]                                   # ?.or.?   NOT a .view-only, but a new .copy() instantiated, so as to call .detrend() w/o in-place modifying DEMf32 ???
            #            ]                                              # ?.or.?   NOT a .view-only, but a new .copy() instantiated, so as to call .detrend() w/o in-place modifying DEMf32 ???
            dtdWIN[:,:] = DEMf32[fromI:tillI, fromJ:tillJ]              #          NOT a .view-only, but a     .copy() re-populated into a just once and only once pre-allocated dtdWIN, via an in-place copy
            #f np.max( np.isnan(    win ) ) == 1:                       # AVOID: 1M+ times full-range scan, while any first np.nan decides the game and no need to scan "the rest"
            if np.any( np.isnan( dtdWIN ) ):                            #        "density" of np.nan-s determine, if this is a good idea to pre-store
               a_num_of_NaNs += 1                                       # .INC
               continue                                                 #        .NOP/LOOP from here, already pre-stored np.nan-s for this case
               # rms[i,j] = np.nan                                      # DUP ( already stored in initialisation ... )
            else:
               #in    = signal.detrend(    win, type = 'linear' )       # REALLY?: in-place modification of DEM-matrix ???
               dtdWIN = signal.detrend( dtdWIN, type = 'linear'   )     #    in scipy-v1.3.1+ can mod in-place,   overwrite_data = True ) # REMOVE OLS-fit-linear trend
               dtdWIN = signal.detrend( dtdWIN, type = 'constant' )     #    in scipy-v1.3.1+ can mod in-place,   overwrite_data = True ) # REMOVE mean
               #z  = np.reshape( win, -1 )                              # AVOID:~1M+ re-counting constant value, known from w directly
               #nz = np.size( z )                                       # AVOID:~1M+ re-counting constant value, known from w directly
               #rootms    = np.sqrt( 1 / ( nz - 1 ) * np.sum( ( z - np.mean( z ) )**2 ) )
               #rms[i,j]  = rootms
               rmsRESULTf32[i,j] = np.sqrt( a_div_by_nz_minus1          # .STO a "scaled"
                                          * np.dot(   dtdWIN,
                                                      dtdWIN.T
                                                      ).sum()
                                          # np.sum( ( dtdWIN            #         SUM of
                                          #       # - dtdWIN.mean()     #               mean-removed ( ALREADY done via scipy.signal.detrend( 'const' ) above )
                                          #           )**2              #               SQUARES
                                          #         )
                                            )                           #      ROOT
    return( a_num_of_NaNs )                                             # ret i4

0 讨论(0)

[愿得一人]

2021-01-06 05:22

A solution using Numba

In some cases this is very easy to do, if all functions which you use are supported. In your code win = signal.detrend(win, type = 'linear') is the part you have to implement in Numba, because this function isn't supported.

Implementing detrend in Numba

If you look at the source-code of detrend, and extract the relevant parts for your problem, it may look like this:

@nb.njit()
def detrend(w):
    Npts=w.shape[0]
    A=np.empty((Npts,2),dtype=w.dtype)
    for i in range(Npts):
        A[i,0]=1.*(i+1) / Npts
        A[i,1]=1.

    coef, resids, rank, s = np.linalg.lstsq(A, w.T)
    out=w.T- np.dot(A, coef)
    return out.T

I also implemented a faster solution for np.max(np.isnan(win)) == 1

@nb.njit()
def isnan(win):
    for i in range(win.shape[0]):
        for j in range(win.shape[1]):
            if np.isnan(win[i,j]):
                return True
    return False

Main function

As I used Numba here, the parallelization is very simple, just a prange on the outer loop and

import numpy as np
import numba as nb

@nb.njit(parallel=True)
def RMSH_det_nb(DEM, w):
    [nrows, ncols] = np.shape(DEM)

    #create an empty array to store result
    rms = DEM*np.nan

    for i in nb.prange(w+1,nrows-w):
        for j in range(w+1,ncols-w):
            win = DEM[i-w:i+w-1,j-w:j+w-1]

            if isnan(win):
                rms[i,j] = np.nan
            else:
                win = detrend(win)
                z = win.flatten()
                nz = z.size
                rootms = np.sqrt(1 / (nz - 1) * np.sum((z-np.mean(z))**2))
                rms[i,j] = rootms

    return rms

Timings (small example)

w = 10
DEM=np.random.rand(100, 100).astype(np.float32)

res1=RMSH_det(DEM, w)
res2=RMSH_det_nb(DEM, w)
print(np.allclose(res1,res2,equal_nan=True))
#True

%timeit res1=RMSH_det(DEM, w)
#1.59 s ± 72 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit res2=RMSH_det_nb(DEM, w) #approx. 55 times faster
#29 ms ± 1.85 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

Timings for larger arrays

w = 10
DEM=np.random.rand(1355, 1165).astype(np.float32)
%timeit res2=RMSH_det_nb(DEM, w)
#6.63 s ± 21.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

[Edit] Implemenation using normal Equations

Overdetermined system

This method has a lower numerical precision. Although this solution is quite a lot faster.

@nb.njit()
def isnan(win):
    for i in range(win.shape[0]):
        for j in range(win.shape[1]):
            if win[i,j]==np.nan:
                return True
    return False

@nb.njit()
def detrend(w):
    Npts=w.shape[0]
    A=np.empty((Npts,2),dtype=w.dtype)
    for i in range(Npts):
        A[i,0]=1.*(i+1) / Npts
        A[i,1]=1.

    coef, resids, rank, s = np.linalg.lstsq(A, w.T)
    out=w.T- np.dot(A, coef)
    return out.T

@nb.njit()
def detrend_2(w,T1,A):
    T2=np.dot(A.T,w.T)
    coef=np.linalg.solve(T1,T2)

    out=w.T- np.dot(A, coef)

    return out.T

@nb.njit(parallel=True)
def RMSH_det_nb_normal_eq(DEM,w):
    [nrows, ncols] = np.shape(DEM)

    #create an empty array to store result
    rms = DEM*np.nan

    Npts=w*2-1
    A=np.empty((Npts,2),dtype=DEM.dtype)
    for i in range(Npts):
        A[i,0]=1.*(i+1) / Npts
        A[i,1]=1.

    T1=np.dot(A.T,A)

    nz = Npts**2
    for i in nb.prange(w+1,nrows-w):
        for j in range(w+1,ncols-w):
            win = DEM[i-w:i+w-1,j-w:j+w-1]

            if isnan(win):
                rms[i,j] = np.nan
            else:
                win = detrend_2(win,T1,A)
                rootms = np.sqrt(1 / (nz - 1) * np.sum((win-np.mean(win))**2))
                rms[i,j] = rootms

    return rms

Timings

w = 10
DEM=np.random.rand(100, 100).astype(np.float32)

res1=RMSH_det(DEM, w)
res2=RMSH_det_nb(DEM, w)
print(np.allclose(res1,res2,equal_nan=True))
#True

%timeit res1=RMSH_det(DEM, w)
#1.59 s ± 72 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit res2=RMSH_det_nb_normal_eq(DEM,w)
#7.97 ms ± 89.4 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)

Optimized solution using normal equations

Temporary arrays are reused to avoid costly memory allocations and a custom implementation for matrix multiplication is used. This is only recommendable for very small matrices, in most other cases np.dot (sgeemm) will be a lot faster.

@nb.njit()
def matmult_2(A,B,out):
    for j in range(B.shape[1]):
        acc1=nb.float32(0)
        acc2=nb.float32(0)
        for k in range(B.shape[0]):
            acc1+=A[0,k]*B[k,j]
            acc2+=A[1,k]*B[k,j]
        out[0,j]=acc1
        out[1,j]=acc2
    return out

@nb.njit(fastmath=True)
def matmult_mod(A,B,w,out):
    for j in range(B.shape[1]):
        for i in range(A.shape[0]):
            acc=nb.float32(0)
            acc+=A[i,0]*B[0,j]+A[i,1]*B[1,j]
            out[j,i]=acc-w[j,i]
    return out

@nb.njit()
def detrend_2_opt(w,T1,A,Tempvar_1,Tempvar_2):
    T2=matmult_2(A.T,w.T,Tempvar_1)
    coef=np.linalg.solve(T1,T2)
    return matmult_mod(A, coef,w,Tempvar_2)

@nb.njit(parallel=True)
def RMSH_det_nb_normal_eq_opt(DEM,w):
    [nrows, ncols] = np.shape(DEM)

    #create an empty array to store result
    rms = DEM*np.nan

    Npts=w*2-1
    A=np.empty((Npts,2),dtype=DEM.dtype)
    for i in range(Npts):
        A[i,0]=1.*(i+1) / Npts
        A[i,1]=1.

    T1=np.dot(A.T,A)

    nz = Npts**2
    for i in nb.prange(w+1,nrows-w):
        Tempvar_1=np.empty((2,Npts),dtype=DEM.dtype)
        Tempvar_2=np.empty((Npts,Npts),dtype=DEM.dtype)
        for j in range(w+1,ncols-w):
            win = DEM[i-w:i+w-1,j-w:j+w-1]

            if isnan(win):
                rms[i,j] = np.nan
            else:
                win = detrend_2_opt(win,T1,A,Tempvar_1,Tempvar_2)
                rootms = np.sqrt(1 / (nz - 1) * np.sum((win-np.mean(win))**2))
                rms[i,j] = rootms

    return rms