Get cumulative count per 2d array

后端 未结 3 920
伪装坚强ぢ
伪装坚强ぢ 2021-01-18 03:41

I have general data, e.g. strings:

np.random.seed(343)

arr = np.sort(np.random.randint(5, size=(10, 10)), axis=1).astype(str)
print (arr)
[[\'0\' \'1\' \'1\         


        
3条回答
  •  抹茶落季
    2021-01-18 04:20

    Using the method of Divakar column wise is pretty faster, even so there is probably a fully vectorized way.

    #function of Divakar
    def grp_range(a):
        idx = a.cumsum()
        id_arr = np.ones(idx[-1],dtype=int)
        id_arr[0] = 0
        id_arr[idx[:-1]] = -a[:-1]+1
        return id_arr.cumsum()
    
    #create the equivalent of (df != df.shift()).cumsum() but faster
    arr_sum = np.vstack([np.ones(10), np.cumsum((arr != np.roll(arr, 1, 0))[1:],0)+1])
    
    #use grp_range column wise on arr_sum
    arr_result = np.array([grp_range(np.unique(arr_sum[:,i],return_counts=1)[1]) 
                           for i in range(arr_sum.shape[1])]).T+1
    

    To check the equality:

    # of the cumsum
    print (((df != df.shift()).cumsum() == 
             np.vstack([np.ones(10), np.cumsum((arr != np.roll(arr, 1, 0))[1:],0)+1]))
             .all().all())
    #True
    
    print ((df.apply(lambda x: x.groupby((x != x.shift()).cumsum()).cumcount() + 1) ==
            np.array([grp_range(np.unique(arr_sum[:,i],return_counts=1)[1]) 
                      for i in range(arr_sum.shape[1])]).T+1)
            .all().all())
    #True
    

    and the speed:

    %timeit df.apply(lambda x: x.groupby((x != x.shift()).cumsum()).cumcount() + 1)
    #19.4 ms ± 2.97 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
    
    %%timeit
    arr_sum = np.vstack([np.ones(10), np.cumsum((arr != np.roll(arr, 1, 0))[1:],0)+1])
    arr_res = np.array([grp_range(np.unique(arr_sum[:,i],return_counts=1)[1]) 
                        for i in range(arr_sum.shape[1])]).T+1
    
    #562 µs ± 82.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
    

    EDIT: with Numpy, you can also use np.maximum.accumulate with np.arange.

    def accumulate(arr):
        n,m = arr.shape
        arr_arange = np.arange(1,n+1)[:,np.newaxis]
        return np.concatenate([ np.ones((1,m)), 
                               arr_arange[1:] - np.maximum.accumulate(arr_arange[:-1]*
                          (arr[:-1,:] != arr[1:,:]))],axis=0)
    

    Some TIMING

    arr_100 = np.sort(np.random.randint(50, size=(100000, 100)), axis=1).astype(str)
    

    Solution with np.maximum.accumulate

    %timeit accumulate(arr_100)
    #520 ms ± 72 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    

    Solution of Divakar

    %timeit grp_range_2drow(arr_100.T, start=1).T
    #1.15 s ± 64.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    

    Solution with Numba of B. M.

    %timeit numbering(arr_100)
    #228 ms ± 31.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    

提交回复
热议问题