I have general data, e.g. strings:
np.random.seed(343)
arr = np.sort(np.random.randint(5, size=(10, 10)), axis=1).astype(str)
print (arr)
[[\'0\' \'1\' \'1\
Using the method of Divakar column wise is pretty faster, even so there is probably a fully vectorized way.
#function of Divakar
def grp_range(a):
idx = a.cumsum()
id_arr = np.ones(idx[-1],dtype=int)
id_arr[0] = 0
id_arr[idx[:-1]] = -a[:-1]+1
return id_arr.cumsum()
#create the equivalent of (df != df.shift()).cumsum() but faster
arr_sum = np.vstack([np.ones(10), np.cumsum((arr != np.roll(arr, 1, 0))[1:],0)+1])
#use grp_range column wise on arr_sum
arr_result = np.array([grp_range(np.unique(arr_sum[:,i],return_counts=1)[1])
for i in range(arr_sum.shape[1])]).T+1
To check the equality:
# of the cumsum
print (((df != df.shift()).cumsum() ==
np.vstack([np.ones(10), np.cumsum((arr != np.roll(arr, 1, 0))[1:],0)+1]))
.all().all())
#True
print ((df.apply(lambda x: x.groupby((x != x.shift()).cumsum()).cumcount() + 1) ==
np.array([grp_range(np.unique(arr_sum[:,i],return_counts=1)[1])
for i in range(arr_sum.shape[1])]).T+1)
.all().all())
#True
and the speed:
%timeit df.apply(lambda x: x.groupby((x != x.shift()).cumsum()).cumcount() + 1)
#19.4 ms ± 2.97 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit
arr_sum = np.vstack([np.ones(10), np.cumsum((arr != np.roll(arr, 1, 0))[1:],0)+1])
arr_res = np.array([grp_range(np.unique(arr_sum[:,i],return_counts=1)[1])
for i in range(arr_sum.shape[1])]).T+1
#562 µs ± 82.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
EDIT: with Numpy
, you can also use np.maximum.accumulate
with np.arange
.
def accumulate(arr):
n,m = arr.shape
arr_arange = np.arange(1,n+1)[:,np.newaxis]
return np.concatenate([ np.ones((1,m)),
arr_arange[1:] - np.maximum.accumulate(arr_arange[:-1]*
(arr[:-1,:] != arr[1:,:]))],axis=0)
Some TIMING
arr_100 = np.sort(np.random.randint(50, size=(100000, 100)), axis=1).astype(str)
Solution with np.maximum.accumulate
%timeit accumulate(arr_100)
#520 ms ± 72 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Solution of Divakar
%timeit grp_range_2drow(arr_100.T, start=1).T
#1.15 s ± 64.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Solution with Numba of B. M.
%timeit numbering(arr_100)
#228 ms ± 31.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)