Vectorized groupby with NumPy

前端 未结 4 2134
自闭症患者
自闭症患者 2020-12-31 08:11

Pandas has a widely-used groupby facility to split up a DataFrame based on a corresponding mapping, from which you can apply a calculation on each subgroup and recombine the

相关标签:
4条回答
  • 2020-12-31 08:44

    There's probably a faster way than this (both of the operands are making copies right now), but:

    np.bincount(np.broadcast_to(groups, X.T.shape).ravel(), X.T.ravel())
    
    array([ 15.,  30.])
    
    0 讨论(0)
  • 2020-12-31 08:52

    If you want a more flexible implementation of groupby that can group using any of numpy's ufuncs:

    def groupby_np(X, groups, axis = 0, uf = np.add, out = None, minlength = 0, identity = None):
        if minlength < groups.max() + 1:
            minlength = groups.max() + 1
        if identity is None:
            identity = uf.identity
        i = list(range(X.ndim))
        del i[axis]
        i = tuple(i)
        n = out is None
        if n:
            if identity is None:  # fallback to loops over 0-index for identity
                assert np.all(np.in1d(np.arange(minlength), groups)), "No valid identity for unassinged groups"
                s = [slice(None)] * X.ndim
                for i_ in i:
                    s[i_] = 0
                out = np.array([uf.reduce(X[tuple(s)][groups == i]) for i in range(minlength)])
            else:
                out = np.full((minlength,), identity, dtype = X.dtype)
        uf.at(out, groups, uf.reduce(X, i))
        if n:
            return out
    
    groupby_np(X, groups)
    array([15, 30])
    
    groupby_np(X, groups, uf = np.multiply)
    array([   0, 3024])
    
    groupby_np(X, groups, uf = np.maximum)
    array([5, 9])
    
    groupby_np(X, groups, uf = np.minimum)
    array([0, 6])
    
    0 讨论(0)
  • 2020-12-31 08:52

    How about using scipy sparse matrix

    import numpy as np
    from scipy import sparse
    import time
    
    x_len = 500000
    g_len = 100
    
    X = np.arange(x_len * 2).reshape(x_len, 2)
    groups = np.random.randint(0, g_len, x_len)
    
    # original
    s = time.time()
    
    a = np.array([X[groups==i].sum() for i in np.unique(groups)])
    
    print(time.time() - s)
    
    # using scipy sparse matrix
    s = time.time()
    
    x_sum = X.sum(axis=1)
    b = np.array(sparse.coo_matrix(
        (
            x_sum,
            (groups, np.arange(len(x_sum)))
        ),
        shape=(g_len, x_len)
    ).sum(axis=1)).ravel()
    
    print(time.time() - s)
    
    #compare
    print(np.abs((a-b)).sum())
    

    result on my PC

    0.15915322303771973
    0.012875080108642578
    0
    

    More than 10 times faster.


    Update!

    Let's benchmark answers of @Paul Panzer and @Daniel F. It is summation only benchmark.

    import numpy as np
    from scipy import sparse
    import time
    
    # by @Daniel F
    def groupby_np(X, groups, axis = 0, uf = np.add, out = None, minlength = 0, identity = None):
        if minlength < groups.max() + 1:
            minlength = groups.max() + 1
        if identity is None:
            identity = uf.identity
        i = list(range(X.ndim))
        del i[axis]
        i = tuple(i)
        n = out is None
        if n:
            if identity is None:  # fallback to loops over 0-index for identity
                assert np.all(np.in1d(np.arange(minlength), groups)), "No valid identity for unassinged groups"
                s = [slice(None)] * X.ndim
                for i_ in i:
                    s[i_] = 0
                out = np.array([uf.reduce(X[tuple(s)][groups == i]) for i in range(minlength)])
            else:
                out = np.full((minlength,), identity, dtype = X.dtype)
        uf.at(out, groups, uf.reduce(X, i))
        if n:
            return out
    
    x_len = 500000
    g_len = 200
    
    X = np.arange(x_len * 2).reshape(x_len, 2)
    groups = np.random.randint(0, g_len, x_len)
    
    print("original")
    s = time.time()
    
    a = np.array([X[groups==i].sum() for i in np.unique(groups)])
    
    print(time.time() - s)
    
    print("use scipy coo matrix")
    s = time.time()
    
    x_sum = X.sum(axis=1)
    b = np.array(sparse.coo_matrix(
        (
            x_sum,
            (groups, np.arange(len(x_sum)))
        ),
        shape=(g_len, x_len)
    ).sum(axis=1)).ravel()
    
    print(time.time() - s)
    
    #compare
    print(np.abs((a-b)).sum())
    
    
    print("use scipy csr matrix @Daniel F")
    s = time.time()
    x_sum = X.sum(axis=1)
    c = np.array(sparse.csr_matrix(
        (
            x_sum,
            groups,
            np.arange(len(groups)+1)
        ),
        shape=(len(groups), g_len)
    ).sum(axis=0)).ravel()
    
    print(time.time() - s)
    
    #compare
    print(np.abs((a-c)).sum())
    
    
    print("use bincount @Paul Panzer @Daniel F")
    s = time.time()
    d = np.bincount(groups, X.sum(axis=1), g_len)
    print(time.time() - s)
    
    #compare
    print(np.abs((a-d)).sum())
    
    print("use ufunc @Daniel F")
    s = time.time()
    e = groupby_np(X, groups)
    print(time.time() - s)
    
    #compare
    print(np.abs((a-e)).sum())
    

    STDOUT

    original
    0.2882847785949707
    use scipy coo matrix
    0.012301445007324219
    0
    use scipy csr matrix @Daniel F
    0.01046299934387207
    0
    use bincount @Paul Panzer @Daniel F
    0.007468223571777344
    0.0
    use ufunc @Daniel F
    0.04431319236755371
    0
    

    The winner is the bincount solution. But the csr matrix solution is also very interesting.

    0 讨论(0)
  • 2020-12-31 09:02

    @klim's sparse matrix solution would at first sight appear to be tied to summation. We can, however, use it in the general case by converting between the csr and csc formats:

    Let's look at a small example:

    >>> m, n = 3, 8                                                                                                     
    >>> idx = np.random.randint(0, m, (n,))
    >>> data = np.arange(n)
    >>>                                                                                                                 
    >>> M = sparse.csr_matrix((data, idx, np.arange(n+1)), (n, m))                                                      
    >>>                                                                                                                 
    >>> idx                                                                                                             
    array([0, 2, 2, 1, 1, 2, 2, 0])                                                                                     
    >>> 
    >>> M = M.tocsc()
    >>> 
    >>> M.indptr, M.indices
    (array([0, 2, 4, 8], dtype=int32), array([0, 7, 3, 4, 1, 2, 5, 6], dtype=int32))
    

    As we can see after conversion the internal representation of the sparse matrix yields the indices grouped and sorted:

    >>> groups = np.split(M.indices, M.indptr[1:-1])
    >>> groups
    [array([0, 7], dtype=int32), array([3, 4], dtype=int32), array([1, 2, 5, 6], dtype=int32)]
    >>> 
    

    We could have obtained the same using a stable argsort:

    >>> np.argsort(idx, kind='mergesort')
    array([0, 7, 3, 4, 1, 2, 5, 6])
    >>> 
    

    But sparse matrices are actually faster, even when we allow argsort to use a faster non-stable algorithm:

    >>> m, n = 1000, 100000
    >>> idx = np.random.randint(0, m, (n,))
    >>> data = np.arange(n)
    >>> 
    >>> timeit('sparse.csr_matrix((data, idx, np.arange(n+1)), (n, m)).tocsc()', **kwds)
    2.250748165184632
    >>> timeit('np.argsort(idx)', **kwds)
    5.783584725111723
    

    If we require argsort to keep groups sorted, the difference is even larger:

    >>> timeit('np.argsort(idx, kind="mergesort")', **kwds)
    10.507467685034499
    
    0 讨论(0)
提交回复
热议问题