Numpy: Finding count of distinct values from associations through binning

后端 未结 1 791
感情败类
感情败类 2021-01-25 17:01

Prerequisite

This is a question is an extension of this post. So, some of the introduction of the problem will be similar to that post.

1条回答
  •  一个人的身影
    2021-01-25 17:40

    Here is one solution

    import numpy as np
    
    x_mapping = np.array([0, 1, 0, 0, 0, 0, 0, 0])
    y_mapping = np.array([0, 3, 2, 2, 0, 3, 2, 0])
    values = np.array([ 1.,  2.,  1.,  1.,  5.,  6.,  7.,  1.], dtype=np.float32)
    result = np.zeros([4, 2], dtype=np.float32)
    
    # Get flat indices
    idx_mapping = np.ravel_multi_index((-y_mapping, x_mapping), result.shape, mode='wrap')
    # Sort flat indices and reorders values accordingly
    reorder = np.argsort(idx_mapping)
    idx_mapping = idx_mapping[reorder]
    values = values[reorder]
    # Get unique values
    val_uniq = np.unique(values)
    # Find where each unique value appears
    val_uniq_hit = values[:, np.newaxis] == val_uniq
    # Find reduction indices (slices with the same flat index)
    reduce_idx = np.concatenate([[0], np.nonzero(np.diff(idx_mapping))[0] + 1])
    # Reduce slices
    reduced = np.logical_or.reduceat(val_uniq_hit, reduce_idx)
    # Count distinct values on each slice
    counts = np.count_nonzero(reduced, axis=1)
    # Put counts in result
    result.flat[idx_mapping[reduce_idx]] = counts
    
    print(result)
    # [[2. 0.]
    #  [1. 1.]
    #  [2. 0.]
    #  [0. 0.]]
    

    This method takes more memory (O(len(values) * len(np.unique(values)))), but a small benchmark comparing with your original solution shows a significant speedup (although that depends on the actual size of the problem):

    import numpy as np
    
    np.random.seed(100)
    result = np.zeros([400, 200], dtype=np.float32)
    values = np.random.randint(100, size=(20000,)).astype(np.float32)
    x_mapping = np.random.randint(result.shape[1], size=values.shape)
    y_mapping = np.random.randint(result.shape[0], size=values.shape)
    
    res1 = solution_orig(x_mapping, y_mapping, values, result)
    res2 = solution(x_mapping, y_mapping, values, result)
    print(np.allclose(res1, res2))
    # True
    
    # Original solution
    %timeit solution_orig(x_mapping, y_mapping, values, result)
    # 76.2 ms ± 623 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
    
    # This solution
    %timeit solution(x_mapping, y_mapping, values, result)
    # 13.8 ms ± 51.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    

    Full code of benchmark functions:

    import numpy as np
    
    def solution(x_mapping, y_mapping, values, result):
        result = np.array(result)
        idx_mapping = np.ravel_multi_index((-y_mapping, x_mapping), result.shape, mode='wrap')
        reorder = np.argsort(idx_mapping)
        idx_mapping = idx_mapping[reorder]
        values = values[reorder]
        val_uniq = np.unique(values)
        val_uniq_hit = values[:, np.newaxis] == val_uniq
        reduce_idx = np.concatenate([[0], np.nonzero(np.diff(idx_mapping))[0] + 1])
        reduced = np.logical_or.reduceat(val_uniq_hit, reduce_idx)
        counts = np.count_nonzero(reduced, axis=1)
        result.flat[idx_mapping[reduce_idx]] = counts
        return result
    
    def solution_orig(x_mapping, y_mapping, values, result):
        result = np.array(result)
        m,n = result.shape
        out_dtype = result.dtype
        lidx = ((-y_mapping)%m)*n + x_mapping
    
        sidx = lidx.argsort()
        idx = lidx[sidx]
        val = values[sidx]
    
        m_idx = np.flatnonzero(np.r_[True,idx[:-1] != idx[1:]])
        unq_ids = idx[m_idx]
    
        r_res = np.zeros(m_idx.size, dtype=np.float32)
        for i in range(0, m_idx.shape[0]):
            _next = None
            arr = None
            if i == m_idx.shape[0]-1:
                _next = val.shape[0]
            else:
                _next = m_idx[i+1]
            _start = m_idx[i]
    
            if _start >= _next:
                arr = val[_start]
            else:
                arr = val[_start:_next]
            r_res[i] = np.unique(arr).size
        result.flat[unq_ids] = r_res
        return result
    

    0 讨论(0)
提交回复
热议问题