Numpy: Finding count of distinct values from associations through binning

后端未结

关注

 1  787

Prerequisite

This is a question is an extension of this post. So, some of the introduction of the problem will be similar to that post.

相关标签:

1条回答

一个人的身影

2021-01-25 17:40

Here is one solution

import numpy as np x_mapping = np.array([0, 1, 0, 0, 0, 0, 0, 0]) y_mapping = np.array([0, 3, 2, 2, 0, 3, 2, 0]) values = np.array([ 1., 2., 1., 1., 5., 6., 7., 1.], dtype=np.float32) result = np.zeros([4, 2], dtype=np.float32) # Get flat indices idx_mapping = np.ravel_multi_index((-y_mapping, x_mapping), result.shape, mode='wrap') # Sort flat indices and reorders values accordingly reorder = np.argsort(idx_mapping) idx_mapping = idx_mapping[reorder] values = values[reorder] # Get unique values val_uniq = np.unique(values) # Find where each unique value appears val_uniq_hit = values[:, np.newaxis] == val_uniq # Find reduction indices (slices with the same flat index) reduce_idx = np.concatenate([[0], np.nonzero(np.diff(idx_mapping))[0] + 1]) # Reduce slices reduced = np.logical_or.reduceat(val_uniq_hit, reduce_idx) # Count distinct values on each slice counts = np.count_nonzero(reduced, axis=1) # Put counts in result result.flat[idx_mapping[reduce_idx]] = counts print(result) # [[2. 0.] # [1. 1.] # [2. 0.] # [0. 0.]]

This method takes more memory (O(len(values) * len(np.unique(values)))), but a small benchmark comparing with your original solution shows a significant speedup (although that depends on the actual size of the problem):

import numpy as np np.random.seed(100) result = np.zeros([400, 200], dtype=np.float32) values = np.random.randint(100, size=(20000,)).astype(np.float32) x_mapping = np.random.randint(result.shape[1], size=values.shape) y_mapping = np.random.randint(result.shape[0], size=values.shape) res1 = solution_orig(x_mapping, y_mapping, values, result) res2 = solution(x_mapping, y_mapping, values, result) print(np.allclose(res1, res2)) # True # Original solution %timeit solution_orig(x_mapping, y_mapping, values, result) # 76.2 ms ± 623 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) # This solution %timeit solution(x_mapping, y_mapping, values, result) # 13.8 ms ± 51.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Full code of benchmark functions:

import numpy as np def solution(x_mapping, y_mapping, values, result): result = np.array(result) idx_mapping = np.ravel_multi_index((-y_mapping, x_mapping), result.shape, mode='wrap') reorder = np.argsort(idx_mapping) idx_mapping = idx_mapping[reorder] values = values[reorder] val_uniq = np.unique(values) val_uniq_hit = values[:, np.newaxis] == val_uniq reduce_idx = np.concatenate([[0], np.nonzero(np.diff(idx_mapping))[0] + 1]) reduced = np.logical_or.reduceat(val_uniq_hit, reduce_idx) counts = np.count_nonzero(reduced, axis=1) result.flat[idx_mapping[reduce_idx]] = counts return result def solution_orig(x_mapping, y_mapping, values, result): result = np.array(result) m,n = result.shape out_dtype = result.dtype lidx = ((-y_mapping)%m)*n + x_mapping sidx = lidx.argsort() idx = lidx[sidx] val = values[sidx] m_idx = np.flatnonzero(np.r_[True,idx[:-1] != idx[1:]]) unq_ids = idx[m_idx] r_res = np.zeros(m_idx.size, dtype=np.float32) for i in range(0, m_idx.shape[0]): _next = None arr = None if i == m_idx.shape[0]-1: _next = val.shape[0] else: _next = m_idx[i+1] _start = m_idx[i] if _start >= _next: arr = val[_start] else: arr = val[_start:_next] r_res[i] = np.unique(arr).size result.flat[unq_ids] = r_res return result

0 讨论(0)

发布评论:

提交评论

加载中...

验证码

看不清?

提交回复