numpy replace groups of elements with integers incrementally

前端 未结 2 994
花落未央
花落未央 2021-01-13 02:53
import numpy as np
data = np.array([\'b\',\'b\',\'b\',\'a\',\'a\',\'a\',\'a\',\'c\',\'c\',\'d\',\'d\',\'d\'])

I need to replace each group of strin

相关标签:
2条回答
  • 2021-01-13 03:09

    EDIT: This doesn't always work:

    >>> a,b,c = np.unique(data, return_index=True, return_inverse=True)
    >>> c # almost!!!
    array([1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 3, 3])
    >>> np.argsort(b)[c]
    array([0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3], dtype=int64)
    

    But this does work:

    def replace_groups(data):
        a,b,c, = np.unique(data, True, True)
        _, ret = np.unique(b[c], False, True)
        return ret
    

    and is faster than the dictionary replacement approach, about 33% for larger datasets:

    def replace_groups_dict(data):
        _, ind = np.unique(data, return_index=True)
        unqs = data[np.sort(ind)]
        data_id = dict(zip(unqs, np.arange(data.size)))
        num = np.array([data_id[datum] for datum in data])
        return num
    
    In [7]: %timeit replace_groups_dict(lines100)
    10000 loops, best of 3: 68.8 us per loop
    
    In [8]: %timeit replace_groups_dict(lines200)
    10000 loops, best of 3: 106 us per loop
    
    In [9]: %timeit replace_groups_dict(lines)
    10 loops, best of 3: 32.1 ms per loop
    
    In [10]: %timeit replace_groups(lines100)
    10000 loops, best of 3: 67.1 us per loop
    
    In [11]: %timeit replace_groups(lines200)
    10000 loops, best of 3: 78.4 us per loop
    
    In [12]: %timeit replace_groups(lines)
    10 loops, best of 3: 23.1 ms per loop
    
    0 讨论(0)
  • 2021-01-13 03:16

    Given @DSM's noticing that my original idea doesn't work robustly, the best solution I can think of is a replacement dictionary:

    data = np.array(['b','b','b','a','a','a','a','c','c','d','d','d'])
    _, ind = np.unique(data, return_index=True)
    unqs = data[np.sort(ind)]
    data_id = dict(zip(unqs, np.arange(data.size)))
    num = np.array([data_id[datum] for datum in data])
    

    for the month data:

    In [5]: f = open('test.txt','r')
    
    In [6]: data = np.array([line.strip() for line in f.readlines()])
    
    In [7]: _, ind, inv  = np.unique(data, return_index=True)
    
    In [8]: months = data[np.sort(ind)]
    
    In [9]: month_id = dict(zip(months, np.arange(months.size)))
    
    In [10]: np.array([month_id[datum] for datum in data])
    Out[10]: array([ 0,  0,  0, ..., 41, 41, 41])
    
    0 讨论(0)
提交回复
热议问题