Create new column with incremental values efficiently

前端 未结 4 1222
攒了一身酷
攒了一身酷 2021-02-07 03:23

I am creating a column with incremental values and then appending a string at the start of the column. When used on large data this is very slow. Please suggest a faster and eff

4条回答
  •  天涯浪人
    2021-02-07 03:45

    I'll add two more in the mix

    Numpy

    from numpy.core.defchararray import add
    
    df.assign(new=add('str_', np.arange(1, len(df) + 1).astype(str)))
    
       id Field  Value    new
    0   1     A      1  str_1
    1   2     B      0  str_2
    2   3     D      1  str_3
    

    f-string in comprehension

    Python 3.6+
    df.assign(new=[f'str_{i}' for i in range(1, len(df) + 1)])
    
       id Field  Value    new
    0   1     A      1  str_1
    1   2     B      0  str_2
    2   3     D      1  str_3
    

    Time Test

    Conclusions

    Comprehension wins the day with performance relative to simplicity. Mind you, this was cᴏʟᴅsᴘᴇᴇᴅ's proposed method. I appreciate the upvotes (thank you) but let's give credit where it's due.

    Cythonizing the comprehension didn't seem to help. Nor did f-strings.
    Divakar's numexp comes out on top for performance over larger data.

    Functions

    %load_ext Cython
    

    %%cython
    def gen_list(l, h):
        return ['str_%s' % i for i in range(l, h)]
    

    pir1 = lambda d: d.assign(new=[f'str_{i}' for i in range(1, len(d) + 1)])
    pir2 = lambda d: d.assign(new=add('str_', np.arange(1, len(d) + 1).astype(str)))
    cld1 = lambda d: d.assign(new=['str_%s' % i for i in range(1, len(d) + 1)])
    cld2 = lambda d: d.assign(new=gen_list(1, len(d) + 1))
    jez1 = lambda d: d.assign(new='str_' + pd.Series(np.arange(1, len(d) + 1), d.index).astype(str))
    div1 = lambda d: d.assign(new=create_inc_pattern(prefix_str='str_', start=1, stop=len(d) + 1))
    div2 = lambda d: d.assign(new=create_inc_pattern_numexpr(prefix_str='str_', start=1, stop=len(d) + 1))
    

    Testing

    res = pd.DataFrame(
        np.nan, [10, 30, 100, 300, 1000, 3000, 10000, 30000],
        'pir1 pir2 cld1 cld2 jez1 div1 div2'.split()
    )
    
    for i in res.index:
        d = pd.concat([df] * i)
        for j in res.columns:
            stmt = f'{j}(d)'
            setp = f'from __main__ import {j}, d'
            res.at[i, j] = timeit(stmt, setp, number=200)
    

    Results

    res.plot(loglog=True)
    

    res.div(res.min(1), 0)
    
               pir1      pir2      cld1      cld2       jez1      div1      div2
    10     1.243998  1.137877  1.006501  1.000000   1.798684  1.277133  1.427025
    30     1.009771  1.144892  1.012283  1.000000   2.144972  1.210803  1.283230
    100    1.090170  1.567300  1.039085  1.000000   3.134154  1.281968  1.356706
    300    1.061804  2.260091  1.072633  1.000000   4.792343  1.051886  1.305122
    1000   1.135483  3.401408  1.120250  1.033484   7.678876  1.077430  1.000000
    3000   1.310274  5.179131  1.359795  1.362273  13.006764  1.317411  1.000000
    10000  2.110001  7.861251  1.942805  1.696498  17.905551  1.974627  1.000000
    30000  2.188024  8.236724  2.100529  1.872661  18.416222  1.875299  1.000000
    

    More Functions

    def create_inc_pattern(prefix_str, start, stop):
        N = stop - start # count of numbers
        W = int(np.ceil(np.log10(N+1))) # width of numeral part in string
        dl = len(prefix_str)+W # datatype length
        dt = np.uint8 # int datatype for string to-from conversion 
    
        padv = np.full(W,48,dtype=np.uint8)
        a0 = np.r_[np.fromstring(prefix_str,dtype='uint8'), padv]
    
        r = np.arange(start, stop)
    
        addn = (r[:,None] // 10**np.arange(W-1,-1,-1))%10
        a1 = np.repeat(a0[None],N,axis=0)
        a1[:,len(prefix_str):] += addn.astype(dt)
        a1.shape = (-1)
    
        a2 = np.zeros((len(a1),4),dtype=dt)
        a2[:,0] = a1
        return np.frombuffer(a2.ravel(), dtype='U'+str(dl))
    
    import numexpr as ne
    
    def create_inc_pattern_numexpr(prefix_str, start, stop):
        N = stop - start # count of numbers
        W = int(np.ceil(np.log10(N+1))) # width of numeral part in string
        dl = len(prefix_str)+W # datatype length
        dt = np.uint8 # int datatype for string to-from conversion 
    
        padv = np.full(W,48,dtype=np.uint8)
        a0 = np.r_[np.fromstring(prefix_str,dtype='uint8'), padv]
    
        r = np.arange(start, stop)
    
        r2D = r[:,None]
        s = 10**np.arange(W-1,-1,-1)
        addn = ne.evaluate('(r2D/s)%10')
        a1 = np.repeat(a0[None],N,axis=0)
        a1[:,len(prefix_str):] += addn.astype(dt)
        a1.shape = (-1)
    
        a2 = np.zeros((len(a1),4),dtype=dt)
        a2[:,0] = a1
        return np.frombuffer(a2.ravel(), dtype='U'+str(dl))
    

提交回复
热议问题