Create new column with incremental values efficiently

前端 未结 4 1218
攒了一身酷
攒了一身酷 2021-02-07 03:23

I am creating a column with incremental values and then appending a string at the start of the column. When used on large data this is very slow. Please suggest a faster and eff

相关标签:
4条回答
  • 2021-02-07 03:45

    I'll add two more in the mix

    Numpy

    from numpy.core.defchararray import add
    
    df.assign(new=add('str_', np.arange(1, len(df) + 1).astype(str)))
    
       id Field  Value    new
    0   1     A      1  str_1
    1   2     B      0  str_2
    2   3     D      1  str_3
    

    f-string in comprehension

    Python 3.6+
    df.assign(new=[f'str_{i}' for i in range(1, len(df) + 1)])
    
       id Field  Value    new
    0   1     A      1  str_1
    1   2     B      0  str_2
    2   3     D      1  str_3
    

    Time Test

    Conclusions

    Comprehension wins the day with performance relative to simplicity. Mind you, this was cᴏʟᴅsᴘᴇᴇᴅ's proposed method. I appreciate the upvotes (thank you) but let's give credit where it's due.

    Cythonizing the comprehension didn't seem to help. Nor did f-strings.
    Divakar's numexp comes out on top for performance over larger data.

    Functions

    %load_ext Cython
    

    %%cython
    def gen_list(l, h):
        return ['str_%s' % i for i in range(l, h)]
    

    pir1 = lambda d: d.assign(new=[f'str_{i}' for i in range(1, len(d) + 1)])
    pir2 = lambda d: d.assign(new=add('str_', np.arange(1, len(d) + 1).astype(str)))
    cld1 = lambda d: d.assign(new=['str_%s' % i for i in range(1, len(d) + 1)])
    cld2 = lambda d: d.assign(new=gen_list(1, len(d) + 1))
    jez1 = lambda d: d.assign(new='str_' + pd.Series(np.arange(1, len(d) + 1), d.index).astype(str))
    div1 = lambda d: d.assign(new=create_inc_pattern(prefix_str='str_', start=1, stop=len(d) + 1))
    div2 = lambda d: d.assign(new=create_inc_pattern_numexpr(prefix_str='str_', start=1, stop=len(d) + 1))
    

    Testing

    res = pd.DataFrame(
        np.nan, [10, 30, 100, 300, 1000, 3000, 10000, 30000],
        'pir1 pir2 cld1 cld2 jez1 div1 div2'.split()
    )
    
    for i in res.index:
        d = pd.concat([df] * i)
        for j in res.columns:
            stmt = f'{j}(d)'
            setp = f'from __main__ import {j}, d'
            res.at[i, j] = timeit(stmt, setp, number=200)
    

    Results

    res.plot(loglog=True)
    

    res.div(res.min(1), 0)
    
               pir1      pir2      cld1      cld2       jez1      div1      div2
    10     1.243998  1.137877  1.006501  1.000000   1.798684  1.277133  1.427025
    30     1.009771  1.144892  1.012283  1.000000   2.144972  1.210803  1.283230
    100    1.090170  1.567300  1.039085  1.000000   3.134154  1.281968  1.356706
    300    1.061804  2.260091  1.072633  1.000000   4.792343  1.051886  1.305122
    1000   1.135483  3.401408  1.120250  1.033484   7.678876  1.077430  1.000000
    3000   1.310274  5.179131  1.359795  1.362273  13.006764  1.317411  1.000000
    10000  2.110001  7.861251  1.942805  1.696498  17.905551  1.974627  1.000000
    30000  2.188024  8.236724  2.100529  1.872661  18.416222  1.875299  1.000000
    

    More Functions

    def create_inc_pattern(prefix_str, start, stop):
        N = stop - start # count of numbers
        W = int(np.ceil(np.log10(N+1))) # width of numeral part in string
        dl = len(prefix_str)+W # datatype length
        dt = np.uint8 # int datatype for string to-from conversion 
    
        padv = np.full(W,48,dtype=np.uint8)
        a0 = np.r_[np.fromstring(prefix_str,dtype='uint8'), padv]
    
        r = np.arange(start, stop)
    
        addn = (r[:,None] // 10**np.arange(W-1,-1,-1))%10
        a1 = np.repeat(a0[None],N,axis=0)
        a1[:,len(prefix_str):] += addn.astype(dt)
        a1.shape = (-1)
    
        a2 = np.zeros((len(a1),4),dtype=dt)
        a2[:,0] = a1
        return np.frombuffer(a2.ravel(), dtype='U'+str(dl))
    
    import numexpr as ne
    
    def create_inc_pattern_numexpr(prefix_str, start, stop):
        N = stop - start # count of numbers
        W = int(np.ceil(np.log10(N+1))) # width of numeral part in string
        dl = len(prefix_str)+W # datatype length
        dt = np.uint8 # int datatype for string to-from conversion 
    
        padv = np.full(W,48,dtype=np.uint8)
        a0 = np.r_[np.fromstring(prefix_str,dtype='uint8'), padv]
    
        r = np.arange(start, stop)
    
        r2D = r[:,None]
        s = 10**np.arange(W-1,-1,-1)
        addn = ne.evaluate('(r2D/s)%10')
        a1 = np.repeat(a0[None],N,axis=0)
        a1[:,len(prefix_str):] += addn.astype(dt)
        a1.shape = (-1)
    
        a2 = np.zeros((len(a1),4),dtype=dt)
        a2[:,0] = a1
        return np.frombuffer(a2.ravel(), dtype='U'+str(dl))
    
    0 讨论(0)
  • 2021-02-07 03:46

    When all else fails, use a list comprehension:

    df['NewColumn'] = ['str_%s' %i for i in range(1, len(df) + 1)]
    

    Further speedups are possible if you cythonize your function:

    %load_ext Cython
    
    %%cython
    def gen_list(l, h):
        return ['str_%s' %i for i in range(l, h)]
    

    Note, this code is run on Python3.6.0 (IPython6.2.1). Solution improved thanks to @hpaulj in the comments.


    # @jezrael's fastest solution
    
    %%timeit
    df['NewColumn'] = np.arange(len(df['a'])) + 1
    df['NewColumn'] = 'str_' + df['New_Column'].map(str)
    
    547 ms ± 13.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    

    # in this post - no cython
    
    %timeit df['NewColumn'] = ['str_%s'%i for i in range(n)]
    409 ms ± 9.36 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    

    # cythonized list comp 
    
    %timeit df['NewColumn'] = gen_list(1, len(df) + 1)
    370 ms ± 9.23 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    
    0 讨论(0)
  • 2021-02-07 03:47

    One possible solution is with convert values to strings by map:

    df['New_Column'] = np.arange(len(df['a']))+1
    df['New_Column'] = 'str_' + df['New_Column'].map(str)
    
    0 讨论(0)
  • 2021-02-07 04:07

    Proposed approach

    After tinkering quite a bit with the string and numeric dtypes and leveraging the easy interoperability between them, here's something that I ended up with to get zeros padded strings, as NumPy does well and allows for vectorized operations that way -

    def create_inc_pattern(prefix_str, start, stop):
        N = stop - start # count of numbers
        W = int(np.ceil(np.log10(stop+1))) # width of numeral part in string
    
        padv = np.full(W,48,dtype=np.uint8)
        a0 = np.r_[np.fromstring(prefix_str,dtype='uint8'), padv]
        a1 = np.repeat(a0[None],N,axis=0)
    
        r = np.arange(start, stop)
        addn = (r[:,None] // 10**np.arange(W-1,-1,-1))%10
        a1[:,len(prefix_str):] += addn.astype(a1.dtype)
        return a1.view('S'+str(a1.shape[1])).ravel()
    

    Brining in numexpr for faster broadcasting + modulus operation -

    import numexpr as ne
    
    def create_inc_pattern_numexpr(prefix_str, start, stop):
        N = stop - start # count of numbers
        W = int(np.ceil(np.log10(stop+1))) # width of numeral part in string
    
        padv = np.full(W,48,dtype=np.uint8)
        a0 = np.r_[np.fromstring(prefix_str,dtype='uint8'), padv]
        a1 = np.repeat(a0[None],N,axis=0)
    
        r = np.arange(start, stop)
        r2D = r[:,None]
        s = 10**np.arange(W-1,-1,-1)
        addn = ne.evaluate('(r2D/s)%10')
        a1[:,len(prefix_str):] += addn.astype(a1.dtype)
        return a1.view('S'+str(a1.shape[1])).ravel()
    

    So, to use as the new column :

    df['New_Column'] = create_inc_pattern(prefix_str='str_', start=1, stop=len(df)+1)
    

    Sample runs -

    In [334]: create_inc_pattern_numexpr(prefix_str='str_', start=1, stop=14)
    Out[334]: 
    array(['str_01', 'str_02', 'str_03', 'str_04', 'str_05', 'str_06',
           'str_07', 'str_08', 'str_09', 'str_10', 'str_11', 'str_12', 'str_13'], 
          dtype='|S6')
    
    In [338]: create_inc_pattern(prefix_str='str_', start=1, stop=124)
    Out[338]: 
    array(['str_001', 'str_002', 'str_003', 'str_004', 'str_005', 'str_006',
           'str_007', 'str_008', 'str_009', 'str_010', 'str_011', 'str_012',..
           'str_115', 'str_116', 'str_117', 'str_118', 'str_119', 'str_120',
           'str_121', 'str_122', 'str_123'], 
          dtype='|S7')
    

    Explanation

    Basic idea and explanation with step-by-step sample run

    The basic idea is creating the ASCII equivalent numeric array, which could be viewed or converted by dtype conversion into a string one. To be more specific, we would create uint8 type numerals. Thus, each string would be represented by a 1D array of numerals. For the list of strings that would translate to a 2D array of numerals with each row (1D array) representing a single string.

    1) Inputs :

    In [22]: prefix_str='str_'
        ...: start=15
        ...: stop=24
    

    2) Parameters :

    In [23]: N = stop - start # count of numbers
        ...: W = int(np.ceil(np.log10(stop+1))) # width of numeral part in string
    
    In [24]: N,W
    Out[24]: (9, 2)
    

    3) Create 1D array of numerals representing the starting string :

    In [25]: padv = np.full(W,48,dtype=np.uint8)
        ...: a0 = np.r_[np.fromstring(prefix_str,dtype='uint8'), padv]
    
    In [27]: a0
    Out[27]: array([115, 116, 114,  95,  48,  48], dtype=uint8)
    

    4) Extend to cover range of strings as 2D array :

    In [33]: a1 = np.repeat(a0[None],N,axis=0)
        ...: r = np.arange(start, stop)
        ...: addn = (r[:,None] // 10**np.arange(W-1,-1,-1))%10
        ...: a1[:,len(prefix_str):] += addn.astype(a1.dtype)
    
    In [34]: a1
    Out[34]: 
    array([[115, 116, 114,  95,  49,  53],
           [115, 116, 114,  95,  49,  54],
           [115, 116, 114,  95,  49,  55],
           [115, 116, 114,  95,  49,  56],
           [115, 116, 114,  95,  49,  57],
           [115, 116, 114,  95,  50,  48],
           [115, 116, 114,  95,  50,  49],
           [115, 116, 114,  95,  50,  50],
           [115, 116, 114,  95,  50,  51]], dtype=uint8)
    

    5) Thus, each row represents ascii equivalent of a string each off the desired output. Let's get it with the final step :

    In [35]: a1.view('S'+str(a1.shape[1])).ravel()
    Out[35]: 
    array(['str_15', 'str_16', 'str_17', 'str_18', 'str_19', 'str_20',
           'str_21', 'str_22', 'str_23'], 
          dtype='|S6')
    

    Timings

    Here's a quick timings test against the list comprehension version that seems to be working the best looking at the timings from other posts -

    In [339]: N = 10000
    
    In [340]: %timeit ['str_%s'%i for i in range(N)]
    1000 loops, best of 3: 1.12 ms per loop
    
    In [341]: %timeit create_inc_pattern_numexpr(prefix_str='str_', start=1, stop=N)
    1000 loops, best of 3: 490 µs per loop
    
    In [342]: N = 100000
    
    In [343]: %timeit ['str_%s'%i for i in range(N)]
    100 loops, best of 3: 14 ms per loop
    
    In [344]: %timeit create_inc_pattern_numexpr(prefix_str='str_', start=1, stop=N)
    100 loops, best of 3: 4 ms per loop
    

    Python-3 codes

    On Python-3, to get the string dtype array, we were needed to pad with few more zeros on the intermediate int dtype array. So, the equivalent without and with numexpr versions for Python-3 ended up becoming something along these lines -

    Method #1 (No numexpr) :

    def create_inc_pattern(prefix_str, start, stop):
        N = stop - start # count of numbers
        W = int(np.ceil(np.log10(stop+1))) # width of numeral part in string
        dl = len(prefix_str)+W # datatype length
        dt = np.uint8 # int datatype for string to-from conversion 
    
        padv = np.full(W,48,dtype=np.uint8)
        a0 = np.r_[np.fromstring(prefix_str,dtype='uint8'), padv]
    
        r = np.arange(start, stop)
    
        addn = (r[:,None] // 10**np.arange(W-1,-1,-1))%10
        a1 = np.repeat(a0[None],N,axis=0)
        a1[:,len(prefix_str):] += addn.astype(dt)
        a1.shape = (-1)
    
        a2 = np.zeros((len(a1),4),dtype=dt)
        a2[:,0] = a1
        return np.frombuffer(a2.ravel(), dtype='U'+str(dl))
    

    Method #2 (With numexpr) :

    import numexpr as ne
    
    def create_inc_pattern_numexpr(prefix_str, start, stop):
        N = stop - start # count of numbers
        W = int(np.ceil(np.log10(stop+1))) # width of numeral part in string
        dl = len(prefix_str)+W # datatype length
        dt = np.uint8 # int datatype for string to-from conversion 
    
        padv = np.full(W,48,dtype=np.uint8)
        a0 = np.r_[np.fromstring(prefix_str,dtype='uint8'), padv]
    
        r = np.arange(start, stop)
    
        r2D = r[:,None]
        s = 10**np.arange(W-1,-1,-1)
        addn = ne.evaluate('(r2D/s)%10')
        a1 = np.repeat(a0[None],N,axis=0)
        a1[:,len(prefix_str):] += addn.astype(dt)
        a1.shape = (-1)
    
        a2 = np.zeros((len(a1),4),dtype=dt)
        a2[:,0] = a1
        return np.frombuffer(a2.ravel(), dtype='U'+str(dl))
    

    Timings -

    In [8]: N = 100000
    
    In [9]: %timeit ['str_%s'%i for i in range(N)]
    100 loops, best of 3: 18.5 ms per loop
    
    In [10]: %timeit create_inc_pattern_numexpr(prefix_str='str_', start=1, stop=N)
    100 loops, best of 3: 6.06 ms per loop
    
    0 讨论(0)
提交回复
热议问题