Create new column with incremental values efficiently

前端未结

关注

 4  1222

攒了一身酷 2021-02-07 03:23

I am creating a column with incremental values and then appending a string at the start of the column. When used on large data this is very slow. Please suggest a faster and eff

4条回答

天涯浪人 (楼主)

2021-02-07 03:45

I'll add two more in the mix

Numpy

from numpy.core.defchararray import add

df.assign(new=add('str_', np.arange(1, len(df) + 1).astype(str)))

   id Field  Value    new
0   1     A      1  str_1
1   2     B      0  str_2
2   3     D      1  str_3

`f-string` in comprehension

Python 3.6+

df.assign(new=[f'str_{i}' for i in range(1, len(df) + 1)])

   id Field  Value    new
0   1     A      1  str_1
1   2     B      0  str_2
2   3     D      1  str_3

Time Test

Conclusions

Comprehension wins the day with performance relative to simplicity. Mind you, this was cᴏʟᴅsᴘᴇᴇᴅ's proposed method. I appreciate the upvotes (thank you) but let's give credit where it's due.

Cythonizing the comprehension didn't seem to help. Nor did f-strings.
Divakar's numexp comes out on top for performance over larger data.

Functions

%load_ext Cython

%%cython
def gen_list(l, h):
    return ['str_%s' % i for i in range(l, h)]

pir1 = lambda d: d.assign(new=[f'str_{i}' for i in range(1, len(d) + 1)])
pir2 = lambda d: d.assign(new=add('str_', np.arange(1, len(d) + 1).astype(str)))
cld1 = lambda d: d.assign(new=['str_%s' % i for i in range(1, len(d) + 1)])
cld2 = lambda d: d.assign(new=gen_list(1, len(d) + 1))
jez1 = lambda d: d.assign(new='str_' + pd.Series(np.arange(1, len(d) + 1), d.index).astype(str))
div1 = lambda d: d.assign(new=create_inc_pattern(prefix_str='str_', start=1, stop=len(d) + 1))
div2 = lambda d: d.assign(new=create_inc_pattern_numexpr(prefix_str='str_', start=1, stop=len(d) + 1))

Testing

res = pd.DataFrame(
    np.nan, [10, 30, 100, 300, 1000, 3000, 10000, 30000],
    'pir1 pir2 cld1 cld2 jez1 div1 div2'.split()
)

for i in res.index:
    d = pd.concat([df] * i)
    for j in res.columns:
        stmt = f'{j}(d)'
        setp = f'from __main__ import {j}, d'
        res.at[i, j] = timeit(stmt, setp, number=200)

Results

res.plot(loglog=True)

res.div(res.min(1), 0)

           pir1      pir2      cld1      cld2       jez1      div1      div2
10     1.243998  1.137877  1.006501  1.000000   1.798684  1.277133  1.427025
30     1.009771  1.144892  1.012283  1.000000   2.144972  1.210803  1.283230
100    1.090170  1.567300  1.039085  1.000000   3.134154  1.281968  1.356706
300    1.061804  2.260091  1.072633  1.000000   4.792343  1.051886  1.305122
1000   1.135483  3.401408  1.120250  1.033484   7.678876  1.077430  1.000000
3000   1.310274  5.179131  1.359795  1.362273  13.006764  1.317411  1.000000
10000  2.110001  7.861251  1.942805  1.696498  17.905551  1.974627  1.000000
30000  2.188024  8.236724  2.100529  1.872661  18.416222  1.875299  1.000000

More Functions

def create_inc_pattern(prefix_str, start, stop):
    N = stop - start # count of numbers
    W = int(np.ceil(np.log10(N+1))) # width of numeral part in string
    dl = len(prefix_str)+W # datatype length
    dt = np.uint8 # int datatype for string to-from conversion 

    padv = np.full(W,48,dtype=np.uint8)
    a0 = np.r_[np.fromstring(prefix_str,dtype='uint8'), padv]

    r = np.arange(start, stop)

    addn = (r[:,None] // 10**np.arange(W-1,-1,-1))%10
    a1 = np.repeat(a0[None],N,axis=0)
    a1[:,len(prefix_str):] += addn.astype(dt)
    a1.shape = (-1)

    a2 = np.zeros((len(a1),4),dtype=dt)
    a2[:,0] = a1
    return np.frombuffer(a2.ravel(), dtype='U'+str(dl))

import numexpr as ne

def create_inc_pattern_numexpr(prefix_str, start, stop):
    N = stop - start # count of numbers
    W = int(np.ceil(np.log10(N+1))) # width of numeral part in string
    dl = len(prefix_str)+W # datatype length
    dt = np.uint8 # int datatype for string to-from conversion 

    padv = np.full(W,48,dtype=np.uint8)
    a0 = np.r_[np.fromstring(prefix_str,dtype='uint8'), padv]

    r = np.arange(start, stop)

    r2D = r[:,None]
    s = 10**np.arange(W-1,-1,-1)
    addn = ne.evaluate('(r2D/s)%10')
    a1 = np.repeat(a0[None],N,axis=0)
    a1[:,len(prefix_str):] += addn.astype(dt)
    a1.shape = (-1)

    a2 = np.zeros((len(a1),4),dtype=dt)
    a2[:,0] = a1
    return np.frombuffer(a2.ravel(), dtype='U'+str(dl))

0 讨论(0)

查看其它4个回答