I am creating a column with incremental values and then appending a string at the start of the column. When used on large data this is very slow. Please suggest a faster and eff
I'll add two more in the mix
from numpy.core.defchararray import add
df.assign(new=add('str_', np.arange(1, len(df) + 1).astype(str)))
id Field Value new
0 1 A 1 str_1
1 2 B 0 str_2
2 3 D 1 str_3
f-string
in comprehensiondf.assign(new=[f'str_{i}' for i in range(1, len(df) + 1)])
id Field Value new
0 1 A 1 str_1
1 2 B 0 str_2
2 3 D 1 str_3
Comprehension wins the day with performance relative to simplicity. Mind you, this was cᴏʟᴅsᴘᴇᴇᴅ's proposed method. I appreciate the upvotes (thank you) but let's give credit where it's due.
Cythonizing the comprehension didn't seem to help. Nor did f-strings.
Divakar's numexp
comes out on top for performance over larger data.
%load_ext Cython
%%cython
def gen_list(l, h):
return ['str_%s' % i for i in range(l, h)]
pir1 = lambda d: d.assign(new=[f'str_{i}' for i in range(1, len(d) + 1)])
pir2 = lambda d: d.assign(new=add('str_', np.arange(1, len(d) + 1).astype(str)))
cld1 = lambda d: d.assign(new=['str_%s' % i for i in range(1, len(d) + 1)])
cld2 = lambda d: d.assign(new=gen_list(1, len(d) + 1))
jez1 = lambda d: d.assign(new='str_' + pd.Series(np.arange(1, len(d) + 1), d.index).astype(str))
div1 = lambda d: d.assign(new=create_inc_pattern(prefix_str='str_', start=1, stop=len(d) + 1))
div2 = lambda d: d.assign(new=create_inc_pattern_numexpr(prefix_str='str_', start=1, stop=len(d) + 1))
res = pd.DataFrame(
np.nan, [10, 30, 100, 300, 1000, 3000, 10000, 30000],
'pir1 pir2 cld1 cld2 jez1 div1 div2'.split()
)
for i in res.index:
d = pd.concat([df] * i)
for j in res.columns:
stmt = f'{j}(d)'
setp = f'from __main__ import {j}, d'
res.at[i, j] = timeit(stmt, setp, number=200)
res.plot(loglog=True)
res.div(res.min(1), 0)
pir1 pir2 cld1 cld2 jez1 div1 div2
10 1.243998 1.137877 1.006501 1.000000 1.798684 1.277133 1.427025
30 1.009771 1.144892 1.012283 1.000000 2.144972 1.210803 1.283230
100 1.090170 1.567300 1.039085 1.000000 3.134154 1.281968 1.356706
300 1.061804 2.260091 1.072633 1.000000 4.792343 1.051886 1.305122
1000 1.135483 3.401408 1.120250 1.033484 7.678876 1.077430 1.000000
3000 1.310274 5.179131 1.359795 1.362273 13.006764 1.317411 1.000000
10000 2.110001 7.861251 1.942805 1.696498 17.905551 1.974627 1.000000
30000 2.188024 8.236724 2.100529 1.872661 18.416222 1.875299 1.000000
def create_inc_pattern(prefix_str, start, stop):
N = stop - start # count of numbers
W = int(np.ceil(np.log10(N+1))) # width of numeral part in string
dl = len(prefix_str)+W # datatype length
dt = np.uint8 # int datatype for string to-from conversion
padv = np.full(W,48,dtype=np.uint8)
a0 = np.r_[np.fromstring(prefix_str,dtype='uint8'), padv]
r = np.arange(start, stop)
addn = (r[:,None] // 10**np.arange(W-1,-1,-1))%10
a1 = np.repeat(a0[None],N,axis=0)
a1[:,len(prefix_str):] += addn.astype(dt)
a1.shape = (-1)
a2 = np.zeros((len(a1),4),dtype=dt)
a2[:,0] = a1
return np.frombuffer(a2.ravel(), dtype='U'+str(dl))
import numexpr as ne
def create_inc_pattern_numexpr(prefix_str, start, stop):
N = stop - start # count of numbers
W = int(np.ceil(np.log10(N+1))) # width of numeral part in string
dl = len(prefix_str)+W # datatype length
dt = np.uint8 # int datatype for string to-from conversion
padv = np.full(W,48,dtype=np.uint8)
a0 = np.r_[np.fromstring(prefix_str,dtype='uint8'), padv]
r = np.arange(start, stop)
r2D = r[:,None]
s = 10**np.arange(W-1,-1,-1)
addn = ne.evaluate('(r2D/s)%10')
a1 = np.repeat(a0[None],N,axis=0)
a1[:,len(prefix_str):] += addn.astype(dt)
a1.shape = (-1)
a2 = np.zeros((len(a1),4),dtype=dt)
a2[:,0] = a1
return np.frombuffer(a2.ravel(), dtype='U'+str(dl))