I am creating a column with incremental values and then appending a string at the start of the column. When used on large data this is very slow. Please suggest a faster and eff
I'll add two more in the mix
from numpy.core.defchararray import add
df.assign(new=add('str_', np.arange(1, len(df) + 1).astype(str)))
id Field Value new
0 1 A 1 str_1
1 2 B 0 str_2
2 3 D 1 str_3
f-string
in comprehensiondf.assign(new=[f'str_{i}' for i in range(1, len(df) + 1)])
id Field Value new
0 1 A 1 str_1
1 2 B 0 str_2
2 3 D 1 str_3
Comprehension wins the day with performance relative to simplicity. Mind you, this was cᴏʟᴅsᴘᴇᴇᴅ's proposed method. I appreciate the upvotes (thank you) but let's give credit where it's due.
Cythonizing the comprehension didn't seem to help. Nor did f-strings.
Divakar's numexp
comes out on top for performance over larger data.
%load_ext Cython
%%cython
def gen_list(l, h):
return ['str_%s' % i for i in range(l, h)]
pir1 = lambda d: d.assign(new=[f'str_{i}' for i in range(1, len(d) + 1)])
pir2 = lambda d: d.assign(new=add('str_', np.arange(1, len(d) + 1).astype(str)))
cld1 = lambda d: d.assign(new=['str_%s' % i for i in range(1, len(d) + 1)])
cld2 = lambda d: d.assign(new=gen_list(1, len(d) + 1))
jez1 = lambda d: d.assign(new='str_' + pd.Series(np.arange(1, len(d) + 1), d.index).astype(str))
div1 = lambda d: d.assign(new=create_inc_pattern(prefix_str='str_', start=1, stop=len(d) + 1))
div2 = lambda d: d.assign(new=create_inc_pattern_numexpr(prefix_str='str_', start=1, stop=len(d) + 1))
res = pd.DataFrame(
np.nan, [10, 30, 100, 300, 1000, 3000, 10000, 30000],
'pir1 pir2 cld1 cld2 jez1 div1 div2'.split()
)
for i in res.index:
d = pd.concat([df] * i)
for j in res.columns:
stmt = f'{j}(d)'
setp = f'from __main__ import {j}, d'
res.at[i, j] = timeit(stmt, setp, number=200)
res.plot(loglog=True)
res.div(res.min(1), 0)
pir1 pir2 cld1 cld2 jez1 div1 div2
10 1.243998 1.137877 1.006501 1.000000 1.798684 1.277133 1.427025
30 1.009771 1.144892 1.012283 1.000000 2.144972 1.210803 1.283230
100 1.090170 1.567300 1.039085 1.000000 3.134154 1.281968 1.356706
300 1.061804 2.260091 1.072633 1.000000 4.792343 1.051886 1.305122
1000 1.135483 3.401408 1.120250 1.033484 7.678876 1.077430 1.000000
3000 1.310274 5.179131 1.359795 1.362273 13.006764 1.317411 1.000000
10000 2.110001 7.861251 1.942805 1.696498 17.905551 1.974627 1.000000
30000 2.188024 8.236724 2.100529 1.872661 18.416222 1.875299 1.000000
def create_inc_pattern(prefix_str, start, stop):
N = stop - start # count of numbers
W = int(np.ceil(np.log10(N+1))) # width of numeral part in string
dl = len(prefix_str)+W # datatype length
dt = np.uint8 # int datatype for string to-from conversion
padv = np.full(W,48,dtype=np.uint8)
a0 = np.r_[np.fromstring(prefix_str,dtype='uint8'), padv]
r = np.arange(start, stop)
addn = (r[:,None] // 10**np.arange(W-1,-1,-1))%10
a1 = np.repeat(a0[None],N,axis=0)
a1[:,len(prefix_str):] += addn.astype(dt)
a1.shape = (-1)
a2 = np.zeros((len(a1),4),dtype=dt)
a2[:,0] = a1
return np.frombuffer(a2.ravel(), dtype='U'+str(dl))
import numexpr as ne
def create_inc_pattern_numexpr(prefix_str, start, stop):
N = stop - start # count of numbers
W = int(np.ceil(np.log10(N+1))) # width of numeral part in string
dl = len(prefix_str)+W # datatype length
dt = np.uint8 # int datatype for string to-from conversion
padv = np.full(W,48,dtype=np.uint8)
a0 = np.r_[np.fromstring(prefix_str,dtype='uint8'), padv]
r = np.arange(start, stop)
r2D = r[:,None]
s = 10**np.arange(W-1,-1,-1)
addn = ne.evaluate('(r2D/s)%10')
a1 = np.repeat(a0[None],N,axis=0)
a1[:,len(prefix_str):] += addn.astype(dt)
a1.shape = (-1)
a2 = np.zeros((len(a1),4),dtype=dt)
a2[:,0] = a1
return np.frombuffer(a2.ravel(), dtype='U'+str(dl))
When all else fails, use a list comprehension:
df['NewColumn'] = ['str_%s' %i for i in range(1, len(df) + 1)]
Further speedups are possible if you cythonize your function:
%load_ext Cython
%%cython
def gen_list(l, h):
return ['str_%s' %i for i in range(l, h)]
Note, this code is run on Python3.6.0 (IPython6.2.1). Solution improved thanks to @hpaulj in the comments.
# @jezrael's fastest solution
%%timeit
df['NewColumn'] = np.arange(len(df['a'])) + 1
df['NewColumn'] = 'str_' + df['New_Column'].map(str)
547 ms ± 13.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# in this post - no cython
%timeit df['NewColumn'] = ['str_%s'%i for i in range(n)]
409 ms ± 9.36 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# cythonized list comp
%timeit df['NewColumn'] = gen_list(1, len(df) + 1)
370 ms ± 9.23 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
One possible solution is with convert values to string
s by map
:
df['New_Column'] = np.arange(len(df['a']))+1
df['New_Column'] = 'str_' + df['New_Column'].map(str)
After tinkering quite a bit with the string and numeric dtypes and leveraging the easy interoperability between them, here's something that I ended up with to get zeros padded strings, as NumPy does well and allows for vectorized operations that way -
def create_inc_pattern(prefix_str, start, stop):
N = stop - start # count of numbers
W = int(np.ceil(np.log10(stop+1))) # width of numeral part in string
padv = np.full(W,48,dtype=np.uint8)
a0 = np.r_[np.fromstring(prefix_str,dtype='uint8'), padv]
a1 = np.repeat(a0[None],N,axis=0)
r = np.arange(start, stop)
addn = (r[:,None] // 10**np.arange(W-1,-1,-1))%10
a1[:,len(prefix_str):] += addn.astype(a1.dtype)
return a1.view('S'+str(a1.shape[1])).ravel()
Brining in numexpr
for faster broadcasting + modulus operation -
import numexpr as ne
def create_inc_pattern_numexpr(prefix_str, start, stop):
N = stop - start # count of numbers
W = int(np.ceil(np.log10(stop+1))) # width of numeral part in string
padv = np.full(W,48,dtype=np.uint8)
a0 = np.r_[np.fromstring(prefix_str,dtype='uint8'), padv]
a1 = np.repeat(a0[None],N,axis=0)
r = np.arange(start, stop)
r2D = r[:,None]
s = 10**np.arange(W-1,-1,-1)
addn = ne.evaluate('(r2D/s)%10')
a1[:,len(prefix_str):] += addn.astype(a1.dtype)
return a1.view('S'+str(a1.shape[1])).ravel()
So, to use as the new column :
df['New_Column'] = create_inc_pattern(prefix_str='str_', start=1, stop=len(df)+1)
Sample runs -
In [334]: create_inc_pattern_numexpr(prefix_str='str_', start=1, stop=14)
Out[334]:
array(['str_01', 'str_02', 'str_03', 'str_04', 'str_05', 'str_06',
'str_07', 'str_08', 'str_09', 'str_10', 'str_11', 'str_12', 'str_13'],
dtype='|S6')
In [338]: create_inc_pattern(prefix_str='str_', start=1, stop=124)
Out[338]:
array(['str_001', 'str_002', 'str_003', 'str_004', 'str_005', 'str_006',
'str_007', 'str_008', 'str_009', 'str_010', 'str_011', 'str_012',..
'str_115', 'str_116', 'str_117', 'str_118', 'str_119', 'str_120',
'str_121', 'str_122', 'str_123'],
dtype='|S7')
Basic idea and explanation with step-by-step sample run
The basic idea is creating the ASCII equivalent numeric array, which could be viewed or converted by dtype conversion into a string one. To be more specific, we would create uint8 type numerals. Thus, each string would be represented by a 1D array of numerals. For the list of strings that would translate to a 2D array of numerals with each row (1D array) representing a single string.
1) Inputs :
In [22]: prefix_str='str_'
...: start=15
...: stop=24
2) Parameters :
In [23]: N = stop - start # count of numbers
...: W = int(np.ceil(np.log10(stop+1))) # width of numeral part in string
In [24]: N,W
Out[24]: (9, 2)
3) Create 1D array of numerals representing the starting string :
In [25]: padv = np.full(W,48,dtype=np.uint8)
...: a0 = np.r_[np.fromstring(prefix_str,dtype='uint8'), padv]
In [27]: a0
Out[27]: array([115, 116, 114, 95, 48, 48], dtype=uint8)
4) Extend to cover range of strings as 2D array :
In [33]: a1 = np.repeat(a0[None],N,axis=0)
...: r = np.arange(start, stop)
...: addn = (r[:,None] // 10**np.arange(W-1,-1,-1))%10
...: a1[:,len(prefix_str):] += addn.astype(a1.dtype)
In [34]: a1
Out[34]:
array([[115, 116, 114, 95, 49, 53],
[115, 116, 114, 95, 49, 54],
[115, 116, 114, 95, 49, 55],
[115, 116, 114, 95, 49, 56],
[115, 116, 114, 95, 49, 57],
[115, 116, 114, 95, 50, 48],
[115, 116, 114, 95, 50, 49],
[115, 116, 114, 95, 50, 50],
[115, 116, 114, 95, 50, 51]], dtype=uint8)
5) Thus, each row represents ascii equivalent of a string each off the desired output. Let's get it with the final step :
In [35]: a1.view('S'+str(a1.shape[1])).ravel()
Out[35]:
array(['str_15', 'str_16', 'str_17', 'str_18', 'str_19', 'str_20',
'str_21', 'str_22', 'str_23'],
dtype='|S6')
Here's a quick timings test against the list comprehension version that seems to be working the best looking at the timings from other posts -
In [339]: N = 10000
In [340]: %timeit ['str_%s'%i for i in range(N)]
1000 loops, best of 3: 1.12 ms per loop
In [341]: %timeit create_inc_pattern_numexpr(prefix_str='str_', start=1, stop=N)
1000 loops, best of 3: 490 µs per loop
In [342]: N = 100000
In [343]: %timeit ['str_%s'%i for i in range(N)]
100 loops, best of 3: 14 ms per loop
In [344]: %timeit create_inc_pattern_numexpr(prefix_str='str_', start=1, stop=N)
100 loops, best of 3: 4 ms per loop
On Python-3, to get the string dtype array, we were needed to pad with few more zeros on the intermediate int dtype array. So, the equivalent without and with numexpr versions for Python-3 ended up becoming something along these lines -
Method #1 (No numexpr) :
def create_inc_pattern(prefix_str, start, stop):
N = stop - start # count of numbers
W = int(np.ceil(np.log10(stop+1))) # width of numeral part in string
dl = len(prefix_str)+W # datatype length
dt = np.uint8 # int datatype for string to-from conversion
padv = np.full(W,48,dtype=np.uint8)
a0 = np.r_[np.fromstring(prefix_str,dtype='uint8'), padv]
r = np.arange(start, stop)
addn = (r[:,None] // 10**np.arange(W-1,-1,-1))%10
a1 = np.repeat(a0[None],N,axis=0)
a1[:,len(prefix_str):] += addn.astype(dt)
a1.shape = (-1)
a2 = np.zeros((len(a1),4),dtype=dt)
a2[:,0] = a1
return np.frombuffer(a2.ravel(), dtype='U'+str(dl))
Method #2 (With numexpr) :
import numexpr as ne
def create_inc_pattern_numexpr(prefix_str, start, stop):
N = stop - start # count of numbers
W = int(np.ceil(np.log10(stop+1))) # width of numeral part in string
dl = len(prefix_str)+W # datatype length
dt = np.uint8 # int datatype for string to-from conversion
padv = np.full(W,48,dtype=np.uint8)
a0 = np.r_[np.fromstring(prefix_str,dtype='uint8'), padv]
r = np.arange(start, stop)
r2D = r[:,None]
s = 10**np.arange(W-1,-1,-1)
addn = ne.evaluate('(r2D/s)%10')
a1 = np.repeat(a0[None],N,axis=0)
a1[:,len(prefix_str):] += addn.astype(dt)
a1.shape = (-1)
a2 = np.zeros((len(a1),4),dtype=dt)
a2[:,0] = a1
return np.frombuffer(a2.ravel(), dtype='U'+str(dl))
Timings -
In [8]: N = 100000
In [9]: %timeit ['str_%s'%i for i in range(N)]
100 loops, best of 3: 18.5 ms per loop
In [10]: %timeit create_inc_pattern_numexpr(prefix_str='str_', start=1, stop=N)
100 loops, best of 3: 6.06 ms per loop