Suppose I have a dataframe as follows:
In [1]: test_dup_df
Out[1]:
exe_price exe_vol flag
2008-03-13 14:41:07 84.5 200 yes
2008-03-13
I like @waitingkuo's answer because it is very clear and readable.
I'm keeping this around anyway because it does appear to be faster -- at least with Pandas version 0.10.0. The situation may (hopefully) change in the future, so be sure to rerun the benchmark especially if you are using a different version of Pandas.
import pandas as pd
import io
import timeit
data = '''\
date time exe_price exe_vol flag
2008-03-13 14:41:07 84.5 200 yes
2008-03-13 14:41:37 85.0 10000 yes
2008-03-13 14:41:38 84.5 69700 yes
2008-03-13 14:41:39 84.5 1200 yes
2008-03-13 14:42:00 84.5 1000 yes
2008-03-13 14:42:08 84.5 300 yes
2008-03-13 14:42:10 10 88100 yes
2008-03-13 14:42:10 100 11900 yes
2008-03-13 14:42:15 84.5 5000 yes
2008-03-13 14:42:16 84.5 3200 yes'''
df = pd.read_table(io.BytesIO(data), sep='\s+', parse_dates=[[0, 1]],
index_col=0)
def func(subf):
exe_vol = subf['exe_vol'].sum()
exe_price = ((subf['exe_price']*subf['exe_vol']).sum()
/ exe_vol)
flag = True
return pd.Series([exe_price, exe_vol, flag],
index=['exe_price', 'exe_vol', 'flag'])
# return exe_price
def using_apply():
return df.groupby(df.index).apply(func)
def using_helper_column():
df['weight'] = df['exe_price'] * df['exe_vol']
grouped = df.groupby(level=0, group_keys=True)
result = grouped.agg({'weight': 'sum', 'exe_vol': 'sum'})
result['exe_price'] = result['weight'] / result['exe_vol']
result['flag'] = True
result = result.drop(['weight'], axis=1)
return result
result = using_apply()
print(result)
result = using_helper_column()
print(result)
time_apply = timeit.timeit('m.using_apply()',
'import __main__ as m ',
number=1000)
time_helper = timeit.timeit('m.using_helper_column()',
'import __main__ as m ',
number=1000)
print('using_apply: {t}'.format(t = time_apply))
print('using_helper_column: {t}'.format(t = time_helper))
yields
exe_vol exe_price flag
date_time
2008-03-13 14:41:07 200 84.50 True
2008-03-13 14:41:37 10000 85.00 True
2008-03-13 14:41:38 69700 84.50 True
2008-03-13 14:41:39 1200 84.50 True
2008-03-13 14:42:00 1000 84.50 True
2008-03-13 14:42:08 300 84.50 True
2008-03-13 14:42:10 100000 20.71 True
2008-03-13 14:42:15 5000 84.50 True
2008-03-13 14:42:16 3200 84.50 True
with timeit benchmarks of:
using_apply: 3.0081038475
using_helper_column: 1.35300707817