I faced the same issue today and I spend few hours on google searching for solution. Finally I come up with a work around like this:
import numpy as np
import pandas as pd
import time
def foo(text):
text = str(text) + ' is processed'
return text
def func1(data):
print("run1")
return foo(data['text'])
def func2(data):
print("run2")
data['text'] = data['text'] + ' is processed'
return data
def test_one():
data = pd.DataFrame(columns=['text'], index=np.arange(0, 3))
data['text'] = 'text'
start = time.time()
data = data.apply(func1, axis = 1)
print(time.time() - start)
print(data)
def test_two():
data = pd.DataFrame(columns=['text'], index=np.arange(0, 3))
data['text'] = 'text'
start = time.time()
data = data.apply(func2, axis=1)
print(time.time() - start)
print(data)
test_one()
test_two()
if you run the program you will see the result like this:
run1
run1
run1
0.0029706954956054688
0 text is processed
1 text is processed
2 text is processed
dtype: object
run2
run2
run2
run2
0.0049877166748046875
text
0 text is processed is processed
1 text is processed
2 text is processed
By splitting the function (func2) into func1 and foo, it runs the first row once only.