I have a 20 x 4000 dataframe in Python using pandas. Two of these columns are named Year
and quarter
. I\'d like to create a variable called p
The method cat() of the .str accessor works really well for this:
>>> import pandas as pd
>>> df = pd.DataFrame([["2014", "q1"],
... ["2015", "q3"]],
... columns=('Year', 'Quarter'))
>>> print(df)
Year Quarter
0 2014 q1
1 2015 q3
>>> df['Period'] = df.Year.str.cat(df.Quarter)
>>> print(df)
Year Quarter Period
0 2014 q1 2014q1
1 2015 q3 2015q3
cat()
even allows you to add a separator so, for example, suppose you only have integers for year and period, you can do this:
>>> import pandas as pd
>>> df = pd.DataFrame([[2014, 1],
... [2015, 3]],
... columns=('Year', 'Quarter'))
>>> print(df)
Year Quarter
0 2014 1
1 2015 3
>>> df['Period'] = df.Year.astype(str).str.cat(df.Quarter.astype(str), sep='q')
>>> print(df)
Year Quarter Period
0 2014 1 2014q1
1 2015 3 2015q3
Joining multiple columns is just a matter of passing either a list of series or a dataframe containing all but the first column as a parameter to str.cat()
invoked on the first column (Series):
>>> df = pd.DataFrame(
... [['USA', 'Nevada', 'Las Vegas'],
... ['Brazil', 'Pernambuco', 'Recife']],
... columns=['Country', 'State', 'City'],
... )
>>> df['AllTogether'] = df['Country'].str.cat(df[['State', 'City']], sep=' - ')
>>> print(df)
Country State City AllTogether
0 USA Nevada Las Vegas USA - Nevada - Las Vegas
1 Brazil Pernambuco Recife Brazil - Pernambuco - Recife
Do note that if your pandas dataframe/series has null values, you need to include the parameter na_rep to replace the NaN values with a string, otherwise the combined column will default to NaN.
my take....
listofcols = ['col1','col2','col3']
df['combined_cols'] = ''
for column in listofcols:
df['combined_cols'] = df['combined_cols'] + ' ' + df[column]
'''
Using zip
could be even quicker:
df["period"] = [''.join(i) for i in zip(df["Year"].map(str),df["quarter"])]
Graph:
import pandas as pd
import numpy as np
import timeit
import matplotlib.pyplot as plt
from collections import defaultdict
df = pd.DataFrame({'Year': ['2014', '2015'], 'quarter': ['q1', 'q2']})
myfuncs = {
"df['Year'].astype(str) + df['quarter']":
lambda: df['Year'].astype(str) + df['quarter'],
"df['Year'].map(str) + df['quarter']":
lambda: df['Year'].map(str) + df['quarter'],
"df.Year.str.cat(df.quarter)":
lambda: df.Year.str.cat(df.quarter),
"df.loc[:, ['Year','quarter']].astype(str).sum(axis=1)":
lambda: df.loc[:, ['Year','quarter']].astype(str).sum(axis=1),
"df[['Year','quarter']].astype(str).sum(axis=1)":
lambda: df[['Year','quarter']].astype(str).sum(axis=1),
"df[['Year','quarter']].apply(lambda x : '{}{}'.format(x[0],x[1]), axis=1)":
lambda: df[['Year','quarter']].apply(lambda x : '{}{}'.format(x[0],x[1]), axis=1),
"[''.join(i) for i in zip(dataframe['Year'].map(str),dataframe['quarter'])]":
lambda: [''.join(i) for i in zip(df["Year"].map(str),df["quarter"])]
}
d = defaultdict(dict)
step = 10
cont = True
while cont:
lendf = len(df); print(lendf)
for k,v in myfuncs.items():
iters = 1
t = 0
while t < 0.2:
ts = timeit.repeat(v, number=iters, repeat=3)
t = min(ts)
iters *= 10
d[k][lendf] = t/iters
if t > 2: cont = False
df = pd.concat([df]*step)
pd.DataFrame(d).plot().legend(loc='upper center', bbox_to_anchor=(0.5, -0.15))
plt.yscale('log'); plt.xscale('log'); plt.ylabel('seconds'); plt.xlabel('df rows')
plt.show()
df = pd.DataFrame({'Year': ['2014', '2015'], 'quarter': ['q1', 'q2']})
df['period'] = df[['Year', 'quarter']].apply(lambda x: ''.join(x), axis=1)
Yields this dataframe
Year quarter period
0 2014 q1 2014q1
1 2015 q2 2015q2
This method generalizes to an arbitrary number of string columns by replacing df[['Year', 'quarter']]
with any column slice of your dataframe, e.g. df.iloc[:,0:2].apply(lambda x: ''.join(x), axis=1)
.
You can check more information about apply() method here
Use of a lamba function this time with string.format().
import pandas as pd
df = pd.DataFrame({'Year': ['2014', '2015'], 'Quarter': ['q1', 'q2']})
print df
df['YearQuarter'] = df[['Year','Quarter']].apply(lambda x : '{}{}'.format(x[0],x[1]), axis=1)
print df
Quarter Year
0 q1 2014
1 q2 2015
Quarter Year YearQuarter
0 q1 2014 2014q1
1 q2 2015 2015q2
This allows you to work with non-strings and reformat values as needed.
import pandas as pd
df = pd.DataFrame({'Year': ['2014', '2015'], 'Quarter': [1, 2]})
print df.dtypes
print df
df['YearQuarter'] = df[['Year','Quarter']].apply(lambda x : '{}q{}'.format(x[0],x[1]), axis=1)
print df
Quarter int64
Year object
dtype: object
Quarter Year
0 1 2014
1 2 2015
Quarter Year YearQuarter
0 1 2014 2014q1
1 2 2015 2015q2
generalising to multiple columns, why not:
columns = ['whatever', 'columns', 'you', 'choose']
df['period'] = df[columns].astype(str).sum(axis=1)