I have dataframe, but all strings are duplicated and when I try print the graph, It contain duplicated column. I try to delete it, but then my graph print incorrectly. My csv is
You can first add new column sort
in function f
, then sorted values by column pair of websites
and last drop_duplicates by columns used_at
and sort
:
import pandas as pd
import itertools
df = pd.read_csv("avito_trend.csv",
parse_dates=[2])
def f(df):
dfs = []
i = 0
for x in [list(x) for x in itertools.combinations(df['address'].unique(), 2)]:
i += 1
c1 = df.loc[df['address'].isin([x[0]]), 'ID']
c2 = df.loc[df['address'].isin([x[1]]), 'ID']
c = pd.Series(list(set(c1).intersection(set(c2))))
#add inverted intersection c2 vs c1
c_invert = pd.Series(list(set(c2).intersection(set(c1))))
dfs.append(pd.DataFrame({'common users':len(c), 'pair of websites':' and '.join(x), 'sort': i}, index=[0]))
#swap values in x
x[1],x[0] = x[0],x[1]
dfs.append(pd.DataFrame({'common users':len(c_invert), 'pair of websites':' and '.join(x), 'sort': i}, index=[0]))
return pd.concat(dfs)
common_users = df.groupby([df['used_at'].dt.year]).apply(f).reset_index(drop=True, level=1).reset_index()
common_users = common_users.sort_values('pair of websites')
common_users = common_users.drop_duplicates(subset=['used_at','sort'])
#print common_users
graph_by_common_users = common_users.pivot(index='pair of websites', columns='used_at', values='common users')
#print graph_by_common_users
#change order of columns
graph_by_common_users = graph_by_common_users[[2015,2014]]
graph_by_common_users = graph_by_common_users.sort_values(2014, ascending=False)
ax = graph_by_common_users.plot(kind='barh', width=0.5, figsize=(10,20))
[label.set_rotation(25) for label in ax.get_xticklabels()]
rects = ax.patches
labels = [int(round(graph_by_common_users.loc[i, y])) for y in graph_by_common_users.columns.tolist() for i in graph_by_common_users.index]
for rect, label in zip(rects, labels):
height = rect.get_height()
ax.text(rect.get_width() + 20, rect.get_y() - 0.25 + rect.get_height(), label, fontsize=8)
#sorting values of legend
handles, labels = ax.get_legend_handles_labels()
# sort both labels and handles by labels
labels, handles = zip(*sorted(zip(labels, handles), key=lambda t: t[0]))
ax.legend(handles, labels)
My graph:
EDIT:
Comment is:
Why did you creare c_invert and x1,x[0] = x[0],x1
Because combinations for years 2014
and 2015
were different - 4
values were missing in first and 4
in second column:
used_at 2015 2014
pair of websites
avito.ru and drom.ru 1491.0 1716.0
avito.ru and auto.ru 1473.0 1602.0
avito.ru and e1.ru 1153.0 1364.0
drom.ru and auto.ru NaN 874.0
e1.ru and drom.ru 539.0 634.0
avito.ru and irr.ru/cars 403.0 602.0
avito.ru and am.ru 262.0 579.0
e1.ru and auto.ru 451.0 475.0
avito.ru and cars.mail.ru/sale 256.0 424.0
drom.ru and irr.ru/cars 277.0 423.0
auto.ru and irr.ru/cars 288.0 409.0
auto.ru and am.ru 224.0 408.0
drom.ru and am.ru 187.0 394.0
auto.ru and cars.mail.ru/sale 195.0 330.0
avito.ru and avtomarket.ru 205.0 299.0
drom.ru and cars.mail.ru/sale 189.0 292.0
drom.ru and avtomarket.ru 175.0 247.0
auto.ru and avtomarket.ru 162.0 243.0
e1.ru and irr.ru/cars 148.0 235.0
e1.ru and am.ru 99.0 224.0
am.ru and irr.ru/cars NaN 223.0
irr.ru/cars and cars.mail.ru/sale 94.0 197.0
am.ru and cars.mail.ru/sale NaN 166.0
e1.ru and cars.mail.ru/sale 105.0 154.0
e1.ru and avtomarket.ru 105.0 139.0
avtomarket.ru and irr.ru/cars NaN 139.0
avtomarket.ru and am.ru 72.0 133.0
avtomarket.ru and cars.mail.ru/sale 48.0 105.0
auto.ru and drom.ru 799.0 NaN
cars.mail.ru/sale and am.ru 73.0 NaN
irr.ru/cars and am.ru 102.0 NaN
irr.ru/cars and avtomarket.ru 73.0 NaN
Then I create all inverted combination - problem was solved. But why there are NaN
? Why combinations are different in 2014
and 2015
?
I add to function f
:
def f(df):
print df['address'].unique()
dfs = []
i = 0
for x in [list(x) for x in itertools.combinations((df['address'].unique()), 2)]:
...
...
and output was (why first print twice is described in warning
here ):
['avito.ru' 'e1.ru' 'drom.ru' 'auto.ru' 'avtomarket.ru' 'am.ru'
'irr.ru/cars' 'cars.mail.ru/sale']
['avito.ru' 'e1.ru' 'drom.ru' 'auto.ru' 'avtomarket.ru' 'am.ru'
'irr.ru/cars' 'cars.mail.ru/sale']
['avito.ru' 'e1.ru' 'auto.ru' 'drom.ru' 'irr.ru/cars' 'avtomarket.ru'
'cars.mail.ru/sale' 'am.ru']
So lists are different and then combinations are different too -> I get some NaN
values.
Solution is sorting list of combinations.
def f(df):
#print (sorted(df['address'].unique()))
dfs = []
for x in [list(x) for x in itertools.combinations(sorted(df['address'].unique()), 2)]:
c1 = df.loc[df['address'].isin([x[0]]), 'ID']
...
...
All code is:
import pandas as pd
import itertools
df = pd.read_csv("avito_trend.csv",
parse_dates=[2])
def f(df):
#print (sorted(df['address'].unique()))
dfs = []
for x in [list(x) for x in itertools.combinations(sorted(df['address'].unique()), 2)]:
c1 = df.loc[df['address'].isin([x[0]]), 'ID']
c2 = df.loc[df['address'].isin([x[1]]), 'ID']
c = pd.Series(list(set(c1).intersection(set(c2))))
dfs.append(pd.DataFrame({'common users':len(c), 'pair of websites':' and '.join(x)}, index=[0]))
return pd.concat(dfs)
common_users = df.groupby([df['used_at'].dt.year]).apply(f).reset_index(drop=True, level=1).reset_index()
#print common_users
graph_by_common_users = common_users.pivot(index='pair of websites', columns='used_at', values='common users')
#change order of columns
graph_by_common_users = graph_by_common_users[[2015,2014]]
graph_by_common_users = graph_by_common_users.sort_values(2014, ascending=False)
#print graph_by_common_users
ax = graph_by_common_users.plot(kind='barh', width=0.5, figsize=(10,20))
[label.set_rotation(25) for label in ax.get_xticklabels()]
rects = ax.patches
labels = [int(round(graph_by_common_users.loc[i, y])) \
for y in graph_by_common_users.columns.tolist() \
for i in graph_by_common_users.index]
for rect, label in zip(rects, labels):
height = rect.get_height()
ax.text(rect.get_width()+20, rect.get_y() - 0.25 + rect.get_height(), label, fontsize=8)
handles, labels = ax.get_legend_handles_labels()
# sort both labels and handles by labels
labels, handles = zip(*sorted(zip(labels, handles), key=lambda t: t[0]))
ax.legend(handles, labels)
And graph:
It looks like your DataFrame
is not structure the way you would like it to be. Your DataFrame
contains 2014
and 2015
as column header names not as row values on the used_at
index. Also used_at
is the index name not the index label of the first row.
You can test that this is true by executing:
import pandas as pd
from cStringIO import StringIO
text_data = '''
used_at 2014 2015
address
am.ru 621 273
auto.ru 1752 1595
avito.ru 5460 4631
avtomarket.ru 314 215
cars.mail.ru/sale 457 271
drom.ru 1934 1623
e1.ru 1654 1359
irr.ru/cars 619 426
'''
# Read in tabular data with used_at row as header
df = pd.read_table(StringIO(text_data), sep='\s+', index_col=0)
print 'DataFrame created with used_at row as header:'
print df
print
# print df.used_at would cause AttributeError: 'DataFrame' object has no attribute 'used_at'
print 'df columns :', df.columns
print 'df index name :', df.index.name
print
DataFrame created with used_at row as header:
2014 2015
used_at
address NaN NaN
am.ru 621 273
auto.ru 1752 1595
avito.ru 5460 4631
avtomarket.ru 314 215
cars.mail.ru/sale 457 271
drom.ru 1934 1623
e1.ru 1654 1359
irr.ru/cars 619 426
df columns : Index([u'2014', u'2015'], dtype='object')
df index name : used_at