《利用python进行数据分析》之数据处理

三世轮回 提交于 2019-11-30 16:53:51

 

 

 

In [1]:
import pandas as pd
import numpy as np
 

删除缺失值

In [2]:
df = pd.DataFrame(np.random.randn(6,3))
df.iloc[:4,1] = np.NaN
df.iloc[:2,2] = np.NaN
df
Out[2]:
 
 012
0 0.230148 NaN NaN
1 0.967690 NaN NaN
2 0.296245 NaN -0.905187
3 1.204742 NaN 1.765566
4 0.463110 1.423042 1.858276
5 1.272618 -0.033120 -1.269821
In [3]:
#有2个以上的nan才删除
df.dropna(thresh=2)
Out[3]:
 
 012
2 0.296245 NaN -0.905187
3 1.204742 NaN 1.765566
4 0.463110 1.423042 1.858276
5 1.272618 -0.033120 -1.269821
In [4]:
df.dropna(axis=1,thresh=3)
Out[4]:
 
 02
0 0.230148 NaN
1 0.967690 NaN
2 0.296245 -0.905187
3 1.204742 1.765566
4 0.463110 1.858276
5 1.272618 -1.269821
 

补全缺失值

In [5]:
df.fillna(method='bfill',limit=2)
Out[5]:
 
 012
0 0.230148 NaN -0.905187
1 0.967690 NaN -0.905187
2 0.296245 1.423042 -0.905187
3 1.204742 1.423042 1.765566
4 0.463110 1.423042 1.858276
5 1.272618 -0.033120 -1.269821
 

删除重复值

In [6]:
df = pd.DataFrame({'k1':['one','two']*3+['two'],'k2':[1,1,2,3,3,4,4]})
df
Out[6]:
 
 k1k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
5 two 4
6 two 4
In [7]:
df.duplicated()
Out[7]:
0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool
In [8]:
df.drop_duplicates()
Out[8]:
 
 k1k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
5 two 4
In [9]:
df.drop_duplicates(keep='last')
Out[9]:
 
 k1k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
6 two 4
In [10]:
df.drop_duplicates(['k2'])
Out[10]:
 
 k1k2
0 one 1
2 one 2
3 two 3
5 two 4
 

替代值

In [11]:
df.replace({'one':1,'two':2})
Out[11]:
 
 k1k2
0 1 1
1 2 1
2 1 2
3 2 3
4 1 3
5 2 4
6 2 4
 

重命名索引

In [12]:
df.rename(index={6:'repeat'},columns={'k1':'one','k2':'two'})
Out[12]:
 
 onetwo
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
5 two 4
repeat two 4
 

离散化和分箱

In [13]:
ages = [20,22,25,27,21,23,37,31,61,45,41,32]
bins = [18,25,35,60,100]
cats = pd.cut(ages,bins)
cats
Out[13]:
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
In [14]:
cats.codes #ages的数据标签
Out[14]:
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
In [15]:
cats.categories
Out[15]:
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')
In [16]:
pd.value_counts(cats)
Out[16]:
(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64
In [17]:
#改为左闭右开
pd.cut(ages,bins,right=False)
Out[17]:
[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]
In [18]:
#自定义箱名
pd.cut(ages,bins,labels=['Youth','YoungAdult','MiddleAged','Senior'])
Out[18]:
[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]
In [19]:
#定义箱子个数
data = np.random.rand(20)
pd.cut(data,4,precision=2)
Out[19]:
[(0.26, 0.5], (0.018, 0.26], (0.26, 0.5], (0.26, 0.5], (0.26, 0.5], ..., (0.5, 0.74], (0.5, 0.74], (0.74, 0.98], (0.018, 0.26], (0.74, 0.98]]
Length: 20
Categories (4, interval[float64]): [(0.018, 0.26] < (0.26, 0.5] < (0.5, 0.74] < (0.74, 0.98]]
In [20]:
#根据样本的分位数进行分箱
data = np.random.randn(1000)
cats = pd.qcut(data,4)
cats
Out[20]:
[(-3.302, -0.702], (-3.302, -0.702], (-0.702, 0.00318], (0.713, 3.516], (-0.702, 0.00318], ..., (-0.702, 0.00318], (0.713, 3.516], (0.00318, 0.713], (0.00318, 0.713], (0.00318, 0.713]]
Length: 1000
Categories (4, interval[float64]): [(-3.302, -0.702] < (-0.702, 0.00318] < (0.00318, 0.713] < (0.713, 3.516]]
In [21]:
pd.value_counts(cats)
Out[21]:
(0.713, 3.516]       250
(0.00318, 0.713]     250
(-0.702, 0.00318]    250
(-3.302, -0.702]     250
dtype: int64
In [22]:
#自定义分位数
pd.qcut(data,[0,0.1,0.5,0.9,1.])
Out[22]:
[(-1.379, 0.00318], (-3.302, -1.379], (-1.379, 0.00318], (0.00318, 1.264], (-1.379, 0.00318], ..., (-1.379, 0.00318], (0.00318, 1.264], (0.00318, 1.264], (0.00318, 1.264], (0.00318, 1.264]]
Length: 1000
Categories (4, interval[float64]): [(-3.302, -1.379] < (-1.379, 0.00318] < (0.00318, 1.264] < (1.264, 3.516]]
 

检测和过滤异常值

In [23]:
data = pd.DataFrame(np.random.randn(1000,4))
#拥有绝对值大于3的行
data[(np.abs(data)>3).any(1)]
Out[23]:
 
 0123
55 -0.378616 0.023351 -0.390768 3.044924
73 2.223267 0.135439 -2.165250 3.190073
96 -0.442352 -0.241786 -3.061948 0.596391
199 -1.548635 -3.035343 0.090680 0.204924
351 3.095982 -1.234948 -1.707739 2.320149
376 -0.202145 -3.108550 0.066009 -0.571813
377 3.428047 -0.972084 -0.128787 0.898637
547 -2.163697 0.421286 -1.506587 -3.123146
638 -4.669923 0.938394 -0.719577 -1.615580
809 0.404150 -0.106034 -0.022086 3.192218
922 0.045469 0.224181 3.349797 -1.413839
972 0.438964 -0.577106 1.240444 3.074443
In [24]:
data.iloc[500] = [4,5,6,-7]
#绝对值都大于3的行
data[(np.abs(data)>3).all(1)]
Out[24]:
 
 0123
500 4.0 5.0 6.0 -7.0
In [25]:
np.sign(data).head()
Out[25]:
 
 0123
0 1.0 -1.0 1.0 1.0
1 -1.0 1.0 1.0 1.0
2 1.0 -1.0 1.0 1.0
3 1.0 -1.0 1.0 -1.0
4 -1.0 -1.0 1.0 1.0
 

随机抽样

In [26]:
data.sample(n=5)
Out[26]:
 
 0123
508 0.945668 -1.054974 -0.485197 0.400421
68 1.679209 -0.545147 -0.329424 -0.140010
356 1.756743 0.583158 0.201458 -0.622671
760 1.532128 -0.677541 -0.095587 -0.489332
194 0.960248 -0.016510 0.116253 0.070162
 

类别统计

In [27]:
df = pd.DataFrame({'key':['b','b','a','c','a','b'],'data':range(6)})
df
Out[27]:
 
 keydata
0 b 0
1 b 1
2 a 2
3 c 3
4 a 4
5 b 5
In [28]:
pd.get_dummies(df['key'])
Out[28]:
 
 abc
0 0 1 0
1 0 1 0
2 1 0 0
3 0 0 1
4 1 0 0
5 0 1 0
In [29]:
#一行属于多个类别时,比较复杂

movies = pd.read_csv('datasets/movielens/movies.dat',sep='::',header= None,names=['movie_id','title','genres'])
movies.head()
 
/Users/finnchan/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  This is separate from the ipykernel package so we can avoid doing imports until
Out[29]:
 
 movie_idtitlegenres
0 1 Toy Story (1995) Animation|Children's|Comedy
1 2 Jumanji (1995) Adventure|Children's|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama
4 5 Father of the Bride Part II (1995) Comedy
In [30]:
all_genres = []
for genres in movies['genres']:
    all_genres.extend(genres.split('|'))
genres = pd.unique(all_genres)    
In [31]:
genres
Out[31]:
array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)
In [32]:
count_df = pd.DataFrame(np.zeros(shape=(len(movies),len(genres))),columns=genres)
count_df.head()
Out[32]:
 
 AnimationChildren'sComedyAdventureFantasyRomanceDramaActionCrimeThrillerHorrorSci-FiDocumentaryWarMusicalMysteryFilm-NoirWestern
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
In [33]:
#遍历分类可以避免遍历数据较多的movies,速度较快

for genre in genres:
    count_df.loc[movies['genres'].str.contains(genre),genre] = 1
count_df.head()
Out[33]:
 
 AnimationChildren'sComedyAdventureFantasyRomanceDramaActionCrimeThrillerHorrorSci-FiDocumentaryWarMusicalMysteryFilm-NoirWestern
0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
 

正则表达式

%E5%9B%BE%E7%89%87.png

In [34]:
val = 'a,b,  guido'
val.index(',')
Out[34]:
1
In [35]:
#index找不到时会报错,find找不到时返回-1
val.find(':')
Out[35]:
-1
In [36]:
import re
text = 'foo  bar\t baz  \tqux'
#正则表达式对象
regex = re.compile('\s+')
regex.findall(text)
Out[36]:
['  ', '\t ', '  \t']
In [37]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
#pattern里加上括号返回的是元组
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern=pattern,flags=re.IGNORECASE)
regex.findall(text)
Out[37]:
[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]
In [38]:
print(regex.sub(r'Username:\1,Domain:\2,Suffix:\3',text))
 
Dave Username:dave,Domain:google,Suffix:com
Steve Username:steve,Domain:gmail,Suffix:com
Rob Username:rob,Domain:gmail,Suffix:com
Ryan Username:ryan,Domain:yahoo,Suffix:com

 

数据合并

In [39]:
left = pd.DataFrame({'key':['a','b','a','a','b','c'],'value':range(6)})
left
Out[39]:
 
 keyvalue
0 a 0
1 b 1
2 a 2
3 a 3
4 b 4
5 c 5
In [40]:
right = pd.DataFrame({'group_val':[3.5,7]},index=['a','b'])
right
Out[40]:
 
 group_val
a 3.5
b 7.0
In [41]:
#使用right的行索引作为它的连接键
pd.merge(left,right,left_on='key',right_index=True)
Out[41]:
 
 keyvaluegroup_val
0 a 0 3.5
2 a 2 3.5
3 a 3 3.5
1 b 1 7.0
4 b 4 7.0
In [42]:
#不存在相同索引时,可以考虑concat

df1 = pd.DataFrame(np.arange(6).reshape(3,2),index=list('abc'),columns=['one','two'])
df1
Out[42]:
 
 onetwo
a 0 1
b 2 3
c 4 5
In [43]:
df2 = pd.DataFrame(np.arange(5,9).reshape(2,2),index=list('ac'),columns=['three','four'])
df2
Out[43]:
 
 threefour
a 5 6
c 7 8
In [44]:
pd.concat([df1,df2],keys=['df1','df2'],sort=True)
Out[44]:
 
  fouronethreetwo
df1a NaN 0.0 NaN 1.0
b NaN 2.0 NaN 3.0
c NaN 4.0 NaN 5.0
df2a 6.0 NaN 5.0 NaN
c 8.0 NaN 7.0 NaN
In [45]:
pd.concat([df1,df2],axis=1,keys=['df1','df2'],sort=True)
Out[45]:
 
 df1df2
 onetwothreefour
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
In [46]:
#不保留原来的索引,而是产生新的索引
pd.concat([df1,df2],ignore_index=True,sort=True)
Out[46]:
 
 fouronethreetwo
0 NaN 0.0 NaN 1.0
1 NaN 2.0 NaN 3.0
2 NaN 4.0 NaN 5.0
3 6.0 NaN 5.0 NaN
4 8.0 NaN 7.0 NaN
In [47]:
df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
                    'b': [np.nan, 2., np.nan, 6.],
                    'c': range(2, 18, 4)})
df1
Out[47]:
 
 abc
0 1.0 NaN 2
1 NaN 2.0 6
2 5.0 NaN 10
3 NaN 6.0 14
In [48]:
df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],
                    'b': [np.nan, 3., 4., 6., 8.]})
df2
Out[48]:
 
 ab
0 5.0 NaN
1 4.0 3.0
2 NaN 4.0
3 3.0 6.0
4 7.0 8.0
In [49]:
#相当于np.where(pd.isnull(a),b,a)
df1.combine_first(df2)
Out[49]:
 
 abc
0 1.0 NaN 2.0
1 4.0 2.0 6.0
2 5.0 4.0 10.0
3 3.0 6.0 14.0
4 7.0 8.0 NaN
 

数据透视

In [50]:
data = pd.read_csv('examples/macrodata.csv')
data.head()
Out[50]:
 
 yearquarterrealgdprealconsrealinvrealgovtrealdpicpim1tbilrateunemppopinflrealint
0 1959.0 1.0 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 5.8 177.146 0.00 0.00
1 1959.0 2.0 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 5.1 177.830 2.34 0.74
2 1959.0 3.0 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 5.3 178.657 2.74 1.09
3 1959.0 4.0 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 5.6 179.386 0.27 4.06
4 1960.0 1.0 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 5.2 180.007 2.31 1.19
In [51]:
periods = pd.PeriodIndex(year=data.year,quarter=data.quarter,name='date')
periods
Out[51]:
PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
             '1960Q3', '1960Q4', '1961Q1', '1961Q2',
             ...
             '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
             '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
            dtype='period[Q-DEC]', name='date', length=203, freq='Q-DEC')
In [52]:
columns = pd.Index(['realgdp','infl','unemp'],name='item')
columns
Out[52]:
Index(['realgdp', 'infl', 'unemp'], dtype='object', name='item')
In [53]:
data = data.reindex(columns=columns)
data.head()
Out[53]:
 
itemrealgdpinflunemp
0 2710.349 0.00 5.8
1 2778.801 2.34 5.1
2 2775.488 2.74 5.3
3 2785.204 0.27 5.6
4 2847.699 2.31 5.2
In [54]:
data.index = periods.to_timestamp('D','e')
In [55]:
data.head()
Out[55]:
 
itemrealgdpinflunemp
date   
1959-03-31 23:59:59.999999999 2710.349 0.00 5.8
1959-06-30 23:59:59.999999999 2778.801 2.34 5.1
1959-09-30 23:59:59.999999999 2775.488 2.74 5.3
1959-12-31 23:59:59.999999999 2785.204 0.27 5.6
1960-03-31 23:59:59.999999999 2847.699 2.31 5.2
In [56]:
data = data.stack().reset_index().rename(columns={0:'value'})
data.head()
Out[56]:
 
 dateitemvalue
0 1959-03-31 23:59:59.999999999 realgdp 2710.349
1 1959-03-31 23:59:59.999999999 infl 0.000
2 1959-03-31 23:59:59.999999999 unemp 5.800
3 1959-06-30 23:59:59.999999999 realgdp 2778.801
4 1959-06-30 23:59:59.999999999 infl 2.340
In [57]:
data.pivot('date','item','value')[:5]#行,列,值
Out[57]:
 
iteminflrealgdpunemp
date   
1959-03-31 23:59:59.999999999 0.00 2710.349 5.8
1959-06-30 23:59:59.999999999 2.34 2778.801 5.1
1959-09-30 23:59:59.999999999 2.74 2775.488 5.3
1959-12-31 23:59:59.999999999 0.27 2785.204 5.6
1960-03-31 23:59:59.999999999 2.31 2847.699 5.2
In [58]:
df = pd.DataFrame({'key': ['foo', 'bar', 'baz'],
                   'A': [1, 2, 3],
                   'B': [4, 5, 6],
                   'C': [7, 8, 9]})
df
Out[58]:
 
 keyABC
0 foo 1 4 7
1 bar 2 5 8
2 baz 3 6 9
In [59]:
#pivot的反向操作
pd.melt(df,['key'])
Out[59]:
 
 keyvariablevalue
0 foo A 1
1 bar A 2
2 baz A 3
3 foo B 4
4 bar B 5
5 baz B 6
6 foo C 7
7 bar C 8
8 baz C 9
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!