pandas核心数据结构第二天

a 夏天 提交于 2020-02-08 09:20:03

一、Series

在这里插入图片描述

创建

#1创建一个序列,索引必须为列表
b=pd.Series(np.random.randn(5),index=['a','b','c','d','e'])
print(b)
#2查看索引
print(b.index)
#3默认不指定索引时,为0开始排的整型索引即可
c=pd.Series(np.random.randn(5))
print(c)
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
a    0.168710
b    0.436219
c   -0.236762
d    0.426008
e   -0.951852
dtype: float64
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
0    1.506954
1    0.282373
2    0.963675
3    0.671374
4   -0.936167
dtype: float64

Process finished with exit code 0

#1通过字典创建一个序列
d={'a':0.,'b':1.,'d':3.}
b=pd.Series(d,index=list('abcd'))#index代表行索引
print(b)
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
a    0.0
b    1.0
c    NaN
d    3.0
dtype: float64

Process finished with exit code 0

#1通过标量创建一个序列
b=pd.Series(5,index=list('abcd'))#index代表行索引
print(b)
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
a    5
b    5
c    5
d    5
dtype: int64

Process finished with exit code 0

支持ndarry数组操作

s=pd.Series(np.random.randn(5),index=['a','b','c','d','e'])
print(s)
#1支持索引操作
print(s[0])#不包括最后的索引
print(s[:3])
print(s[2:5])
print(s[[1,3,4]])#支持整型直接索引。
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
a    1.147490
b   -0.084418
c   -0.777898
d   -0.488580
e   -0.146267
dtype: float64
1.1474904747052694
a    1.147490
b   -0.084418
c   -0.777898
dtype: float64
c   -0.777898
d   -0.488580
e   -0.146267
dtype: float64
b   -0.084418
d   -0.488580
e   -0.146267
dtype: float64

s=pd.Series(np.random.randn(5),index=['a','b','c','d','e'])
print(s)
#1支持numpy中的函数操作
print(np.sin(s))
print(np.exp(s))
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
a    2.122820
b   -0.081273
c   -0.144711
d   -1.461435
e    0.341797
dtype: float64
a    0.851465
b   -0.081183
c   -0.144206
d   -0.994026
e    0.335181
dtype: float64
a    8.354667
b    0.921942
c    0.865273
d    0.231903
e    1.407474
dtype: float64



s=pd.Series(np.random.randn(5),index=['a','b','c','d','e'])
print(s)
#3支持字典访问
print(s['a'])
s['b']=3
print(s)
#4用字典方式增加一个键值
s['g']=100
print(s)
#5访问不存在的键会报错
# print(s['f'])
#6可以用get访问,键不存在时指定默认值
print(s.get('f'))
print(s.get('f',0))

结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
a    0.980320
b    0.547169
c    2.342473
d   -1.520565
e    0.589286
dtype: float64
0.9803196644381337
a    0.980320
b    3.000000
c    2.342473
d   -1.520565
e    0.589286
dtype: float64
a      0.980320
b      3.000000
c      2.342473
d     -1.520565
e      0.589286
g    100.000000
dtype: float64
None
0

Process finished with exit code 0

s1=pd.Series(np.random.randn(3),index=['a','c','e'])
s2=pd.Series(np.random.randn(3),index=['a','d','e'])
#1支持2个series标签对齐打印,此时2个序列中间多了1个空行,因为打印完s1另起一行,空这一行,就开始打印s2
print('{0}\n\n{1}'.format(s1,s2))
#支持两个序列相加,即a标签加a标签,即自动标签对齐相加,即s2没有的c,所以相加后的c值为NaN.
print(s1+s2)
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
a    0.859558
c    1.547378
e    0.135625
dtype: float64

a    1.705699
d   -0.591336
e   -1.058273
dtype: float64
a    2.565257
c         NaN
d         NaN
e   -0.922649
dtype: float64

Process finished with exit code 0

二、DataFrame

每行每列都可以看成是Series序列

创建

# 1通过字典创建
data = pd.DataFrame({'one': pd.Series([1,2,3],index=['a','b','c']),
                    'two': pd.Series([1,2,3,4],index=['a','b','c','d'])})
print(data)
d={'one': pd.Series([1,2,3],index=['a','b','c']),
                    'two': pd.Series([1,2,3,4],index=['a','b','c','d'])}
#2只取三个索引值,会自动对齐
df1=pd.DataFrame(d,index=['b','d','a'])
print(df1)
#3改变列标签
df2=pd.DataFrame(d,columns=['two','three'])
print(df2)
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4
   one  two
b  2.0    2
d  NaN    4
a  1.0    1
   two three
a    1   NaN
b    2   NaN
c    3   NaN
d    4   NaN

Process finished with exit code 0


# 1通过列表作为字典的值时,必须保证个数一致,否则会报错,而series不存在这个问题
data = pd.DataFrame({'one': [1,2,3,4],#因为列表不像Series序列有索引,所以不用指定index了。要想指定索引是指定DataFrame的索引
                    'two': [21,22,23,24]})
print(data)
# d={'one': pd.Series([1,2,3],index=['a','b','c']),
#                     'two': pd.Series([1,2,3,4],index=['a','b','c','d'])}
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
   one  two
0    1   21
1    2   22
2    3   23
3    4   24

Process finished with exit code 0

# 1通过列表创建DataFrame,行列标签自己可以指定
data=[(1,2.2,'Hello'),(2,3.,'World')]
data1 = pd.DataFrame(data,index=['one','two'],columns=list('ABC'))
print(data1)
结果:
     A    B      C
one  1  2.2  Hello
two  2  3.0  World


# 1大的列表是字典,可以指定行标签,列标签由字典的键自动生成,当自己再定义列标签时,会自动与现有的列标签对齐。
data=[{'a':1,'b':2},{'a':5,'b':10,'c':20}]
data1 = pd.DataFrame(data,index=['A','B'],columns=list('ab'))
print(data1)
#列标签由字典的键自动生成,当自己再定义列标签时,会自动与现有的列标签对齐,且原来没有e标签,所以为空。
#行标签若不指定,会自动分配整型
data2 = pd.DataFrame(data,index=['A','B'],columns=list('abe'))
print(data2)
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
   a   b
A  1   2
B  5  10
   a   b   e
A  1   2 NaN
B  5  10 NaN

Process finished with exit code 0
# 字典套字典创建DataFrame,
data={('a','b'):{('A','B'):1,('A','C'): 2},
      ('a','a'):{('A','C'):3,('A','B'): 4},
      ('a','c'):{('A','B'):5,('A','C'): 6},
      ('b','a'):{('A','C'):7,('A','B'): 8},
      ('b','b'):{('A','D'):9,('A','B'): 10}}
#data1的行与列都是二维标签索引了。
data1 = pd.DataFrame(data)
print(data1)
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
       a              b      
       b    a    c    a     b
A B  1.0  4.0  5.0  8.0  10.0
  C  2.0  3.0  6.0  7.0   NaN
  D  NaN  NaN  NaN  NaN   9.0

Process finished with exit code 0

data3=pd.Series(np.random.randn(5),index=['a','b','c','d','e'])
print(data3)
##1从Series中创建,默认列从0开始索引,也指定列标签columns=['A']
data4=pd.DataFrame(data3,columns=['A'])
print(data4)
#2data4=pd.DataFrame(data3,columns=['A','B'],index=list('acd))这句话不对,因为data3是一维序列,只有一列数据,所以指定有两列就不对
#且指定的行标签与原来的data3Series序列中的行标签会自动对齐。
#这种可以,#3本质从数据中找出索引,没有的话会自动生成从0开始的索引。
# 或者指定索引,若索引有,会找出与已有的索引数据进行对齐。
data5=pd.DataFrame(data3,columns=['A'],index=list('acd'))
print(data5)
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
a   -0.075319
b   -1.039741
c   -1.072038
d   -0.331250
e    1.267542
dtype: float64
          A
a -0.075319
b -1.039741
c -1.072038
d -0.331250
e  1.267542
          A
a -0.075319
c -1.072038
d -0.331250

Process finished with exit code 0

特性

在这里插入图片描述
在这里插入图片描述

data3 = pd.DataFrame(np.random.randn(6, 4),  columns=['one','two','three','four'])
print(data3)
#1选择一行或一列
print(data3['one'])
#2为three这一列为其他两列相加
data3['three']=data3['one']+data3['two']
print(data3)
#3删除一列
del data3['three']
print(data3)
#4增加新的一列
data3['flag']=data3['one']>0.2
print(data3)
#5直接标量加一列
data3['five']=5
#6将four这一列弹出之后,原数据data3中就没有这一列了
data3.pop('four')
print(data3)
#7用函数插入一列,名字为bar,值是两列相加
data3.insert(1,'bar',data3['one']+data3['two'])
print(data3)
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
        one       two     three      four
0 -0.503520  0.557812  2.455988  2.099548
1  0.330146 -2.574253  0.359760 -0.062902
2  0.035532 -0.727159 -2.370839  0.889751
3 -0.888395  0.717642 -0.383320 -0.523820
4 -0.474148  0.518265  0.104414 -3.433082
5  1.928498 -0.627430  0.182430  0.347653
0   -0.503520
1    0.330146
2    0.035532
3   -0.888395
4   -0.474148
5    1.928498
Name: one, dtype: float64
        one       two     three      four
0 -0.503520  0.557812  0.054292  2.099548
1  0.330146 -2.574253 -2.244107 -0.062902
2  0.035532 -0.727159 -0.691627  0.889751
3 -0.888395  0.717642 -0.170753 -0.523820
4 -0.474148  0.518265  0.044117 -3.433082
5  1.928498 -0.627430  1.301068  0.347653
        one       two      four
0 -0.503520  0.557812  2.099548
1  0.330146 -2.574253 -0.062902
2  0.035532 -0.727159  0.889751
3 -0.888395  0.717642 -0.523820
4 -0.474148  0.518265 -3.433082
5  1.928498 -0.627430  0.347653
        one       two      four   flag
0 -0.503520  0.557812  2.099548  False
1  0.330146 -2.574253 -0.062902   True
2  0.035532 -0.727159  0.889751  False
3 -0.888395  0.717642 -0.523820  False
4 -0.474148  0.518265 -3.433082  False
5  1.928498 -0.627430  0.347653   True
        one       two   flag  five
0 -0.503520  0.557812  False     5
1  0.330146 -2.574253   True     5
2  0.035532 -0.727159  False     5
3 -0.888395  0.717642  False     5
4 -0.474148  0.518265  False     5
5  1.928498 -0.627430   True     5
        one       bar       two   flag  five
0 -0.503520  0.054292  0.557812  False     5
1  0.330146 -2.244107 -2.574253   True     5
2  0.035532 -0.691627 -0.727159  False     5
3 -0.888395 -0.170753  0.717642  False     5
4 -0.474148  0.044117  0.518265  False     5
5  1.928498  1.301068 -0.627430   True     5

Process finished with exit code 0





data3 = pd.DataFrame(np.random.randn(6, 4),  columns=['one','two','three','four'])
print(data3)
#1用函数插入一列,名字为bar,值是两列相加
data3.insert(1,'bar',data3['one']+data3['two'])
print(data3)
#2insert插入会直接改变原来数组data3,而assign会copy一份再去改变,原来的data3不变
#用assign插入一列
data4=data3.assign(Ratio=data3['one']/data3['two'])
print(data3)#发现data3不变
print(data4)#在最后边插入了一列
#3assign可以直接将data3作为数据进行函数计算
data5=data3.assign(Ratio=lambda  x:x.one-x.two)
print(data5)
#3assign可以直接将data3作为数据进行函数计算第一次生成ABRatio这一列,之后再用一次assign并用这一列继续生成新的一列
data6=data3.assign(ABRatio=lambda  x:x.one-x.two).assign(BarValue=lambda x:x.ABRatio*x.bar)
print(data6)
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
        one       two     three      four
0 -0.903571 -1.887683  1.197445 -0.192508
1  0.727993  0.627388  1.092060 -0.017405
2  1.132262 -1.761343  0.569888 -1.049006
3 -0.241974  1.099350 -1.571476 -0.823272
4 -0.122233  0.330990 -0.102380 -0.324069
5  0.546435 -1.282765  0.332780 -1.871410
        one       bar       two     three      four
0 -0.903571 -2.791254 -1.887683  1.197445 -0.192508
1  0.727993  1.355381  0.627388  1.092060 -0.017405
2  1.132262 -0.629081 -1.761343  0.569888 -1.049006
3 -0.241974  0.857377  1.099350 -1.571476 -0.823272
4 -0.122233  0.208758  0.330990 -0.102380 -0.324069
5  0.546435 -0.736331 -1.282765  0.332780 -1.871410
        one       bar       two     three      four
0 -0.903571 -2.791254 -1.887683  1.197445 -0.192508
1  0.727993  1.355381  0.627388  1.092060 -0.017405
2  1.132262 -0.629081 -1.761343  0.569888 -1.049006
3 -0.241974  0.857377  1.099350 -1.571476 -0.823272
4 -0.122233  0.208758  0.330990 -0.102380 -0.324069
5  0.546435 -0.736331 -1.282765  0.332780 -1.871410
        one       bar       two     three      four     Ratio
0 -0.903571 -2.791254 -1.887683  1.197445 -0.192508  0.478666
1  0.727993  1.355381  0.627388  1.092060 -0.017405  1.160355
2  1.132262 -0.629081 -1.761343  0.569888 -1.049006 -0.642840
3 -0.241974  0.857377  1.099350 -1.571476 -0.823272 -0.220106
4 -0.122233  0.208758  0.330990 -0.102380 -0.324069 -0.369294
5  0.546435 -0.736331 -1.282765  0.332780 -1.871410 -0.425982
        one       bar       two     three      four     Ratio
0 -0.903571 -2.791254 -1.887683  1.197445 -0.192508  0.984113
1  0.727993  1.355381  0.627388  1.092060 -0.017405  0.100605
2  1.132262 -0.629081 -1.761343  0.569888 -1.049006  2.893605
3 -0.241974  0.857377  1.099350 -1.571476 -0.823272 -1.341324
4 -0.122233  0.208758  0.330990 -0.102380 -0.324069 -0.453223
5  0.546435 -0.736331 -1.282765  0.332780 -1.871410  1.829200
        one       bar       two     three      four   ABRatio  BarValue
0 -0.903571 -2.791254 -1.887683  1.197445 -0.192508  0.984113 -2.746909
1  0.727993  1.355381  0.627388  1.092060 -0.017405  0.100605  0.136358
2  1.132262 -0.629081 -1.761343  0.569888 -1.049006  2.893605 -1.820313
3 -0.241974  0.857377  1.099350 -1.571476 -0.823272 -1.341324 -1.150020
4 -0.122233  0.208758  0.330990 -0.102380 -0.324069 -0.453223 -0.094614
5  0.546435 -0.736331 -1.282765  0.332780 -1.871410  1.829200 -1.346896

Process finished with exit code 0



data3 = pd.DataFrame(np.random.randint(1,10, (6,4)),  index=list('abcdef'),columns=['A','B','C','D'])
print(data3)
#索引与选择操作
#1用类似字典方式选择1列
print(data3['A'])
#2通过方法用标签选择一行
print(data3.loc['a'])
print(data3.loc[:,'A'])#可以选择一列,结果与data3['A']一样
#3用值选择一行
print(data3.iloc[1])
#loc函数使用必须用标签去索引,.loc[],中括号里面是先行后列,以逗号分割,行和列分别是行标签和列标签,
# 比如data.loc["b","B"],iloc也一样只不过要用具体的第几行数字来表示。
 
#4用字典方式选择多行,效率更高
print(data3[1:4])#不包括结束行
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
   A  B  C  D
a  8  6  2  2
b  5  9  4  3
c  2  8  3  6
d  5  1  6  4
e  2  1  8  9
f  2  2  8  6
a    8
b    5
c    2
d    5
e    2
f    2
Name: A, dtype: int32
A    8
B    6
C    2
D    2
Name: a, dtype: int32
a    8
b    5
c    2
d    5
e    2
f    2
Name: A, dtype: int32
A    5
B    9
C    4
D    3
Name: b, dtype: int32


data3 = pd.DataFrame(np.random.randint(1,10, (6,4)),  index=list('abcdef'),columns=['A','B','C','D'])
print(data3)
#1Series的bool变量
print(data3.A>4)
#2选择出A这一列大于4的数据表
print(data3[data3.A>4])
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
   A  B  C  D
a  6  2  3  1
b  9  6  7  5
c  9  8  9  7
d  9  3  9  2
e  5  6  6  7
f  2  6  6  4
a     True
b     True
c     True
d     True
e     True
f    False
Name: A, dtype: bool
   A  B  C  D
a  6  2  3  1
b  9  6  7  5
c  9  8  9  7
d  9  3  9  2
e  5  6  6  7

在这里插入图片描述
在这里插入图片描述

da1 = pd.DataFrame(np.random.randn(10,4),  index=list('abcdefghij'),columns=['A','B','C','D'])
da2 = pd.DataFrame(np.random.randn(7,3),  index=list('cdefghi'),columns=['A','B','C'])
#1进行da1与da2对齐操作,标签自动对齐操作
#相加,即相加时例如标签c对应标签c相加,相加时哪一列没有就为NaN代替即自动对齐相加
print(da1+da2)
#2只会进行第一行相减,因为一个da1与一行相减,肯定这一行要复制多份与da1大小一样,之后再进行相减。
print(da1)
print(da1-da1.iloc[0])
#3pandas中的DataFrame与numpy中数组兼容,所以可以直接调用怒买便宜中函数对其进行计算
print(np.exp(da2))#本质上DataFrame所用数据结构就是numpy中的array
print(np.sin(da2))
print(da2.values)
print(type(da2.values))#类型就是numpy中的n维数组即ndarray
print(np.asarray(da2)==da2.values)#array和asarray都可将结构数据转换为ndarray类型但是主要区别就是当数据源是ndarray时,
#array仍会copy出一个副本,占用新的内存,但asarray不会。
#为true说明da2通过asarray改变为ndarray的类型就是da2.values
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
          A         B         C   D
a       NaN       NaN       NaN NaN
b       NaN       NaN       NaN NaN
c  3.053139  0.399460  0.902013 NaN
d -0.971126 -1.068590 -0.381331 NaN
e  2.409311 -0.001406 -1.475077 NaN
f  1.184333  0.623498 -0.744652 NaN
g  1.718821  0.607609  1.127204 NaN
h -0.967076 -1.922861 -0.443765 NaN
i -0.291349  0.063443 -0.317229 NaN
j       NaN       NaN       NaN NaN
          A         B         C         D
a -0.506622  0.334452  0.328909 -0.604279
b -0.547917 -0.612315  1.581732 -1.153236
c  1.883545 -0.327115 -0.153999 -0.111412
d -0.210938 -0.578260 -0.457308 -0.249904
e  0.360569  0.984508 -0.105404  0.776660
f  1.907145  1.944524 -1.149351  0.067634
g  1.479150  0.294211  0.284969  1.821444
h -0.591414 -1.086091  0.797395  0.959605
i -0.405419 -0.382432 -0.377807 -0.086269
j  0.773303 -0.080868  1.059053 -0.657401
          A         B         C         D
a  0.000000  0.000000  0.000000  0.000000
b -0.041295 -0.946767  1.252824 -0.548957
c  2.390167 -0.661567 -0.482908  0.492868
d  0.295683 -0.912712 -0.786216  0.354375
e  0.867191  0.650056 -0.434313  1.380940
f  2.413767  1.610072 -1.478260  0.671913
g  1.985772 -0.040241 -0.043940  2.425723
h -0.084792 -1.420543  0.468486  1.563885
i  0.101203 -0.716883 -0.706716  0.518010
j  1.279925 -0.415320  0.730145 -0.053121
          A         B         C
c  3.220685  2.067985  2.874884
d  0.467579  0.612424  1.078937
e  7.758132  0.373098  0.254190
f  0.485385  0.266861  1.498852
g  1.270830  1.368066  2.321551
h  0.686835  0.433107  0.289049
i  1.120831  1.561856  1.062451
          A         B         C
c  0.920592  0.664313  0.870399
d -0.689057 -0.470917  0.075904
e  0.887942 -0.833777 -0.979843
f -0.661497 -0.968969  0.393742
g  0.237382  0.308293  0.746133
h -0.366888 -0.742483 -0.946160
i  0.113823  0.431247  0.060542
[[ 1.1695942   0.72657459  1.05601218]
 [-0.76018733 -0.49032976  0.07597664]
 [ 2.04874157 -0.98591365 -1.36967273]
 [-0.72281253 -1.32102574  0.40469934]
 [ 0.23967026  0.31339829  0.84223529]
 [-0.37566185 -0.83676964 -1.24115999]
 [ 0.11407028  0.44587498  0.06057866]]
<class 'numpy.ndarray'>
[[ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]]

Process finished with exit code 0

三维数组创建

总之不管多少维度数据,4或5维乃至更高维度,都可以转化为用多维标签表示的二维数组即DataFrame,因此今后用DataFrame最多。

#键是字符串,值用DataFrame创建就可以得到三维数组了,如(2,2,3)相当于就是2个2乘3的二维数组就是一个三维数组了。
data={'Item1':pd.DataFrame(np.random.randn(4,3)),
                           'Item2':pd.DataFrame(np.random.randn(4, 2))}#(4, 2)向(4, 3)看齐补NaN
#1创建3维数组
pn=pd.Panel(data)
print(pn)#结果说明有:第一维度索引从 Item1 to Item2
#2因为此三维数组就是相当于2个二维数组
#看第一个二维数组
print(pn['Item1'])
#看第二个二维数组
print(pn['Item2'])
#3看三个维度的索引
print(pn.items)
#看第二维度的索引
print(pn.major_axis)
#看第三维度的索引
print(pn.minor_axis)
#4用第二维度标签获取,即相当于有4个3乘2的二维数组。以下取第二个
print(pn.major_xs(1))#(2,4,3)
#5三维数组与二维数组可以转化
print(pn.to_frame())#此时转化过来核心还是三维数组,因为有三个标签,即对应第一二三维度信息了
#6总之不管多少维度数据,4或5维乃至更高维度,都可以转化为用多维标签表示的二维数组即DataFrame,
#因此今后用DataFrame最多。
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
sys:1: FutureWarning: 
<class 'pandas.core.panel.Panel'>
Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Dimensions: 2 (items) x 4 (major_axis) x 3 (minor_axis)
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Items axis: Item1 to Item2
Pandas provides a `.to_xarray()` method to help automate this conversion.
Major_axis axis: 0 to 3

Minor_axis axis: 0 to 2
          0         1         2
0 -0.947857  0.149131 -0.868937
1 -1.547583 -1.772686 -0.811341
2  0.671214  0.121123  0.448306
3 -1.421754  0.265004  0.372799
          0         1   2
0 -0.803768 -0.255339 NaN
1  1.132508 -0.825563 NaN
2 -0.059555  0.988052 NaN
3 -0.644363  2.509904 NaN
Index(['Item1', 'Item2'], dtype='object')
RangeIndex(start=0, stop=4, step=1)
RangeIndex(start=0, stop=3, step=1)
      Item1     Item2
0 -1.547583  1.132508
1 -1.772686 -0.825563
2 -0.811341       NaN
                Item1     Item2
major minor                    
0     0     -0.947857 -0.803768
      1      0.149131 -0.255339
1     0     -1.547583  1.132508
      1     -1.772686 -0.825563
2     0      0.671214 -0.059555
      1      0.121123  0.988052
3     0     -1.421754 -0.644363
      1      0.265004  2.509904

Process finished with exit code 0

在这里插入图片描述

三、基础运算

在这里插入图片描述

一维数组处理

df1=pd.Series([1,3,5,6,8],index=list('acefh'))
print(df1)
#1查看序列的索引
print(df1.index)
#2对Series重新索引,重新索引后原来没有的标签对应值为NaN,原来有的会自动对齐的。
print(df1.reindex(list('abcdefgh')))
#3重新索引时,对于原来没有的标签索引,可以默认设置个值
print(df1.reindex(list('abcdefgh'),fill_value=0))
#4重新索引时,有一些数据没有值的,如股票停盘后的时间对应就没有值,所以可以用停盘前的数据对其进行填充即可。
print(df1.reindex(list('abcdefgh'),method='ffill'))#即结果中就是如标签b对应用a值填充,其他依次。
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
a    1
c    3
e    5
f    6
h    8
dtype: int64
Index(['a', 'c', 'e', 'f', 'h'], dtype='object')
a    1.0
b    NaN
c    3.0
d    NaN
e    5.0
f    6.0
g    NaN
h    8.0
dtype: float64
a    1
b    0
c    3
d    0
e    5
f    6
g    0
h    8
dtype: int64
a    1
b    1
c    3
d    3
e    5
f    6
g    6
h    8
dtype: int64

Process finished with exit code 0
da1 = pd.DataFrame(np.random.randn(4,6),  index=list('ADFH'),columns=['one','two','three','four','five','six'])
#1二维数组可以分别对行与列进行重新索引
da2=da1.reindex(index=list('ABCDEFGH'))#对行重新索引
print(da2)
#2将NaN赋值默认值,实际上reindex是copy一份da1数据出来并有返回值的,所以以下打印直接可以出来结果,也可以赋值给一个变量再打印出来结果,即原来数据da1并未改变
print(da1.reindex(index=list('ABCDEFGH'),fill_value=0))
print(da1)
#3将da1的A行one列对应值改为100,只是da1对应位置为100了,而之前的da2仍不变
da1.loc['A']['one']=100
print(da1)
print(da2)
#4对列索引,默认新增列的值为NaN
print(da1.reindex(columns=['one','three','five','seven']))
#5对列索引,设定默认值为0
# print(da1.reindex(columns=['one','three','five','seven'],fill_value=0))
#6method='ffill'对二维数组的列无效果,这种不行
# print(da1.reindex(columns=['one','three','five','seven'],method='ffill'))
#7对行填充有效果,即B与C的值是根据前边有的A的值来的
print(da1.reindex(index=list('ABCDEFGH'),method='ffill'))
#8向后填充,即B与C的值是根据后边有的D的值来的
print(da1.reindex(index=list('ABCDEFGH'),method='bfill'))
#9丢弃一行或一列时,加axis说明,否则默认按丢弃一行的方向处理
print(da1)
print(da1.drop('A'))
#10丢弃一列,drop也是将da1copy一份进行改变的,即之前的da1数据并未改变。
print(da1.drop(['two','four'],axis=1))
print(da1)
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
        one       two     three      four      five       six
A -1.909367  0.730111  0.574619  0.454327 -0.262506 -0.004040
B       NaN       NaN       NaN       NaN       NaN       NaN
C       NaN       NaN       NaN       NaN       NaN       NaN
D  0.074381  0.015976 -0.759157 -0.162168  1.321225  0.959368
E       NaN       NaN       NaN       NaN       NaN       NaN
F -0.139248  2.523069  1.798354  0.177123 -0.817056  0.071136
G       NaN       NaN       NaN       NaN       NaN       NaN
H  1.514792 -0.068565 -2.648990 -0.756524  0.502933 -1.225695
        one       two     three      four      five       six
A -1.909367  0.730111  0.574619  0.454327 -0.262506 -0.004040
B  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
C  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
D  0.074381  0.015976 -0.759157 -0.162168  1.321225  0.959368
E  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
F -0.139248  2.523069  1.798354  0.177123 -0.817056  0.071136
G  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
H  1.514792 -0.068565 -2.648990 -0.756524  0.502933 -1.225695
        one       two     three      four      five       six
A -1.909367  0.730111  0.574619  0.454327 -0.262506 -0.004040
D  0.074381  0.015976 -0.759157 -0.162168  1.321225  0.959368
F -0.139248  2.523069  1.798354  0.177123 -0.817056  0.071136
H  1.514792 -0.068565 -2.648990 -0.756524  0.502933 -1.225695
          one       two     three      four      five       six
A  100.000000  0.730111  0.574619  0.454327 -0.262506 -0.004040
D    0.074381  0.015976 -0.759157 -0.162168  1.321225  0.959368
F   -0.139248  2.523069  1.798354  0.177123 -0.817056  0.071136
H    1.514792 -0.068565 -2.648990 -0.756524  0.502933 -1.225695
        one       two     three      four      five       six
A -1.909367  0.730111  0.574619  0.454327 -0.262506 -0.004040
B       NaN       NaN       NaN       NaN       NaN       NaN
C       NaN       NaN       NaN       NaN       NaN       NaN
D  0.074381  0.015976 -0.759157 -0.162168  1.321225  0.959368
E       NaN       NaN       NaN       NaN       NaN       NaN
F -0.139248  2.523069  1.798354  0.177123 -0.817056  0.071136
G       NaN       NaN       NaN       NaN       NaN       NaN
H  1.514792 -0.068565 -2.648990 -0.756524  0.502933 -1.225695
          one     three      five  seven
A  100.000000  0.574619 -0.262506    NaN
D    0.074381 -0.759157  1.321225    NaN
F   -0.139248  1.798354 -0.817056    NaN
H    1.514792 -2.648990  0.502933    NaN
          one       two     three      four      five       six
A  100.000000  0.730111  0.574619  0.454327 -0.262506 -0.004040
B  100.000000  0.730111  0.574619  0.454327 -0.262506 -0.004040
C  100.000000  0.730111  0.574619  0.454327 -0.262506 -0.004040
D    0.074381  0.015976 -0.759157 -0.162168  1.321225  0.959368
E    0.074381  0.015976 -0.759157 -0.162168  1.321225  0.959368
F   -0.139248  2.523069  1.798354  0.177123 -0.817056  0.071136
G   -0.139248  2.523069  1.798354  0.177123 -0.817056  0.071136
H    1.514792 -0.068565 -2.648990 -0.756524  0.502933 -1.225695
          one       two     three      four      five       six
A  100.000000  0.730111  0.574619  0.454327 -0.262506 -0.004040
B    0.074381  0.015976 -0.759157 -0.162168  1.321225  0.959368
C    0.074381  0.015976 -0.759157 -0.162168  1.321225  0.959368
D    0.074381  0.015976 -0.759157 -0.162168  1.321225  0.959368
E   -0.139248  2.523069  1.798354  0.177123 -0.817056  0.071136
F   -0.139248  2.523069  1.798354  0.177123 -0.817056  0.071136
G    1.514792 -0.068565 -2.648990 -0.756524  0.502933 -1.225695
H    1.514792 -0.068565 -2.648990 -0.756524  0.502933 -1.225695
          one       two     three      four      five       six
A  100.000000  0.730111  0.574619  0.454327 -0.262506 -0.004040
D    0.074381  0.015976 -0.759157 -0.162168  1.321225  0.959368
F   -0.139248  2.523069  1.798354  0.177123 -0.817056  0.071136
H    1.514792 -0.068565 -2.648990 -0.756524  0.502933 -1.225695
        one       two     three      four      five       six
D  0.074381  0.015976 -0.759157 -0.162168  1.321225  0.959368
F -0.139248  2.523069  1.798354  0.177123 -0.817056  0.071136
H  1.514792 -0.068565 -2.648990 -0.756524  0.502933 -1.225695
          one     three      five       six
A  100.000000  0.574619 -0.262506 -0.004040
D    0.074381 -0.759157  1.321225  0.959368
F   -0.139248  1.798354 -0.817056  0.071136
H    1.514792 -2.648990  0.502933 -1.225695
          one       two     three      four      five       six
A  100.000000  0.730111  0.574619  0.454327 -0.262506 -0.004040
D    0.074381  0.015976 -0.759157 -0.162168  1.321225  0.959368
F   -0.139248  2.523069  1.798354  0.177123 -0.817056  0.071136
H    1.514792 -0.068565 -2.648990 -0.756524  0.502933 -1.225695

Process finished with exit code 0

da1 = pd.DataFrame(np.arange(12).reshape(4,3),  index=['one','two','three','four'],columns=list('ABC'))
print(da1)
#1apply函数默认按照1列给函数作为参数进行计算的,结果是每一列对应有一个结果了。
#默认按照axis=0方向即行方向进行计算,但是就是每列,其实一样明白即可。
print(da1.apply(lambda  x:x.max()-x.min()))
#按照每行计算的,即每行有个结果,#每一行的返回结果都是标量。
print(da1.apply(lambda  x:x.max()-x.min(),axis=1))#每一行的返回结果都是标量。
#2返回的结果为序列,对列操作
def min_max(x):
    return  pd.Series([x.max(),x.min()],index=['min','max'])
print(da1.apply(min_max))#注意apply会自动将每一列参数传进去函数中,所以函数需要形参,但是这个形参apply函数会自动将每一列传进去给x的
# 。且每次返回都是一列的最小与最大值的序列,3列即返回三个序列了,即返回一个二维表了。
#3返回的结果为序列,对行操作
print(da1.apply(min_max,axis=1))
#ipythonshell交互式提示符中独有的查看函数说明文档方式为  df.apply?
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
       A   B   C
one    0   1   2
two    3   4   5
three  6   7   8
four   9  10  11
A    9
B    9
C    9
dtype: int64
one      2
two      2
three    2
four     2
dtype: int64
     A   B   C
min  9  10  11
max  0   1   2
       min  max
one      2    0
two      5    3
three    8    6
four    11    9

Process finished with exit code 0

da1 = pd.DataFrame(np.random.randn(4,3),  index=['one','two','three','four'],columns=list('ABC'))
print(da1)
#对数组中的每一个元素进行处理
#方式1
#formater=lambda  x :'%.03f' %x
#print(da1.applymap(formater))#此时中的x就是数组中的每一个元素
#方式2
formater='{0:.03f}'.format#将函数作为参数传给下面的applymap函数了。冒号前的0表示第0个参数,即占位符的意思,可以写或不写
print(da1.applymap(formater))#此时中的x就是数组中的每一个元素
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
              A         B         C
one   -0.078113 -1.334115 -0.178684
two    1.340760  3.026996 -0.327963
three  0.861242  1.030327 -0.202078
four   0.100948 -2.405198  0.057714
            A       B       C
one    -0.078  -1.334  -0.179
two     1.341   3.027  -0.328
three   0.861   1.030  -0.202
four    0.101  -2.405   0.058

Process finished with exit code 0

排序

da1 = pd.DataFrame(np.random.randint(1,10,(4,3)),  index=list('ABCD'),columns=['one','two','three'])
print(da1)
#1根据一列two排序
print(da1.sort_values(by='two'))
#2降序根据一列two排序
print(da1.sort_values(by='two',ascending=False))
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
   one  two  three
A    8    1      8
B    9    4      4
C    1    1      8
D    9    3      8
   one  two  three
A    8    1      8
C    1    1      8
D    9    3      8
B    9    4      4
   one  two  three
B    9    4      4
D    9    3      8
A    8    1      8
C    1    1      8

Process finished with exit code 0

排名

#一维数组排名
s=pd.Series([3,6,2,6,4])
print(s)
#1根据值大小进行排名,如索引为2值为2就是排名第一
print(s.rank())
#2发现排名中有4.5名,可以定义将先出现的排名靠前,就没有4.5名了
print(s.rank(method='first'))
#3默认按照平均值排名,即出现2个6,位置为4+5求平均就是4.5名
print(s.rank(method='average'))
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
0    3
1    6
2    2
3    6
4    4
dtype: int64
0    2.0
1    4.5
2    1.0
3    4.5
4    3.0
dtype: float64
0    2.0
1    4.0
2    1.0
3    5.0
4    3.0
dtype: float64
0    2.0
1    4.5
2    1.0
3    4.5
4    3.0
dtype: float64

Process finished with exit code 0


#二维数组排名
da1 = pd.DataFrame(np.random.randint(1,10,(4,3)),  index=list('ABCD'),columns=['one','two','three'])
print(da1)
#默认按照列的数值进行排名,即行方向。
print(da1.rank(method='first'))
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
   one  two  three
A    5    5      9
B    9    3      6
C    8    3      3
D    2    8      5
   one  two  three
A  2.0  3.0    4.0
B  4.0  1.0    3.0
C  3.0  2.0    1.0
D  1.0  4.0    2.0

Process finished with exit code 0

s=pd.Series(list('abbcdabacad'))
print(s)
#1统计序列中重复值
print(s.value_counts())
#2返回这个序列中唯一不重复值的序列出来
print(s.unique())
print(type(s.unique()))
#3判断s这个数组中元素是不是都在后面的列表中,在的对应s序列中就是True。True代表s中有这个值。且在后边的列表中
print(s.isin(['a','c','d']))
#4因为s.unique()就是数组['a' 'b' 'c' 'd'],所以s序列肯定有的值都在这里边了,所以s每一个位置上都为True了
print(s.isin(s.unique()))
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
0     a
1     b
2     b
3     c
4     d
5     a
6     b
7     a
8     c
9     a
10    d
dtype: object
a    4
b    3
d    2
c    2
dtype: int64
['a' 'b' 'c' 'd']
<class 'numpy.ndarray'>
0      True
1     False
2     False
3      True
4      True
5      True
6     False
7      True
8      True
9      True
10     True
dtype: bool
0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
dtype: bool

Process finished with exit code 0

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!