一、Series
创建
#1创建一个序列,索引必须为列表
b=pd.Series(np.random.randn(5),index=['a','b','c','d','e'])
print(b)
#2查看索引
print(b.index)
#3默认不指定索引时,为0开始排的整型索引即可
c=pd.Series(np.random.randn(5))
print(c)
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
a 0.168710
b 0.436219
c -0.236762
d 0.426008
e -0.951852
dtype: float64
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
0 1.506954
1 0.282373
2 0.963675
3 0.671374
4 -0.936167
dtype: float64
Process finished with exit code 0
#1通过字典创建一个序列
d={'a':0.,'b':1.,'d':3.}
b=pd.Series(d,index=list('abcd'))#index代表行索引
print(b)
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
a 0.0
b 1.0
c NaN
d 3.0
dtype: float64
Process finished with exit code 0
#1通过标量创建一个序列
b=pd.Series(5,index=list('abcd'))#index代表行索引
print(b)
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
a 5
b 5
c 5
d 5
dtype: int64
Process finished with exit code 0
支持ndarry数组操作
s=pd.Series(np.random.randn(5),index=['a','b','c','d','e'])
print(s)
#1支持索引操作
print(s[0])#不包括最后的索引
print(s[:3])
print(s[2:5])
print(s[[1,3,4]])#支持整型直接索引。
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
a 1.147490
b -0.084418
c -0.777898
d -0.488580
e -0.146267
dtype: float64
1.1474904747052694
a 1.147490
b -0.084418
c -0.777898
dtype: float64
c -0.777898
d -0.488580
e -0.146267
dtype: float64
b -0.084418
d -0.488580
e -0.146267
dtype: float64
s=pd.Series(np.random.randn(5),index=['a','b','c','d','e'])
print(s)
#1支持numpy中的函数操作
print(np.sin(s))
print(np.exp(s))
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
a 2.122820
b -0.081273
c -0.144711
d -1.461435
e 0.341797
dtype: float64
a 0.851465
b -0.081183
c -0.144206
d -0.994026
e 0.335181
dtype: float64
a 8.354667
b 0.921942
c 0.865273
d 0.231903
e 1.407474
dtype: float64
s=pd.Series(np.random.randn(5),index=['a','b','c','d','e'])
print(s)
#3支持字典访问
print(s['a'])
s['b']=3
print(s)
#4用字典方式增加一个键值
s['g']=100
print(s)
#5访问不存在的键会报错
# print(s['f'])
#6可以用get访问,键不存在时指定默认值
print(s.get('f'))
print(s.get('f',0))
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
a 0.980320
b 0.547169
c 2.342473
d -1.520565
e 0.589286
dtype: float64
0.9803196644381337
a 0.980320
b 3.000000
c 2.342473
d -1.520565
e 0.589286
dtype: float64
a 0.980320
b 3.000000
c 2.342473
d -1.520565
e 0.589286
g 100.000000
dtype: float64
None
0
Process finished with exit code 0
s1=pd.Series(np.random.randn(3),index=['a','c','e'])
s2=pd.Series(np.random.randn(3),index=['a','d','e'])
#1支持2个series标签对齐打印,此时2个序列中间多了1个空行,因为打印完s1另起一行,空这一行,就开始打印s2
print('{0}\n\n{1}'.format(s1,s2))
#支持两个序列相加,即a标签加a标签,即自动标签对齐相加,即s2没有的c,所以相加后的c值为NaN.
print(s1+s2)
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
a 0.859558
c 1.547378
e 0.135625
dtype: float64
a 1.705699
d -0.591336
e -1.058273
dtype: float64
a 2.565257
c NaN
d NaN
e -0.922649
dtype: float64
Process finished with exit code 0
二、DataFrame
每行每列都可以看成是Series序列
创建
# 1通过字典创建
data = pd.DataFrame({'one': pd.Series([1,2,3],index=['a','b','c']),
'two': pd.Series([1,2,3,4],index=['a','b','c','d'])})
print(data)
d={'one': pd.Series([1,2,3],index=['a','b','c']),
'two': pd.Series([1,2,3,4],index=['a','b','c','d'])}
#2只取三个索引值,会自动对齐
df1=pd.DataFrame(d,index=['b','d','a'])
print(df1)
#3改变列标签
df2=pd.DataFrame(d,columns=['two','three'])
print(df2)
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
one two
a 1.0 1
b 2.0 2
c 3.0 3
d NaN 4
one two
b 2.0 2
d NaN 4
a 1.0 1
two three
a 1 NaN
b 2 NaN
c 3 NaN
d 4 NaN
Process finished with exit code 0
# 1通过列表作为字典的值时,必须保证个数一致,否则会报错,而series不存在这个问题
data = pd.DataFrame({'one': [1,2,3,4],#因为列表不像Series序列有索引,所以不用指定index了。要想指定索引是指定DataFrame的索引
'two': [21,22,23,24]})
print(data)
# d={'one': pd.Series([1,2,3],index=['a','b','c']),
# 'two': pd.Series([1,2,3,4],index=['a','b','c','d'])}
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
one two
0 1 21
1 2 22
2 3 23
3 4 24
Process finished with exit code 0
# 1通过列表创建DataFrame,行列标签自己可以指定
data=[(1,2.2,'Hello'),(2,3.,'World')]
data1 = pd.DataFrame(data,index=['one','two'],columns=list('ABC'))
print(data1)
结果:
A B C
one 1 2.2 Hello
two 2 3.0 World
# 1大的列表是字典,可以指定行标签,列标签由字典的键自动生成,当自己再定义列标签时,会自动与现有的列标签对齐。
data=[{'a':1,'b':2},{'a':5,'b':10,'c':20}]
data1 = pd.DataFrame(data,index=['A','B'],columns=list('ab'))
print(data1)
#列标签由字典的键自动生成,当自己再定义列标签时,会自动与现有的列标签对齐,且原来没有e标签,所以为空。
#行标签若不指定,会自动分配整型
data2 = pd.DataFrame(data,index=['A','B'],columns=list('abe'))
print(data2)
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
a b
A 1 2
B 5 10
a b e
A 1 2 NaN
B 5 10 NaN
Process finished with exit code 0
# 字典套字典创建DataFrame,
data={('a','b'):{('A','B'):1,('A','C'): 2},
('a','a'):{('A','C'):3,('A','B'): 4},
('a','c'):{('A','B'):5,('A','C'): 6},
('b','a'):{('A','C'):7,('A','B'): 8},
('b','b'):{('A','D'):9,('A','B'): 10}}
#data1的行与列都是二维标签索引了。
data1 = pd.DataFrame(data)
print(data1)
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
a b
b a c a b
A B 1.0 4.0 5.0 8.0 10.0
C 2.0 3.0 6.0 7.0 NaN
D NaN NaN NaN NaN 9.0
Process finished with exit code 0
data3=pd.Series(np.random.randn(5),index=['a','b','c','d','e'])
print(data3)
##1从Series中创建,默认列从0开始索引,也指定列标签columns=['A']
data4=pd.DataFrame(data3,columns=['A'])
print(data4)
#2data4=pd.DataFrame(data3,columns=['A','B'],index=list('acd))这句话不对,因为data3是一维序列,只有一列数据,所以指定有两列就不对
#且指定的行标签与原来的data3Series序列中的行标签会自动对齐。
#这种可以,#3本质从数据中找出索引,没有的话会自动生成从0开始的索引。
# 或者指定索引,若索引有,会找出与已有的索引数据进行对齐。
data5=pd.DataFrame(data3,columns=['A'],index=list('acd'))
print(data5)
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
a -0.075319
b -1.039741
c -1.072038
d -0.331250
e 1.267542
dtype: float64
A
a -0.075319
b -1.039741
c -1.072038
d -0.331250
e 1.267542
A
a -0.075319
c -1.072038
d -0.331250
Process finished with exit code 0
特性
data3 = pd.DataFrame(np.random.randn(6, 4), columns=['one','two','three','four'])
print(data3)
#1选择一行或一列
print(data3['one'])
#2为three这一列为其他两列相加
data3['three']=data3['one']+data3['two']
print(data3)
#3删除一列
del data3['three']
print(data3)
#4增加新的一列
data3['flag']=data3['one']>0.2
print(data3)
#5直接标量加一列
data3['five']=5
#6将four这一列弹出之后,原数据data3中就没有这一列了
data3.pop('four')
print(data3)
#7用函数插入一列,名字为bar,值是两列相加
data3.insert(1,'bar',data3['one']+data3['two'])
print(data3)
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
one two three four
0 -0.503520 0.557812 2.455988 2.099548
1 0.330146 -2.574253 0.359760 -0.062902
2 0.035532 -0.727159 -2.370839 0.889751
3 -0.888395 0.717642 -0.383320 -0.523820
4 -0.474148 0.518265 0.104414 -3.433082
5 1.928498 -0.627430 0.182430 0.347653
0 -0.503520
1 0.330146
2 0.035532
3 -0.888395
4 -0.474148
5 1.928498
Name: one, dtype: float64
one two three four
0 -0.503520 0.557812 0.054292 2.099548
1 0.330146 -2.574253 -2.244107 -0.062902
2 0.035532 -0.727159 -0.691627 0.889751
3 -0.888395 0.717642 -0.170753 -0.523820
4 -0.474148 0.518265 0.044117 -3.433082
5 1.928498 -0.627430 1.301068 0.347653
one two four
0 -0.503520 0.557812 2.099548
1 0.330146 -2.574253 -0.062902
2 0.035532 -0.727159 0.889751
3 -0.888395 0.717642 -0.523820
4 -0.474148 0.518265 -3.433082
5 1.928498 -0.627430 0.347653
one two four flag
0 -0.503520 0.557812 2.099548 False
1 0.330146 -2.574253 -0.062902 True
2 0.035532 -0.727159 0.889751 False
3 -0.888395 0.717642 -0.523820 False
4 -0.474148 0.518265 -3.433082 False
5 1.928498 -0.627430 0.347653 True
one two flag five
0 -0.503520 0.557812 False 5
1 0.330146 -2.574253 True 5
2 0.035532 -0.727159 False 5
3 -0.888395 0.717642 False 5
4 -0.474148 0.518265 False 5
5 1.928498 -0.627430 True 5
one bar two flag five
0 -0.503520 0.054292 0.557812 False 5
1 0.330146 -2.244107 -2.574253 True 5
2 0.035532 -0.691627 -0.727159 False 5
3 -0.888395 -0.170753 0.717642 False 5
4 -0.474148 0.044117 0.518265 False 5
5 1.928498 1.301068 -0.627430 True 5
Process finished with exit code 0
data3 = pd.DataFrame(np.random.randn(6, 4), columns=['one','two','three','four'])
print(data3)
#1用函数插入一列,名字为bar,值是两列相加
data3.insert(1,'bar',data3['one']+data3['two'])
print(data3)
#2insert插入会直接改变原来数组data3,而assign会copy一份再去改变,原来的data3不变
#用assign插入一列
data4=data3.assign(Ratio=data3['one']/data3['two'])
print(data3)#发现data3不变
print(data4)#在最后边插入了一列
#3assign可以直接将data3作为数据进行函数计算
data5=data3.assign(Ratio=lambda x:x.one-x.two)
print(data5)
#3assign可以直接将data3作为数据进行函数计算第一次生成ABRatio这一列,之后再用一次assign并用这一列继续生成新的一列
data6=data3.assign(ABRatio=lambda x:x.one-x.two).assign(BarValue=lambda x:x.ABRatio*x.bar)
print(data6)
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
one two three four
0 -0.903571 -1.887683 1.197445 -0.192508
1 0.727993 0.627388 1.092060 -0.017405
2 1.132262 -1.761343 0.569888 -1.049006
3 -0.241974 1.099350 -1.571476 -0.823272
4 -0.122233 0.330990 -0.102380 -0.324069
5 0.546435 -1.282765 0.332780 -1.871410
one bar two three four
0 -0.903571 -2.791254 -1.887683 1.197445 -0.192508
1 0.727993 1.355381 0.627388 1.092060 -0.017405
2 1.132262 -0.629081 -1.761343 0.569888 -1.049006
3 -0.241974 0.857377 1.099350 -1.571476 -0.823272
4 -0.122233 0.208758 0.330990 -0.102380 -0.324069
5 0.546435 -0.736331 -1.282765 0.332780 -1.871410
one bar two three four
0 -0.903571 -2.791254 -1.887683 1.197445 -0.192508
1 0.727993 1.355381 0.627388 1.092060 -0.017405
2 1.132262 -0.629081 -1.761343 0.569888 -1.049006
3 -0.241974 0.857377 1.099350 -1.571476 -0.823272
4 -0.122233 0.208758 0.330990 -0.102380 -0.324069
5 0.546435 -0.736331 -1.282765 0.332780 -1.871410
one bar two three four Ratio
0 -0.903571 -2.791254 -1.887683 1.197445 -0.192508 0.478666
1 0.727993 1.355381 0.627388 1.092060 -0.017405 1.160355
2 1.132262 -0.629081 -1.761343 0.569888 -1.049006 -0.642840
3 -0.241974 0.857377 1.099350 -1.571476 -0.823272 -0.220106
4 -0.122233 0.208758 0.330990 -0.102380 -0.324069 -0.369294
5 0.546435 -0.736331 -1.282765 0.332780 -1.871410 -0.425982
one bar two three four Ratio
0 -0.903571 -2.791254 -1.887683 1.197445 -0.192508 0.984113
1 0.727993 1.355381 0.627388 1.092060 -0.017405 0.100605
2 1.132262 -0.629081 -1.761343 0.569888 -1.049006 2.893605
3 -0.241974 0.857377 1.099350 -1.571476 -0.823272 -1.341324
4 -0.122233 0.208758 0.330990 -0.102380 -0.324069 -0.453223
5 0.546435 -0.736331 -1.282765 0.332780 -1.871410 1.829200
one bar two three four ABRatio BarValue
0 -0.903571 -2.791254 -1.887683 1.197445 -0.192508 0.984113 -2.746909
1 0.727993 1.355381 0.627388 1.092060 -0.017405 0.100605 0.136358
2 1.132262 -0.629081 -1.761343 0.569888 -1.049006 2.893605 -1.820313
3 -0.241974 0.857377 1.099350 -1.571476 -0.823272 -1.341324 -1.150020
4 -0.122233 0.208758 0.330990 -0.102380 -0.324069 -0.453223 -0.094614
5 0.546435 -0.736331 -1.282765 0.332780 -1.871410 1.829200 -1.346896
Process finished with exit code 0
data3 = pd.DataFrame(np.random.randint(1,10, (6,4)), index=list('abcdef'),columns=['A','B','C','D'])
print(data3)
#索引与选择操作
#1用类似字典方式选择1列
print(data3['A'])
#2通过方法用标签选择一行
print(data3.loc['a'])
print(data3.loc[:,'A'])#可以选择一列,结果与data3['A']一样
#3用值选择一行
print(data3.iloc[1])
#loc函数使用必须用标签去索引,.loc[],中括号里面是先行后列,以逗号分割,行和列分别是行标签和列标签,
# 比如data.loc["b","B"],iloc也一样只不过要用具体的第几行数字来表示。
#4用字典方式选择多行,效率更高
print(data3[1:4])#不包括结束行
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
A B C D
a 8 6 2 2
b 5 9 4 3
c 2 8 3 6
d 5 1 6 4
e 2 1 8 9
f 2 2 8 6
a 8
b 5
c 2
d 5
e 2
f 2
Name: A, dtype: int32
A 8
B 6
C 2
D 2
Name: a, dtype: int32
a 8
b 5
c 2
d 5
e 2
f 2
Name: A, dtype: int32
A 5
B 9
C 4
D 3
Name: b, dtype: int32
data3 = pd.DataFrame(np.random.randint(1,10, (6,4)), index=list('abcdef'),columns=['A','B','C','D'])
print(data3)
#1Series的bool变量
print(data3.A>4)
#2选择出A这一列大于4的数据表
print(data3[data3.A>4])
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
A B C D
a 6 2 3 1
b 9 6 7 5
c 9 8 9 7
d 9 3 9 2
e 5 6 6 7
f 2 6 6 4
a True
b True
c True
d True
e True
f False
Name: A, dtype: bool
A B C D
a 6 2 3 1
b 9 6 7 5
c 9 8 9 7
d 9 3 9 2
e 5 6 6 7
da1 = pd.DataFrame(np.random.randn(10,4), index=list('abcdefghij'),columns=['A','B','C','D'])
da2 = pd.DataFrame(np.random.randn(7,3), index=list('cdefghi'),columns=['A','B','C'])
#1进行da1与da2对齐操作,标签自动对齐操作
#相加,即相加时例如标签c对应标签c相加,相加时哪一列没有就为NaN代替即自动对齐相加
print(da1+da2)
#2只会进行第一行相减,因为一个da1与一行相减,肯定这一行要复制多份与da1大小一样,之后再进行相减。
print(da1)
print(da1-da1.iloc[0])
#3pandas中的DataFrame与numpy中数组兼容,所以可以直接调用怒买便宜中函数对其进行计算
print(np.exp(da2))#本质上DataFrame所用数据结构就是numpy中的array
print(np.sin(da2))
print(da2.values)
print(type(da2.values))#类型就是numpy中的n维数组即ndarray
print(np.asarray(da2)==da2.values)#array和asarray都可将结构数据转换为ndarray类型但是主要区别就是当数据源是ndarray时,
#array仍会copy出一个副本,占用新的内存,但asarray不会。
#为true说明da2通过asarray改变为ndarray的类型就是da2.values
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
A B C D
a NaN NaN NaN NaN
b NaN NaN NaN NaN
c 3.053139 0.399460 0.902013 NaN
d -0.971126 -1.068590 -0.381331 NaN
e 2.409311 -0.001406 -1.475077 NaN
f 1.184333 0.623498 -0.744652 NaN
g 1.718821 0.607609 1.127204 NaN
h -0.967076 -1.922861 -0.443765 NaN
i -0.291349 0.063443 -0.317229 NaN
j NaN NaN NaN NaN
A B C D
a -0.506622 0.334452 0.328909 -0.604279
b -0.547917 -0.612315 1.581732 -1.153236
c 1.883545 -0.327115 -0.153999 -0.111412
d -0.210938 -0.578260 -0.457308 -0.249904
e 0.360569 0.984508 -0.105404 0.776660
f 1.907145 1.944524 -1.149351 0.067634
g 1.479150 0.294211 0.284969 1.821444
h -0.591414 -1.086091 0.797395 0.959605
i -0.405419 -0.382432 -0.377807 -0.086269
j 0.773303 -0.080868 1.059053 -0.657401
A B C D
a 0.000000 0.000000 0.000000 0.000000
b -0.041295 -0.946767 1.252824 -0.548957
c 2.390167 -0.661567 -0.482908 0.492868
d 0.295683 -0.912712 -0.786216 0.354375
e 0.867191 0.650056 -0.434313 1.380940
f 2.413767 1.610072 -1.478260 0.671913
g 1.985772 -0.040241 -0.043940 2.425723
h -0.084792 -1.420543 0.468486 1.563885
i 0.101203 -0.716883 -0.706716 0.518010
j 1.279925 -0.415320 0.730145 -0.053121
A B C
c 3.220685 2.067985 2.874884
d 0.467579 0.612424 1.078937
e 7.758132 0.373098 0.254190
f 0.485385 0.266861 1.498852
g 1.270830 1.368066 2.321551
h 0.686835 0.433107 0.289049
i 1.120831 1.561856 1.062451
A B C
c 0.920592 0.664313 0.870399
d -0.689057 -0.470917 0.075904
e 0.887942 -0.833777 -0.979843
f -0.661497 -0.968969 0.393742
g 0.237382 0.308293 0.746133
h -0.366888 -0.742483 -0.946160
i 0.113823 0.431247 0.060542
[[ 1.1695942 0.72657459 1.05601218]
[-0.76018733 -0.49032976 0.07597664]
[ 2.04874157 -0.98591365 -1.36967273]
[-0.72281253 -1.32102574 0.40469934]
[ 0.23967026 0.31339829 0.84223529]
[-0.37566185 -0.83676964 -1.24115999]
[ 0.11407028 0.44587498 0.06057866]]
<class 'numpy.ndarray'>
[[ True True True]
[ True True True]
[ True True True]
[ True True True]
[ True True True]
[ True True True]
[ True True True]]
Process finished with exit code 0
三维数组创建
总之不管多少维度数据,4或5维乃至更高维度,都可以转化为用多维标签表示的二维数组即DataFrame,因此今后用DataFrame最多。
#键是字符串,值用DataFrame创建就可以得到三维数组了,如(2,2,3)相当于就是2个2乘3的二维数组就是一个三维数组了。
data={'Item1':pd.DataFrame(np.random.randn(4,3)),
'Item2':pd.DataFrame(np.random.randn(4, 2))}#(4, 2)向(4, 3)看齐补NaN
#1创建3维数组
pn=pd.Panel(data)
print(pn)#结果说明有:第一维度索引从 Item1 to Item2
#2因为此三维数组就是相当于2个二维数组
#看第一个二维数组
print(pn['Item1'])
#看第二个二维数组
print(pn['Item2'])
#3看三个维度的索引
print(pn.items)
#看第二维度的索引
print(pn.major_axis)
#看第三维度的索引
print(pn.minor_axis)
#4用第二维度标签获取,即相当于有4个3乘2的二维数组。以下取第二个
print(pn.major_xs(1))#(2,4,3)
#5三维数组与二维数组可以转化
print(pn.to_frame())#此时转化过来核心还是三维数组,因为有三个标签,即对应第一二三维度信息了
#6总之不管多少维度数据,4或5维乃至更高维度,都可以转化为用多维标签表示的二维数组即DataFrame,
#因此今后用DataFrame最多。
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
sys:1: FutureWarning:
<class 'pandas.core.panel.Panel'>
Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Dimensions: 2 (items) x 4 (major_axis) x 3 (minor_axis)
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Items axis: Item1 to Item2
Pandas provides a `.to_xarray()` method to help automate this conversion.
Major_axis axis: 0 to 3
Minor_axis axis: 0 to 2
0 1 2
0 -0.947857 0.149131 -0.868937
1 -1.547583 -1.772686 -0.811341
2 0.671214 0.121123 0.448306
3 -1.421754 0.265004 0.372799
0 1 2
0 -0.803768 -0.255339 NaN
1 1.132508 -0.825563 NaN
2 -0.059555 0.988052 NaN
3 -0.644363 2.509904 NaN
Index(['Item1', 'Item2'], dtype='object')
RangeIndex(start=0, stop=4, step=1)
RangeIndex(start=0, stop=3, step=1)
Item1 Item2
0 -1.547583 1.132508
1 -1.772686 -0.825563
2 -0.811341 NaN
Item1 Item2
major minor
0 0 -0.947857 -0.803768
1 0.149131 -0.255339
1 0 -1.547583 1.132508
1 -1.772686 -0.825563
2 0 0.671214 -0.059555
1 0.121123 0.988052
3 0 -1.421754 -0.644363
1 0.265004 2.509904
Process finished with exit code 0
三、基础运算
一维数组处理
df1=pd.Series([1,3,5,6,8],index=list('acefh'))
print(df1)
#1查看序列的索引
print(df1.index)
#2对Series重新索引,重新索引后原来没有的标签对应值为NaN,原来有的会自动对齐的。
print(df1.reindex(list('abcdefgh')))
#3重新索引时,对于原来没有的标签索引,可以默认设置个值
print(df1.reindex(list('abcdefgh'),fill_value=0))
#4重新索引时,有一些数据没有值的,如股票停盘后的时间对应就没有值,所以可以用停盘前的数据对其进行填充即可。
print(df1.reindex(list('abcdefgh'),method='ffill'))#即结果中就是如标签b对应用a值填充,其他依次。
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
a 1
c 3
e 5
f 6
h 8
dtype: int64
Index(['a', 'c', 'e', 'f', 'h'], dtype='object')
a 1.0
b NaN
c 3.0
d NaN
e 5.0
f 6.0
g NaN
h 8.0
dtype: float64
a 1
b 0
c 3
d 0
e 5
f 6
g 0
h 8
dtype: int64
a 1
b 1
c 3
d 3
e 5
f 6
g 6
h 8
dtype: int64
Process finished with exit code 0
da1 = pd.DataFrame(np.random.randn(4,6), index=list('ADFH'),columns=['one','two','three','four','five','six'])
#1二维数组可以分别对行与列进行重新索引
da2=da1.reindex(index=list('ABCDEFGH'))#对行重新索引
print(da2)
#2将NaN赋值默认值,实际上reindex是copy一份da1数据出来并有返回值的,所以以下打印直接可以出来结果,也可以赋值给一个变量再打印出来结果,即原来数据da1并未改变
print(da1.reindex(index=list('ABCDEFGH'),fill_value=0))
print(da1)
#3将da1的A行one列对应值改为100,只是da1对应位置为100了,而之前的da2仍不变
da1.loc['A']['one']=100
print(da1)
print(da2)
#4对列索引,默认新增列的值为NaN
print(da1.reindex(columns=['one','three','five','seven']))
#5对列索引,设定默认值为0
# print(da1.reindex(columns=['one','three','five','seven'],fill_value=0))
#6method='ffill'对二维数组的列无效果,这种不行
# print(da1.reindex(columns=['one','three','five','seven'],method='ffill'))
#7对行填充有效果,即B与C的值是根据前边有的A的值来的
print(da1.reindex(index=list('ABCDEFGH'),method='ffill'))
#8向后填充,即B与C的值是根据后边有的D的值来的
print(da1.reindex(index=list('ABCDEFGH'),method='bfill'))
#9丢弃一行或一列时,加axis说明,否则默认按丢弃一行的方向处理
print(da1)
print(da1.drop('A'))
#10丢弃一列,drop也是将da1copy一份进行改变的,即之前的da1数据并未改变。
print(da1.drop(['two','four'],axis=1))
print(da1)
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
one two three four five six
A -1.909367 0.730111 0.574619 0.454327 -0.262506 -0.004040
B NaN NaN NaN NaN NaN NaN
C NaN NaN NaN NaN NaN NaN
D 0.074381 0.015976 -0.759157 -0.162168 1.321225 0.959368
E NaN NaN NaN NaN NaN NaN
F -0.139248 2.523069 1.798354 0.177123 -0.817056 0.071136
G NaN NaN NaN NaN NaN NaN
H 1.514792 -0.068565 -2.648990 -0.756524 0.502933 -1.225695
one two three four five six
A -1.909367 0.730111 0.574619 0.454327 -0.262506 -0.004040
B 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
C 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
D 0.074381 0.015976 -0.759157 -0.162168 1.321225 0.959368
E 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
F -0.139248 2.523069 1.798354 0.177123 -0.817056 0.071136
G 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
H 1.514792 -0.068565 -2.648990 -0.756524 0.502933 -1.225695
one two three four five six
A -1.909367 0.730111 0.574619 0.454327 -0.262506 -0.004040
D 0.074381 0.015976 -0.759157 -0.162168 1.321225 0.959368
F -0.139248 2.523069 1.798354 0.177123 -0.817056 0.071136
H 1.514792 -0.068565 -2.648990 -0.756524 0.502933 -1.225695
one two three four five six
A 100.000000 0.730111 0.574619 0.454327 -0.262506 -0.004040
D 0.074381 0.015976 -0.759157 -0.162168 1.321225 0.959368
F -0.139248 2.523069 1.798354 0.177123 -0.817056 0.071136
H 1.514792 -0.068565 -2.648990 -0.756524 0.502933 -1.225695
one two three four five six
A -1.909367 0.730111 0.574619 0.454327 -0.262506 -0.004040
B NaN NaN NaN NaN NaN NaN
C NaN NaN NaN NaN NaN NaN
D 0.074381 0.015976 -0.759157 -0.162168 1.321225 0.959368
E NaN NaN NaN NaN NaN NaN
F -0.139248 2.523069 1.798354 0.177123 -0.817056 0.071136
G NaN NaN NaN NaN NaN NaN
H 1.514792 -0.068565 -2.648990 -0.756524 0.502933 -1.225695
one three five seven
A 100.000000 0.574619 -0.262506 NaN
D 0.074381 -0.759157 1.321225 NaN
F -0.139248 1.798354 -0.817056 NaN
H 1.514792 -2.648990 0.502933 NaN
one two three four five six
A 100.000000 0.730111 0.574619 0.454327 -0.262506 -0.004040
B 100.000000 0.730111 0.574619 0.454327 -0.262506 -0.004040
C 100.000000 0.730111 0.574619 0.454327 -0.262506 -0.004040
D 0.074381 0.015976 -0.759157 -0.162168 1.321225 0.959368
E 0.074381 0.015976 -0.759157 -0.162168 1.321225 0.959368
F -0.139248 2.523069 1.798354 0.177123 -0.817056 0.071136
G -0.139248 2.523069 1.798354 0.177123 -0.817056 0.071136
H 1.514792 -0.068565 -2.648990 -0.756524 0.502933 -1.225695
one two three four five six
A 100.000000 0.730111 0.574619 0.454327 -0.262506 -0.004040
B 0.074381 0.015976 -0.759157 -0.162168 1.321225 0.959368
C 0.074381 0.015976 -0.759157 -0.162168 1.321225 0.959368
D 0.074381 0.015976 -0.759157 -0.162168 1.321225 0.959368
E -0.139248 2.523069 1.798354 0.177123 -0.817056 0.071136
F -0.139248 2.523069 1.798354 0.177123 -0.817056 0.071136
G 1.514792 -0.068565 -2.648990 -0.756524 0.502933 -1.225695
H 1.514792 -0.068565 -2.648990 -0.756524 0.502933 -1.225695
one two three four five six
A 100.000000 0.730111 0.574619 0.454327 -0.262506 -0.004040
D 0.074381 0.015976 -0.759157 -0.162168 1.321225 0.959368
F -0.139248 2.523069 1.798354 0.177123 -0.817056 0.071136
H 1.514792 -0.068565 -2.648990 -0.756524 0.502933 -1.225695
one two three four five six
D 0.074381 0.015976 -0.759157 -0.162168 1.321225 0.959368
F -0.139248 2.523069 1.798354 0.177123 -0.817056 0.071136
H 1.514792 -0.068565 -2.648990 -0.756524 0.502933 -1.225695
one three five six
A 100.000000 0.574619 -0.262506 -0.004040
D 0.074381 -0.759157 1.321225 0.959368
F -0.139248 1.798354 -0.817056 0.071136
H 1.514792 -2.648990 0.502933 -1.225695
one two three four five six
A 100.000000 0.730111 0.574619 0.454327 -0.262506 -0.004040
D 0.074381 0.015976 -0.759157 -0.162168 1.321225 0.959368
F -0.139248 2.523069 1.798354 0.177123 -0.817056 0.071136
H 1.514792 -0.068565 -2.648990 -0.756524 0.502933 -1.225695
Process finished with exit code 0
da1 = pd.DataFrame(np.arange(12).reshape(4,3), index=['one','two','three','four'],columns=list('ABC'))
print(da1)
#1apply函数默认按照1列给函数作为参数进行计算的,结果是每一列对应有一个结果了。
#默认按照axis=0方向即行方向进行计算,但是就是每列,其实一样明白即可。
print(da1.apply(lambda x:x.max()-x.min()))
#按照每行计算的,即每行有个结果,#每一行的返回结果都是标量。
print(da1.apply(lambda x:x.max()-x.min(),axis=1))#每一行的返回结果都是标量。
#2返回的结果为序列,对列操作
def min_max(x):
return pd.Series([x.max(),x.min()],index=['min','max'])
print(da1.apply(min_max))#注意apply会自动将每一列参数传进去函数中,所以函数需要形参,但是这个形参apply函数会自动将每一列传进去给x的
# 。且每次返回都是一列的最小与最大值的序列,3列即返回三个序列了,即返回一个二维表了。
#3返回的结果为序列,对行操作
print(da1.apply(min_max,axis=1))
#ipythonshell交互式提示符中独有的查看函数说明文档方式为 df.apply?
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
A B C
one 0 1 2
two 3 4 5
three 6 7 8
four 9 10 11
A 9
B 9
C 9
dtype: int64
one 2
two 2
three 2
four 2
dtype: int64
A B C
min 9 10 11
max 0 1 2
min max
one 2 0
two 5 3
three 8 6
four 11 9
Process finished with exit code 0
da1 = pd.DataFrame(np.random.randn(4,3), index=['one','two','three','four'],columns=list('ABC'))
print(da1)
#对数组中的每一个元素进行处理
#方式1
#formater=lambda x :'%.03f' %x
#print(da1.applymap(formater))#此时中的x就是数组中的每一个元素
#方式2
formater='{0:.03f}'.format#将函数作为参数传给下面的applymap函数了。冒号前的0表示第0个参数,即占位符的意思,可以写或不写
print(da1.applymap(formater))#此时中的x就是数组中的每一个元素
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
A B C
one -0.078113 -1.334115 -0.178684
two 1.340760 3.026996 -0.327963
three 0.861242 1.030327 -0.202078
four 0.100948 -2.405198 0.057714
A B C
one -0.078 -1.334 -0.179
two 1.341 3.027 -0.328
three 0.861 1.030 -0.202
four 0.101 -2.405 0.058
Process finished with exit code 0
排序
da1 = pd.DataFrame(np.random.randint(1,10,(4,3)), index=list('ABCD'),columns=['one','two','three'])
print(da1)
#1根据一列two排序
print(da1.sort_values(by='two'))
#2降序根据一列two排序
print(da1.sort_values(by='two',ascending=False))
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
one two three
A 8 1 8
B 9 4 4
C 1 1 8
D 9 3 8
one two three
A 8 1 8
C 1 1 8
D 9 3 8
B 9 4 4
one two three
B 9 4 4
D 9 3 8
A 8 1 8
C 1 1 8
Process finished with exit code 0
排名
#一维数组排名
s=pd.Series([3,6,2,6,4])
print(s)
#1根据值大小进行排名,如索引为2值为2就是排名第一
print(s.rank())
#2发现排名中有4.5名,可以定义将先出现的排名靠前,就没有4.5名了
print(s.rank(method='first'))
#3默认按照平均值排名,即出现2个6,位置为4+5求平均就是4.5名
print(s.rank(method='average'))
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
0 3
1 6
2 2
3 6
4 4
dtype: int64
0 2.0
1 4.5
2 1.0
3 4.5
4 3.0
dtype: float64
0 2.0
1 4.0
2 1.0
3 5.0
4 3.0
dtype: float64
0 2.0
1 4.5
2 1.0
3 4.5
4 3.0
dtype: float64
Process finished with exit code 0
#二维数组排名
da1 = pd.DataFrame(np.random.randint(1,10,(4,3)), index=list('ABCD'),columns=['one','two','three'])
print(da1)
#默认按照列的数值进行排名,即行方向。
print(da1.rank(method='first'))
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
one two three
A 5 5 9
B 9 3 6
C 8 3 3
D 2 8 5
one two three
A 2.0 3.0 4.0
B 4.0 1.0 3.0
C 3.0 2.0 1.0
D 1.0 4.0 2.0
Process finished with exit code 0
s=pd.Series(list('abbcdabacad'))
print(s)
#1统计序列中重复值
print(s.value_counts())
#2返回这个序列中唯一不重复值的序列出来
print(s.unique())
print(type(s.unique()))
#3判断s这个数组中元素是不是都在后面的列表中,在的对应s序列中就是True。True代表s中有这个值。且在后边的列表中
print(s.isin(['a','c','d']))
#4因为s.unique()就是数组['a' 'b' 'c' 'd'],所以s序列肯定有的值都在这里边了,所以s每一个位置上都为True了
print(s.isin(s.unique()))
结果:
D:\ProgramData\Anaconda3\python.exe D:/numpy-kexue/03.py
0 a
1 b
2 b
3 c
4 d
5 a
6 b
7 a
8 c
9 a
10 d
dtype: object
a 4
b 3
d 2
c 2
dtype: int64
['a' 'b' 'c' 'd']
<class 'numpy.ndarray'>
0 True
1 False
2 False
3 True
4 True
5 True
6 False
7 True
8 True
9 True
10 True
dtype: bool
0 True
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 True
9 True
10 True
dtype: bool
Process finished with exit code 0
来源:CSDN
作者:qq_43498494
链接:https://blog.csdn.net/qq_43498494/article/details/103950198