Pandas 精简实例入门

倖福魔咒の 提交于 2020-04-02 22:50:48

import pandas as pd
import numpy as np

0. 案例引入

# 由np直接生成的ndarray
stock_change = np.random.normal(0, 1, (10, 8))
stock_change
array([[ 0.74057955,  0.78604657, -0.15264135,  0.05680483,  0.09388135,
         0.7313751 , -1.52338443,  1.71156505],
       [ 0.42204925,  0.62541715, -1.41583042, -0.27434654,  0.98587136,
        -0.55797884,  0.31026482, -0.47964535],
       [ 0.99741102, -0.94397298, -0.40782973, -1.33631227, -0.0124836 ,
         1.1873408 , -0.25430393, -0.74264106],
       [ 0.34156662, -0.40621262,  0.82861416,  0.1272128 ,  1.04101412,
         0.79061324, -0.60325544,  1.29954581],
       [-1.23289547,  0.83789748,  1.19276989,  0.45092868, -1.7418129 ,
        -0.65362211, -0.17752493,  1.87679286],
       [-0.4268705 ,  1.14017572,  0.18261009, -0.28947877,  0.82489897,
         0.11566058, -0.53191371, -0.96065812],
       [ 0.92792797,  0.26086313,  0.08316582, -0.94533007, -0.77956139,
         0.23006703, -0.81971461, -1.36742474],
       [ 0.82241768,  0.54201367, -0.19331564,  0.50576697, -0.42545839,
        -0.24247517, -0.03526651, -0.02268451],
       [ 1.67480093, -1.23265948, -2.88199942, -1.07761987, -1.37844497,
        -0.13581683,  2.06013919,  1.18986057],
       [ 0.60744357,  0.52348326,  0.76418263, -0.73385554,  0.54857341,
         0.27310645, -0.26464179,  0.77370496]])
# 通过pd.DataFrame生成    (pd.DataFrame(ndarray))
stock_df = pd.DataFrame(stock_change)
stock_df
0 1 2 3 4 5 6 7
0 0.740580 0.786047 -0.152641 0.056805 0.093881 0.731375 -1.523384 1.711565
1 0.422049 0.625417 -1.415830 -0.274347 0.985871 -0.557979 0.310265 -0.479645
2 0.997411 -0.943973 -0.407830 -1.336312 -0.012484 1.187341 -0.254304 -0.742641
3 0.341567 -0.406213 0.828614 0.127213 1.041014 0.790613 -0.603255 1.299546
4 -1.232895 0.837897 1.192770 0.450929 -1.741813 -0.653622 -0.177525 1.876793
5 -0.426871 1.140176 0.182610 -0.289479 0.824899 0.115661 -0.531914 -0.960658
6 0.927928 0.260863 0.083166 -0.945330 -0.779561 0.230067 -0.819715 -1.367425
7 0.822418 0.542014 -0.193316 0.505767 -0.425458 -0.242475 -0.035267 -0.022685
8 1.674801 -1.232659 -2.881999 -1.077620 -1.378445 -0.135817 2.060139 1.189861
9 0.607444 0.523483 0.764183 -0.733856 0.548573 0.273106 -0.264642 0.773705
stock_df.shape
(10, 8)
# 添加行索引
stock_name = ['股票{}'.format(i+1) for i in range(stock_df.shape[0])]
stock_df = pd.DataFrame(data=stock_change, index=stock_name)
stock_df
0 1 2 3 4 5 6 7
股票1 0.740580 0.786047 -0.152641 0.056805 0.093881 0.731375 -1.523384 1.711565
股票2 0.422049 0.625417 -1.415830 -0.274347 0.985871 -0.557979 0.310265 -0.479645
股票3 0.997411 -0.943973 -0.407830 -1.336312 -0.012484 1.187341 -0.254304 -0.742641
股票4 0.341567 -0.406213 0.828614 0.127213 1.041014 0.790613 -0.603255 1.299546
股票5 -1.232895 0.837897 1.192770 0.450929 -1.741813 -0.653622 -0.177525 1.876793
股票6 -0.426871 1.140176 0.182610 -0.289479 0.824899 0.115661 -0.531914 -0.960658
股票7 0.927928 0.260863 0.083166 -0.945330 -0.779561 0.230067 -0.819715 -1.367425
股票8 0.822418 0.542014 -0.193316 0.505767 -0.425458 -0.242475 -0.035267 -0.022685
股票9 1.674801 -1.232659 -2.881999 -1.077620 -1.378445 -0.135817 2.060139 1.189861
股票10 0.607444 0.523483 0.764183 -0.733856 0.548573 0.273106 -0.264642 0.773705
# 添加列索引
# 引入df.date_range(),start-开始日期, end: 结束日期,  periods - 持续时间, frep- B:工作日, M:月, D:天
date = pd.date_range(start='2020-3-30', periods=stock_df.shape[1], freq='d')
date
DatetimeIndex(['2020-03-30', '2020-03-31', '2020-04-01', '2020-04-02',
               '2020-04-03', '2020-04-04', '2020-04-05', '2020-04-06'],
              dtype='datetime64[ns]', freq='D')
stock_df = pd.DataFrame(stock_change, index=stock_name, columns=date)
stock_df
2020-03-30 2020-03-31 2020-04-01 2020-04-02 2020-04-03 2020-04-04 2020-04-05 2020-04-06
股票1 0.740580 0.786047 -0.152641 0.056805 0.093881 0.731375 -1.523384 1.711565
股票2 0.422049 0.625417 -1.415830 -0.274347 0.985871 -0.557979 0.310265 -0.479645
股票3 0.997411 -0.943973 -0.407830 -1.336312 -0.012484 1.187341 -0.254304 -0.742641
股票4 0.341567 -0.406213 0.828614 0.127213 1.041014 0.790613 -0.603255 1.299546
股票5 -1.232895 0.837897 1.192770 0.450929 -1.741813 -0.653622 -0.177525 1.876793
股票6 -0.426871 1.140176 0.182610 -0.289479 0.824899 0.115661 -0.531914 -0.960658
股票7 0.927928 0.260863 0.083166 -0.945330 -0.779561 0.230067 -0.819715 -1.367425
股票8 0.822418 0.542014 -0.193316 0.505767 -0.425458 -0.242475 -0.035267 -0.022685
股票9 1.674801 -1.232659 -2.881999 -1.077620 -1.378445 -0.135817 2.060139 1.189861
股票10 0.607444 0.523483 0.764183 -0.733856 0.548573 0.273106 -0.264642 0.773705

1. Pandas 主要数据结构

1.1 DataFrame

stock_df
2020-03-30 2020-03-31 2020-04-01 2020-04-02 2020-04-03 2020-04-04 2020-04-05 2020-04-06
股票1 0.740580 0.786047 -0.152641 0.056805 0.093881 0.731375 -1.523384 1.711565
股票2 0.422049 0.625417 -1.415830 -0.274347 0.985871 -0.557979 0.310265 -0.479645
股票3 0.997411 -0.943973 -0.407830 -1.336312 -0.012484 1.187341 -0.254304 -0.742641
股票4 0.341567 -0.406213 0.828614 0.127213 1.041014 0.790613 -0.603255 1.299546
股票5 -1.232895 0.837897 1.192770 0.450929 -1.741813 -0.653622 -0.177525 1.876793
股票6 -0.426871 1.140176 0.182610 -0.289479 0.824899 0.115661 -0.531914 -0.960658
股票7 0.927928 0.260863 0.083166 -0.945330 -0.779561 0.230067 -0.819715 -1.367425
股票8 0.822418 0.542014 -0.193316 0.505767 -0.425458 -0.242475 -0.035267 -0.022685
股票9 1.674801 -1.232659 -2.881999 -1.077620 -1.378445 -0.135817 2.060139 1.189861
股票10 0.607444 0.523483 0.764183 -0.733856 0.548573 0.273106 -0.264642 0.773705
# 查看DataFrame形状,类似于2d array
stock_df.shape
(10, 8)
# 取行索引
stock_df.index
Index(['股票1', '股票2', '股票3', '股票4', '股票5', '股票6', '股票7', '股票8', '股票9', '股票10'], dtype='object')
# 取列索引
stock_df.columns
DatetimeIndex(['2020-03-30', '2020-03-31', '2020-04-01', '2020-04-02',
               '2020-04-03', '2020-04-04', '2020-04-05', '2020-04-06'],
              dtype='datetime64[ns]', freq='D')
# 取ndarray的值
stock_df.values
array([[ 0.74057955,  0.78604657, -0.15264135,  0.05680483,  0.09388135,
         0.7313751 , -1.52338443,  1.71156505],
       [ 0.42204925,  0.62541715, -1.41583042, -0.27434654,  0.98587136,
        -0.55797884,  0.31026482, -0.47964535],
       [ 0.99741102, -0.94397298, -0.40782973, -1.33631227, -0.0124836 ,
         1.1873408 , -0.25430393, -0.74264106],
       [ 0.34156662, -0.40621262,  0.82861416,  0.1272128 ,  1.04101412,
         0.79061324, -0.60325544,  1.29954581],
       [-1.23289547,  0.83789748,  1.19276989,  0.45092868, -1.7418129 ,
        -0.65362211, -0.17752493,  1.87679286],
       [-0.4268705 ,  1.14017572,  0.18261009, -0.28947877,  0.82489897,
         0.11566058, -0.53191371, -0.96065812],
       [ 0.92792797,  0.26086313,  0.08316582, -0.94533007, -0.77956139,
         0.23006703, -0.81971461, -1.36742474],
       [ 0.82241768,  0.54201367, -0.19331564,  0.50576697, -0.42545839,
        -0.24247517, -0.03526651, -0.02268451],
       [ 1.67480093, -1.23265948, -2.88199942, -1.07761987, -1.37844497,
        -0.13581683,  2.06013919,  1.18986057],
       [ 0.60744357,  0.52348326,  0.76418263, -0.73385554,  0.54857341,
         0.27310645, -0.26464179,  0.77370496]])
# 取转置
stock_df.T
股票1 股票2 股票3 股票4 股票5 股票6 股票7 股票8 股票9 股票10
2020-03-30 0.740580 0.422049 0.997411 0.341567 -1.232895 -0.426871 0.927928 0.822418 1.674801 0.607444
2020-03-31 0.786047 0.625417 -0.943973 -0.406213 0.837897 1.140176 0.260863 0.542014 -1.232659 0.523483
2020-04-01 -0.152641 -1.415830 -0.407830 0.828614 1.192770 0.182610 0.083166 -0.193316 -2.881999 0.764183
2020-04-02 0.056805 -0.274347 -1.336312 0.127213 0.450929 -0.289479 -0.945330 0.505767 -1.077620 -0.733856
2020-04-03 0.093881 0.985871 -0.012484 1.041014 -1.741813 0.824899 -0.779561 -0.425458 -1.378445 0.548573
2020-04-04 0.731375 -0.557979 1.187341 0.790613 -0.653622 0.115661 0.230067 -0.242475 -0.135817 0.273106
2020-04-05 -1.523384 0.310265 -0.254304 -0.603255 -0.177525 -0.531914 -0.819715 -0.035267 2.060139 -0.264642
2020-04-06 1.711565 -0.479645 -0.742641 1.299546 1.876793 -0.960658 -1.367425 -0.022685 1.189861 0.773705
# 查看头部几行数据, 默认5行
stock_df.head(5)
# 查看倒数几行数据
stock_df.tail()
2020-03-30 2020-03-31 2020-04-01 2020-04-02 2020-04-03 2020-04-04 2020-04-05 2020-04-06
股票6 -0.426871 1.140176 0.182610 -0.289479 0.824899 0.115661 -0.531914 -0.960658
股票7 0.927928 0.260863 0.083166 -0.945330 -0.779561 0.230067 -0.819715 -1.367425
股票8 0.822418 0.542014 -0.193316 0.505767 -0.425458 -0.242475 -0.035267 -0.022685
股票9 1.674801 -1.232659 -2.881999 -1.077620 -1.378445 -0.135817 2.060139 1.189861
股票10 0.607444 0.523483 0.764183 -0.733856 0.548573 0.273106 -0.264642 0.773705

1.1.1 设置索引

# 只能通过对整个index 重新赋值, 整行或者整列
data_index = [['股票__{}'.format(i+1) for i in range(stock_df.shape[0])]]
stock_df.index = data_index
stock_df
2020-03-30 2020-03-31 2020-04-01 2020-04-02 2020-04-03 2020-04-04 2020-04-05 2020-04-06
股票__1 0.740580 0.786047 -0.152641 0.056805 0.093881 0.731375 -1.523384 1.711565
股票__2 0.422049 0.625417 -1.415830 -0.274347 0.985871 -0.557979 0.310265 -0.479645
股票__3 0.997411 -0.943973 -0.407830 -1.336312 -0.012484 1.187341 -0.254304 -0.742641
股票__4 0.341567 -0.406213 0.828614 0.127213 1.041014 0.790613 -0.603255 1.299546
股票__5 -1.232895 0.837897 1.192770 0.450929 -1.741813 -0.653622 -0.177525 1.876793
股票__6 -0.426871 1.140176 0.182610 -0.289479 0.824899 0.115661 -0.531914 -0.960658
股票__7 0.927928 0.260863 0.083166 -0.945330 -0.779561 0.230067 -0.819715 -1.367425
股票__8 0.822418 0.542014 -0.193316 0.505767 -0.425458 -0.242475 -0.035267 -0.022685
股票__9 1.674801 -1.232659 -2.881999 -1.077620 -1.378445 -0.135817 2.060139 1.189861
股票__10 0.607444 0.523483 0.764183 -0.733856 0.548573 0.273106 -0.264642 0.773705
# stock_df.index[3] ='hahha'
# stock_df

1.1.2 重设索引

# reset_index在原来基础上新增一列索引
# drop=False(默认) - 不丢弃原来索引
# drop=True - 丢掉原来索引 index
stock_df.reset_index()
level_0 2020-03-30 00:00:00 2020-03-31 00:00:00 2020-04-01 00:00:00 2020-04-02 00:00:00 2020-04-03 00:00:00 2020-04-04 00:00:00 2020-04-05 00:00:00 2020-04-06 00:00:00
0 股票__1 0.740580 0.786047 -0.152641 0.056805 0.093881 0.731375 -1.523384 1.711565
1 股票__2 0.422049 0.625417 -1.415830 -0.274347 0.985871 -0.557979 0.310265 -0.479645
2 股票__3 0.997411 -0.943973 -0.407830 -1.336312 -0.012484 1.187341 -0.254304 -0.742641
3 股票__4 0.341567 -0.406213 0.828614 0.127213 1.041014 0.790613 -0.603255 1.299546
4 股票__5 -1.232895 0.837897 1.192770 0.450929 -1.741813 -0.653622 -0.177525 1.876793
5 股票__6 -0.426871 1.140176 0.182610 -0.289479 0.824899 0.115661 -0.531914 -0.960658
6 股票__7 0.927928 0.260863 0.083166 -0.945330 -0.779561 0.230067 -0.819715 -1.367425
7 股票__8 0.822418 0.542014 -0.193316 0.505767 -0.425458 -0.242475 -0.035267 -0.022685
8 股票__9 1.674801 -1.232659 -2.881999 -1.077620 -1.378445 -0.135817 2.060139 1.189861
9 股票__10 0.607444 0.523483 0.764183 -0.733856 0.548573 0.273106 -0.264642 0.773705
stock_df.reset_index(drop=True)
2020-03-30 2020-03-31 2020-04-01 2020-04-02 2020-04-03 2020-04-04 2020-04-05 2020-04-06
0 0.740580 0.786047 -0.152641 0.056805 0.093881 0.731375 -1.523384 1.711565
1 0.422049 0.625417 -1.415830 -0.274347 0.985871 -0.557979 0.310265 -0.479645
2 0.997411 -0.943973 -0.407830 -1.336312 -0.012484 1.187341 -0.254304 -0.742641
3 0.341567 -0.406213 0.828614 0.127213 1.041014 0.790613 -0.603255 1.299546
4 -1.232895 0.837897 1.192770 0.450929 -1.741813 -0.653622 -0.177525 1.876793
5 -0.426871 1.140176 0.182610 -0.289479 0.824899 0.115661 -0.531914 -0.960658
6 0.927928 0.260863 0.083166 -0.945330 -0.779561 0.230067 -0.819715 -1.367425
7 0.822418 0.542014 -0.193316 0.505767 -0.425458 -0.242475 -0.035267 -0.022685
8 1.674801 -1.232659 -2.881999 -1.077620 -1.378445 -0.135817 2.060139 1.189861
9 0.607444 0.523483 0.764183 -0.733856 0.548573 0.273106 -0.264642 0.773705

1.1.3 以某列为索引

stock_df.set_index(keys='2020-03-30', drop=False) #此处因为类型问题,都是drop 原来的index
2020-03-30 2020-03-31 2020-04-01 2020-04-02 2020-04-03 2020-04-04 2020-04-05 2020-04-06
2020-03-30
0.740580 0.740580 0.786047 -0.152641 0.056805 0.093881 0.731375 -1.523384 1.711565
0.422049 0.422049 0.625417 -1.415830 -0.274347 0.985871 -0.557979 0.310265 -0.479645
0.997411 0.997411 -0.943973 -0.407830 -1.336312 -0.012484 1.187341 -0.254304 -0.742641
0.341567 0.341567 -0.406213 0.828614 0.127213 1.041014 0.790613 -0.603255 1.299546
-1.232895 -1.232895 0.837897 1.192770 0.450929 -1.741813 -0.653622 -0.177525 1.876793
-0.426871 -0.426871 1.140176 0.182610 -0.289479 0.824899 0.115661 -0.531914 -0.960658
0.927928 0.927928 0.260863 0.083166 -0.945330 -0.779561 0.230067 -0.819715 -1.367425
0.822418 0.822418 0.542014 -0.193316 0.505767 -0.425458 -0.242475 -0.035267 -0.022685
1.674801 1.674801 -1.232659 -2.881999 -1.077620 -1.378445 -0.135817 2.060139 1.189861
0.607444 0.607444 0.523483 0.764183 -0.733856 0.548573 0.273106 -0.264642 0.773705
# 字典方式创建DataFrame
df = pd.DataFrame({'month': [1, 4, 7, 10],
                    'year': [2012, 2014, 2013, 2014],
                    'sale':[55, 40, 84, 31]})
df
month year sale
0 1 2012 55
1 4 2014 40
2 7 2013 84
3 10 2014 31
df.set_index(keys='month')
year sale
month
1 2012 55
4 2014 40
7 2013 84
10 2014 31
# 设置2个index, 就是MultiIndex (三维数据结构)
# df.set_index(keys=['month', 'year'])
df.set_index(['month', 'year'])
sale
month year
1 2012 55
4 2014 40
7 2013 84
10 2014 31

1.2 MultiIndex

df_m = df.set_index(['year', 'month'])
df_m
sale
year month
2012 1 55
2014 4 40
2013 7 84
2014 10 31
# index属性
# - names: levers的名称
# - levels: 每个level的列表值
df_m.index
MultiIndex([(2012,  1),
            (2014,  4),
            (2013,  7),
            (2014, 10)],
           names=['year', 'month'])
df_m.index.names
FrozenList(['year', 'month'])
df_m.index.levels
FrozenList([[2012, 2013, 2014], [1, 4, 7, 10]])

1.3 Series

# 自动生成从0开始的行索引 index
# Data must be 1-dimensional
pd.Series(np.arange(10))
0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32
# 手动指定index值
pd.Series([6.7,5.6,3,10,2], index=['a', 'b', 'c', 'd', 'e'])
a     6.7
b     5.6
c     3.0
d    10.0
e     2.0
dtype: float64
# 通过字典创建
se = pd.Series({'red':100, 'blue':200, 'green': 500, 'yellow':1000})
se
red        100
blue       200
green      500
yellow    1000
dtype: int64
# 取索引
se.index
Index(['red', 'blue', 'green', 'yellow'], dtype='object')
# 取array值
se.values
array([ 100,  200,  500, 1000], dtype=int64)
pd.Series(np.random.normal(0, 1, (10)))
0   -0.975747
1    0.021589
2   -0.384579
3   -0.412900
4    0.218133
5   -0.866525
6   -0.777209
7   -1.032130
8    0.202134
9    0.295274
dtype: float64

2.基本数据操作

2.1 索引操作

# 使用pd.read_csv()读取本地数据
data = pd.read_csv('./data/stock_day.csv')
data
open high close low volume price_change p_change ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover
2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 22.942 22.142 22.875 53782.64 46738.65 55576.11 2.39
2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 22.406 21.955 22.942 40827.52 42736.34 56007.50 1.53
2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 21.938 21.929 23.022 35119.58 41871.97 56372.85 1.32
2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 21.446 21.909 23.137 35397.58 39904.78 60149.60 0.90
2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 21.366 21.923 23.253 33590.21 42935.74 61716.11 0.58
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2015-03-06 13.17 14.48 14.28 13.13 179831.72 1.12 8.51 13.112 13.112 13.112 115090.18 115090.18 115090.18 6.16
2015-03-05 12.88 13.45 13.16 12.87 93180.39 0.26 2.02 12.820 12.820 12.820 98904.79 98904.79 98904.79 3.19
2015-03-04 12.80 12.92 12.90 12.61 67075.44 0.20 1.57 12.707 12.707 12.707 100812.93 100812.93 100812.93 2.30
2015-03-03 12.52 13.06 12.70 12.52 139071.61 0.18 1.44 12.610 12.610 12.610 117681.67 117681.67 117681.67 4.76
2015-03-02 12.25 12.67 12.52 12.20 96291.73 0.32 2.62 12.520 12.520 12.520 96291.73 96291.73 96291.73 3.30

643 rows × 14 columns

# 去除一些列,简化数据
data = data.drop(["ma5","ma10","ma20","v_ma5","v_ma10","v_ma20"], axis=1)  # axis=1 去除对应的列,与numpy相反
data
open high close low volume price_change p_change turnover
2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 2.39
2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 1.53
2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 1.32
2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 0.90
2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 0.58
... ... ... ... ... ... ... ... ...
2015-03-06 13.17 14.48 14.28 13.13 179831.72 1.12 8.51 6.16
2015-03-05 12.88 13.45 13.16 12.87 93180.39 0.26 2.02 3.19
2015-03-04 12.80 12.92 12.90 12.61 67075.44 0.20 1.57 2.30
2015-03-03 12.52 13.06 12.70 12.52 139071.61 0.18 1.44 4.76
2015-03-02 12.25 12.67 12.52 12.20 96291.73 0.32 2.62 3.30

643 rows × 8 columns

2.1.1 直接使用行列索引

# 必须先列后行
data['high']['2018-02-27']
25.88
# data['2018-02-27']['high']

2.1.2 使用loc和iloc取索引

# loc取字符串, 先行后列
data.loc['2018-02-27']['high'] 
25.88
# 两种取值方式都可以
data.loc['2018-02-27','high'] 
25.88
data.loc['2018-02-27':'2018-02-22', 'open']
2018-02-27    23.53
2018-02-26    22.80
2018-02-23    22.88
2018-02-22    22.25
Name: open, dtype: float64
# data.loc['high']['2018-02-27']
# iloc取索引数字,先行后列
data.iloc[:3, 3:5]
low volume
2018-02-27 23.53 95578.03
2018-02-26 22.80 60985.11
2018-02-23 22.71 52914.01

2.1.3 使用ix取混合索引

# ix可以去数字和字符串, 先行后列
# 现版本中已被取消
# data.ix[0:4, ['open', 'close', 'high', 'low']]
# 先通过data.index去除索引并切片
data.loc[data.index[0:4], ['open', 'close', 'high', 'low']]
open close high low
2018-02-27 23.53 24.16 25.88 23.53
2018-02-26 22.80 23.53 23.78 22.80
2018-02-23 22.88 22.82 23.37 22.71
2018-02-22 22.25 22.28 22.76 22.02
data.index
Index(['2018-02-27', '2018-02-26', '2018-02-23', '2018-02-22', '2018-02-14',
       '2018-02-13', '2018-02-12', '2018-02-09', '2018-02-08', '2018-02-07',
       ...
       '2015-03-13', '2015-03-12', '2015-03-11', '2015-03-10', '2015-03-09',
       '2015-03-06', '2015-03-05', '2015-03-04', '2015-03-03', '2015-03-02'],
      dtype='object', length=643)
data.iloc[0:4, data.columns.get_indexer(['open', 'close', 'high', 'low'])]
open close high low
2018-02-27 23.53 24.16 25.88 23.53
2018-02-26 22.80 23.53 23.78 22.80
2018-02-23 22.88 22.82 23.37 22.71
2018-02-22 22.25 22.28 22.76 22.02
data.columns.get_indexer(['close'])
array([2], dtype=int64)
data
open high close low volume price_change p_change turnover
2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 2.39
2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 1.53
2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 1.32
2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 0.90
2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 0.58
... ... ... ... ... ... ... ... ...
2015-03-06 13.17 14.48 14.28 13.13 179831.72 1.12 8.51 6.16
2015-03-05 12.88 13.45 13.16 12.87 93180.39 0.26 2.02 3.19
2015-03-04 12.80 12.92 12.90 12.61 67075.44 0.20 1.57 2.30
2015-03-03 12.52 13.06 12.70 12.52 139071.61 0.18 1.44 4.76
2015-03-02 12.25 12.67 12.52 12.20 96291.73 0.32 2.62 3.30

643 rows × 8 columns

2.2 赋值操作

data
open high close low volume price_change p_change turnover
2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 2.39
2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 1.53
2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 1.32
2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 0.90
2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 0.58
... ... ... ... ... ... ... ... ...
2015-03-06 13.17 14.48 14.28 13.13 179831.72 1.12 8.51 6.16
2015-03-05 12.88 13.45 13.16 12.87 93180.39 0.26 2.02 3.19
2015-03-04 12.80 12.92 12.90 12.61 67075.44 0.20 1.57 2.30
2015-03-03 12.52 13.06 12.70 12.52 139071.61 0.18 1.44 4.76
2015-03-02 12.25 12.67 12.52 12.20 96291.73 0.32 2.62 3.30

643 rows × 8 columns

# 赋值方式1, 直接取对应的属性值
data.volume = 100
data.head()
open high close low volume price_change p_change turnover
2018-02-27 23.53 25.88 24.16 23.53 100 0.63 2.68 2.39
2018-02-26 22.80 23.78 23.53 22.80 100 0.69 3.02 1.53
2018-02-23 22.88 23.37 22.82 22.71 100 0.54 2.42 1.32
2018-02-22 22.25 22.76 22.28 22.02 100 0.36 1.64 0.90
2018-02-14 21.49 21.99 21.92 21.48 100 0.44 2.05 0.58
# 赋值方式2, 类似于取切片
data['low'] = 100
data.head()
open high close low volume price_change p_change turnover
2018-02-27 23.53 25.88 24.16 100 100 0.63 2.68 2.39
2018-02-26 22.80 23.78 23.53 100 100 0.69 3.02 1.53
2018-02-23 22.88 23.37 22.82 100 100 0.54 2.42 1.32
2018-02-22 22.25 22.76 22.28 100 100 0.36 1.64 0.90
2018-02-14 21.49 21.99 21.92 100 100 0.44 2.05 0.58
# 直接取出Series
data.open.head()
2018-02-27    23.53
2018-02-26    22.80
2018-02-23    22.88
2018-02-22    22.25
2018-02-14    21.49
Name: open, dtype: float64
# 直接取出Series
data['open'].head()
2018-02-27    23.53
2018-02-26    22.80
2018-02-23    22.88
2018-02-22    22.25
2018-02-14    21.49
Name: open, dtype: float64

2.3 排序

data.head()
open high close low volume price_change p_change turnover
2018-02-27 23.53 25.88 24.16 100 100 0.63 2.68 2.39
2018-02-26 22.80 23.78 23.53 100 100 0.69 3.02 1.53
2018-02-23 22.88 23.37 22.82 100 100 0.54 2.42 1.32
2018-02-22 22.25 22.76 22.28 100 100 0.36 1.64 0.90
2018-02-14 21.49 21.99 21.92 100 100 0.44 2.05 0.58

2.3.1 以特征值排序

# by --> 传入特征值, 可以传一个或者多个,以列表形式,排前面的作为高优先级,默认升序
data.sort_values(by='open', ascending=False)
open high close low volume price_change p_change turnover
2015-06-15 34.99 34.99 31.69 100 100 -3.52 -10.00 6.82
2015-06-12 34.69 35.98 35.21 100 100 0.82 2.38 5.47
2015-06-10 34.10 36.35 33.85 100 100 0.51 1.53 9.21
2017-11-01 33.85 34.34 33.83 100 100 -0.61 -1.77 5.81
2015-06-11 33.17 34.98 34.39 100 100 0.54 1.59 5.92
... ... ... ... ... ... ... ... ...
2015-03-05 12.88 13.45 13.16 100 100 0.26 2.02 3.19
2015-03-04 12.80 12.92 12.90 100 100 0.20 1.57 2.30
2015-03-03 12.52 13.06 12.70 100 100 0.18 1.44 4.76
2015-09-02 12.30 14.11 12.36 100 100 -1.10 -8.17 2.40
2015-03-02 12.25 12.67 12.52 100 100 0.32 2.62 3.30

643 rows × 8 columns

data.sort_values(by=['open', 'high'], ascending=True)
open high close low volume price_change p_change turnover
2015-03-02 12.25 12.67 12.52 100 100 0.32 2.62 3.30
2015-09-02 12.30 14.11 12.36 100 100 -1.10 -8.17 2.40
2015-03-03 12.52 13.06 12.70 100 100 0.18 1.44 4.76
2015-03-04 12.80 12.92 12.90 100 100 0.20 1.57 2.30
2015-03-05 12.88 13.45 13.16 100 100 0.26 2.02 3.19
... ... ... ... ... ... ... ... ...
2015-06-11 33.17 34.98 34.39 100 100 0.54 1.59 5.92
2017-11-01 33.85 34.34 33.83 100 100 -0.61 -1.77 5.81
2015-06-10 34.10 36.35 33.85 100 100 0.51 1.53 9.21
2015-06-12 34.69 35.98 35.21 100 100 0.82 2.38 5.47
2015-06-15 34.99 34.99 31.69 100 100 -3.52 -10.00 6.82

643 rows × 8 columns

# Series 排序因为只有一个特征值,所以不需要传参
data.close.sort_values()
2015-09-02    12.36
2015-03-02    12.52
2015-03-03    12.70
2015-09-07    12.77
2015-03-04    12.90
              ...  
2017-11-01    33.83
2015-06-10    33.85
2015-06-11    34.39
2017-10-31    34.44
2015-06-12    35.21
Name: close, Length: 643, dtype: float64

2.3.2 以索引排序

# DataFrame 使用sort_index 以索引排序
data.sort_index()
open high close low volume price_change p_change turnover
2015-03-02 12.25 12.67 12.52 100 100 0.32 2.62 3.30
2015-03-03 12.52 13.06 12.70 100 100 0.18 1.44 4.76
2015-03-04 12.80 12.92 12.90 100 100 0.20 1.57 2.30
2015-03-05 12.88 13.45 13.16 100 100 0.26 2.02 3.19
2015-03-06 13.17 14.48 14.28 100 100 1.12 8.51 6.16
... ... ... ... ... ... ... ... ...
2018-02-14 21.49 21.99 21.92 100 100 0.44 2.05 0.58
2018-02-22 22.25 22.76 22.28 100 100 0.36 1.64 0.90
2018-02-23 22.88 23.37 22.82 100 100 0.54 2.42 1.32
2018-02-26 22.80 23.78 23.53 100 100 0.69 3.02 1.53
2018-02-27 23.53 25.88 24.16 100 100 0.63 2.68 2.39

643 rows × 8 columns

# Series 排序
data.high.sort_index()
2015-03-02    12.67
2015-03-03    13.06
2015-03-04    12.92
2015-03-05    13.45
2015-03-06    14.48
              ...  
2018-02-14    21.99
2018-02-22    22.76
2018-02-23    23.37
2018-02-26    23.78
2018-02-27    25.88
Name: high, Length: 643, dtype: float64

3. DataFrame运算

3.1 算数运算

data.head()
open high close low volume price_change p_change turnover
2018-02-27 23.53 25.88 24.16 100 100 0.63 2.68 2.39
2018-02-26 22.80 23.78 23.53 100 100 0.69 3.02 1.53
2018-02-23 22.88 23.37 22.82 100 100 0.54 2.42 1.32
2018-02-22 22.25 22.76 22.28 100 100 0.36 1.64 0.90
2018-02-14 21.49 21.99 21.92 100 100 0.44 2.05 0.58
# 推荐使用pd.方法
data['close'].add(100).head()
2018-02-27    124.16
2018-02-26    123.53
2018-02-23    122.82
2018-02-22    122.28
2018-02-14    121.92
Name: close, dtype: float64
# 使用符号运算
(data.close + 100).head()
2018-02-27    124.16
2018-02-26    123.53
2018-02-23    122.82
2018-02-22    122.28
2018-02-14    121.92
Name: close, dtype: float64
data.close.sub(10).head()
2018-02-27    14.16
2018-02-26    13.53
2018-02-23    12.82
2018-02-22    12.28
2018-02-14    11.92
Name: close, dtype: float64

3.2 逻辑运算

3.2.1 逻辑运算符 ( <, > , |, &)

data.head()
open high close low volume price_change p_change turnover
2018-02-27 23.53 25.88 24.16 100 100 0.63 2.68 2.39
2018-02-26 22.80 23.78 23.53 100 100 0.69 3.02 1.53
2018-02-23 22.88 23.37 22.82 100 100 0.54 2.42 1.32
2018-02-22 22.25 22.76 22.28 100 100 0.36 1.64 0.90
2018-02-14 21.49 21.99 21.92 100 100 0.44 2.05 0.58
# data.open 返回数据 True, False
# data[data.open] 逻辑判断的结果作为筛选依据
data['open'] > 23
2018-02-27     True
2018-02-26    False
2018-02-23    False
2018-02-22    False
2018-02-14    False
              ...  
2015-03-06    False
2015-03-05    False
2015-03-04    False
2015-03-03    False
2015-03-02    False
Name: open, Length: 643, dtype: bool
data[data.open>23].head()
open high close low volume price_change p_change turnover
2018-02-27 23.53 25.88 24.16 100 100 0.63 2.68 2.39
2018-02-01 23.71 23.86 22.42 100 100 -1.30 -5.48 1.66
2018-01-31 23.85 23.98 23.72 100 100 -0.11 -0.46 1.23
2018-01-30 23.71 24.08 23.83 100 100 0.05 0.21 0.81
2018-01-29 24.40 24.63 23.77 100 100 -0.73 -2.98 1.64
# 利用与或 (& |)完成逻辑判断 
# 优先级问题,多加括号
data[(data.close>23) & (data.close<24)].head()
open high close low volume price_change p_change turnover
2018-02-26 22.80 23.78 23.53 100 100 0.69 3.02 1.53
2018-02-05 22.45 23.39 23.27 100 100 0.65 2.87 1.31
2018-01-31 23.85 23.98 23.72 100 100 -0.11 -0.46 1.23
2018-01-30 23.71 24.08 23.83 100 100 0.05 0.21 0.81
2018-01-29 24.40 24.63 23.77 100 100 -0.73 -2.98 1.64

3.2.2 逻辑运算函数

# query(str) 传入字符串
data.query('close>23 & close<24').head()
open high close low volume price_change p_change turnover
2018-02-26 22.80 23.78 23.53 100 100 0.69 3.02 1.53
2018-02-05 22.45 23.39 23.27 100 100 0.65 2.87 1.31
2018-01-31 23.85 23.98 23.72 100 100 -0.11 -0.46 1.23
2018-01-30 23.71 24.08 23.83 100 100 0.05 0.21 0.81
2018-01-29 24.40 24.63 23.77 100 100 -0.73 -2.98 1.64
# isin() 可以传一个值, 也可以传一个列表范围, 判断是否在某个范围内
data['open'].isin([22.80, 23.00])
2018-02-27    False
2018-02-26     True
2018-02-23    False
2018-02-22    False
2018-02-14    False
              ...  
2015-03-06    False
2015-03-05    False
2015-03-04    False
2015-03-03    False
2015-03-02    False
Name: open, Length: 643, dtype: bool
data[data['open'].isin([22.80, 23.00])]
open high close low volume price_change p_change turnover
2018-02-26 22.8 23.78 23.53 100 100 0.69 3.02 1.53
2018-02-06 22.8 23.55 22.29 100 100 -0.97 -4.17 1.39
2017-12-18 23.0 23.49 23.13 100 100 0.12 0.52 0.74
2017-07-24 22.8 23.79 23.03 100 100 -0.17 -0.73 2.59
2017-06-21 23.0 23.84 23.57 100 100 -0.51 -2.12 5.13
2016-01-04 22.8 22.84 20.69 100 100 -2.28 -9.93 1.60

3.3 统计运算

3.3.1 describe()

# describe()方法可以快速的查看DataFrame的整体属性
# 25% - 第一四分位数(Q1),样本中从小到大排列后第25%的数据
# 50% - 中位数
data.describe()
open high close low volume price_change p_change turnover
count 643.000000 643.000000 643.000000 643.0 643.0 643.000000 643.000000 643.000000
mean 21.272706 21.900513 21.336267 100.0 100.0 0.018802 0.190280 2.936190
std 3.930973 4.077578 3.942806 0.0 0.0 0.898476 4.079698 2.079375
min 12.250000 12.670000 12.360000 100.0 100.0 -3.520000 -10.030000 0.040000
25% 19.000000 19.500000 19.045000 100.0 100.0 -0.390000 -1.850000 1.360000
50% 21.440000 21.970000 21.450000 100.0 100.0 0.050000 0.260000 2.500000
75% 23.400000 24.065000 23.415000 100.0 100.0 0.455000 2.305000 3.915000
max 34.990000 36.350000 35.210000 100.0 100.0 3.030000 10.030000 12.560000

3.3.2 统计函数

# max(), min()
data.max()
open             34.99
high             36.35
close            35.21
low             100.00
volume          100.00
price_change      3.03
p_change         10.03
turnover         12.56
dtype: float64
data.std()
# data.var()
open            3.930973
high            4.077578
close           3.942806
low             0.000000
volume          0.000000
price_change    0.898476
p_change        4.079698
turnover        2.079375
dtype: float64
data.median()
open             21.44
high             21.97
close            21.45
low             100.00
volume          100.00
price_change      0.05
p_change          0.26
turnover          2.50
dtype: float64
# idxmax ( index-max) 最大值的索引值
data.idxmax()
# data,idxmin()
open            2015-06-15
high            2015-06-10
close           2015-06-12
low             2018-02-27
volume          2018-02-27
price_change    2015-06-09
p_change        2015-08-28
turnover        2017-10-26
dtype: object

3.4 累计统计函数

# 常见累计统计函数为:
# cumsum - 累加
# cummax - 累计取最大值, 新的最大值替换原来的最大值
# cummin - 累计取最小值
# cumprod - 累积
data = data.sort_index()
data.p_change
2015-03-02    2.62
2015-03-03    1.44
2015-03-04    1.57
2015-03-05    2.02
2015-03-06    8.51
              ... 
2018-02-14    2.05
2018-02-22    1.64
2018-02-23    2.42
2018-02-26    3.02
2018-02-27    2.68
Name: p_change, Length: 643, dtype: float64
data.p_change.cumsum()
2015-03-02      2.62
2015-03-03      4.06
2015-03-04      5.63
2015-03-05      7.65
2015-03-06     16.16
               ...  
2018-02-14    112.59
2018-02-22    114.23
2018-02-23    116.65
2018-02-26    119.67
2018-02-27    122.35
Name: p_change, Length: 643, dtype: float64
# 利用Pandas自带的绘图功能, 需要运行2次才能出结果
data.p_change.cumsum().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1d54ecb2848>

data.p_change.cummax().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1d54ff73a88>

3.5 自定义函数

# apply(func), fun - lambda函数
data[['open']] # [[]]取出DataFrame
open
2015-03-02 12.25
2015-03-03 12.52
2015-03-04 12.80
2015-03-05 12.88
2015-03-06 13.17
... ...
2018-02-14 21.49
2018-02-22 22.25
2018-02-23 22.88
2018-02-26 22.80
2018-02-27 23.53

643 rows × 1 columns

data[['open']].apply(lambda x: x.max()-x.min()) # 默认axis=0
open    22.74
dtype: float64

4. Pandas内置画图

# DataFrame(x, y, kind='line')
# kind: 绘图的类型, line, bar, barh, hist, pie, scatter
# DataFrame
data['open'].plot(kind='hist')
<matplotlib.axes._subplots.AxesSubplot at 0x1d54ffe6c88>

5. 文件读取与存储

5.1 csv文件

# usecols -abs 读取特定列,列表形式传入
# sep=',' 分隔

# 读取文件
data = pd.read_csv('./data/stock_day.csv', usecols=['open', 'high', 'low'], sep=',')
data
open high low
2018-02-27 23.53 25.88 23.53
2018-02-26 22.80 23.78 22.80
2018-02-23 22.88 23.37 22.71
2018-02-22 22.25 22.76 22.02
2018-02-14 21.49 21.99 21.48
... ... ... ...
2015-03-06 13.17 14.48 13.13
2015-03-05 12.88 13.45 12.87
2015-03-04 12.80 12.92 12.61
2015-03-03 12.52 13.06 12.52
2015-03-02 12.25 12.67 12.20

643 rows × 3 columns

# 存储文件
# columns :存储指定列,
# index:是否存储index
data[:10].to_csv('./data/test_write_in.csv',columns=['high', 'low'], index=False)

5.2 hdf文件

# hdf文件格式是官方推荐的格式,存储读取速度快
# 压缩方式读取速度快,节省空间
# 支持跨平台

day_eps = pd.read_hdf('./data/stock_data/day/day_close.h5')
# 需要安装tables模块才能显示
# hdf文件不能直接打开,需要导入后才能打开
day_eps
000001.SZ 000002.SZ 000004.SZ 000005.SZ 000006.SZ 000007.SZ 000008.SZ 000009.SZ 000010.SZ 000011.SZ ... 001965.SZ 603283.SH 002920.SZ 002921.SZ 300684.SZ 002922.SZ 300735.SZ 603329.SH 603655.SH 603080.SH
0 16.30 17.71 4.58 2.88 14.60 2.62 4.96 4.66 5.37 6.02 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 17.02 19.20 4.65 3.02 15.97 2.65 4.95 4.70 5.37 6.27 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 17.02 17.28 4.56 3.06 14.37 2.63 4.82 4.47 5.37 5.96 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 16.18 16.97 4.49 2.95 13.10 2.73 4.89 4.33 5.37 5.77 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 16.95 17.19 4.55 2.99 13.18 2.77 4.97 4.42 5.37 5.92 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2673 12.96 35.99 22.84 4.37 9.85 16.66 8.47 7.52 6.20 17.88 ... 12.99 23.42 47.99 32.40 22.45 28.79 23.18 24.45 14.98 26.06
2674 13.08 35.84 23.02 4.41 9.85 16.66 8.49 7.48 6.01 17.75 ... 12.83 25.76 45.14 35.64 24.70 31.67 25.50 26.90 16.48 28.67
2675 13.47 35.67 22.40 4.32 9.85 16.66 8.49 7.38 5.97 17.45 ... 12.20 28.34 43.21 39.20 27.17 34.84 28.05 29.59 18.13 31.54
2676 13.40 35.15 22.29 4.29 9.85 16.66 8.56 7.04 5.84 17.49 ... 12.11 31.17 43.76 40.88 29.89 34.84 29.64 32.55 19.94 34.69
2677 13.55 35.55 22.20 4.37 9.85 16.66 8.67 7.06 5.99 17.76 ... 11.91 34.29 41.71 39.10 32.88 34.84 27.92 31.82 21.93 38.16

2678 rows × 3562 columns

# 存储格式为 .h5
day_eps_test = day_eps.to_hdf('./data/day_eps_test.h5', key='day_eps')
pd.read_hdf('./data/day_eps_test.h5')
000001.SZ 000002.SZ 000004.SZ 000005.SZ 000006.SZ 000007.SZ 000008.SZ 000009.SZ 000010.SZ 000011.SZ ... 001965.SZ 603283.SH 002920.SZ 002921.SZ 300684.SZ 002922.SZ 300735.SZ 603329.SH 603655.SH 603080.SH
0 16.30 17.71 4.58 2.88 14.60 2.62 4.96 4.66 5.37 6.02 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 17.02 19.20 4.65 3.02 15.97 2.65 4.95 4.70 5.37 6.27 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 17.02 17.28 4.56 3.06 14.37 2.63 4.82 4.47 5.37 5.96 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 16.18 16.97 4.49 2.95 13.10 2.73 4.89 4.33 5.37 5.77 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 16.95 17.19 4.55 2.99 13.18 2.77 4.97 4.42 5.37 5.92 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2673 12.96 35.99 22.84 4.37 9.85 16.66 8.47 7.52 6.20 17.88 ... 12.99 23.42 47.99 32.40 22.45 28.79 23.18 24.45 14.98 26.06
2674 13.08 35.84 23.02 4.41 9.85 16.66 8.49 7.48 6.01 17.75 ... 12.83 25.76 45.14 35.64 24.70 31.67 25.50 26.90 16.48 28.67
2675 13.47 35.67 22.40 4.32 9.85 16.66 8.49 7.38 5.97 17.45 ... 12.20 28.34 43.21 39.20 27.17 34.84 28.05 29.59 18.13 31.54
2676 13.40 35.15 22.29 4.29 9.85 16.66 8.56 7.04 5.84 17.49 ... 12.11 31.17 43.76 40.88 29.89 34.84 29.64 32.55 19.94 34.69
2677 13.55 35.55 22.20 4.37 9.85 16.66 8.67 7.06 5.99 17.76 ... 11.91 34.29 41.71 39.10 32.88 34.84 27.92 31.82 21.93 38.16

2678 rows × 3562 columns

5.3 json文件

# oritent: 读取方式
# lines: 是否按行读取

json_read = pd.read_json("./data/Sarcasm_Headlines_Dataset.json", orient="records", lines=True)
json_read
article_link headline is_sarcastic
0 https://www.huffingtonpost.com/entry/versace-b... former versace store clerk sues over secret 'b... 0
1 https://www.huffingtonpost.com/entry/roseanne-... the 'roseanne' revival catches up to our thorn... 0
2 https://local.theonion.com/mom-starting-to-fea... mom starting to fear son's web series closest ... 1
3 https://politics.theonion.com/boehner-just-wan... boehner just wants wife to listen, not come up... 1
4 https://www.huffingtonpost.com/entry/jk-rowlin... j.k. rowling wishes snape happy birthday in th... 0
... ... ... ...
26704 https://www.huffingtonpost.com/entry/american-... american politics in moral free-fall 0
26705 https://www.huffingtonpost.com/entry/americas-... america's best 20 hikes 0
26706 https://www.huffingtonpost.com/entry/reparatio... reparations and obama 0
26707 https://www.huffingtonpost.com/entry/israeli-b... israeli ban targeting boycott supporters raise... 0
26708 https://www.huffingtonpost.com/entry/gourmet-g... gourmet gifts for the foodie 2014 0

26709 rows × 3 columns

# lines 表示存储数据分行, 否则全部为一整行
json_read.to_json('./data/test.json', orient='records', lines=True)

6.高级处理

6.1 处理缺失值

# 缺失值一般使用nan(not a number)来表示
type(np.nan)
float
# 导入数据
movie = pd.read_csv('./data/IMDB-Movie-Data.csv')
movie.head()
Rank Title Genre Description Director Actors Year Runtime (Minutes) Rating Votes Revenue (Millions) Metascore
0 1 Guardians of the Galaxy Action,Adventure,Sci-Fi A group of intergalactic criminals are forced ... James Gunn Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S... 2014 121 8.1 757074 333.13 76.0
1 2 Prometheus Adventure,Mystery,Sci-Fi Following clues to the origin of mankind, a te... Ridley Scott Noomi Rapace, Logan Marshall-Green, Michael Fa... 2012 124 7.0 485820 126.46 65.0
2 3 Split Horror,Thriller Three girls are kidnapped by a man with a diag... M. Night Shyamalan James McAvoy, Anya Taylor-Joy, Haley Lu Richar... 2016 117 7.3 157606 138.12 62.0
3 4 Sing Animation,Comedy,Family In a city of humanoid animals, a hustling thea... Christophe Lourdelet Matthew McConaughey,Reese Witherspoon, Seth Ma... 2016 108 7.2 60545 270.32 59.0
4 5 Suicide Squad Action,Adventure,Fantasy A secret government agency recruits some of th... David Ayer Will Smith, Jared Leto, Margot Robbie, Viola D... 2016 123 6.2 393727 325.02 40.0
# 判断缺失值是否存在
# isnull() :nan - True
# notnull():nan - False
pd.isnull(movie)
Rank Title Genre Description Director Actors Year Runtime (Minutes) Rating Votes Revenue (Millions) Metascore
0 False False False False False False False False False False False False
1 False False False False False False False False False False False False
2 False False False False False False False False False False False False
3 False False False False False False False False False False False False
4 False False False False False False False False False False False False
... ... ... ... ... ... ... ... ... ... ... ... ...
995 False False False False False False False False False False True False
996 False False False False False False False False False False False False
997 False False False False False False False False False False False False
998 False False False False False False False False False False True False
999 False False False False False False False False False False False False

1000 rows × 12 columns

np.any(pd.isnull(movie))  # 其中有任何一个为True(nan值存在), 则返回True
True
np.all(pd.notnull(movie)) # 所有的元素都非nan
False

6.1.1 丢弃缺失值

# 直接丢弃含有nan的一行数据
data = movie.dropna()
np.any(pd.isnull(data))
False

6.1.2 替换缺失值 (常见:平均值或者0)

# 使用平均值替换
# inplace=True , 表示直接对原来movie值进行修改
data = movie['Revenue (Millions)'].fillna(movie['Revenue (Millions)'].mean())
# inplace默认为False, 返回了新的替换后的一个data数据, 原来的movie中仍含有nan
np.any(pd.isnull(movie['Revenue (Millions)']))
True
movie['Revenue (Millions)'].fillna(movie['Revenue (Millions)'].mean(), inplace=True)
# movie['Revenue (Millions)'] 中的nan 已经被替换
np.any(pd.isnull(movie['Revenue (Millions)']))
False

6.1.3 缺失值不是nan

# 全局取消证书验证
# 读取数据
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

wis = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data")
# 先将? 数据替换成nan
# 再对nan进行处理

wis
1000025 5 1 1.1 1.2 2 1.3 3 1.4 1.5 2.1
0 1002945 5 4 4 5 7 10 3 2 1 2
1 1015425 3 1 1 1 2 2 3 1 1 2
2 1016277 6 8 8 1 3 4 3 7 1 2
3 1017023 4 1 1 3 2 1 3 1 1 2
4 1017122 8 10 10 8 7 10 9 7 1 4
... ... ... ... ... ... ... ... ... ... ... ...
693 776715 3 1 1 1 3 2 1 1 1 2
694 841769 2 1 1 1 2 1 1 1 1 2
695 888820 5 10 10 3 7 3 8 10 2 4
696 897471 4 8 6 4 3 4 10 6 1 4
697 897471 4 8 8 5 4 5 10 4 1 4

698 rows × 11 columns

# to_replace: 被替换的值, value:去替换的值
wis = wis.replace(to_replace='?', value=np.nan)
wis = wis.dropna()
wis
1000025 5 1 1.1 1.2 2 1.3 3 1.4 1.5 2.1
0 1002945 5 4 4 5 7 10 3 2 1 2
1 1015425 3 1 1 1 2 2 3 1 1 2
2 1016277 6 8 8 1 3 4 3 7 1 2
3 1017023 4 1 1 3 2 1 3 1 1 2
4 1017122 8 10 10 8 7 10 9 7 1 4
... ... ... ... ... ... ... ... ... ... ... ...
693 776715 3 1 1 1 3 2 1 1 1 2
694 841769 2 1 1 1 2 1 1 1 1 2
695 888820 5 10 10 3 7 3 8 10 2 4
696 897471 4 8 6 4 3 4 10 6 1 4
697 897471 4 8 8 5 4 5 10 4 1 4

682 rows × 11 columns

6.2 数据离散化

# 数据离散化可以简化数据结构,将数据划分到若干离散的区间,可以简化数据结构, 常用于搭配one-hot编码
# 获取数据
data = pd.read_csv("./data/stock_day.csv")
data_p= data['p_change']
data_p
2018-02-27    2.68
2018-02-26    3.02
2018-02-23    2.42
2018-02-22    1.64
2018-02-14    2.05
              ... 
2015-03-06    8.51
2015-03-05    2.02
2015-03-04    1.57
2015-03-03    1.44
2015-03-02    2.62
Name: p_change, Length: 643, dtype: float64
# pd.qcut() 智能分组
# q: 分组数量
q_cut = pd.qcut(data_p, q=10)
q_cut
2018-02-27    (1.738, 2.938]
2018-02-26     (2.938, 5.27]
2018-02-23    (1.738, 2.938]
2018-02-22     (0.94, 1.738]
2018-02-14    (1.738, 2.938]
                   ...      
2015-03-06     (5.27, 10.03]
2015-03-05    (1.738, 2.938]
2015-03-04     (0.94, 1.738]
2015-03-03     (0.94, 1.738]
2015-03-02    (1.738, 2.938]
Name: p_change, Length: 643, dtype: category
Categories (10, interval[float64]): [(-10.030999999999999, -4.836] < (-4.836, -2.444] < (-2.444, -1.352] < (-1.352, -0.462] ... (0.94, 1.738] < (1.738, 2.938] < (2.938, 5.27] < (5.27, 10.03]]
# value_counts(): 每个分组区间内的数据数量
q_cut.value_counts()
(5.27, 10.03]                    65
(0.26, 0.94]                     65
(-0.462, 0.26]                   65
(-10.030999999999999, -4.836]    65
(2.938, 5.27]                    64
(1.738, 2.938]                   64
(-1.352, -0.462]                 64
(-2.444, -1.352]                 64
(-4.836, -2.444]                 64
(0.94, 1.738]                    63
Name: p_change, dtype: int64
# pd.cut(data, bins): 自己指定分组区间
bins  = [-100, -7, -5, -3, 0, 3, 5, 7, 100]
cut = pd.cut(data_p, bins=bins)
cut
2018-02-27      (0, 3]
2018-02-26      (3, 5]
2018-02-23      (0, 3]
2018-02-22      (0, 3]
2018-02-14      (0, 3]
                ...   
2015-03-06    (7, 100]
2015-03-05      (0, 3]
2015-03-04      (0, 3]
2015-03-03      (0, 3]
2015-03-02      (0, 3]
Name: p_change, Length: 643, dtype: category
Categories (8, interval[int64]): [(-100, -7] < (-7, -5] < (-5, -3] < (-3, 0] < (0, 3] < (3, 5] < (5, 7] < (7, 100]]
cut.value_counts()
(0, 3]        215
(-3, 0]       188
(3, 5]         57
(-5, -3]       51
(7, 100]       35
(5, 7]         35
(-100, -7]     34
(-7, -5]       28
Name: p_change, dtype: int64
# get_dummies() 取独热矩阵
pd.get_dummies(q_cut)
(-10.030999999999999, -4.836] (-4.836, -2.444] (-2.444, -1.352] (-1.352, -0.462] (-0.462, 0.26] (0.26, 0.94] (0.94, 1.738] (1.738, 2.938] (2.938, 5.27] (5.27, 10.03]
2018-02-27 0 0 0 0 0 0 0 1 0 0
2018-02-26 0 0 0 0 0 0 0 0 1 0
2018-02-23 0 0 0 0 0 0 0 1 0 0
2018-02-22 0 0 0 0 0 0 1 0 0 0
2018-02-14 0 0 0 0 0 0 0 1 0 0
... ... ... ... ... ... ... ... ... ... ...
2015-03-06 0 0 0 0 0 0 0 0 0 1
2015-03-05 0 0 0 0 0 0 0 1 0 0
2015-03-04 0 0 0 0 0 0 1 0 0 0
2015-03-03 0 0 0 0 0 0 1 0 0 0
2015-03-02 0 0 0 0 0 0 0 1 0 0

643 rows × 10 columns

data_dummy = pd.get_dummies(q_cut)

6.3 数据拼接

6.3.1 pd.concat()

# 不指定axis 可能会造成拼接错位,产生很多nan
pd.concat([data, data_dummy], axis=1)
open high close low volume price_change p_change ma5 ma10 ma20 ... (-10.030999999999999, -4.836] (-4.836, -2.444] (-2.444, -1.352] (-1.352, -0.462] (-0.462, 0.26] (0.26, 0.94] (0.94, 1.738] (1.738, 2.938] (2.938, 5.27] (5.27, 10.03]
2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 22.942 22.142 22.875 ... 0 0 0 0 0 0 0 1 0 0
2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 22.406 21.955 22.942 ... 0 0 0 0 0 0 0 0 1 0
2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 21.938 21.929 23.022 ... 0 0 0 0 0 0 0 1 0 0
2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 21.446 21.909 23.137 ... 0 0 0 0 0 0 1 0 0 0
2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 21.366 21.923 23.253 ... 0 0 0 0 0 0 0 1 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2015-03-06 13.17 14.48 14.28 13.13 179831.72 1.12 8.51 13.112 13.112 13.112 ... 0 0 0 0 0 0 0 0 0 1
2015-03-05 12.88 13.45 13.16 12.87 93180.39 0.26 2.02 12.820 12.820 12.820 ... 0 0 0 0 0 0 0 1 0 0
2015-03-04 12.80 12.92 12.90 12.61 67075.44 0.20 1.57 12.707 12.707 12.707 ... 0 0 0 0 0 0 1 0 0 0
2015-03-03 12.52 13.06 12.70 12.52 139071.61 0.18 1.44 12.610 12.610 12.610 ... 0 0 0 0 0 0 1 0 0 0
2015-03-02 12.25 12.67 12.52 12.20 96291.73 0.32 2.62 12.520 12.520 12.520 ... 0 0 0 0 0 0 0 1 0 0

643 rows × 24 columns

6.3.2 pd.merge()

# 获取数据
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                        'key2': ['K0', 'K1', 'K0', 'K1'],
                        'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3']})

right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                        'key2': ['K0', 'K0', 'K0', 'K0'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']})
# 默认内连接
# on: 以什么作为键来拼接
result = pd.merge(left, right, on=['key1', 'key2'])
# 内连接就是取交集
result
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
# 外连接
pd.merge(left, right, on=['key1', 'key2'], how='outer')
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
5 K2 K0 NaN NaN C3 D3
# 左连接
pd.merge(left, right, on=['key1', 'key2'], how='left')
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
# 右连接
pd.merge(left, right, on=['key1', 'key2'], how='right')
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
3 K2 K0 NaN NaN C3 D3

6.4 交叉表和透视表

data.head()
open high close low volume price_change p_change ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover
2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 22.942 22.142 22.875 53782.64 46738.65 55576.11 2.39
2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 22.406 21.955 22.942 40827.52 42736.34 56007.50 1.53
2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 21.938 21.929 23.022 35119.58 41871.97 56372.85 1.32
2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 21.446 21.909 23.137 35397.58 39904.78 60149.60 0.90
2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 21.366 21.923 23.253 33590.21 42935.74 61716.11 0.58
# 将date.index 转化为datetime格式
date = pd.to_datetime(data.index).weekday
data['week'] = date
data
open high close low volume price_change p_change ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover posi_neg week
2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 22.942 22.142 22.875 53782.64 46738.65 55576.11 2.39 1 1
2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 22.406 21.955 22.942 40827.52 42736.34 56007.50 1.53 1 0
2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 21.938 21.929 23.022 35119.58 41871.97 56372.85 1.32 1 4
2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 21.446 21.909 23.137 35397.58 39904.78 60149.60 0.90 1 3
2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 21.366 21.923 23.253 33590.21 42935.74 61716.11 0.58 1 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2015-03-06 13.17 14.48 14.28 13.13 179831.72 1.12 8.51 13.112 13.112 13.112 115090.18 115090.18 115090.18 6.16 1 4
2015-03-05 12.88 13.45 13.16 12.87 93180.39 0.26 2.02 12.820 12.820 12.820 98904.79 98904.79 98904.79 3.19 1 3
2015-03-04 12.80 12.92 12.90 12.61 67075.44 0.20 1.57 12.707 12.707 12.707 100812.93 100812.93 100812.93 2.30 1 2
2015-03-03 12.52 13.06 12.70 12.52 139071.61 0.18 1.44 12.610 12.610 12.610 117681.67 117681.67 117681.67 4.76 1 1
2015-03-02 12.25 12.67 12.52 12.20 96291.73 0.32 2.62 12.520 12.520 12.520 96291.73 96291.73 96291.73 3.30 1 0

643 rows × 16 columns

# 把p_change 划分为0, 1两类
data['posi_neg'] = np.where(data['p_change']>0, 1, 0)
data.head()
open high close low volume price_change p_change ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover posi_neg week
2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 22.942 22.142 22.875 53782.64 46738.65 55576.11 2.39 1 1
2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 22.406 21.955 22.942 40827.52 42736.34 56007.50 1.53 1 0
2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 21.938 21.929 23.022 35119.58 41871.97 56372.85 1.32 1 4
2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 21.446 21.909 23.137 35397.58 39904.78 60149.60 0.90 1 3
2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 21.366 21.923 23.253 33590.21 42935.74 61716.11 0.58 1 2
# 手动构造交叉表
count = pd.crosstab(data['week'], data['posi_neg'])
sum = count.sum(axis=1).astype(np.float32)
pro = count.div(sum, axis=0)
pro.plot(kind='bar', stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x1d55bc28408>

# 自动构造交叉表
data.pivot_table(['posi_neg'], index='week')
posi_neg
week
0 0.496000
1 0.580153
2 0.537879
3 0.507812
4 0.535433

6.5 分组和聚合

6.5.1 pd.groupby()

# 创建数据
col =pd.DataFrame({'color': ['white','red','green','red','green'], 'object': ['pen','pencil','pencil','ashtray','pen'],'price1':[5.56,4.20,1.30,0.56,2.75],'price2':[4.75,4.12,1.60,0.75,3.15]})
col
color object price1 price2
0 white pen 5.56 4.75
1 red pencil 4.20 4.12
2 green pencil 1.30 1.60
3 red ashtray 0.56 0.75
4 green pen 2.75 3.15
# DataFrame 分组, 推荐使用
# 单独的分组没有意义,进行聚合(求值)才有价值
col.groupby(['color'])['price1'].mean()
color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64
# 如果设置as_index= False 会创建一列新索引
col.groupby(['color'], as_index= False)['price1'].mean()
color price1
0 green 2.025
1 red 2.380
2 white 5.560
col.price1
0    5.56
1    4.20
2    1.30
3    0.56
4    2.75
Name: price1, dtype: float64
# Series 分组
col.price1.groupby(col['color']).mean()
color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64

6.5.2 分组实例

# 获取数据
starbucks = pd.read_csv("./data/starbucks/directory.csv")
starbucks.head()
Brand Store Number Store Name Ownership Type Street Address City State/Province Country Postcode Phone Number Timezone Longitude Latitude
0 Starbucks 47370-257954 Meritxell, 96 Licensed Av. Meritxell, 96 Andorra la Vella 7 AD AD500 376818720 GMT+1:00 Europe/Andorra 1.53 42.51
1 Starbucks 22331-212325 Ajman Drive Thru Licensed 1 Street 69, Al Jarf Ajman AJ AE NaN NaN GMT+04:00 Asia/Dubai 55.47 25.42
2 Starbucks 47089-256771 Dana Mall Licensed Sheikh Khalifa Bin Zayed St. Ajman AJ AE NaN NaN GMT+04:00 Asia/Dubai 55.47 25.39
3 Starbucks 22126-218024 Twofour 54 Licensed Al Salam Street Abu Dhabi AZ AE NaN NaN GMT+04:00 Asia/Dubai 54.38 24.48
4 Starbucks 17127-178586 Al Ain Tower Licensed Khaldiya Area, Abu Dhabi Island Abu Dhabi AZ AE NaN NaN GMT+04:00 Asia/Dubai 54.54 24.51
# 以一个值为分组依据
starbucks.groupby(['Country']).count()
Brand Store Number Store Name Ownership Type Street Address City State/Province Postcode Phone Number Timezone Longitude Latitude
Country
AD 1 1 1 1 1 1 1 1 1 1 1 1
AE 144 144 144 144 144 144 144 24 78 144 144 144
AR 108 108 108 108 108 108 108 100 29 108 108 108
AT 18 18 18 18 18 18 18 18 17 18 18 18
AU 22 22 22 22 22 22 22 22 0 22 22 22
... ... ... ... ... ... ... ... ... ... ... ... ...
TT 3 3 3 3 3 3 3 3 0 3 3 3
TW 394 394 394 394 394 394 394 365 39 394 394 394
US 13608 13608 13608 13608 13608 13608 13608 13607 13122 13608 13608 13608
VN 25 25 25 25 25 25 25 25 23 25 25 25
ZA 3 3 3 3 3 3 3 3 2 3 3 3

73 rows × 12 columns

starbucks_count = starbucks.groupby(['Country']).count()
starbucks_count['Brand'].plot(kind='bar', figsize=(20, 8))
<matplotlib.axes._subplots.AxesSubplot at 0x1d563dbd148>

# 为了阅读方便,对数据进行排序后画图
starbucks_count.sort_values(by='Brand', ascending=False).head(20)['Brand'].plot(kind='bar', figsize=(20, 8))
<matplotlib.axes._subplots.AxesSubplot at 0x1d55ca68f48>

# 多种分组依据
starbucks.groupby(['Country', 'State/Province']).count().head(20)
Brand Store Number Store Name Ownership Type Street Address City Postcode Phone Number Timezone Longitude Latitude
Country State/Province
AD 7 1 1 1 1 1 1 1 1 1 1 1
AE AJ 2 2 2 2 2 2 0 0 2 2 2
AZ 48 48 48 48 48 48 7 20 48 48 48
DU 82 82 82 82 82 82 16 50 82 82 82
FU 2 2 2 2 2 2 1 0 2 2 2
RK 3 3 3 3 3 3 0 3 3 3 3
SH 6 6 6 6 6 6 0 5 6 6 6
UQ 1 1 1 1 1 1 0 0 1 1 1
AR B 21 21 21 21 21 21 18 5 21 21 21
C 73 73 73 73 73 73 71 24 73 73 73
M 5 5 5 5 5 5 2 0 5 5 5
S 3 3 3 3 3 3 3 0 3 3 3
X 6 6 6 6 6 6 6 0 6 6 6
AT 3 1 1 1 1 1 1 1 1 1 1 1
5 3 3 3 3 3 3 3 3 3 3 3
9 14 14 14 14 14 14 14 13 14 14 14
AU NSW 9 9 9 9 9 9 9 0 9 9 9
QLD 8 8 8 8 8 8 8 0 8 8 8
VIC 5 5 5 5 5 5 5 0 5 5 5
AW AW 3 3 3 3 3 3 0 3 3 3 3
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!