pandas基本数据对象及操作

1、Series

创建Series import pandas as pd countries = ['中国', '美国', '澳大利亚'] countries_s = pd.Series(countries) print(type(countries_s)) print(countries_s)  numbers = [4, 5, 6] print(pd.Series(numbers))  country_dicts = {'CH': '中国',                 'US': '美国',                 'AU': '澳大利亚'} country_dict_s = pd.Series(country_dicts) # 给索引命名 country_dict_s.index.name = 'Code' # 给数据命名 country_dict_s.name = 'Country' print(country_dict_s) print(country_dict_s.values) print(country_dict_s.index)  处理缺失数据 countries = ['中国', '美国', '澳大利亚', None] print(pd.Series(countries)) numbers = [4, 5, 6, None] print(pd.Series(numbers)) country_dicts = {'CH': '中国',                 'US': '美国',                 'AU': '澳大利亚'} country_dict_s = pd.Series(country_dicts) print(country_dict_s)  # 通过索引判断数据是存在 # Series也可看作定长、有序的字典 print('CH' in country_dict_s) print('NZ' in country_dict_s) print('iloc:', country_dict_s.iloc[1]) print('loc:', country_dict_s.loc['US']) print('[]:', country_dict_s['US']) print('iloc:\n', country_dict_s.iloc[ [0, 2] ]) print() print('loc:\n', country_dict_s.loc[['US', 'AU']])  向量化操作 import numpy as np s = pd.Series(np.random.randint(0, 1000, 10000)) print(s.head()) print(len(s))

2、DataFrame

创建Dataframe import pandas as pd country1 = pd.Series({'Name': '中国',                     'Language': 'Chinese',                     'Area': '9.597M km2',                      'Happiness Rank': 79}) country2 = pd.Series({'Name': '美国',                     'Language': 'English (US)',                     'Area': '9.834M km2',                      'Happiness Rank': 14}) country3 = pd.Series({'Name': '澳大利亚',                     'Language': 'English (AU)',                     'Area': '7.692M km2',                      'Happiness Rank': 9}) df = pd.DataFrame([country1, country2, country3], index=['CH', 'US', 'AU']) # 注意在jupyter中使用print和不使用print的区别 print(df) # 添加数据 # 如果个数小于要求的个数，会自动进行“广播”操作 # 如果大于要求的个数，会报错 df['Location'] = '地球' print(df) df['Region'] = ['亚洲', '北美洲', '大洋洲'] print(df) Dataframe索引 # 行索引 print('loc:') print(df.loc['CH']) print(type(df.loc['CH'])) print('iloc:') print(df.iloc[1]) print(df['Area']) # 列索引 print(df['Area']) print(type(df['Area'])) # 获取不连续的列数据 print(df[['Name', 'Area']]) # 混合索引 # 注意写法上的区别 print('先取出列，再取行：') print(df['Area']['CH']) print(df['Area'].loc['CH']) print(df['Area'].iloc[0]) print('先取出行，再取列：') print(df.loc['CH']['Area']) print(df.iloc[0]['Area']) # 转换行和列 print(df.T) 删除数据 print(df.drop(['CH'])) # 注意drop操作只是将修改后的数据copy一份，而不会对原始数据进行修改 print(df) print(df.drop(['CH'], inplace=True)) # 如果使用了inplace=True，会在原始数据上进行修改，同时不会返回一个copy print(df) #  如果需要删除列，需要指定axis=1 print(df.drop(['Area'], axis=1)) print(df) # 也可直接使用del关键字 del df['Name'] print(df) DataFrame的操作与加载 print(df['Happiness Rank']) # 注意从DataFrame中取出的数据进行操作后，会对原始数据产生影响 ranks = df['Happiness Rank'] ranks += 2 print(ranks) print(df) # 注意从DataFrame中取出的数据进行操作后，会对原始数据产生影响 # 安全的操作是使用copy() ranks = df['Happiness Rank'].copy() ranks += 2 print(ranks) print(df) # 加载csv文件数据 reprot_2015_df = pd.read_csv('./2015.csv') print('2015年数据预览：') #print(reprot_2015_df.head()) reprot_2015_df.head() print(reprot_2015_df.info())

3、索引

[数据集2016.csv下载地址] https://pan.baidu.com/s/1_D8rTk1Kl5io1qnBXMXhcA 密码：u2vt # 使用index_col指定索引列 # 使用usecols指定需要读取的列 reprot_2016_df = pd.read_csv('./2016.csv',                               index_col='Country',                              usecols=['Country', 'Happiness Rank', 'Happiness Score', 'Region']) # 数据预览 reprot_2016_df.head() print('列名(column)：', reprot_2016_df.columns) print('行名(index)：', reprot_2016_df.index) # 注意index是不可变的 reprot_2016_df.index[0] = '丹麦' # 重置index # 注意inplace加与不加的区别 reprot_2016_df.reset_index(inplace=True) print(reprot_2016_df.head()) # 重命名列名 reprot_2016_df = reprot_2016_df.rename(columns={'Region': '地区', 'Hapiness Rank': '排名', 'Hapiness Score': '幸福指数'}) reprot_2016_df.head() # 重命名列名，注意inplace的使用 reprot_2016_df.rename(columns={'Region': '地区', 'Happiness Rank': '排名', 'Happiness Score': '幸福指数'},                      inplace=True) reprot_2016_df.head()

4、Boolean Mask

print(reprot_2016_df.head()) # 过滤 Western Europe 地区的国家 # only_western_europe = reprot_2016_df['地区'] == 'Western Europe' reprot_2016_df[reprot_2016_df['地区'] == 'Western Europe'] # 过滤 Western Europe 地区的国家 # 并且排名在10之外 only_western_europe_10 = (reprot_2016_df['地区'] == 'Western Europe') & (reprot_2016_df['排名'] > 10) only_western_europe_10 # 叠加 boolean mask 得到最终结果 reprot_2016_df[only_western_europe_10] # 熟练以后可以写在一行中 reprot_2016_df[(reprot_2016_df['地区'] == 'Western Europe') & (reprot_2016_df['排名'] > 10)]

5、层级索引

[数据集2015.csv下载地址] https://pan.baidu.com/s/1-tBedyPvbuKQFJP5BdR1yA 密码：j22j print(reprot_2015_df.head()) # 设置层级索引 report_2015_df2 = reprot_2015_df.set_index(['Region', 'Country']) report_2015_df2.head(20) # level0 索引 report_2015_df2.loc['Western Europe'] # 两层索引 report_2015_df2.loc['Western Europe', 'Switzerland'] # 交换分层顺序 report_2015_df2.swaplevel() # 排序分层 report_2015_df2.sort_index(level=0)

文章来源: pandas基本数据对象及操作

标签

pandas

happiness