pandas vs excel学习笔记

久未见 提交于 2020-03-04 02:59:46
#-*- coding: utf-8 -*-
import pandas as pd
import numpy as np



#1
#####1.1创建一个dataframe并保存为excel文件
# =============================================================================
# df = pd.DataFrame({'id':[1,2,3],'name':['a','b','c']})
# df = df.set_index('id')
# df.to_excel('e:/output-f.xlsx')
# =============================================================================



#2
# =============================================================================
# people = pd.read_excel('F:/pytoexcel_study/output.xlsx')
# print(people.shape)   ###返回32   列名不计算
# print(people.columns)   #显示列名
# print(people.head(2))
# print(people.head(2))
# print('==========================')
# print(people.tail(2)) 
# =============================================================================

#####2.1指定head
# =============================================================================
# people = pd.read_excel('F:/pytoexcel_study/2.1-head_process.xlsx',header=2)
# print(people.columns)
# print(people)
# =============================================================================

#####2.2 第一行为空时,仍需设置header
# =============================================================================
# people = pd.read_excel('F:/pytoexcel_study/2.2-head_blank.xlsx',header=2)
# print(people.columns)
# print(people)
# 
# =============================================================================

#####2.3 没有header
# =============================================================================
# people = pd.read_excel('F:/pytoexcel_study/2.3-head_none.xlsx',header=None)
# print(people.columns)
# print(people)
# print('============================')
# people.columns = ['id','name']    ###添加header
# print(people.columns)
# people.set_index('id',inplace=True)
# print(people)
# print(people.columns)
# people.to_excel('F:/pytoexcel_study/2.3-head_none-2.xlsx')
# =============================================================================

#####2.4  读取文件时指定索引列,避免系统自动生成索引列
# =============================================================================
# print('==================')
# people = pd.read_excel('F:/pytoexcel_study/2.3-head_none-3.xlsx',index_col='id')
# people.to_excel('F:/pytoexcel_study/2.3-head_none-4.xlsx')
# print(people)
# =============================================================================




#3
#####3.1 生成序列的3种方法
s1 = pd.Series()   
#s1.data
#s1.name
#s1.index 

d= {'x':100,'y':200,'z':300}
s1 = pd.Series(d)  ##1.字典形式
print(s1)

l1 = [100,200,300]
l2=['x','y','z']
s2 = pd.Series(l1,index=l2) ##2.将值和索引合并为series
print(s2)

s3 = pd.Series([100,200,300],index=['x','y','z'])   ##3.用列表生成series
print(s3)

#####3.2 将series将入dataframe
#将serise以dict方式加入dataframe,则每个serise是一列
# =============================================================================
# s1=pd.Series([1,2,3],index=[1,2,3],name='a')
# s2=pd.Series([10,20,30],index=[1,2,3],name='b')
# s3=pd.Series([100,200,300],index=[1,2,3],name='c')
# df1 = pd.DataFrame({s1.name:s1,s2.name:s2,s3.name:s3}) 
# print(df1)
# =============================================================================


#将serise以list方式加入dataframe,则每个serise是一行
# =============================================================================
# df2 = pd.DataFrame([s1,s2,s3]) 
# print(df2)
# =============================================================================



#dataframe合并方式是并集,当多个serise的索引如不一致,合并后会出现nan数据。
# =============================================================================
# s1=pd.Series([1,2,3],index=[1,2,3],name='a')
# s2=pd.Series([10,20,30],index=[1,2,3],name='b')
# s3=pd.Series([100,200,300],index=[2,3,4],name='c')
# df1 = pd.DataFrame({s1.name:s1,s2.name:s2,s3.name:s3}) 
# print(df1)
# =============================================================================

#4
#####4.1读取前三方和左两列为空的数据
#####当pandas读取的数据为nan,则将数据转换为float类型,解决此问题可将dtype设置为str
#books = pd.read_excel('F:/pytoexcel_study/4.1-books.xlsx',skiprows=3,usecols='C:F',dtype={'id':str,'instore':str,'date':str})
#print(books)

#####4.2填写列
# =============================================================================
# from datetime import date,timedelta
# 
# start = date(2018,1,1) #填充DATA列时使用
# def add_month(d,md):
#     yd = md//12
#     m = d.month + md %12
#     if m!=12:
#         yd += m//12
#         m = m %12
#     return date(d.year + yd ,m,d.day)        
# 
# 
# books['id'].at[0] = 100
# =============================================================================
#####以serise方式填充
# =============================================================================
# for i in books.index:
#     books['id'].at[i] = i+1
#     books['instore'].at[i] = 'Yes' if i%2==0 else 'No'
#     #books['date'].at[i] = start
#     #books['date'].at[i] = start +timedelta(days=i) #按天加
#     #books['date'].at[i] = date(start.year+i,start.month,start.day) #按年加
#     books['date'].at[i] = add_month(start,i) #按月加
# =============================================================================

#####以dataframe方式填充
# =============================================================================
# for i in books.index:
#     books.at[i,'id'] = i+1
#     books.at[i,'instore'] = 'Yes' if i%2==0 else 'No'
#     books.at[i,'date'] = add_month(start,i) #按月加
# 
# 
# print(books)
# books.set_index('id',inplace=True)
# books.to_excel('F:/pytoexcel_study/4.2-books.xlsx')
# =============================================================================

 


#6
#####6.1函数式填充数据
# =============================================================================
# books = pd.read_excel('F:/pytoexcel_study/6.1-function_fill.xlsx',index_col='id')
# books['price'] = books['listprice'] * books['discount']   ##pandas中操作的是列,不是单元格
# print(books)                         #操作符的重载
# 
# =============================================================================

#####6.2迭代式填充数据,应用于部分数据需运算时
books = pd.read_excel('F:/pytoexcel_study/6.1-function_fill.xlsx',index_col='id')

for i in range(5,11):    #books.index: 全部迭代时使用
    #左闭右开
    books['price'].at[i] = books['listprice'].at[i] * books['discount'].at[i]
print(books)      



#####6.3 apply()方式更新数据
def add_2(x):
    return x+2
books['listprice'] = books['listprice'].apply(add_2)
print(books)


#####6.4 lambda方式更新数据
books = pd.read_excel('F:/pytoexcel_study/6.1-function_fill.xlsx',index_col='id')
books['listprice'] = books['listprice'].apply(lambda x:x + 6)
print(books)





#7
#####7.1数据排序
products = pd.read_excel('F:/pytoexcel_study/7.1-sort.xlsx',index_col='id')
#products.sort_values(by='price',inplace=True) #从低到高
#products.sort_values(by='price',inplace=True,ascending=False) #从高到低
#products.sort_values(by=['worthy','price'],inplace=True,ascending=True) #多列排序低到高
products.sort_values(by=['worthy','price'],inplace=True,ascending=[True,False]) #多列排序,多种排序
print(products)



#8
#####8.1数据筛选
# =============================================================================
# students = pd.read_excel('F:/pytoexcel_study/8.1-filter.xlsx',index_col='id')
# 
# def age_18to_30(a):
#     return 18<=a<30
# def level_a(s):
#     return 85<=s<=100
# 
# 
# ######one way
# students = students.loc[students['age'].apply(age_18to_30)].loc[students['score'].apply(level_a)]
# ######two way
# students = students.loc[students.age.apply(age_18to_30)].loc[students.score.apply(level_a)]
# ######three way
# students = students.loc[students.age.apply(lambda x: 18<=x<30)] \
#            .loc[students.score.apply(lambda a:85<=a<=100)]
# print(students)
# 
# =============================================================================


#9
#####9.1柱状图
# =============================================================================
# import matplotlib.pyplot as plt
# students = pd.read_excel('F:/pytoexcel_study/9.1-char_bar.xlsx')
# students.sort_values(by='number',inplace=True,ascending=False)
# #用pandas绘图
# #students.plot.bar(x='field',y='number',color='orange',title='Internation Field')
# 
# 
# 
# #用matplotlib画图
# plt.bar(students.field,students.number,color='orange')
# plt.xticks(students.field,rotation='90')
# plt.xlabel('field')
# plt.ylabel('number')
# plt.title('Internation Number',fontsize=16)
# 
# #输出
# plt.tight_layout()
# plt.show()
# 
# 
# =============================================================================

#10
#####分组柱图
# =============================================================================
# import matplotlib.pyplot as plt
# 
# students = pd.read_excel('F:/pytoexcel_study/10.1-char_bar.xlsx')
# ######pandas实现
# students.sort_values(by='year2017',inplace=True,ascending=False)
# 
# students.plot.bar(x='field', y=['year2016','year2017'], color=['orange', 'Red'])
# plt.title('International Students by Field', fontsize=16)
# plt.xlabel('Field', fontweight='bold')
# plt.ylabel('Number', fontweight='bold')
# plt.tight_layout()
# ax = plt.gca()  ##获取当前轴
# ax.set_xticklabels(students['field'], rotation=40, ha='right')
# f = plt.gcf()
# f.subplots_adjust(left=0.1, bottom=0.42)  ##调整左侧和底部的宽度
# plt.show()
# =============================================================================



######课程代码pandas实现
# =============================================================================
# students = pd.read_excel('C:/Temp/Students.xlsx')
# students.sort_values(by='2017', inplace=True, ascending=False)
# print(students)
# students.plot.bar('Field', ['2016', '2017'], color=['orange', 'Red'])
# plt.title('International Students by Field', fontsize=16)
# plt.xlabel('Field', fontweight='bold')
# plt.ylabel('Number', fontweight='bold')
# # plt.tight_layout()
# ax = plt.gca()
# ax.set_xticklabels(students['Field'], rotation=40, ha='right')
# plt.gcf().subplots_adjust(left=0.2, bottom=0.42)
# plt.show()
# =============================================================================






######课程代码matplotlib实现

# =============================================================================
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
# 
# students = pd.read_excel('F:/pytoexcel_study/10.1-char_bar.xlsx')
# students.sort_values(by='year2017', inplace=True, ascending=False)
# students.index = range(0, len(students))
# print(students)
# 
# bar_width = 0.6
# x_pos = np.arange(len(students) * 2, step=2)
# plt.bar(x_pos, students['year2016'], color='green', width=bar_width)
# plt.bar(x_pos + bar_width, students['year2017'], color='blue', width=bar_width)
# plt.xticks(x_pos + bar_width / 2, students['field'], rotation='90')
# plt.title('International Student by Field', fontsize=16)
# plt.xlabel('Field')
# plt.ylabel('Number')
# plt.tight_layout()
# plt.show()
# =============================================================================


#11
#####11.1c叠加柱状图

import matplotlib.pyplot as plt

users = pd.read_excel('F:/pytoexcel_study/11.1-char_bar.xlsx')

users['total'] = users['oct']+users['nov']+users['nov']
######柱状叠加图
#users.sort_values(by='total',inplace=True,ascending=False)
#users.plot.bar(x='name',y=['oct','nov','dec'],stacked=True,title='User Behavior')


######水平叠加图
# =============================================================================
#users.sort_values(by='total',inplace=True,ascending=True)
#users.plot.barh(x='name',y=['oct','nov','dec'],stacked=True,title='User Behavior')
#plt.tight_layout()
#plt.show()
# =============================================================================


#12
#####12.1饼图
#students = pd.read_excel('F:/pytoexcel_study/12.1-char_pie.xlsx',index_col='From')

######one way
# =============================================================================
# students['2017'].sort_values(ascending=True).plot.pie(fontsize=4,startangle=-270)
# plt.title('Source of International Students',fontsize=16,fontweight='bold')
# plt.ylabel('2017',fontsize=12,fontweight='bold')
# plt.show()
# =============================================================================


######two way
# =============================================================================
# students['2017'].plot.pie(fontsize=4,counterclock=False,startangle=90)  #从大到小排序,正值是逆时针,负值是顺时针
# plt.title('Source of International Students',fontsize=16,fontweight='bold')
# plt.ylabel('2017',fontsize=12,fontweight='bold')
# plt.show()
# =============================================================================

#13
#####13.1拆线图
#weeks = pd.read_excel('F:/pytoexcel_study/13.1-char_line.xlsx',index_col='Week')
#print(weeks.columns)

# =============================================================================
# weeks.plot(y=['Accessories', 'Bikes', 'Clothing', 'Components'])
# plt.title('Sales Weekly Trend',fontsize=16,fontweight='bold')
# plt.ylabel('Total')
# plt.xticks(weeks.index)   ###X轴的刻度
# plt.show()
# =============================================================================


# =============================================================================
# #####13.2叠加图
# weeks.plot.area(y=['Accessories', 'Bikes', 'Clothing', 'Components'])
# plt.title('Sales Weekly Trend',fontsize=16,fontweight='bold')
# plt.ylabel('Total')
# plt.xticks(weeks.index)   ###X轴的刻度
# plt.show()
# 
# 
# =============================================================================

#14
#####14.1散点图
#pd.options.display.max_columns = 777
#homes = pd.read_excel('F:/pytoexcel_study/14.1-char_dot.xlsx')

# =============================================================================
# homes.plot.scatter(x='sqft_living',y='price')
# plt.show()
# 
# =============================================================================



#15
#####15.1直方图

# =============================================================================
# homes.sqft_living.plot.hist(bins=100)
# plt.xticks(range(0,max(homes.sqft_living),500),fontsize=8,rotation=90)
# plt.show()
# =============================================================================

#####15.2密度图
# =============================================================================
# homes.sqft_living.plot.kde()
# plt.xticks(range(0,max(homes.sqft_living),500),fontsize=8,rotation=90)
# plt.show()
# =============================================================================

#####15.3数据的相关应分析
# =============================================================================
# print(homes.corr())   #数据的相关性矩阵
# 
# =============================================================================



#16
#####16.1表合并
# =============================================================================
# students = pd.read_excel('F:/pytoexcel_study/16.1-join.xlsx',sheet_name='Students')
# scores = pd.read_excel('F:/pytoexcel_study/16.1-join.xlsx',sheet_name='Scores')
# 
# =============================================================================

######one way
# =============================================================================
# table = students.merge(scores,how='left',on='ID').fillna(0)
# table.Score =table['Score'].astype(int)  #转换为整数
# =============================================================================


######two way
# =============================================================================
# students = pd.read_excel('F:/pytoexcel_study/16.1-join.xlsx',sheet_name='Students',index_col='ID')
# scores = pd.read_excel('F:/pytoexcel_study/16.1-join.xlsx',sheet_name='Scores',index_col='ID')
# table = students.join(scores,how='left').fillna(0)
# table.Score =table['Score'].astype(int)  #转换为整数
# 
# 
# print(table)
# =============================================================================


#17数据校验
#####17.1
students = pd.read_excel('F:/pytoexcel_study/17.1-check_data.xlsx')

######one way
# =============================================================================
# def score_validation(row):
#     try:
#         assert 0<=row.Score<=100
#     except:
#         print(f'#{row.ID}\tstudent {row.Name} has an invalid score {row.Score}.')
# 
# students.apply(score_validation,axis=1)
# =============================================================================

######two way
# =============================================================================
# def score_validation(row):
#     if not 0<=row.Score<=100:
#         print(f'#{row.ID}\tstudent {row.Name} has an invalid score {row.Score}.')
# 
# students.apply(score_validation,axis=1)
# 
# =============================================================================


#18 datatransformation
#####18.1

# =============================================================================
# employees = pd.read_excel('F:/pytoexcel_study/18.1-datatransformation.xlsx',index_col='ID')
# 
# df = employees['Full Name'].str.split(' ',expand=True)  #分开
# employees['First Name'] =df[0]
# employees['Last Name'] =df[1]
# 
# #employees.drop('Full Name',axis=1,inplace=True)
# 
# print(employees)
# =============================================================================


#19
#####19.1 平均和求和
# =============================================================================
# students = pd.read_excel('F:/pytoexcel_study/19.1-students.xlsx',index_col='ID')
# temp = students[['Test_1','Test_2','Test_3']]
# row_sum = temp.sum(axis=1)
# row_mean = temp.mean(axis=1)
# 
# students['total'] = row_sum
# students['average'] = row_mean
# col_mean = students[['Test_1','Test_2','Test_3','total','average']].mean()
# col_mean['Name'] = 'Summary'
# students = students.append(col_mean,ignore_index=True) #用APPEND方法加一行数据
# 
# print(students)
# =============================================================================



#20
#####20.1去除重复数据
#students = pd.read_excel('F:/pytoexcel_study/20.1-Students_Duplicates.xlsx')

# =============================================================================
# #去除重复数据
# students.drop_duplicates(subset='Name',inplace=True,keep='first') 
# #keep可选择first和last,first是保留第一次出现的数据
# =============================================================================

#####20.2定位重复数据
# =============================================================================
# dupe = students.duplicated(subset='Name') #返回bool类型数据
# #print(dupe.any()) #检查是否有重复项
# 
# #定位到重复数据
# dupe = dupe[dupe == True]  
# print(students.iloc[dupe.index])
# =============================================================================


#21 表旋转
#####21.1
# =============================================================================
# pd.options.display.max_columns = 999
# videos = pd.read_excel('F:/pytoexcel_study/21.1-videos.xlsx',index_col='Month')
# 
# table = videos.transpose()  #转置
# print(table)
# =============================================================================

#22 读取文本文件
#####22.1
# =============================================================================
# students = pd.read_csv('F:/pytoexcel_study/22.1-students.csv',index_col='ID')
# #print(students)
# 
# students2 = pd.read_csv('F:/pytoexcel_study/22.1-students.tsv',sep='\t',index_col='ID')
# #print(students2)
# 
# students3 = pd.read_csv('F:/pytoexcel_study/22.1-students.txt',sep='|',index_col='ID')
# print(students3)
# =============================================================================

#23
#####23.1
#import numpy as np
#orders = pd.read_excel('F:/pytoexcel_study/23.1-orders.xlsx')


#orders['Year'] = pd.DatetimeIndex(orders['Date']).year #提取年份

######one way
#pt1 = orders.pivot_table(index='Category',columns='Year',values='Total',aggfunc=np.sum)


######two way
# =============================================================================
# groups = orders.groupby(['Category','Year'])
# s= groups['Total'].sum()   #聚合后包含YEARCATEGORY列
# c= groups['ID'].count()
# groups2 = pd.DataFrame({'Sum':s,'Count':c})
# =============================================================================

######three way  简洁
#groups = orders.groupby(['Category','Year']).agg({'Total':'sum','ID':'count'})


#print(groups)

#24 
#####24.1 回归分析
# =============================================================================
# from scipy.stats import linregress
# sales = pd.read_excel('F:/pytoexcel_study/24.1-sales.xlsx',dtype={'Month':str})
# 
# slope,intercept,r,p,std_err=linregress(sales.index,sales.Revenue)
# exp = sales.index*slope+intercept
# 
# print(f'第37的值是:{slope*37+intercept}')
# 
# plt.scatter(sales.index,sales.Revenue)
# plt.plot(sales.index,exp,color='orange')
# 
# plt.title('Sales')
# plt.xticks(sales.index,sales.Month,rotation=90)
# plt.tight_layout()
# plt.show()
# =============================================================================



#25
#####25.1条件格式化

# =============================================================================
# def low_score_red(s):
#     color = 'red' if s<60 else 'green'
#     return f'color:{color}'
# 
# students = pd.read_excel('F:/pytoexcel_study/25.1-Students.xlsx')
# students.style.applymap(low_score_red, subset=['Test_1', 'Test_2', 'Test_3'])
# 
# ========================
# 
# import pandas as pd
# 
# def highest_score_green2(col):
#     return ['background-color:lime' if v==col.max() else 'background-color:red' for v in col]
# 
# students = pd.read_excel('c:/Temp/Students.xlsx')
# students.style.apply(highest_score_green2, subset=['Test_1', 'Test_2', 'Test_3'])
# 
# ========================
# #####25.3单元格背景颜色深浅
# import pandas as pd
# import seaborn as sns
# 
# color_map = sns.light_palette('green', as_cmap=True)
# 
# students = pd.read_excel('c:/Temp/Students.xlsx')
# students.style.background_gradient(cmap=color_map, subset=['Test_1','Test_2','Test_3'])
# 
# ========================
# =============================================================================

#import pandas as pd

#students.style.bar(color='orange', subset=['Test_1','Test_2','Test_3'])


#27 行操作

page1 = pd.read_excel('F:/pytoexcel_study/27.1-Students.xlsx',sheet_name='Page_001')
page2 = pd.read_excel('F:/pytoexcel_study/27.1-Students.xlsx',sheet_name='Page_002')


#####27.1两表合并,并生成新索引
######one way
students = page1.append(page2).reset_index(drop=True)

######two way
#students = pd.concat([page1,page2]).reset_index(drop=True)

#####27.2追加一行记录
stu = pd.Series({'ID':41,'Name':'aaa','Score':99})
students = students.append(stu,ignore_index=True)


#####27.3修改值
######one way  at方法
students.at[39,'Name']='Ball'
students.at[39,'Score'] = 111

######two way  iloc方法进行替换
#stu = pd.Series({'ID':36,'Name':'ccc','Score':99})
#students.iloc[35] = stu


#print(students)

#####27.4插入行,先切片,然后合并
#stu = pd.Series({'ID':110,'Name':'ccc','Score':99})
#part1 = students[:20]
#part2 = students[20:]
#students = part1.append(stu,ignore_index=True).append(part2).reset_index(drop=True)

#####27.5删除行
######one way
#students.drop(index=[0,1,2],inplace=True)

######two way
#students.drop(index=range(3,9),inplace=True)


######three way
#students.drop(index=students[0:10].index,inplace=True)


#####27.6按条件删除

# =============================================================================
# for i in range(5,10):
#     students['Name'].at[i] = ''
# 
# missing = students.loc[students['Name']=='']
# 
# students.drop(index=missing.index,inplace=True)
# students=students.reset_index(drop=True)
# 
# print(students)
# =============================================================================



#28 列操作


page1 = pd.read_excel('F:/pytoexcel_study/28.1-Students.xlsx',sheet_name='Page_001')
page2 = pd.read_excel('F:/pytoexcel_study/28.1-Students.xlsx',sheet_name='Page_002')

#####28.1 追加列
students = pd.concat([page1,page2]).reset_index(drop=True)
#students['Age'] = np.repeat(25,len(students))
students['Age'] = np.arange(0,len(students))


#####28.2 删除列
students.drop(columns=['Score','Age'],inplace=True)

#####28.3 插入列
students.insert(1,column='Foo',value=np.repeat('foo',len(students)))

#####28.4 修改列名
students.rename(columns={'Foo':'FOO'},inplace=True)

#####28.5 删除空值
students['ID'] = students['ID'].astype(float)
for i in range(5,15):
    students['ID'].at[i] = np.nan
    

students.dropna(inplace=True)
print(students)


#29 连接数据库
#####29.1
#import pandas as pd
#import pyodbc 
#import sqlalchemy


# =============================================================================
# ######one way
# connection = pyodbc.connect('DRIVER={SQL Server}; SERVER=(local); DATABASE=AdventureWorks;USER=sa;PASSWORD=123456')   #连接
# query = "select * from Person"
# df1 = pd.read_sql_query(query,connection)
# 
# ######two way
# engine = sqlalchemy.create_engine('mssql+pyodbc://sa:123456@(local)/AdventureWorks?driver=SQL+Server') #连接
# df2 = pd.read_sql_query(query,engine)
# 
# 
# pd.options.display.max_columns = 999
# print(df1.head())
# print(df2.head())
# =============================================================================


#30
#####30.1
# =============================================================================
#import pandas as pd
#import numpy as np
# 
# 
# def get_circumcircle_area(l, h):
#     r = np.sqrt(l ** 2 + h ** 2) / 2
#     return r ** 2 * np.pi
# 
# 
# def wrapper(row):
#     return get_circumcircle_area(row['Length'], row['Height'])
# 
# 
# rects = pd.read_excel('C:/Temp/Rectangles.xlsx', index_col='ID')
# ######one way
# rects['Circumcircle Area'] = rects.apply(wrapper, axis=1)
# 
# ######two way
# rects['Circumcircle Area'] = rects.apply(lambda row:get_circumcircle_area(row['Length'], row['Height']), axis=1)
# 
# print(rects)
# =============================================================================
标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!