#-*- coding: utf-8 -*-
import pandas as pd
import numpy as np
#1
#####1.1创建一个dataframe并保存为excel文件
# =============================================================================
# df = pd.DataFrame({'id':[1,2,3],'name':['a','b','c']})
# df = df.set_index('id')
# df.to_excel('e:/output-f.xlsx')
# =============================================================================
#2
# =============================================================================
# people = pd.read_excel('F:/pytoexcel_study/output.xlsx')
# print(people.shape) ###返回3,2 列名不计算
# print(people.columns) #显示列名
# print(people.head(2))
# print(people.head(2))
# print('==========================')
# print(people.tail(2))
# =============================================================================
#####2.1指定head
# =============================================================================
# people = pd.read_excel('F:/pytoexcel_study/2.1-head_process.xlsx',header=2)
# print(people.columns)
# print(people)
# =============================================================================
#####2.2 第一行为空时,仍需设置header
# =============================================================================
# people = pd.read_excel('F:/pytoexcel_study/2.2-head_blank.xlsx',header=2)
# print(people.columns)
# print(people)
#
# =============================================================================
#####2.3 没有header
# =============================================================================
# people = pd.read_excel('F:/pytoexcel_study/2.3-head_none.xlsx',header=None)
# print(people.columns)
# print(people)
# print('============================')
# people.columns = ['id','name'] ###添加header
# print(people.columns)
# people.set_index('id',inplace=True)
# print(people)
# print(people.columns)
# people.to_excel('F:/pytoexcel_study/2.3-head_none-2.xlsx')
# =============================================================================
#####2.4 读取文件时指定索引列,避免系统自动生成索引列
# =============================================================================
# print('==================')
# people = pd.read_excel('F:/pytoexcel_study/2.3-head_none-3.xlsx',index_col='id')
# people.to_excel('F:/pytoexcel_study/2.3-head_none-4.xlsx')
# print(people)
# =============================================================================
#3
#####3.1 生成序列的3种方法
s1 = pd.Series()
#s1.data
#s1.name
#s1.index
d= {'x':100,'y':200,'z':300}
s1 = pd.Series(d) ##1.字典形式
print(s1)
l1 = [100,200,300]
l2=['x','y','z']
s2 = pd.Series(l1,index=l2) ##2.将值和索引合并为series
print(s2)
s3 = pd.Series([100,200,300],index=['x','y','z']) ##3.用列表生成series
print(s3)
#####3.2 将series将入dataframe
#将serise以dict方式加入dataframe,则每个serise是一列
# =============================================================================
# s1=pd.Series([1,2,3],index=[1,2,3],name='a')
# s2=pd.Series([10,20,30],index=[1,2,3],name='b')
# s3=pd.Series([100,200,300],index=[1,2,3],name='c')
# df1 = pd.DataFrame({s1.name:s1,s2.name:s2,s3.name:s3})
# print(df1)
# =============================================================================
#将serise以list方式加入dataframe,则每个serise是一行
# =============================================================================
# df2 = pd.DataFrame([s1,s2,s3])
# print(df2)
# =============================================================================
#dataframe合并方式是并集,当多个serise的索引如不一致,合并后会出现nan数据。
# =============================================================================
# s1=pd.Series([1,2,3],index=[1,2,3],name='a')
# s2=pd.Series([10,20,30],index=[1,2,3],name='b')
# s3=pd.Series([100,200,300],index=[2,3,4],name='c')
# df1 = pd.DataFrame({s1.name:s1,s2.name:s2,s3.name:s3})
# print(df1)
# =============================================================================
#4
#####4.1读取前三方和左两列为空的数据
#####当pandas读取的数据为nan,则将数据转换为float类型,解决此问题可将dtype设置为str
#books = pd.read_excel('F:/pytoexcel_study/4.1-books.xlsx',skiprows=3,usecols='C:F',dtype={'id':str,'instore':str,'date':str})
#print(books)
#####4.2填写列
# =============================================================================
# from datetime import date,timedelta
#
# start = date(2018,1,1) #填充DATA列时使用
# def add_month(d,md):
# yd = md//12
# m = d.month + md %12
# if m!=12:
# yd += m//12
# m = m %12
# return date(d.year + yd ,m,d.day)
#
#
# books['id'].at[0] = 100
# =============================================================================
#####以serise方式填充
# =============================================================================
# for i in books.index:
# books['id'].at[i] = i+1
# books['instore'].at[i] = 'Yes' if i%2==0 else 'No'
# #books['date'].at[i] = start
# #books['date'].at[i] = start +timedelta(days=i) #按天加
# #books['date'].at[i] = date(start.year+i,start.month,start.day) #按年加
# books['date'].at[i] = add_month(start,i) #按月加
# =============================================================================
#####以dataframe方式填充
# =============================================================================
# for i in books.index:
# books.at[i,'id'] = i+1
# books.at[i,'instore'] = 'Yes' if i%2==0 else 'No'
# books.at[i,'date'] = add_month(start,i) #按月加
#
#
# print(books)
# books.set_index('id',inplace=True)
# books.to_excel('F:/pytoexcel_study/4.2-books.xlsx')
# =============================================================================
#6
#####6.1函数式填充数据
# =============================================================================
# books = pd.read_excel('F:/pytoexcel_study/6.1-function_fill.xlsx',index_col='id')
# books['price'] = books['listprice'] * books['discount'] ##pandas中操作的是列,不是单元格
# print(books) #操作符的重载
#
# =============================================================================
#####6.2迭代式填充数据,应用于部分数据需运算时
books = pd.read_excel('F:/pytoexcel_study/6.1-function_fill.xlsx',index_col='id')
for i in range(5,11): #books.index: 全部迭代时使用
#左闭右开
books['price'].at[i] = books['listprice'].at[i] * books['discount'].at[i]
print(books)
#####6.3 apply()方式更新数据
def add_2(x):
return x+2
books['listprice'] = books['listprice'].apply(add_2)
print(books)
#####6.4 lambda方式更新数据
books = pd.read_excel('F:/pytoexcel_study/6.1-function_fill.xlsx',index_col='id')
books['listprice'] = books['listprice'].apply(lambda x:x + 6)
print(books)
#7
#####7.1数据排序
products = pd.read_excel('F:/pytoexcel_study/7.1-sort.xlsx',index_col='id')
#products.sort_values(by='price',inplace=True) #从低到高
#products.sort_values(by='price',inplace=True,ascending=False) #从高到低
#products.sort_values(by=['worthy','price'],inplace=True,ascending=True) #多列排序低到高
products.sort_values(by=['worthy','price'],inplace=True,ascending=[True,False]) #多列排序,多种排序
print(products)
#8
#####8.1数据筛选
# =============================================================================
# students = pd.read_excel('F:/pytoexcel_study/8.1-filter.xlsx',index_col='id')
#
# def age_18to_30(a):
# return 18<=a<30
# def level_a(s):
# return 85<=s<=100
#
#
# ######one way
# students = students.loc[students['age'].apply(age_18to_30)].loc[students['score'].apply(level_a)]
# ######two way
# students = students.loc[students.age.apply(age_18to_30)].loc[students.score.apply(level_a)]
# ######three way
# students = students.loc[students.age.apply(lambda x: 18<=x<30)] \
# .loc[students.score.apply(lambda a:85<=a<=100)]
# print(students)
#
# =============================================================================
#9
#####9.1柱状图
# =============================================================================
# import matplotlib.pyplot as plt
# students = pd.read_excel('F:/pytoexcel_study/9.1-char_bar.xlsx')
# students.sort_values(by='number',inplace=True,ascending=False)
# #用pandas绘图
# #students.plot.bar(x='field',y='number',color='orange',title='Internation Field')
#
#
#
# #用matplotlib画图
# plt.bar(students.field,students.number,color='orange')
# plt.xticks(students.field,rotation='90')
# plt.xlabel('field')
# plt.ylabel('number')
# plt.title('Internation Number',fontsize=16)
#
# #输出
# plt.tight_layout()
# plt.show()
#
#
# =============================================================================
#10
#####分组柱图
# =============================================================================
# import matplotlib.pyplot as plt
#
# students = pd.read_excel('F:/pytoexcel_study/10.1-char_bar.xlsx')
# ######pandas实现
# students.sort_values(by='year2017',inplace=True,ascending=False)
#
# students.plot.bar(x='field', y=['year2016','year2017'], color=['orange', 'Red'])
# plt.title('International Students by Field', fontsize=16)
# plt.xlabel('Field', fontweight='bold')
# plt.ylabel('Number', fontweight='bold')
# plt.tight_layout()
# ax = plt.gca() ##获取当前轴
# ax.set_xticklabels(students['field'], rotation=40, ha='right')
# f = plt.gcf()
# f.subplots_adjust(left=0.1, bottom=0.42) ##调整左侧和底部的宽度
# plt.show()
# =============================================================================
######课程代码pandas实现
# =============================================================================
# students = pd.read_excel('C:/Temp/Students.xlsx')
# students.sort_values(by='2017', inplace=True, ascending=False)
# print(students)
# students.plot.bar('Field', ['2016', '2017'], color=['orange', 'Red'])
# plt.title('International Students by Field', fontsize=16)
# plt.xlabel('Field', fontweight='bold')
# plt.ylabel('Number', fontweight='bold')
# # plt.tight_layout()
# ax = plt.gca()
# ax.set_xticklabels(students['Field'], rotation=40, ha='right')
# plt.gcf().subplots_adjust(left=0.2, bottom=0.42)
# plt.show()
# =============================================================================
######课程代码matplotlib实现
# =============================================================================
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
#
# students = pd.read_excel('F:/pytoexcel_study/10.1-char_bar.xlsx')
# students.sort_values(by='year2017', inplace=True, ascending=False)
# students.index = range(0, len(students))
# print(students)
#
# bar_width = 0.6
# x_pos = np.arange(len(students) * 2, step=2)
# plt.bar(x_pos, students['year2016'], color='green', width=bar_width)
# plt.bar(x_pos + bar_width, students['year2017'], color='blue', width=bar_width)
# plt.xticks(x_pos + bar_width / 2, students['field'], rotation='90')
# plt.title('International Student by Field', fontsize=16)
# plt.xlabel('Field')
# plt.ylabel('Number')
# plt.tight_layout()
# plt.show()
# =============================================================================
#11
#####11.1c叠加柱状图
import matplotlib.pyplot as plt
users = pd.read_excel('F:/pytoexcel_study/11.1-char_bar.xlsx')
users['total'] = users['oct']+users['nov']+users['nov']
######柱状叠加图
#users.sort_values(by='total',inplace=True,ascending=False)
#users.plot.bar(x='name',y=['oct','nov','dec'],stacked=True,title='User Behavior')
######水平叠加图
# =============================================================================
#users.sort_values(by='total',inplace=True,ascending=True)
#users.plot.barh(x='name',y=['oct','nov','dec'],stacked=True,title='User Behavior')
#plt.tight_layout()
#plt.show()
# =============================================================================
#12
#####12.1饼图
#students = pd.read_excel('F:/pytoexcel_study/12.1-char_pie.xlsx',index_col='From')
######one way
# =============================================================================
# students['2017'].sort_values(ascending=True).plot.pie(fontsize=4,startangle=-270)
# plt.title('Source of International Students',fontsize=16,fontweight='bold')
# plt.ylabel('2017',fontsize=12,fontweight='bold')
# plt.show()
# =============================================================================
######two way
# =============================================================================
# students['2017'].plot.pie(fontsize=4,counterclock=False,startangle=90) #从大到小排序,正值是逆时针,负值是顺时针
# plt.title('Source of International Students',fontsize=16,fontweight='bold')
# plt.ylabel('2017',fontsize=12,fontweight='bold')
# plt.show()
# =============================================================================
#13
#####13.1拆线图
#weeks = pd.read_excel('F:/pytoexcel_study/13.1-char_line.xlsx',index_col='Week')
#print(weeks.columns)
# =============================================================================
# weeks.plot(y=['Accessories', 'Bikes', 'Clothing', 'Components'])
# plt.title('Sales Weekly Trend',fontsize=16,fontweight='bold')
# plt.ylabel('Total')
# plt.xticks(weeks.index) ###X轴的刻度
# plt.show()
# =============================================================================
# =============================================================================
# #####13.2叠加图
# weeks.plot.area(y=['Accessories', 'Bikes', 'Clothing', 'Components'])
# plt.title('Sales Weekly Trend',fontsize=16,fontweight='bold')
# plt.ylabel('Total')
# plt.xticks(weeks.index) ###X轴的刻度
# plt.show()
#
#
# =============================================================================
#14
#####14.1散点图
#pd.options.display.max_columns = 777
#homes = pd.read_excel('F:/pytoexcel_study/14.1-char_dot.xlsx')
# =============================================================================
# homes.plot.scatter(x='sqft_living',y='price')
# plt.show()
#
# =============================================================================
#15
#####15.1直方图
# =============================================================================
# homes.sqft_living.plot.hist(bins=100)
# plt.xticks(range(0,max(homes.sqft_living),500),fontsize=8,rotation=90)
# plt.show()
# =============================================================================
#####15.2密度图
# =============================================================================
# homes.sqft_living.plot.kde()
# plt.xticks(range(0,max(homes.sqft_living),500),fontsize=8,rotation=90)
# plt.show()
# =============================================================================
#####15.3数据的相关应分析
# =============================================================================
# print(homes.corr()) #数据的相关性矩阵
#
# =============================================================================
#16
#####16.1表合并
# =============================================================================
# students = pd.read_excel('F:/pytoexcel_study/16.1-join.xlsx',sheet_name='Students')
# scores = pd.read_excel('F:/pytoexcel_study/16.1-join.xlsx',sheet_name='Scores')
#
# =============================================================================
######one way
# =============================================================================
# table = students.merge(scores,how='left',on='ID').fillna(0)
# table.Score =table['Score'].astype(int) #转换为整数
# =============================================================================
######two way
# =============================================================================
# students = pd.read_excel('F:/pytoexcel_study/16.1-join.xlsx',sheet_name='Students',index_col='ID')
# scores = pd.read_excel('F:/pytoexcel_study/16.1-join.xlsx',sheet_name='Scores',index_col='ID')
# table = students.join(scores,how='left').fillna(0)
# table.Score =table['Score'].astype(int) #转换为整数
#
#
# print(table)
# =============================================================================
#17数据校验
#####17.1
students = pd.read_excel('F:/pytoexcel_study/17.1-check_data.xlsx')
######one way
# =============================================================================
# def score_validation(row):
# try:
# assert 0<=row.Score<=100
# except:
# print(f'#{row.ID}\tstudent {row.Name} has an invalid score {row.Score}.')
#
# students.apply(score_validation,axis=1)
# =============================================================================
######two way
# =============================================================================
# def score_validation(row):
# if not 0<=row.Score<=100:
# print(f'#{row.ID}\tstudent {row.Name} has an invalid score {row.Score}.')
#
# students.apply(score_validation,axis=1)
#
# =============================================================================
#18 datatransformation
#####18.1
# =============================================================================
# employees = pd.read_excel('F:/pytoexcel_study/18.1-datatransformation.xlsx',index_col='ID')
#
# df = employees['Full Name'].str.split(' ',expand=True) #分开
# employees['First Name'] =df[0]
# employees['Last Name'] =df[1]
#
# #employees.drop('Full Name',axis=1,inplace=True)
#
# print(employees)
# =============================================================================
#19
#####19.1 平均和求和
# =============================================================================
# students = pd.read_excel('F:/pytoexcel_study/19.1-students.xlsx',index_col='ID')
# temp = students[['Test_1','Test_2','Test_3']]
# row_sum = temp.sum(axis=1)
# row_mean = temp.mean(axis=1)
#
# students['total'] = row_sum
# students['average'] = row_mean
# col_mean = students[['Test_1','Test_2','Test_3','total','average']].mean()
# col_mean['Name'] = 'Summary'
# students = students.append(col_mean,ignore_index=True) #用APPEND方法加一行数据
#
# print(students)
# =============================================================================
#20
#####20.1去除重复数据
#students = pd.read_excel('F:/pytoexcel_study/20.1-Students_Duplicates.xlsx')
# =============================================================================
# #去除重复数据
# students.drop_duplicates(subset='Name',inplace=True,keep='first')
# #keep可选择first和last,first是保留第一次出现的数据
# =============================================================================
#####20.2定位重复数据
# =============================================================================
# dupe = students.duplicated(subset='Name') #返回bool类型数据
# #print(dupe.any()) #检查是否有重复项
#
# #定位到重复数据
# dupe = dupe[dupe == True]
# print(students.iloc[dupe.index])
# =============================================================================
#21 表旋转
#####21.1
# =============================================================================
# pd.options.display.max_columns = 999
# videos = pd.read_excel('F:/pytoexcel_study/21.1-videos.xlsx',index_col='Month')
#
# table = videos.transpose() #转置
# print(table)
# =============================================================================
#22 读取文本文件
#####22.1
# =============================================================================
# students = pd.read_csv('F:/pytoexcel_study/22.1-students.csv',index_col='ID')
# #print(students)
#
# students2 = pd.read_csv('F:/pytoexcel_study/22.1-students.tsv',sep='\t',index_col='ID')
# #print(students2)
#
# students3 = pd.read_csv('F:/pytoexcel_study/22.1-students.txt',sep='|',index_col='ID')
# print(students3)
# =============================================================================
#23
#####23.1
#import numpy as np
#orders = pd.read_excel('F:/pytoexcel_study/23.1-orders.xlsx')
#orders['Year'] = pd.DatetimeIndex(orders['Date']).year #提取年份
######one way
#pt1 = orders.pivot_table(index='Category',columns='Year',values='Total',aggfunc=np.sum)
######two way
# =============================================================================
# groups = orders.groupby(['Category','Year'])
# s= groups['Total'].sum() #聚合后包含YEAR和CATEGORY列
# c= groups['ID'].count()
# groups2 = pd.DataFrame({'Sum':s,'Count':c})
# =============================================================================
######three way 简洁
#groups = orders.groupby(['Category','Year']).agg({'Total':'sum','ID':'count'})
#print(groups)
#24
#####24.1 回归分析
# =============================================================================
# from scipy.stats import linregress
# sales = pd.read_excel('F:/pytoexcel_study/24.1-sales.xlsx',dtype={'Month':str})
#
# slope,intercept,r,p,std_err=linregress(sales.index,sales.Revenue)
# exp = sales.index*slope+intercept
#
# print(f'第37的值是:{slope*37+intercept}')
#
# plt.scatter(sales.index,sales.Revenue)
# plt.plot(sales.index,exp,color='orange')
#
# plt.title('Sales')
# plt.xticks(sales.index,sales.Month,rotation=90)
# plt.tight_layout()
# plt.show()
# =============================================================================
#25
#####25.1条件格式化
# =============================================================================
# def low_score_red(s):
# color = 'red' if s<60 else 'green'
# return f'color:{color}'
#
# students = pd.read_excel('F:/pytoexcel_study/25.1-Students.xlsx')
# students.style.applymap(low_score_red, subset=['Test_1', 'Test_2', 'Test_3'])
#
# ========================
#
# import pandas as pd
#
# def highest_score_green2(col):
# return ['background-color:lime' if v==col.max() else 'background-color:red' for v in col]
#
# students = pd.read_excel('c:/Temp/Students.xlsx')
# students.style.apply(highest_score_green2, subset=['Test_1', 'Test_2', 'Test_3'])
#
# ========================
# #####25.3单元格背景颜色深浅
# import pandas as pd
# import seaborn as sns
#
# color_map = sns.light_palette('green', as_cmap=True)
#
# students = pd.read_excel('c:/Temp/Students.xlsx')
# students.style.background_gradient(cmap=color_map, subset=['Test_1','Test_2','Test_3'])
#
# ========================
# =============================================================================
#import pandas as pd
#students.style.bar(color='orange', subset=['Test_1','Test_2','Test_3'])
#27 行操作
page1 = pd.read_excel('F:/pytoexcel_study/27.1-Students.xlsx',sheet_name='Page_001')
page2 = pd.read_excel('F:/pytoexcel_study/27.1-Students.xlsx',sheet_name='Page_002')
#####27.1两表合并,并生成新索引
######one way
students = page1.append(page2).reset_index(drop=True)
######two way
#students = pd.concat([page1,page2]).reset_index(drop=True)
#####27.2追加一行记录
stu = pd.Series({'ID':41,'Name':'aaa','Score':99})
students = students.append(stu,ignore_index=True)
#####27.3修改值
######one way at方法
students.at[39,'Name']='Ball'
students.at[39,'Score'] = 111
######two way iloc方法进行替换
#stu = pd.Series({'ID':36,'Name':'ccc','Score':99})
#students.iloc[35] = stu
#print(students)
#####27.4插入行,先切片,然后合并
#stu = pd.Series({'ID':110,'Name':'ccc','Score':99})
#part1 = students[:20]
#part2 = students[20:]
#students = part1.append(stu,ignore_index=True).append(part2).reset_index(drop=True)
#####27.5删除行
######one way
#students.drop(index=[0,1,2],inplace=True)
######two way
#students.drop(index=range(3,9),inplace=True)
######three way
#students.drop(index=students[0:10].index,inplace=True)
#####27.6按条件删除
# =============================================================================
# for i in range(5,10):
# students['Name'].at[i] = ''
#
# missing = students.loc[students['Name']=='']
#
# students.drop(index=missing.index,inplace=True)
# students=students.reset_index(drop=True)
#
# print(students)
# =============================================================================
#28 列操作
page1 = pd.read_excel('F:/pytoexcel_study/28.1-Students.xlsx',sheet_name='Page_001')
page2 = pd.read_excel('F:/pytoexcel_study/28.1-Students.xlsx',sheet_name='Page_002')
#####28.1 追加列
students = pd.concat([page1,page2]).reset_index(drop=True)
#students['Age'] = np.repeat(25,len(students))
students['Age'] = np.arange(0,len(students))
#####28.2 删除列
students.drop(columns=['Score','Age'],inplace=True)
#####28.3 插入列
students.insert(1,column='Foo',value=np.repeat('foo',len(students)))
#####28.4 修改列名
students.rename(columns={'Foo':'FOO'},inplace=True)
#####28.5 删除空值
students['ID'] = students['ID'].astype(float)
for i in range(5,15):
students['ID'].at[i] = np.nan
students.dropna(inplace=True)
print(students)
#29 连接数据库
#####29.1
#import pandas as pd
#import pyodbc
#import sqlalchemy
# =============================================================================
# ######one way
# connection = pyodbc.connect('DRIVER={SQL Server}; SERVER=(local); DATABASE=AdventureWorks;USER=sa;PASSWORD=123456') #连接
# query = "select * from Person"
# df1 = pd.read_sql_query(query,connection)
#
# ######two way
# engine = sqlalchemy.create_engine('mssql+pyodbc://sa:123456@(local)/AdventureWorks?driver=SQL+Server') #连接
# df2 = pd.read_sql_query(query,engine)
#
#
# pd.options.display.max_columns = 999
# print(df1.head())
# print(df2.head())
# =============================================================================
#30
#####30.1
# =============================================================================
#import pandas as pd
#import numpy as np
#
#
# def get_circumcircle_area(l, h):
# r = np.sqrt(l ** 2 + h ** 2) / 2
# return r ** 2 * np.pi
#
#
# def wrapper(row):
# return get_circumcircle_area(row['Length'], row['Height'])
#
#
# rects = pd.read_excel('C:/Temp/Rectangles.xlsx', index_col='ID')
# ######one way
# rects['Circumcircle Area'] = rects.apply(wrapper, axis=1)
#
# ######two way
# rects['Circumcircle Area'] = rects.apply(lambda row:get_circumcircle_area(row['Length'], row['Height']), axis=1)
#
# print(rects)
# =============================================================================
来源:CSDN
作者:skymacro
链接:https://blog.csdn.net/qq_42810165/article/details/104640089