由于要连接四年的excel数据文件,为了节省时间,自己写了一段代码,虽然这只是我在工作中遇到的一个小的知识点,但是我希望把这记录下来,以免以后会有相同的处理,可以大大提升我们的工作效率。有需要的可以看一下
import xlrd
import pandas as pd
import time
filename = ['本地户存活周期研究(2015-08-01至2016-07-31)', '本地户存活周期研究(2016-08-01至2017-07-31)',
'本地户存活周期研究(2017-08-01至2018-07-31)', '本地户存活周期研究(2018-08-01至2019-07-31)']
desktop_root_direct = "C:/Users/liuqiping/Desktop/"
def read_data(): # 读取Excel数据
data_list = []
for file in filename:
workbook = xlrd.open_workbook(desktop_root_direct+file+".xlsx")
worksheet = workbook.sheet_by_index(0)
nrows = worksheet.nrows
ncols = worksheet.ncols
print(nrows, ncols)
dicts = {}
for colnum in range(0, ncols):
coldata = worksheet.col_values(colnum)
dicts[coldata[0]] = coldata[1:]
data_list.append(dicts)
return data_list
def merge_data(data_list): # 利用pd.merge()函数合并filename文件数据
left = pd.DataFrame(data_list[0])
result = 0
for i in range(1, len(data_list)):
right = pd.DataFrame(data_list[i])
result = pd.merge(left, right, how="outer",
on=["账户ID", "账户名称", "账户一级行业", "账户二级行业", "账户三级行业", "账号", "首次消费日", "注册日期", "分析行业"])
left = result
result_df = pd.DataFrame(result)
result_df.to_excel(desktop_root_direct+"本地户存活周期研究_合并结果.xlsx", index=False)
if __name__ == '__main__':
start = time.time()
data_list = read_data()
merge_data(data_list)
end = time.time()
print("程序运行时间:"+str(end-start))
来源:CSDN
作者:liu_qiping
链接:https://blog.csdn.net/liu_qiping/article/details/103455583