1.餐饮销量数的统计量情况
import pandas as pd
import matplotlib.pyplot as plt
# 餐饮销量数的统计量情况
catering_sale = '../data/chapter3/demo/data/catering_sale.xls' # 餐饮数据
data = pd.read_excel(catering_sale, index_col='日期') # 指定日期为列索引
print(data)
# 过滤异常数据
data = data[(data['销量'] > 400) & (data['销量'] < 5000)]
print(data)
# 数据基本描述
print(data.describe())
# 菜品盈利
catering_profit = '../data/chapter3/demo/data/catering_dish_profit.xls'
data = pd.read_excel(catering_profit, index_col='菜品名')
print(data)
data = data.sort_values(by='盈利', ascending=False)
print(data)
# rate = 1.0*data['盈利'].cumsum/data['盈利'].sum()
data.columns = ['food id', 'profit']
data.plot(kind='bar')
plt.xticks(rotation=0)
plt.xlabel('food name')
plt.ylabel('sales')
plt.show()
# 餐饮销量数据相关性分析
url = '../data/chapter3/demo/data/catering_sale_all.xls'
data = pd.read_excel(url, index_col='日期')
print(data)
print(data.corr()) # 相关系数矩阵
print(data['百合酱蒸凤爪'].corr(data['翡翠蒸香茜饺'])) # 计算“百合酱蒸凤爪”与“翡翠蒸香茜饺”的相关系数
2.拉格朗日插值
# 用拉格朗日法进行插值
import pandas as pd
from scipy.interpolate import lagrange
url = '../data/chapter4/demo/data/catering_sale.xls'
data = pd.read_excel(url)
print(data)
# 把不正常数据处理掉
data['销量'][(data['销量'] < 400) | (data['销量'] > 5000)] = None
print(data)
# 采用拉格朗日进行插值
def deal_data(s, n, k=5):
y = s[list(range(n-k, n))+list(range(n+1, n+k+1))]
y = y[y.notnull()]
return lagrange(y.index, list(y))(n)
for j in data.columns:
for i in range(len(data)):
if (data[j].isnull())[i]:
data[j][i] = deal_data(data[j], i)
print(data)
# 数据规范化处理
url = '../data/chapter4/demo/data/normalization_data.xls'
data = pd.read_excel(url)
print(data)
# 最小-大规范化
print((data-data.min())/(data.max()-data.min()))
# 0-均值规范化
print((data-data.mean())/data.std())
# 数据离散化
import matplotlib.pyplot as plt
url = '../data/chapter4/demo/data/discretization_data.xls'
data = pd.read_excel(url)
print(data)
data = data['肝气郁结证型系数'].copy()
print(data)
k = 4
d1 = pd.cut(data, k, labels=range(k)) # 等宽离散化
print(d1.describe())
# 等频离散化
w = [1.0*i/k for i in range(k)]
w = data.describe(percentiles=w)[4:4+k+1]
w[0] = w[0]*(1-1e-10)
d2 = pd.cut(data, w, labels=range(k))
# 聚类算法离散化
from sklearn.cluster import KMeans
model = KMeans(n_clusters=k, n_jobs=4) # n_jobs:并行数
# 模型训练
model.fit(data.values.reshape((len(data), 1)))
# 求出聚类中心
c = pd.DataFrame(model.cluster_centers_).sort_values(0)
# 相邻两项中间点作为边界
w = c.rolling(2).mean().iloc[1:]
w = [0] + list(w[0]) + [data.max()]
d3 = pd.cut(data, w, labels=range(k))
# 自定义做图函数
def cluster_plot(d, k):
for j in range(0, k):
plt.plot(data[d == j], [j for i in d[d == j]], 'o')
return plt
cluster_plot(d1, k).show()
cluster_plot(d2, k).show()
cluster_plot(d3, k).show()
3.主成分分析
# 主成分分析降维代码
import pandas as pd
url = '../data/chapter4/demo/data/principal_component.xls'
data = pd.read_excel(url, headers=None)
print(data)
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(data)
# 返回特征向量
print(pca.components_)
# 返回各个成分方差百分比
print(pca.explained_variance_ratio_)
# 选取前4个主成分
pca = PCA(3)
pca.fit(data)
# 降维
low_data = pca.transform(data)
print('降维')
print(low_data)
# 恢复数据
print('恢复')
print(pca.inverse_transform(low_data))
来源:CSDN
作者:陌上无双
链接:https://blog.csdn.net/baidu_38812770/article/details/103568890