数据分析实战之数据预处理

蹲街弑〆低调 提交于 2019-12-17 02:08:54

1.餐饮销量数的统计量情况

import pandas as pd
import matplotlib.pyplot as plt
# 餐饮销量数的统计量情况
catering_sale = '../data/chapter3/demo/data/catering_sale.xls'  # 餐饮数据
data = pd.read_excel(catering_sale, index_col='日期')           # 指定日期为列索引
print(data)

# 过滤异常数据
data = data[(data['销量'] > 400) & (data['销量'] < 5000)]
print(data)

# 数据基本描述
print(data.describe())

# 菜品盈利
catering_profit = '../data/chapter3/demo/data/catering_dish_profit.xls'
data = pd.read_excel(catering_profit, index_col='菜品名')
print(data)


data = data.sort_values(by='盈利', ascending=False)
print(data)

# rate = 1.0*data['盈利'].cumsum/data['盈利'].sum()
data.columns = ['food id', 'profit']
data.plot(kind='bar')
plt.xticks(rotation=0)

plt.xlabel('food name')
plt.ylabel('sales')
plt.show()

# 餐饮销量数据相关性分析

url = '../data/chapter3/demo/data/catering_sale_all.xls'
data = pd.read_excel(url, index_col='日期')
print(data)

print(data.corr())            # 相关系数矩阵
print(data['百合酱蒸凤爪'].corr(data['翡翠蒸香茜饺']))   # 计算“百合酱蒸凤爪”与“翡翠蒸香茜饺”的相关系数

2.拉格朗日插值

# 用拉格朗日法进行插值

import pandas as pd
from scipy.interpolate import lagrange

url = '../data/chapter4/demo/data/catering_sale.xls'
data = pd.read_excel(url)
print(data)

# 把不正常数据处理掉
data['销量'][(data['销量'] < 400) | (data['销量'] > 5000)] = None
print(data)

# 采用拉格朗日进行插值


def deal_data(s, n, k=5):
    y = s[list(range(n-k, n))+list(range(n+1, n+k+1))]
    y = y[y.notnull()]
    return lagrange(y.index, list(y))(n)


for j in data.columns:
    for i in range(len(data)):
        if (data[j].isnull())[i]:
            data[j][i] = deal_data(data[j], i)

print(data)

# 数据规范化处理
url = '../data/chapter4/demo/data/normalization_data.xls'
data = pd.read_excel(url)
print(data)


# 最小-大规范化
print((data-data.min())/(data.max()-data.min()))

# 0-均值规范化
print((data-data.mean())/data.std())

# 数据离散化
import matplotlib.pyplot as plt
url = '../data/chapter4/demo/data/discretization_data.xls'
data = pd.read_excel(url)
print(data)

data = data['肝气郁结证型系数'].copy()
print(data)

k = 4
d1 = pd.cut(data, k, labels=range(k))    # 等宽离散化
print(d1.describe())

# 等频离散化
w = [1.0*i/k for i in range(k)]
w = data.describe(percentiles=w)[4:4+k+1]
w[0] = w[0]*(1-1e-10)
d2 = pd.cut(data, w, labels=range(k))
# 聚类算法离散化
from sklearn.cluster import KMeans
model = KMeans(n_clusters=k, n_jobs=4)    # n_jobs:并行数
# 模型训练
model.fit(data.values.reshape((len(data), 1)))
# 求出聚类中心
c = pd.DataFrame(model.cluster_centers_).sort_values(0)
# 相邻两项中间点作为边界
w = c.rolling(2).mean().iloc[1:]
w = [0] + list(w[0]) + [data.max()]
d3 = pd.cut(data, w, labels=range(k))


# 自定义做图函数
def cluster_plot(d, k):
    for j in range(0, k):
        plt.plot(data[d == j], [j for i in d[d == j]], 'o')
    return plt


cluster_plot(d1, k).show()
cluster_plot(d2, k).show()
cluster_plot(d3, k).show()

3.主成分分析

# 主成分分析降维代码

import pandas as pd

url = '../data/chapter4/demo/data/principal_component.xls'
data = pd.read_excel(url, headers=None)
print(data)

from sklearn.decomposition import PCA

pca = PCA()
pca.fit(data)

# 返回特征向量
print(pca.components_)
# 返回各个成分方差百分比
print(pca.explained_variance_ratio_)

# 选取前4个主成分
pca = PCA(3)
pca.fit(data)
# 降维
low_data = pca.transform(data)
print('降维')
print(low_data)
# 恢复数据
print('恢复')
print(pca.inverse_transform(low_data))

 

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!