ython编程实现ID3算法,使用西瓜数据集产生结果

给你一囗甜甜゛ 提交于 2020-11-23 19:55:45

数据集和注意的地方,我已经发在BP算法第一篇,你们看一下吧。
具体代码,不分析了,网上太多,有问题请留言。
给我点个赞加个关注吧。
上代码:


encoding:utf-8

import pandas as pd
import numpy as np

class DecisionTree:
def init(self):
self.model = None

def calEntropy(self, y):  # 计算熵
    valRate = y.value_counts().apply(lambda x: x / y.size)  # 频次汇总 得到各个特征对应的概率
    valEntropy = np.inner(valRate, np.log2(valRate)) * -1
    return valEntropy

def fit(self, xTrain, yTrain=pd.Series()):
    if yTrain.size == 0:  # 如果不传,自动选择最后一列作为分类标签
        yTrain = xTrain.iloc[:, -1]
        xTrain = xTrain.iloc[:, :len(xTrain.columns) - 1]
    self.model = self.buildDecisionTree(xTrain, yTrain)
    return self.model

def buildDecisionTree(self, xTrain, yTrain):
    propNamesAll = xTrain.columns
    # print(propNamesAll)
    yTrainCounts = yTrain.value_counts()
    if yTrainCounts.size == 1:
        # print('only one class', yTrainCounts.index[0])
        return yTrainCounts.index[0]
    entropyD = self.calEntropy(yTrain)

    maxGain = None
    maxEntropyPropName = None
    for propName in propNamesAll:
        propDatas = xTrain[propName]
        propClassSummary = propDatas.value_counts().apply(lambda x: x / propDatas.size)  # 频次汇总 得到各个特征对应的概率

        sumEntropyByProp = 0
        for propClass, dvRate in propClassSummary.items():
            yDataByPropClass = yTrain[xTrain[propName] == propClass]
            entropyDv = self.calEntropy(yDataByPropClass)
            sumEntropyByProp += entropyDv * dvRate
        gainEach = entropyD - sumEntropyByProp
        if maxGain == None or gainEach > maxGain:
            maxGain = gainEach
            maxEntropyPropName = propName
    # print('select prop:', maxEntropyPropName, maxGain)
    propDatas = xTrain[maxEntropyPropName]
    propClassSummary = propDatas.value_counts().apply(lambda x: x / propDatas.size)  # 频次汇总 得到各个特征对应的概率

    retClassByProp = {}
    for propClass, dvRate in propClassSummary.items():
        whichIndex = xTrain[maxEntropyPropName] == propClass
        if whichIndex.size == 0:
            continue
        xDataByPropClass = xTrain[whichIndex]
        yDataByPropClass = yTrain[whichIndex]
        del xDataByPropClass[maxEntropyPropName]  # 删除已经选择的属性列

        # print(propClass)
        # print(pd.concat([xDataByPropClass, yDataByPropClass], axis=1))

        retClassByProp[propClass] = self.buildDecisionTree(xDataByPropClass, yDataByPropClass)

    return {'Node': maxEntropyPropName, 'Edge': retClassByProp}

def predictBySeries(self, modelNode, data):
    if not isinstance(modelNode, dict):
        return modelNode
    nodePropName = modelNode['Node']
    prpVal = data.get(nodePropName)
    for edge, nextNode in modelNode['Edge'].items():
        if prpVal == edge:
            return self.predictBySeries(nextNode, data)
    return None

def predict(self, data):
    if isinstance(data, pd.Series):
        return self.predictBySeries(self.model, data)
    return data.apply(lambda d: self.predictBySeries(self.model, d), axis=1)

dataTrain = pd.read_csv(r"C:\Users\杨涵文\PycharmProjects\BP算法\方法一\data\table_4.2.csv", encoding="utf-8")

decisionTree = DecisionTree()
treeData = decisionTree.fit(dataTrain)
print(pd.DataFrame({'预测值': decisionTree.predict(dataTrain), '正取值': dataTrain.iloc[:, -1]}))

import json

print(json.dumps(treeData, ensure_ascii=False))

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!