友情提示:
- 本文档根据林大贵的《Python+Spark 2.0 + Hadoop机器学习与大数据实战》整理得到,代码均为书中提供的源码(python 2.X版本)。
- 本文的可以利用pandoc转换为docx文档,点击这里安装下载pandoc后,在终端输入以下命令:
pandoc youfilename.md -f markdown -t docx -s -o outputfilename.docx
Mllib
决策树二元分类
环境准备
这个阶段包括数据的下载和整理,去除缺失的数据,不符合规范的数据(比如乱码等),数据类型转换,将字符串类型映射为数字类型(建立字典),数据类型转患(将字符串类型的数字转换为浮点型的数字)。
- 导入这个模型我们需要的包
# -*- coding: UTF-8 -*- import sys from time import time import pandas as pd import matplotlib.pyplot as plt from pyspark import SparkConf, SparkContext from pyspark.mllib.tree import DecisionTree from pyspark.mllib.regression import LabeledPoint import numpy as np from pyspark.mllib.evaluation import BinaryClassificationMetrics
- 创建SparkContext,并设置日志级别
def SetLogger( sc ): logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR ) logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR ) logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR) #如果要在cluster模式运行(hadoop yarn 或Spark Stand alone),请按照书上的说明,先把文件上传到HDFS目录 def SetPath(sc): global Path if sc.master[0:5]=="local" : Path="file:/mnt/hdfs/Hadoop/" else: Path="hdfs://master:9000/user/hduser/" def CreateSparkContext(): sparkConf = SparkConf().\ setAppName("RunDecisionTreeBinary").\ set("spark.ui.showConsoleProgress", "false") sc = SparkContext(conf = sparkConf) print ("master="+sc.master) SetLogger(sc) SetPath(sc) return (sc)
数据准备
def convert_float(x): return (0 if x=="?" else float(x)) def extract_label(record): label=(record[-1]) return float(label) def extract_features(field,categoriesMap,featureEnd): categoryIdx = categoriesMap[field[3]] categoryFeatures = np.zeros(len(categoriesMap)) categoryFeatures[categoryIdx] = 1 numericalFeatures=[convert_float(field) for field in field[4: featureEnd]] return np.concatenate(( categoryFeatures, numericalFeatures)) def extract_label(record): label=(record[-1]) return float(label) def extract_features(field,categoriesMap,featureEnd): categoryIdx = categoriesMap[field[3]] categoryFeatures = np.zeros(len(categoriesMap)) categoryFeatures[categoryIdx] = 1 numericalFeatures=[convert_float(field) for field in field[4: featureEnd]] return np.concatenate(( categoryFeatures, numericalFeatures)) def PrepareData(sc): #----------------------1.导入并转换数据------------- print("开始导入数据...") rawDataWithHeader = sc.textFile(Path+"Data/train.tsv") header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x:x !=header) rData=rawData.map(lambda x: x.replace("\"", "")) lines = rData.map(lambda x: x.split("\t")) print("共计:" + str(lines.count()) + "项") #----------------------2.建立训练评估所需数据 RDD[LabeledPoint]------------- categoriesMap = lines.map(lambda fields: fields[3]). \ distinct().zipWithIndex().collectAsMap() # 提取特征字段和标签 labelpointRDD = lines.map( lambda r:LabeledPoint( \ extract_label(r), \ extract_features(r,categoriesMap,len(r) - 1))) #print "labelpointRDD=",labelpointRDD.first(),"\n" #----------------------3.以随机方式将数据分为3个部分并且返回------------- (trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1]) print("将数据分trainData:" + str(trainData.count()) + " validationData:" + str(validationData.count()) + " testData:" + str(testData.count())) return (trainData, validationData, testData, categoriesMap) #返回数据
模型训练
- 建立trainData,使用trainData执行DecisionTree训练并建立模型
from pyspark.mllib.tree import DecisionTree model = DecisionTree.trainClassifier(trainData,numClasses=2,categoricalFeaturesInfo={},\ impurity='entropy',maxDepth=5,maxBins=5)
- 使用
DecisionTree.trainClassifier(trainData,validationData,impurityParm, maxDepthParm, maxBinsParm)
模型训练
def evaluateModel(model, validationData): score = model.predict(validationData.map(lambda p: p.features)) scoreAndLabels=score.zip(validationData.map(lambda p: p.label)) metrics = BinaryClassificationMetrics(scoreAndLabels) AUC=metrics.areaUnderROC return( AUC) def trainEvaluateModel(trainData,validationData, impurityParm, maxDepthParm, maxBinsParm): startTime = time() model = DecisionTree.trainClassifier(trainData, numClasses=2, categoricalFeaturesInfo={}, #这里是二元分类所以numClasses设置为2 impurity=impurityParm, maxDepth=maxDepthParm, maxBins=maxBinsParm) AUC = evaluateModel(model, validationData) duration = time() - startTime print("训练评估:使用参数" + \ " impurity="+str(impurityParm) +\ " maxDepth="+str(maxDepthParm) + \ " maxBins="+str(maxBinsParm) +\ " 所需时间="+str(duration) + \ " 结果AUC = " + str(AUC) ) return (AUC,duration, impurityParm, maxDepthParm, maxBinsParm,model)
模型预测
- PredictData(sc,model,categoriesMap)
def PredictData(sc,model,categoriesMap): print("开始导入数据...") rawDataWithHeader = sc.textFile(Path+"Data/test.tsv") header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x:x !=header) rData=rawData.map(lambda x: x.replace("\"", "")) lines = rData.map(lambda x: x.split("\t")) print("共计:" + str(lines.count()) + "项") dataRDD = lines.map(lambda r: ( r[0] , extract_features(r,categoriesMap,len(r) ))) DescDict = { 0: "暂时性网页(ephemeral)", 1: "长青网页(evergreen)" } for data in dataRDD.take(10): predictResult = model.predict(data[1]) print( " 网址: " +str(data[0])+"\n" +\ " ==>预测:"+ str(predictResult)+ \ " 说明:"+DescDict[predictResult] +"\n")
模型评估
- 创建
scoreAndLabels
score = model.predict(validationData.map(lambda p:p.features)) scoreAndLabels = score.zip(validationData.map(lambda p:p.label)) scoreAndLabels.take(5)
- 编写
BinaryClassifierMetrix
计算AUC
from pyspark.mllib.evaluation import BinaryClassificationMetrics metrics = BinaryClassificationMetrics(scoreAndLabels) print('AUC= ' + str(metrics.areaUnderROC))
- 使用
BinaryClassificationMetrics(scoreAndLables).areaUnderROC
进行评估分析,计算AUC。
def evaluateModel(model, validationData): score = model.predict(validationData.map(lambda p: p.features)) scoreAndLabels=score.zip(validationData.map(lambda p: p.label)) metrics = BinaryClassificationMetrics(scoreAndLabels) AUC=metrics.areaUnderROC return AUC
参数评估
- 评估impurity参数,决策树分裂结点的方法
impurityList = ['gini','entropy'] maxDepthList=[10] maxBinList=[10] metrics = [trainEvaluateModel(trainData,validationData,impurity,maxDepth,maxBins) \ for impurity in impurityList \ for maxDepth in maxDepthList \ for maxBins in maxBinList]
- 编写 evalParameter 评估单个参数
def evalParameter(trainData, validationData, evalparm, impurityList, maxDepthList, maxBinsList): metrics = [trainEvaluateModel(trainData, validationData, impurity,maxDepth, maxBins ) for impurity in impurityList for maxDepth in maxDepthList for maxBins in maxBinsList ] if evalparm=="impurity": IndexList=impurityList[:] elif evalparm=="maxDepth": IndexList=maxDepthList[:] elif evalparm=="maxBins": IndexList=maxBinsList[:] df = pd.DataFrame(metrics,index=IndexList, columns=['AUC', 'duration','impurity', 'maxDepth', 'maxBins','model']) showchart(df,evalparm,'AUC','duration',0.5,0.7 ) evalParameter(trainData,validationData,'maxDepth',impurityList=['gini'],maxDepthList=[3,5,10,15,25,],maxBinList=[10])
- 可视化,编写
showchart()
函数
def showchart(df,evalparm ,barData,lineData,yMin,yMax): ax = df[barData].plot(kind='bar', title =evalparm,figsize=(10,6),legend=True, fontsize=12) ax.set_xlabel(evalparm,fontsize=12) ax.set_ylim([yMin,yMax]) ax.set_ylabel(barData,fontsize=12) ax2 = ax.twinx() ax2.plot(df[[lineData ]].values, linestyle='-', marker='o', linewidth=2.0,color='r') plt.show()
- 找出准确率最高的参数组合,编写
evalAllParameter()
函数,包括impurity
,maxDepth
,maxBins
def evalAllParameter(trainData, validationData, impurityList, maxDepthList, maxBinsList): metrics = [trainEvaluateModel(trainData, validationData, impurity,maxDepth, maxBins ) for impurity in impurityList for maxDepth in maxDepthList for maxBins in maxBinsList ] Smetrics = sorted(metrics, key=lambda k: k[0], reverse=True) bestParameter=Smetrics[0] print("调校后最佳参数:impurity:" + str(bestParameter[2]) + " ,maxDepth:" + str(bestParameter[3]) + " ,maxBins:" + str(bestParameter[4]) + " ,结果AUC = " + str(bestParameter[0])) return bestParameter[5] # 这里的bestParameter[5]指的是返回的带有最佳参数的最佳模型
检验AUC
- 检测模型是否过度训练,看一下testData的AUC是否过低。
AUC = evaluateModel(model,testData) print("AUC = " + str(AUC))
驱动函数
if __name__ == "__main__": print("RunDecisionTreeBinary") sc=CreateSparkContext() print("==========数据准备阶段===============") (trainData, validationData, testData, categoriesMap) =PrepareData(sc) trainData.persist(); validationData.persist(); testData.persist() print("==========训练评估阶段===============") (AUC,duration, impurityParm, maxDepthParm, maxBinsParm,model)= \ trainEvaluateModel(trainData, validationData, "entropy", 10, 200) if (len(sys.argv) == 2) and (sys.argv[1]=="-e"): parametersEval(trainData, validationData) elif (len(sys.argv) == 2) and (sys.argv[1]=="-a"): print("-----所有参数训练评估找出最好的参数组合---------") model=evalAllParameter(trainData, validationData, ["gini", "entropy"], [3, 5, 10, 15, 20, 25], [3, 5, 10, 50, 100, 200 ]) print("==========测试阶段===============") auc = evaluateModel(model, testData) print("使用test Data测试最佳模型,结果 AUC:" + str(auc)) print("==========预测数据===============") PredictData(sc, model, categoriesMap) print(model.toDebugString())
逻辑回归二元分类
环境准备
- 导入这个模型需要的包
# -*- coding: UTF-8 -*- import sys from time import time import pandas as pd import matplotlib.pyplot as plt from pyspark import SparkConf, SparkContext from pyspark.mllib.classification import LogisticRegressionWithSGD from pyspark.mllib.regression import LabeledPoint import numpy as np from pyspark.mllib.evaluation import BinaryClassificationMetrics from pyspark.mllib.feature import StandardScaler
- 前期准备
def SetLogger( sc ): logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR ) logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR ) logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR) def SetPath(sc): global Path if sc.master[0:5]=="local" : Path="file:/home/hduser/pythonsparkexample/PythonProject/" else: Path="hdfs://master:9000/user/hduser/" def CreateSparkContext(): sparkConf = SparkConf() \ .setAppName("LogisticRegressionWithSGD") \ .set("spark.ui.showConsoleProgress", "false") sc = SparkContext(conf = sparkConf) print ("master="+sc.master) SetLogger(sc) SetPath(sc) return (sc)
数据准备
- 标准化将数值特征字段具有共同的标准
def extract_label(record): label=(record[-1]) return float(label) def extract_features(field,categoriesMap,featureEnd): categoryIdx = categoriesMap[field[3]] categoryFeatures = np.zeros(len(categoriesMap)) categoryFeatures[categoryIdx] = 1 numericalFeatures=[convert_float(field) for field in field[4: featureEnd]] return np.concatenate(( categoryFeatures, numericalFeatures)) def convert_float(x): return (0 if x=="?" else float(x)) def PrepareData(sc): #----------------------1.导入并转换数据------------- print("开始导入数据...") rawDataWithHeader = sc.textFile(Path+"data/train.tsv") header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x:x !=header) rData=rawData.map(lambda x: x.replace("\"", "")) lines = rData.map(lambda x: x.split("\t")) print("共计:" + str(lines.count()) + "项") #----------------------2.建立训练评估所需数据 RDD[LabeledPoint]------------- print "标准化之前:", categoriesMap = lines.map(lambda fields: fields[3]). \ distinct().zipWithIndex().collectAsMap() labelRDD = lines.map(lambda r: extract_label(r)) featureRDD = lines.map(lambda r: extract_features(r,categoriesMap,len(r) - 1)) for i in featureRDD.first(): print (str(i)+","), print ("") print ("标准化之后:") #这里的withMean参数为True,在朴素贝叶斯的模型立这个参数应当设置为False stdScaler = StandardScaler(withMean=True, withStd=True).fit(featureRDD) ScalerFeatureRDD=stdScaler.transform(featureRDD) for i in ScalerFeatureRDD.first(): print (str(i)+","), labelpoint=labelRDD.zip(ScalerFeatureRDD) labelpointRDD=labelpoint.map(lambda r: LabeledPoint(r[0], r[1])) #----------------------3.以随机方式将数据分为3个部分并且返回------------- (trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1]) print("将数据分trainData:" + str(trainData.count()) + " validationData:" + str(validationData.count()) + " testData:" + str(testData.count())) return (trainData, validationData, testData, categoriesMap) #返回数据
模型训练
- 使用
LogisticRegressionWithSGD.train(trainData, numIterations, stepSize, miniBatchFraction)
进行模型训练
numIterations:使用SGD迭代的次数
stepSize:每次执行SGD迭代步长大小,默认为1
minBatchFraction:每次迭代参与计算的样本比例,数值在0 - 1之间,默认为1
def evaluateModel(model, validationData): score = model.predict(validationData.map(lambda p: p.features)) scoreAndLabels=score.zip(validationData \ .map(lambda p: p.label)) \ .map(lambda (x,y): (float(x),float(y)) ) metrics = BinaryClassificationMetrics(scoreAndLabels) AUC=metrics.areaUnderROC return( AUC) def trainEvaluateModel(trainData,validationData, numIterations, stepSize, miniBatchFraction): startTime = time() model = LogisticRegressionWithSGD.train(trainData, numIterations, stepSize, miniBatchFraction) AUC = evaluateModel(model, validationData) duration = time() - startTime print "训练评估:使用参数" + \ " numIterations="+str(numIterations) +\ " stepSize="+str(stepSize) + \ " miniBatchFraction="+str(miniBatchFraction) +\ " 所需时间="+str(duration) + \ " 结果AUC = " + str(AUC) return (AUC,duration, numIterations, stepSize, miniBatchFraction,model)
模型预测
- 使用test.tsv进行预测
def PredictData(sc,model,categoriesMap): print("开始导入数据...") rawDataWithHeader = sc.textFile(Path+"data/test.tsv") header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x:x !=header) rData=rawData.map(lambda x: x.replace("\"", "")) lines = rData.map(lambda x: x.split("\t")) print("共计:" + str(lines.count()) + "项") dataRDD = lines.map(lambda r: ( r[0] , extract_features(r,categoriesMap,len(r) ))) DescDict = { 0: "暂时性网页(ephemeral)", 1: "长青网页(evergreen)" } for data in dataRDD.take(10): predictResult = model.predict(data[1]) print " 网址: " +str(data[0])+"\n" +\ " ==>预测:"+ str(predictResult)+ \ " 说明:"+DescDict[predictResult] +"\n"
模型评估
- 使用
BinaryClassificationMetrics(scoreAndLables)
进行评估分析,计算AUC,标准化后的模型都需要浮点类型的转换。
def evaluateModel(model, validationData): score = model.predict(validationData.map(lambda p: p.features)) scoreAndLabels=score.zip(validationData.map(lambda p: p.label)).map(lambda (x,y):(float(x),float(y)) ) metrics = BinaryClassificationMetrics(scoreAndLabels) AUC=metrics.areaUnderROC return( AUC)
参数评估
- 可视化函数
def showchart(df,evalparm ,barData,lineData,yMin,yMax): ax = df[barData].plot(kind='bar', title =evalparm,figsize=(10,6),legend=True, fontsize=12) ax.set_xlabel(evalparm,fontsize=12) ax.set_ylim([yMin,yMax]) ax.set_ylabel(barData,fontsize=12) ax2 = ax.twinx() ax2.plot(df[[lineData ]].values, linestyle='-', marker='o', linewidth=2.0,color='r') plt.show()
- 单个参数评估
def evalParameter(trainData, validationData, evalparm, numIterationsList, stepSizeList, miniBatchFractionList): metrics = [trainEvaluateModel(trainData, validationData, numIterations,stepSize, miniBatchFraction ) for numIterations in numIterationsList for stepSize in stepSizeList for miniBatchFraction in miniBatchFractionList ] if evalparm=="numIterations": IndexList=numIterationsList[:] elif evalparm=="stepSize": IndexList=stepSizeList[:] elif evalparm=="miniBatchFraction": IndexList=miniBatchFractionList[:] df = pd.DataFrame(metrics,index=IndexList, columns=['AUC', 'duration','numIterations', 'stepSize','miniBatchFraction','model']) showchart(df,evalparm,'AUC','duration',0.5,0.7 )
- 多个参数评估
def parametersEval(trainData, validationData): print("----- 评估numIterations参数使用 ---------") evalParameter(trainData, validationData,"numIterations", numIterationsList=[5, 15, 20, 60, 100], stepSizeList=[10], miniBatchFractionList=[1 ]) print("----- 评估stepSize参数使用 ---------") evalParameter(trainData, validationData,"stepSize", numIterationsList=[100], stepSizeList=[10, 50, 100, 200], miniBatchFractionList=[1]) print("----- 评估miniBatchFraction参数使用 ---------") evalParameter(trainData, validationData,"miniBatchFraction", numIterationsList=[100], stepSizeList =[100], miniBatchFractionList=[0.5, 0.8, 1 ])
- 多个参数同时评估,包括
numIterations
、stepSize
、minBatchFraction
numIterations:使用SGD迭代的次数
stepSize:每次执行SGD迭代步长大小,默认为1
minBatchFraction:每次迭代参与计算的样本比例,数值在0 - 1之间,默认为1
def evalAllParameter(trainData, validationData, numIterationsList, stepSizeList, miniBatchFractionList): metrics = [trainEvaluateModel(trainData, validationData, numIterations,stepSize, miniBatchFraction ) for numIterations in numIterationsList for stepSize in stepSizeList for miniBatchFraction in miniBatchFractionList ] Smetrics = sorted(metrics, key=lambda k: k[0], reverse=True) bestParameter=Smetrics[0] print("调校后最佳参数:numIterations:" + str(bestParameter[2]) + " ,stepSize:" + str(bestParameter[3]) + " ,miniBatchFraction:" + str(bestParameter[4]) + " ,结果AUC = " + str(bestParameter[0])) return bestParameter[5] # 这里的bestParameter[5]指的是返回的带有最佳参数的最佳模型
检验AUC
AUC = evaluateModel(model,testData) print("AUC = " + str(AUC))
驱动函数
if __name__ == "__main__": print("RunLogisticRegressionWithSGDBinary") sc=CreateSparkContext() print("==========数据准备阶段===============") (trainData, validationData, testData, categoriesMap) =PrepareData(sc) trainData.persist(); validationData.persist(); testData.persist() print("==========训练评估阶段===============") (AUC,duration, numIterationsParm, stepSizeParm, miniBatchFractionParm,model)= \ trainEvaluateModel(trainData, validationData, 15, 10, 0.5) if (len(sys.argv) == 2) and (sys.argv[1]=="-e"): parametersEval(trainData, validationData) elif (len(sys.argv) == 2) and (sys.argv[1]=="-a"): print("-----所有参数训练评估找出最好的参数组合---------") model=evalAllParameter(trainData, validationData, [3, 5, 10,15], [10, 50, 100], [0.5, 0.8, 1 ]) print("==========测试阶段===============") auc = evaluateModel(model, testData) print("使用test Data测试最佳模型,结果 AUC:" + str(auc)) print("==========预测数据===============") PredictData(sc, model, categoriesMap)
支持向量机SVM二元分类
环境准备
- 导入
SVMWithSGD
和StandardScale
模块
# -*- coding: UTF-8 -*- import sys from time import time import pandas as pd import matplotlib.pyplot as plt from pyspark import SparkConf, SparkContext from pyspark.mllib.classification import SVMWithSGD from pyspark.mllib.regression import LabeledPoint import numpy as np from pyspark.mllib.evaluation import BinaryClassificationMetrics from pyspark.mllib.feature import StandardScale
def SetLogger( sc ): logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR ) logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR ) logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR) def SetPath(sc): global Path if sc.master[0:5]=="local" : Path="file:/home/hduser/pythonsparkexample/PythonProject/" else: Path="hdfs://master:9000/user/hduser/" def CreateSparkContext(): sparkConf = SparkConf() \ .setAppName("LogisticRegressionWithSGD") \ .set("spark.ui.showConsoleProgress", "false") sc = SparkContext(conf = sparkConf) print ("master="+sc.master) SetLogger(sc) SetPath(sc) return (sc)
数据准备
def extract_label(record): label=(record[-1]) return float(label) def extract_features(field,categoriesMap,featureEnd): categoryIdx = categoriesMap[field[3]] categoryFeatures = np.zeros(len(categoriesMap)) categoryFeatures[categoryIdx] = 1 numericalFeatures=[convert_float(field) for field in field[4: featureEnd]] return np.concatenate(( categoryFeatures, numericalFeatures)) def convert_float(x): return (0 if x=="?" else float(x)) def PrepareData(sc): #----------------------1.导入并转换数据------------- print("开始导入数据...") rawDataWithHeader = sc.textFile(Path+"data/train.tsv") header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x:x !=header) rData=rawData.map(lambda x: x.replace("\"", "")) lines = rData.map(lambda x: x.split("\t")) print("共计:" + str(lines.count()) + "项") #----------------------2.建立训练评估所需数据 RDD[LabeledPoint]------------- print "标准化之前:", categoriesMap = lines.map(lambda fields: fields[3]). \ distinct().zipWithIndex().collectAsMap() labelRDD = lines.map(lambda r: extract_label(r)) featureRDD = lines.map(lambda r: extract_features(r,categoriesMap,len(r) - 1)) for i in featureRDD.first(): print (str(i)+","), print "" stdScaler = StandardScaler(withMean=True, withStd=True).fit(featureRDD) ScalerFeatureRDD=stdScaler.transform(featureRDD) print "标准化之后:", for i in ScalerFeatureRDD.first(): print (str(i)+","), labelpoint=labelRDD.zip(ScalerFeatureRDD) labelpointRDD=labelpoint.map(lambda r: LabeledPoint(r[0], r[1])) #----------------------3.以随机方式将数据分为3个部分并且返回------------- (trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1]) print("将数据分trainData:" + str(trainData.count()) + " validationData:" + str(validationData.count()) + " testData:" + str(testData.count())) return (trainData, validationData, testData, categoriesMap) #返回数据
模型训练
- 使用
SVMWithSGD.train(trainData, numIterations, stepSize, regParam)
进行模型训练
trainData:输入的训练数据LabeledPointRDD
numIterations:使用SGD迭代的次数,默认为100
stepSize:每次执行SGD迭代步长的大小,默认为1
regParam:正则化参数,数值为 0-1 之间
def trainEvaluateModel(trainData,validationData, numIterations, stepSize, regParam): startTime = time() model = SVMWithSGD.train(trainData, numIterations, stepSize, regParam) AUC = evaluateModel(model, validationData) duration = time() - startTime print "训练评估:使用参数" + \ " numIterations="+str(numIterations) +\ " stepSize="+str(stepSize) + \ " regParam="+str(regParam) +\ " 所需时间="+str(duration) + \ " 结果AUC = " + str(AUC) return (AUC,duration, numIterations, stepSize, regParam,model)
模型预测
- 利用 model.predict() 进行预测
def PredictData(sc,model,categoriesMap): print("开始导入数据...") rawDataWithHeader = sc.textFile(Path+"data/test.tsv") header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x:x !=header) rData=rawData.map(lambda x: x.replace("\"", "")) lines = rData.map(lambda x: x.split("\t")) print("共计:" + str(lines.count()) + "项") dataRDD = lines.map(lambda r: ( r[0] , extract_features(r,categoriesMap,len(r) ))) DescDict = { 0: "暂时性网页(ephemeral)", 1: "长青网页(evergreen)" } for data in dataRDD.take(10): predictResult = model.predict(data[1]) print " 网址: " +str(data[0])+"\n" +\ " ==>预测:"+ str(predictResult)+ \ " 说明:"+DescDict[predictResult] +"\n"
模型评估
- 利用
BinaryClassificationMetrics(scoreAndLabels)
计算AUC评估结果,标准化后的模型都需要浮点类型的转换。
def evaluateModel(model, validationData): score = model.predict(validationData.map(lambda p: p.features)) # 这里的validationData实际上是一个元祖(label,features),而最后的map(lambda (x,y): (float(x),float(y))就是将rdd中的预测结果和实际结果进行类型转换 scoreAndLabels=score.zip(validationData \ .map(lambda p: p.label)) \ .map(lambda (x,y): (float(x),float(y))) metrics = BinaryClassificationMetrics(scoreAndLabels) AUC=metrics.areaUnderROC return AUC
参数评估
- 利用
parametersEval()
进行单个参数评估
trainData:输入的训练数据LabeledPointRDD
numIterations:使用SGD迭代的次数,默认为100
stepSize:每次执行SGD迭代步长的大小,默认为1
regParam:正则化参数,数值为 0-1 之间
def parametersEval(trainData, validationData): print("----- 评估numIterations参数使用 ---------") evalParameter(trainData, validationData,"numIterations", numIterationsList= [1, 3, 5, 15, 25], stepSizeList=[100], regParamList=[1 ]) print("----- 评估stepSize参数使用 ---------") evalParameter(trainData, validationData,"stepSize", numIterationsList=[25], stepSizeList= [10, 50, 100, 200], regParamList=[1]) print("----- 评估regParam参数使用 ---------") evalParameter(trainData, validationData,"regParam", numIterationsList=[25], stepSizeList =[100], regParamList=[0.01, 0.1, 1 ])
- 利用
trainEvaluateModel
多个参数评估
def trainEvaluateModel(trainData,validationData, numIterations, stepSize, regParam): startTime = time() model = SVMWithSGD.train(trainData, numIterations, stepSize, regParam) AUC = evaluateModel(model, validationData) duration = time() - startTime print "训练评估:使用参数" + \ " numIterations="+str(numIterations) +\ " stepSize="+str(stepSize) + \ " regParam="+str(regParam) +\ " 所需时间="+str(duration) + \ " 结果AUC = " + str(AUC) return (AUC,duration, numIterations, stepSize, regParam,model)
- 多个参数同时评估
def evalAllParameter(trainData, validationData, numIterationsList, stepSizeList, regParamList): metrics = [trainEvaluateModel(trainData, validationData, numIterations,stepSize, regParam ) for numIterations in numIterationsList for stepSize in stepSizeList for regParam in regParamList ] Smetrics = sorted(metrics, key=lambda k: k[0], reverse=True) bestParameter=Smetrics[0] print("调校后最佳参数:numIterations:" + str(bestParameter[2]) + " ,stepSize:" + str(bestParameter[3]) + " ,regParam:" + str(bestParameter[4]) + " ,结果AUC = " + str(bestParameter[0])) return bestParameter[5]
检验AUC
AUC = evaluateModel(model, testData) print(AUC)
驱动函数
- 注意修改参数为SVMWithSGD训练评估所需要的参数
if __name__ == "__main__": print("RunSVMWithSGDBinary") sc=CreateSparkContext() print("==========数据准备阶段===============") (trainData, validationData, testData, categoriesMap) =PrepareData(sc) trainData.persist(); validationData.persist(); testData.persist() print("==========训练评估阶段===============") (AUC,duration, numIterations, stepSize, regParam,model)= \ trainEvaluateModel(trainData, validationData, 3, 50, 1) if (len(sys.argv) == 2) and (sys.argv[1]=="-e"): parametersEval(trainData, validationData) elif (len(sys.argv) == 2) and (sys.argv[1]=="-a"): print("-----所有参数训练评估找出最好的参数组合---------") model=evalAllParameter(trainData, validationData, [1, 3, 5, 15, 25], [10, 50, 100, 200], [0.01, 0.1, 1 ]) print("==========测试阶段===============") auc = evaluateModel(model, testData) print("使用test Data测试最佳模型,结果 AUC:" + str(auc)) print("==========预测数据===============") PredictData(sc, model, categoriesMap)
朴素贝叶斯二元分类
环境准备
- 导入我们需要包
# -*- coding: UTF-8 -*- import sys from time import time import pandas as pd import matplotlib.pyplot as plt from pyspark import SparkConf, SparkContext # 导入NavieBayes模块 from pyspark.mllib.classification import NaiveBayes from pyspark.mllib.regression import LabeledPoint import numpy as np from pyspark.mllib.evaluation import BinaryClassificationMetrics from pyspark.mllib.feature import StandardScaler
- 设置SparkContext 环境
def SetLogger( sc ): logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR ) logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR ) logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR) def SetPath(sc): global Path if sc.master[0:5]=="local" : Path="file:/home/hduser/pythonsparkexample/PythonProject/" else: Path="hdfs://master:9000/user/hduser/" def CreateSparkContext(): sparkConf = SparkConf() \ .setAppName("LogisticRegressionWithSGD") \ .set("spark.ui.showConsoleProgress", "false") sc = SparkContext(conf = sparkConf) print ("master="+sc.master) SetLogger(sc) SetPath(sc) return (sc)
数据准备
- 修改
PrepareData()
加入数据标准化,将withmean
参数设置为False
,在提取特征值时必须保证特征值大于0
def extract_label(record): label=(record[-1]) return float(label) def extract_features(field,categoriesMap,featureEnd): categoryIdx = categoriesMap[field[3]] categoryFeatures = np.zeros(len(categoriesMap)) categoryFeatures[categoryIdx] = 1 numericalFeatures=[convert_float(field) for field in field[4: featureEnd]] return np.concatenate(( categoryFeatures, numericalFeatures)) def convert_float(x): ret=(0 if x=="?" else float(x)) return(0 if ret<0 else ret) #NaiveBayes 特征字段一定大于0,所以负数也转换为0 def PrepareData(sc): #----------------------1.导入并转换数据------------- print("开始导入数据...") rawDataWithHeader = sc.textFile(Path+"data/train.tsv") header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x:x !=header) rData=rawData.map(lambda x: x.replace("\"", "")) lines = rData.map(lambda x: x.split("\t")) print("共计:" + str(lines.count()) + "项") #----------------------2.建立训练评估所需数据 RDD[LabeledPoint]------------- print ("标准化之前:"), categoriesMap = lines.map(lambda fields: fields[3]). \ distinct().zipWithIndex().collectAsMap() labelRDD = lines.map(lambda r: extract_label(r)) featureRDD = lines.map(lambda r: extract_features(r,categoriesMap,len(r) - 1)) for i in featureRDD.first(): print (str(i)+","), print ("") print ("标准化之后:"), # 这里有个withmean参数设置为False stdScaler = StandardScaler(withMean=False, withStd=True).fit(featureRDD) ScalerFeatureRDD=stdScaler.transform(featureRDD) for i in ScalerFeatureRDD.first(): print (str(i)+","), labelpoint=labelRDD.zip(ScalerFeatureRDD) labelpointRDD=labelpoint.map(lambda r: LabeledPoint(r[0], r[1])) #----------------------3.以随机方式将数据分为3个部分并且返回------------- (trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1]) print("将数据分trainData:" + str(trainData.count()) + " validationData:" + str(validationData.count()) + " testData:" + str(testData.count())) return (trainData, validationData, testData, categoriesMap) #返回数据
模型训练
- 利用
NaiveBayes.train(trainData, lambdaParam)
训练模型 - 参数说明
NaiveBayes.train(input,lambda)
- input:输入的训练数据 LabelPointRDD
- lambda:设置lambda 参数,默认值为1.0
def trainEvaluateModel(trainData,validationData,lambdaParam): startTime = time() model = NaiveBayes.train(trainData,lambdaParam) AUC = evaluateModel(model, validationData) duration = time() - startTime print "训练评估:使用参数" + \ " lambda="+str( lambdaParam) +\ " 所需时间="+str(duration) + \ " 结果AUC = " + str(AUC) return (AUC,duration, lambdaParam,model) def evaluateModel(model, validationData): score = model.predict(validationData.map(lambda p: p.features)) scoreAndLabels=score.zip(validationData \ .map(lambda p: p.label)) \ .map(lambda (x,y): (float(x),float(y)) ) metrics = BinaryClassificationMetrics(scoreAndLabels) AUC=metrics.areaUnderROC return( AUC)
模型评估
- 利用
BinaryClassificationMetrics(scoreAndLabels)
评估
def evaluateModel(model, validationData): score = model.predict(validationData.map(lambda p: p.features)) scoreAndLabels=score.zip(validationData \ .map(lambda p: p.label)) \ .map(lambda (x,y): (float(x),float(y)) ) metrics = BinaryClassificationMetrics(scoreAndLabels) AUC=metrics.areaUnderROC return( AUC)
参数评估
- 单个参数评估
def parametersEval(trainData, validationData): print("----- 评估lambda参数使用 ---------") evalParameter(trainData, validationData,"lambdaParam", lambdaParamList=[1.0, 3.0, 5.0, 15.0, 25.0,30.0,35.0,40.0,45.0,50.0,60.0])
- 参数评估
def evalParameter(trainData, validationData, evalparm, lambdaParamList): metrics = [trainEvaluateModel(trainData, validationData,regParam ) for regParam in lambdaParamList] evalparm="lambdaParam" IndexList=lambdaParamList df = pd.DataFrame(metrics,index=IndexList, columns=['AUC', 'duration',' lambdaParam','model']) showchart(df,evalparm,'AUC','duration',0.5,0.7 )
- 可视化函数
def showchart(df,evalparm ,barData,lineData,yMin,yMax): ax = df[barData].plot(kind='bar', title =evalparm,figsize=(10,6),legend=True, fontsize=12) ax.set_xlabel(evalparm,fontsize=12) ax.set_ylim([yMin,yMax]) ax.set_ylabel(barData,fontsize=12) ax2 = ax.twinx() ax2.plot(df[[lineData ]].values, linestyle='-', marker='o', linewidth=2.0,color='r') plt.show()
- 多个参数同时评估
def evalAllParameter(training_RDD, validation_RDD, lambdaParamList): metrics = [trainEvaluateModel(trainData, validationData, lambdaParam ) for lambdaParam in lambdaParamList ] Smetrics = sorted(metrics, key=lambda k: k[0], reverse=True) bestParameter=Smetrics[0] print("调校后最佳参数:lambdaParam:" + str(bestParameter[2]) + " ,结果AUC = " + str(bestParameter[0])) return bestParameter[3]
检验AUC
- 利用模型评估AUC大小
AUC = evaluateModel(model, testData) print(AUC)
模型预测
def PredictData(sc,model,categoriesMap): print("开始导入数据...") rawDataWithHeader = sc.textFile(Path+"data/test.tsv") header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x:x !=header) rData=rawData.map(lambda x: x.replace("\"", "")) lines = rData.map(lambda x: x.split("\t")) print("共计:" + str(lines.count()) + "项") dataRDD = lines.map(lambda r: ( r[0] , extract_features(r,categoriesMap,len(r) ))) DescDict = { 0: "暂时性网页(ephemeral)", 1: "长青网页(evergreen)" } for data in dataRDD.take(10): predictResult = model.predict(data[1]) print " 网址: " +str(data[0])+"\n" +\ " ==>预测:"+ str(predictResult)+ \ " 说明:"+DescDict[predictResult] +"\n"
驱动函数
if __name__ == "__main__": print("RunNaiveBayesBinary") sc=CreateSparkContext() print("==========数据准备阶段===============") (trainData, validationData, testData, categoriesMap) =PrepareData(sc) trainData.persist(); validationData.persist(); testData.persist() print("==========训练评估阶段===============") (AUC,duration, lambdaParam,model)= \ trainEvaluateModel(trainData, validationData, 60.0) if (len(sys.argv) == 2) and (sys.argv[1]=="-e"): parametersEval(trainData, validationData) elif (len(sys.argv) == 2) and (sys.argv[1]=="-a"): print("-----所有参数训练评估找出最好的参数组合---------") model=evalAllParameter(trainData, validationData, [1.0, 3.0, 5.0, 15.0, 25.0,30.0,35.0,40.0,45.0,50.0,60.0]) print("==========测试阶段===============") auc = evaluateModel(model, testData) print("使用test Data测试最佳模型,结果 AUC:" + str(auc)) print("==========预测数据===============") PredictData(sc, model, categoriesMap)
决策树多元分类
环境准备
- 导入需要的包
# -*- coding: UTF-8 -*- import sys from time import time import pandas as pd import matplotlib.pyplot as plt from pyspark import SparkConf, SparkContext from pyspark.mllib.tree import DecisionTree from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.evaluation import MulticlassMetrics
- 设置SParkContext环境
def SetLogger( sc ): logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR ) logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR ) logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR) def SetPath(sc): global Path if sc.master[0:5]=="local" : Path="file:/home/hduser/pythonsparkexample/PythonProject/" else: Path="hdfs://master:9000/user/hduser/" def CreateSparkContext(): sparkConf = SparkConf() \ .setAppName("LogisticRegressionWithSGD") \ .set("spark.ui.showConsoleProgress", "false") sc = SparkContext(conf = sparkConf) print ("master="+sc.master) SetLogger(sc) SetPath(sc) return (sc)
数据准备
- 执行
DecisionTree.trainClassifier
训练,规定label
一定时从0开始的,所以在提取label
时减去了1
def extract_label(record): label=(record[-1]) return float(label)-1 def convert_float(x): return (0 if x=="?" else float(x)) def extract_features(record,featureEnd): # 这里的featureEnd就是 len(r) - 1 numericalFeatures=[convert_float(field) for field in record[0: featureEnd]] return numericalFeatures def PrepareData(sc): #----------------------1.导入并转换数据------------- print("开始导入数据...") rawData = sc.textFile(Path+"data/covtype.data") print("共计:" + str(rawData.count()) + "项") lines = rawData.map(lambda x: x.split(",")) #----------------------2.建立训练评估所需数据 RDD[LabeledPoint]------------- labelpointRDD = lines.map(lambda r: LabeledPoint( extract_label(r), extract_features(r,len(r) - 1))) #----------------------3.以随机方式将数据分为3个部分并且返回------------- (trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1]) print("将数据分trainData:" + str(trainData.count())+\ " validationData:" + str(validationData.count()) +\ " testData:" + str(testData.count())) print labelpointRDD.first() return (trainData, validationData, testData)
模型训练
- 利用
DecisionTree.trainClassifier(trainData,numClasses=7,categoricalFeatruesInfo={},impurity,maxDepth,maxBins)
,因为这里的分类数目为7,所以我们将numClasses设置为7。
def trainEvaluateModel(trainData,validationData,impurityParm, maxDepthParm, maxBinsParm): startTime = time() model = DecisionTree.trainClassifier(trainData,\ numClasses=7, categoricalFeaturesInfo={}, \ impurity=impurityParm, maxDepth=maxDepthParm, maxBins=maxBinsParm) accuracy = evaluateModel(model, validationData) duration = time() - startTime print "训练评估:使用参数" + \ " impurityParm= %s"%impurityParm+ \ " maxDepthParm= %s"%maxDepthParm+ \ " maxBinsParm = %d."%maxBinsParm + \ " 所需时间=%d"%duration + \ " 结果accuracy = %f " % accuracy return (accuracy,duration, impurityParm, maxDepthParm, maxBinsParm,model)
模型评估
- 利用
MulticlassMetrics(scoreAndLabels).accuracy
计算准确率
def evaluateModel(model, validationData): score = model.predict(validationData.map(lambda p: p.features)) scoreAndLabels=score.zip(validationData.map(lambda p: p.label)) metrics = MulticlassMetrics(scoreAndLabels) accuracy = metrics.accuracy return( accuracy)
参数评估
- 评估单个参数
def evalParameter(trainData, validationData, evaparm,impurityList, maxDepthList, maxBinsList): metrics = [trainEvaluateModel(trainData, validationData, impurity,numIter, maxBins ) for impurity in impurityList for numIter in maxDepthList for maxBins in maxBinsList ] if evaparm=="impurity": IndexList=impurityList[:] elif evaparm=="maxDepth": IndexList=maxDepthList[:] elif evaparm=="maxBins": IndexList=maxBinsList[:] df = pd.DataFrame(metrics,index=IndexList, columns=['accuracy', 'duration','impurity', 'maxDepth', 'maxBins','model']) showchart(df,evaparm,'accuracy','duration',0.6,1.0 )
- 评估多个参数
def parametersEval(training_RDD, validation_RDD): print("----- 评估impurity参数使用 ---------") evalParameter(trainData, validationData,"impurity", impurityList=["gini", "entropy"], maxDepthList=[10], maxBinsList=[10 ]) print("----- 评估maxDepth参数使用 ---------") evalParameter(trainData, validationData,"maxDepth", impurityList=["gini"], maxDepthList=[3, 5, 10, 15, 20, 25], maxBinsList=[10]) print("----- 评估maxBins参数使用 ---------") evalParameter(trainData, validationData,"maxBins", impurityList=["gini"], maxDepthList =[10], maxBinsList=[3, 5, 10, 50, 100, 200 ])
- 利用pandas自带的绘图工具绘图
def showchart(df,evalparm ,barData,lineData,yMin,yMax): ax = df[barData].plot(kind='bar', titl =evalparm,figsize=(10,6),legend=True, fontsize=12) ax.set_xlabel(evalparm,fontsize=12) ax.set_ylim([yMin,yMax]) ax.set_ylabel(barData,fontsize=12) ax2 = ax.twinx() ax2.plot(df[[lineData ]].values, linestyle='-', marker='o', linewidth=2.0,color='r') plt.show()
- 同时评估多个参数
def evalAllParameter(training_RDD, validation_RDD, impurityList, maxDepthList, maxBinsList): metrics = [trainEvaluateModel(trainData, validationData, impurity,numIter, maxBins ) for impurity in impurityList for numIter in maxDepthList for maxBins in maxBinsList ] Smetrics = sorted(metrics, key=lambda k: k[0], reverse=True) bestParameter=Smetrics[0] print("调校后最佳参数:impurity:" + str(bestParameter[2]) + " ,maxDepth:" + str(bestParameter[3]) + " ,maxBins:" + str(bestParameter[4]) + " ,结果accuracy = " + str(bestParameter[0])) return bestParameter[5]
检验AUC
auc = evaluateModel(model, testData) return AUC
模型预测
def PredictData(sc,model): #----------------------1.导入并转换数据------------- rawData = sc.textFile(Path+"data/covtype.data") print("共计:" + str(rawData.count()) + "项") print("建立训练评估所需数据 RDD...") lines = rawData.map(lambda x: x.split(",")) #----------------------2.建立预测所需数据 RDD[LabeledPoint]------------- labelpointRDD = lines.map(lambda r: LabeledPoint( extract_label(r), extract_features(r,len(r) - 1))) #----------------------3.进行预测并显示结果------------- for lp in labelpointRDD.take(100): predict = model.predict(lp.features) label=lp.label features=lp.features result = ("正确" if (label == predict) else "错误") print("土地条件:海拔:" + str(features[0]) + " 方位:" + str(features[1]) + " 斜率:" + str(features[2]) + " 水源垂直距离:" + str(features[3]) + " 水源水平距离:" + str(features[4]) + " 9点时阴影:" + str(features[5]) + "....==>预测:" + str(predict) + " 实际:" + str(label) + "结果:" + result)
驱动函数
if __name__ == "__main__": print("RunDecisionTreeMulti") sc=CreateSparkContext() print("==========数据准备阶段===============") (trainData, validationData, testData) =PrepareData(sc) trainData.persist(); validationData.persist(); testData.persist() print("==========训练评估阶段===============") (AUC,duration, impurityParm, maxDepthParm, maxBinsParm,model)= \ trainEvaluateModel(trainData, validationData, "entropy", 15,50) if (len(sys.argv) == 2) and (sys.argv[1]=="-e"): parametersEval(trainData, validationData) elif (len(sys.argv) == 2) and (sys.argv[1]=="-a"): print("-----所有参数训练评估找出最好的参数组合---------") model=evalAllParameter(trainData, validationData, ["gini", "entropy"], [3, 5, 10, 15], [3, 5, 10, 50 ]) print("==========测试阶段===============") accuracy = evaluateModel(model, testData) print("使用test Data测试最佳模型,结果 accuracy:" + str(accuracy)) print("==========预测数据===============") PredictData(sc, model) #print model.toDebugString()
决策树回归分析
环境准备
- 导入我们需要的包
# -*- coding: UTF-8 -*- import sys from time import time import pandas as pd import matplotlib.pyplot as plt from pyspark import SparkConf, SparkContext from pyspark.mllib.tree import DecisionTree from pyspark.mllib.regression import LabeledPoint import numpy as np from pyspark.mllib.evaluation import RegressionMetrics import math
- 搭建SparkContext环境
def SetLogger( sc ): logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR ) logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR ) logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR) def SetPath(sc): global Path if sc.master[0:5]=="local" : Path="file:/home/hduser/pythonsparkexample/PythonProject/" else: Path="hdfs://master:9000/user/hduser/" #如果要在cluster模式运行(hadoop yarn 或Spark Stand alone),请按照书上的说明,先把文件上传到HDFS目录 def CreateSparkContext(): sparkConf = SparkConf() \ .setAppName("RunDecisionTreeRegression") \ .set("spark.ui.showConsoleProgress", "false") sc = SparkContext(conf = sparkConf) print ("master="+sc.master) SetLogger(sc) SetPath(sc) return (sc)
数据准备
def extract_label(record): label=(record[-1]) return float(label) def convert_float(x): return (0 if x=="?" else float(x)) def extract_features(record,featureEnd): featureSeason=[convert_float(field) for field in record[2]] features=[convert_float(field) for field in record[4: featureEnd-2]] return np.concatenate( (featureSeason, features)) def PrepareData(sc): #----------------------1.导入并转换数据------------- print("开始导入数据...") rawDataWithHeader = sc.textFile(Path+"data/hour.csv") header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x:x !=header) lines = rawData.map(lambda x: x.split(",")) print (lines.first()) print("共计:" + str(lines.count()) + "项") #----------------------2.建立训练评估所需数据 RDD[LabeledPoint]------------- labelpointRDD = lines.map(lambda r:LabeledPoint( extract_label(r), extract_features(r,len(r) - 1))) print labelpointRDD.first() #----------------------3.以随机方式将数据分为3个部分并且返回------------- (trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1]) print("将数据分trainData:" + str(trainData.count()) + " validationData:" + str(validationData.count()) + " testData:" + str(testData.count())) #print labelpointRDD.first() return (trainData, validationData, testData) #返回数据
训练模型
- 在做决策树回归分析时,我们必须使用
DecisionTree.trainRegressor
模型,且无numClasses
参数,而且impurity
固定为variance
def trainEvaluateModel(trainData,validationData, impurityParm, maxDepthParm, maxBinsParm): startTime = time() model = DecisionTree.trainRegressor(trainData, categoricalFeaturesInfo={}, \ impurity=impurityParm, maxDepth=maxDepthParm, maxBins=maxBinsParm) RMSE = evaluateModel(model, validationData) duration = time() - startTime print "训练评估:使用参数" + \ " impurityParm= %s"%impurityParm+ \ " maxDepthParm= %s"%maxDepthParm+ \ " maxBinsParm = %d."%maxBinsParm + \ " 所需时间=%d"%duration + \ " 结果RMSE = %f " % RMSE return (RMSE,duration, impurityParm, maxDepthParm, maxBinsParm,model)
模型评估
- 以
RSME
评估模型准确率,Root Mean Square Error
,通常RSME越小,代表的误差就越小,即代表了“计算预测结果” 与 “标签字段” 的误差平均值。 - 使用
RegressionMetrics(scoreAndLabels).rootMeanSquaredError
计算RMSE。
def evaluateModel(model, validationData): score = model.predict(validationData.map(lambda p: p.features)) scoreAndLabels=score.zip(validationData.map(lambda p: p.label)) metrics = RegressionMetrics(scoreAndLabels) RMSE=metrics.rootMeanSquaredError return( RMSE)
参数评估
- 单个参数评估
def parametersEval(training_RDD, validation_RDD): print("----- 评估maxDepth参数使用 ---------") evalParameter(training_RDD, validation_RDD,"maxDepth", impurityList=["variance"], maxDepthList =[3, 5, 10, 15, 20, 25] , maxBinsList=[10]) print("----- 评估maxBins参数使用 ---------") evalParameter(training_RDD, validation_RDD,"maxBins", impurityList=["variance"], maxDepthList=[10], maxBinsList=[3, 5, 10, 50, 100, 200 ])
- 多个参数评估
def evalParameter(trainData, validationData, evaparm,impurityList, maxDepthList, maxBinsList): metrics = [trainEvaluateModel(trainData, validationData, impurity,maxdepth, maxBins ) for impurity in impurityList for maxdepth in maxDepthList for maxBins in maxBinsList ] if evaparm=="impurity": IndexList=impurityList[:] elif evaparm=="maxDepth": IndexList=maxDepthList[:] elif evaparm=="maxBins": IndexList=maxBinsList[:] df = pd.DataFrame(metrics,index=IndexList, columns=['RMSE', 'duration','impurityParm', 'maxDepthParm', 'maxBinsParm','model']) showchart(df,evaparm,'RMSE','duration',0,200 )
- 可视化
def showchart(df,evalparm ,barData,lineData,yMin,yMax): ax = df[barData].plot(kind='bar', title =evalparm,figsize=(10,6),legend=True, fontsize=12) ax.set_xlabel(evalparm,fontsize=12) ax.set_ylim([yMin,yMax]) ax.set_ylabel(barData,fontsize=12) ax2 = ax.twinx() ax2.plot(df[[lineData ]].values, linestyle='-', marker='o', linewidth=2.0,color='r') plt.show()
- 多个参数同时评估
def evalAllParameter(training_RDD, validation_RDD, impurityList, maxDepthList, maxBinsList): metrics = [trainEvaluateModel(trainData, validationData, impurity,maxdepth, maxBins ) for impurity in impurityList for maxdepth in maxDepthList for maxBins in maxBinsList ] Smetrics = sorted(metrics, key=lambda k: k[0]) bestParameter=Smetrics[0] print("调校后最佳参数:impurity:" + str(bestParameter[2]) + " ,maxDepth:" + str(bestParameter[3]) + " ,maxBins:" + str(bestParameter[4]) + " ,结果RMSE = " + str(bestParameter[0])) return bestParameter[5]
检验AUC
RMSE = evaluateModel(model, testData) return RMSE
模型预测
def PredictData(sc,model): #----------------------1.导入并转换数据------------- print("开始导入数据...") rawDataWithHeader = sc.textFile(Path+"data/hour.csv") header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x:x !=header) lines = rawData.map(lambda x: x.split(",")) #print (lines.first()) print("共计:" + str(lines.count()) + "项") #----------------------2.建立训练评估所需数据 LabeledPoint RDD------------- labelpointRDD = lines.map(lambda r: LabeledPoint( extract_label(r), extract_features(r,len(r) - 1))) #----------------------3.定义字典---------------- SeasonDict = { 1 : "春", 2 : "夏", 3 :"秋", 4 : "冬" } HoildayDict={ 0 : "非假日", 1 : "假日" } WeekDict = {0:"一",1:"二",2:"三",3:"四",4 :"五",5:"六",6:"日"} WorkDayDict={ 1 : "工作日", 0 : "非工作日" } WeatherDict={ 1 : "晴", 2 : "阴", 3 : "小雨", 4 : "大雨" } #----------------------4.进行预测并显示结果-------------- for lp in labelpointRDD.take(100): predict = int(model.predict(lp.features)) label=lp.label features=lp.features result = ("正确" if (label == predict) else "错误") error = math.fabs(label - predict) dataDesc=" 特征: "+SeasonDict[features[0]] +"季,"+\ str(features[1]) + "月," +\ str(features[2]) + "时,"+ \ HoildayDict[features[3]] +","+\ "星期"+WeekDict[features[4]]+","+ \ WorkDayDict[features[5]]+","+\ WeatherDict[features[6]]+","+\ str(features[7] * 41)+ "度,"+\ "体感" + str(features[8] * 50) + "度," +\ "湿度" + str(features[9] * 100) + ","+\ "风速" + str(features[10] * 67) +\ " ==> 预测结果:" + str(predict )+\ " , 实际:" + str(label) + result +", 误差:" + str(error) print dataDesc
驱动函数
if __name__ == "__main__": print("RunDecisionTreeRegression") sc=CreateSparkContext() print("==========数据准备阶段===============") (trainData, validationData, testData) =PrepareData(sc) trainData.persist(); validationData.persist(); testData.persist() print("==========训练评估阶段===============") (AUC,duration, impurityParm, maxDepthParm, maxBinsParm,model)= \ trainEvaluateModel(trainData, validationData, "variance", 10, 100) if (len(sys.argv) == 2) and (sys.argv[1]=="-e"): parametersEval(trainData, validationData) elif (len(sys.argv) == 2) and (sys.argv[1]=="-a"): print("-----所有参数训练评估找出最好的参数组合---------") model=evalAllParameter(trainData, validationData, ["variance"], [3, 5, 10, 15, 20, 25], [3, 5, 10, 50, 100, 200 ]) print("==========测试阶段===============") RMSE = evaluateModel(model, testData) print("使用test Data测试最佳模型,结果 RMSE:" + str(RMSE)) print("==========预测数据===============") PredictData(sc, model) #print model.toDebugString()
来源:https://www.cnblogs.com/Toxin/p/9685489.html