from pyspark.ml.linalg import Vectors from pyspark.ml.stat import ChiSquareTest from pyspark.sql import SparkSession spark= SparkSession\ .builder \ .appName("dataFrame") \ .getOrCreate() data = [(0.0, Vectors.dense(0.5, 10.0)), (0.0, Vectors.dense(1.5, 20.0)), (1.0, Vectors.dense(1.5, 30.0)), (0.0, Vectors.dense(3.5, 30.0)), (0.0, Vectors.dense(3.5, 40.0)), (1.0, Vectors.dense(3.5, 40.0))] df = spark.createDataFrame(data, ["label", "features"]) r = ChiSquareTest.test(df, "features", "label").head() print("pValues: " + str(r.pValues)) print("degreesOfFreedom: " + str(r.degreesOfFreedom)) print("statistics: " + str(r.statistics))
pValues: [0.6872892787909721,0.6822703303362126] degreesOfFreedom: [2, 3] statistics: [0.75,1.5]
文章来源: spark 卡方分布的假设检验