Spark DataFrame handing empty String in OneHotEncoder

前端 未结 3 1309
醉梦人生
醉梦人生 2021-01-18 03:54

I am importing a CSV file (using spark-csv) into a DataFrame which has empty String values. When applied the OneHotEncoder, the applic

3条回答
  •  遥遥无期
    2021-01-18 04:47

    if the column contains null the OneHotEncoder fails with a NullPointerException. therefore i extended the udf to tanslate null values as well

    object OneHotEncoderExample {
      def main(args: Array[String]): Unit = {
        val conf = new SparkConf().setAppName("OneHotEncoderExample Application").setMaster("local[2]")
        val sc = new SparkContext(conf)
        val sqlContext = new SQLContext(sc)
    
        // $example on$
        val df1 = sqlContext.createDataFrame(Seq(
          (0.0, "a"),
          (1.0, "b"),
          (2.0, "c"),
          (3.0, ""),
          (4.0, null),
          (5.0, "c")
        )).toDF("id", "category")
    
    
        import org.apache.spark.sql.functions.udf
        def emptyValueSubstitution = udf[String, String] {
          case "" => "NA"
          case null => "null"
          case value => value
        }
        val df = df1.withColumn("category", emptyValueSubstitution( df1("category")) )
    
    
        val indexer = new StringIndexer()
          .setInputCol("category")
          .setOutputCol("categoryIndex")
          .fit(df)
        val indexed = indexer.transform(df)
        indexed.show()
    
        val encoder = new OneHotEncoder()
          .setInputCol("categoryIndex")
          .setOutputCol("categoryVec")
          .setDropLast(false)
        val encoded = encoder.transform(indexed)
        encoded.show()
        // $example off$
        sc.stop()
      }
    }
    

提交回复
热议问题