How to use feature extraction with DStream in Apache Spark

ⅰ亾dé卋堺 提交于 2019-12-03 21:34:39

I used advises from comments and split the procedure into 2 runs:

  • one that calculated IDF model and saves it to file

    def trainFeatures(idfModelFile: File, rdd: RDD[(String, Seq[String])]) = {
      val session: SparkSession = SparkSession.builder.getOrCreate
      val wordsDf = session.createDataFrame(rdd).toDF("data", "words")
      val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures")
      val featurizedDf = hashingTF.transform(wordsDf)
      val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
      val idfModel =
  • one that reads IDF model from file and simply runs it on all incoming information

    val idfModel = IDFModel.load(idfModelFile.getAbsolutePath)
    val documentDf = spark.createDataFrame(rdd).toDF("update", "document")
    val tokenizer = new Tokenizer().setInputCol("document").setOutputCol("words")
    val wordsDf = tokenizer.transform(documentDf)
    val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures")
    val featurizedDf = hashingTF.transform(wordsDf)
    val extractor = idfModel.setInputCol("rawFeatures").setOutputCol("features")
    val featuresDf = extractor.transform(featurizedDf)"update", "features")