How do I create a set of ngrams in Spark?

后端 未结 1 401
无人及你
无人及你 2021-01-15 13:29

I am extracting Ngrams from a Spark 2.2 dataframe column using Scala, thus (trigrams in this example):

val ngram = new NGram().setN(3).setInputCol(\"incol\")         


        
相关标签:
1条回答
  • 2021-01-15 14:27

    If you want to combine these into vectors you can rewrite Python answer by zero323.

    import org.apache.spark.ml.feature._
    import org.apache.spark.ml.Pipeline
    
    def buildNgrams(inputCol: String = "tokens", 
                     outputCol: String = "features", n: Int = 3) = {
    
      val ngrams = (1 to n).map(i =>
          new NGram().setN(i)
            .setInputCol(inputCol).setOutputCol(s"${i}_grams")
      )
    
      val vectorizers = (1 to n).map(i =>
         new CountVectorizer()
          .setInputCol(s"${i}_grams")
          .setOutputCol(s"${i}_counts")
      )
    
      val assembler = new VectorAssembler()
        .setInputCols(vectorizers.map(_.getOutputCol).toArray)
        .setOutputCol(outputCol)
    
      new Pipeline().setStages((ngrams ++ vectorizers :+ assembler).toArray)
    
    }
    
    val df = Seq((1, Seq("a", "b", "c", "d"))).toDF("id", "tokens")
    

    Result

    buildNgrams().fit(df).transform(df).show(1, false)
    // +---+------------+------------+---------------+--------------+-------------------------------+-------------------------+-------------------+-------------------------------------+
    // |id |tokens      |1_grams     |2_grams        |3_grams       |1_counts                       |2_counts                 |3_counts           |features                             |
    // +---+------------+------------+---------------+--------------+-------------------------------+-------------------------+-------------------+-------------------------------------+
    // |1  |[a, b, c, d]|[a, b, c, d]|[a b, b c, c d]|[a b c, b c d]|(4,[0,1,2,3],[1.0,1.0,1.0,1.0])|(3,[0,1,2],[1.0,1.0,1.0])|(2,[0,1],[1.0,1.0])|[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]|
    // +---+------------+------------+---------------+--------------+-------------------------------+-------------------------+-------------------+-------------------------------------+
    

    This could be simpler with a UDF:

    val ngram = udf((xs: Seq[String], n: Int) => 
      (1 to n).map(i => xs.sliding(i).filter(_.size == i).map(_.mkString(" "))).flatten)
    
    spark.udf.register("ngram", ngram)
    
    val ngramer =  new SQLTransformer().setStatement(
      """SELECT *, ngram(tokens, 3) AS ngrams FROM __THIS__"""
    )
    
    ngramer.transform(df).show(false)
    // +---+------------+----------------------------------+
    // |id |tokens      |ngrams                            |
    // +---+------------+----------------------------------+
    // |1  |[a, b, c, d]|[a, b, c, d, ab, bc, cd, abc, bcd]|
    // +---+------------+----------------------------------+
    
    0 讨论(0)
提交回复
热议问题