filter spark dataframe with row field that is an array of strings

后端 未结 2 1355
遥遥无期
遥遥无期 2021-01-03 23:57

Using Spark 1.5 and Scala 2.10.6

I\'m trying to filter a dataframe via a field \"tags\" that is an array of strings. Looking for all rows that have the tag \'privat

相关标签:
2条回答
  • 2021-01-04 00:07

    I think if you use where(array_contains(...)) it will work. Here's my result:

    scala> import org.apache.spark.SparkContext
    import org.apache.spark.SparkContext
    
    scala> import org.apache.spark.sql.DataFrame
    import org.apache.spark.sql.DataFrame
    
    scala> def testData (sc: SparkContext): DataFrame = {
         |     val stringRDD = sc.parallelize(Seq
         |      ("""{ "name": "ned", "tags": ["blue", "big", "private"] }""",
         |       """{ "name": "albert", "tags": ["private", "lumpy"] }""",
         |       """{ "name": "zed", "tags": ["big", "private", "square"] }""",
         |       """{ "name": "jed", "tags": ["green", "small", "round"] }""",
         |       """{ "name": "ed", "tags": ["red", "private"] }""",
         |       """{ "name": "fred", "tags": ["public", "blue"] }"""))
         |     val sqlContext = new org.apache.spark.sql.SQLContext(sc)
         |     import sqlContext.implicits._
         |     sqlContext.read.json(stringRDD)
         |   }
    testData: (sc: org.apache.spark.SparkContext)org.apache.spark.sql.DataFrame
    
    scala>   
         | val df = testData (sc)
    df: org.apache.spark.sql.DataFrame = [name: string, tags: array<string>]
    
    scala> val report = df.select ("*").where (array_contains (df("tags"), "private"))
    report: org.apache.spark.sql.DataFrame = [name: string, tags: array<string>]
    
    scala> report.show
    +------+--------------------+
    |  name|                tags|
    +------+--------------------+
    |   ned|[blue, big, private]|
    |albert|    [private, lumpy]|
    |   zed|[big, private, sq...|
    |    ed|      [red, private]|
    +------+--------------------+
    

    Note that it works if you write where(array_contains(df("tags"), "private")), but if you write where(df("tags").array_contains("private")) (more directly analogous to what you wrote originally) it fails with array_contains is not a member of org.apache.spark.sql.Column. Looking at the source code for Column, I see there's some stuff to handle contains (constructing a Contains instance for that) but not array_contains. Maybe that's an oversight.

    0 讨论(0)
  • 2021-01-04 00:23

    You can use ordinal to refer to the json array's for e.g. in your case df("tags")(0). Here is a working sample

    scala> val stringRDD = sc.parallelize(Seq("""
         |       { "name": "ed",
         |         "tags": ["private"]
         |       }""",
         |       """{ "name": "fred",
         |         "tags": ["public"]
         |       }""")
         |     )
    stringRDD: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[87] at parallelize at <console>:22
    
    scala> import sqlContext.implicits._
    import sqlContext.implicits._
    
    scala> sqlContext.read.json(stringRDD)
    res28: org.apache.spark.sql.DataFrame = [name: string, tags: array<string>]
    
    scala> val df=sqlContext.read.json(stringRDD)
    df: org.apache.spark.sql.DataFrame = [name: string, tags: array<string>]
    
    scala> df.columns
    res29: Array[String] = Array(name, tags)
    
    scala> df.dtypes
    res30: Array[(String, String)] = Array((name,StringType), (tags,ArrayType(StringType,true)))
    
    scala> val report = df.select("*").where(df("tags")(0).contains("private"))
    report: org.apache.spark.sql.DataFrame = [name: string, tags: array<string>]
    
    scala> report.show
    +----+-------------+
    |name|         tags|
    +----+-------------+
    |  ed|List(private)|
    +----+-------------+
    
    0 讨论(0)
提交回复
热议问题