I have a dataset, which contains lines in the format (tab separated):
Title<\\t>Text
Now for every word in Text
, I want to c
Another version with DataFrame API
// read into DataFrame
val viewsDF=spark.read.text("s3n://file.txt")
// Split
val splitedViewsDF = viewsDF.withColumn("col1", split($"value", "\\t").getItem(0)).withColumn("col2", split($"value", "\\s+").getItem(1)).drop($"value"))
scala> val viewsDF=spark.read.text("spark-labs/data/wiki-pageviews.txt")
viewsDF: org.apache.spark.sql.DataFrame = [value: string]
scala> viewsDF.printSchema
root
|-- value: string (nullable = true)
scala> viewsDF.limit(5).show
+------------------+
| value|
+------------------+
| aa Main_Page 3 0|
| aa Main_page 1 0|
| aa User:Savh 1 0|
| aa Wikipedia 1 0|
|aa.b User:Savh 1 0|
+------------------+
scala> val splitedViewsDF = viewsDF.withColumn("col1", split($"value", "\\s+").getItem(0)).withColumn("col2", split($"value", "\\s+").getItem(1)).withColumn("col3", split($"value", "\\s+").getItem(2)).drop($"value")
splitedViewsDF: org.apache.spark.sql.DataFrame = [col1: string, col2: string ... 1 more field]
scala>
scala> splitedViewsDF.printSchema
root
|-- col1: string (nullable = true)
|-- col2: string (nullable = true)
|-- col3: string (nullable = true)
scala> splitedViewsDF.limit(5).show
+----+---------+----+
|col1| col2|col3|
+----+---------+----+
| aa|Main_Page| 3|
| aa|Main_page| 1|
| aa|User:Savh| 1|
| aa|Wikipedia| 1|
|aa.b|User:Savh| 1|
+----+---------+----+
scala>