Spark Advanced Window with dynamic last

前端 未结 4 1941
星月不相逢
星月不相逢 2021-02-09 11:14

Problem: Given a time series data which is a clickstream of user activity is stored in hive, ask is to enrich the data with session id using spark.

Session Definition

4条回答
  •  时光取名叫无心
    2021-02-09 11:53

    Complete solution

    import org.apache.spark.sql.SparkSession
    import org.apache.spark.sql.expressions.Window
    import org.apache.spark.sql.functions._
    import scala.collection.mutable.ListBuffer
    import scala.util.control._
    import spark.sqlContext.implicits._
    import java.sql.Timestamp
    import org.apache.spark.sql.functions._
    import org.apache.spark.sql.types._
    
    
    val interimSessionThreshold=60
    val totalSessionTimeThreshold=120
    
    val sparkSession = SparkSession.builder.master("local").appName("Window Function").getOrCreate()
    
    val clickDF = sparkSession.createDataFrame(Seq(
          ("2018-01-01T11:00:00Z","u1"),
            ("2018-01-01T12:10:00Z","u1"),
            ("2018-01-01T13:00:00Z","u1"),
            ("2018-01-01T13:50:00Z","u1"),
            ("2018-01-01T14:40:00Z","u1"),
            ("2018-01-01T15:30:00Z","u1"),
            ("2018-01-01T16:20:00Z","u1"),
            ("2018-01-01T16:50:00Z","u1"),
            ("2018-01-01T11:00:00Z","u2"),
            ("2018-01-02T11:00:00Z","u2")
        )).toDF("clickTime","user")
    
    val newDF=clickDF.withColumn("clickTimestamp",unix_timestamp($"clickTime", "yyyy-MM-dd'T'HH:mm:ss'Z'").cast(TimestampType).as("timestamp")).drop($"clickTime")  
    
    val partitionWindow = Window.partitionBy($"user").orderBy($"clickTimestamp".asc)
    
    val lagTest = lag($"clickTimestamp", 1, "0000-00-00 00:00:00").over(partitionWindow)
    val df_test=newDF.select($"*", ((unix_timestamp($"clickTimestamp")-unix_timestamp(lagTest))/60D cast "int") as "diff_val_with_previous")
    
    
    val distinctUser=df_test.select($"user").distinct.as[String].collect.toList
    
    val rankTest = rank().over(partitionWindow)
    val ddf = df_test.select($"*", rankTest as "rank")
    
    case class finalClick(User:String,clickTime:Timestamp,session:String)
    
    val rowList: ListBuffer[finalClick] = new ListBuffer()
    
    
    distinctUser.foreach{x =>{
        val tempDf= ddf.filter($"user" === x)
        var cumulDiff:Int=0
        var session_index=1
        var startBatch=true
        var dp=0
        val len = tempDf.count.toInt
        for(i <- 1 until len+1){
          val r = tempDf.filter($"rank" === i).head()
          dp = r.getAs[Int]("diff_val_with_previous")
          cumulDiff += dp
          if(dp <= interimSessionThreshold && cumulDiff <= totalSessionTimeThreshold){
            startBatch=false
            rowList += finalClick(r.getAs[String]("user"),r.getAs[Timestamp]("clickTimestamp"),r.getAs[String]("user")+session_index)
          }
          else{
            session_index+=1
            cumulDiff = 0
            startBatch=true
            dp=0
            rowList += finalClick(r.getAs[String]("user"),r.getAs[Timestamp]("clickTimestamp"),r.getAs[String]("user")+session_index)
          }
        } 
    }}
    
    
    val dataFrame = sc.parallelize(rowList.toList).toDF("user","clickTimestamp","session")
    
    dataFrame.show
    
    +----+-------------------+-------+
    |user|     clickTimestamp|session|
    +----+-------------------+-------+
    |  u1|2018-01-01 11:00:00|    u11|
    |  u1|2018-01-01 12:10:00|    u12|
    |  u1|2018-01-01 13:00:00|    u12|
    |  u1|2018-01-01 13:50:00|    u12|
    |  u1|2018-01-01 14:40:00|    u13|
    |  u1|2018-01-01 15:30:00|    u13|
    |  u1|2018-01-01 16:20:00|    u13|
    |  u1|2018-01-01 16:50:00|    u14|
    |  u2|2018-01-01 11:00:00|    u21|
    |  u2|2018-01-02 11:00:00|    u22|
    +----+-------------------+-------+
    
    
    
    

提交回复
热议问题