Problem: Given a time series data which is a clickstream of user activity is stored in hive, ask is to enrich the data with session id using spark.
Session Definition
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import scala.collection.mutable.ListBuffer
import scala.util.control._
import spark.sqlContext.implicits._
import java.sql.Timestamp
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
val interimSessionThreshold=60
val totalSessionTimeThreshold=120
val sparkSession = SparkSession.builder.master("local").appName("Window Function").getOrCreate()
val clickDF = sparkSession.createDataFrame(Seq(
("2018-01-01T11:00:00Z","u1"),
("2018-01-01T12:10:00Z","u1"),
("2018-01-01T13:00:00Z","u1"),
("2018-01-01T13:50:00Z","u1"),
("2018-01-01T14:40:00Z","u1"),
("2018-01-01T15:30:00Z","u1"),
("2018-01-01T16:20:00Z","u1"),
("2018-01-01T16:50:00Z","u1"),
("2018-01-01T11:00:00Z","u2"),
("2018-01-02T11:00:00Z","u2")
)).toDF("clickTime","user")
val newDF=clickDF.withColumn("clickTimestamp",unix_timestamp($"clickTime", "yyyy-MM-dd'T'HH:mm:ss'Z'").cast(TimestampType).as("timestamp")).drop($"clickTime")
val partitionWindow = Window.partitionBy($"user").orderBy($"clickTimestamp".asc)
val lagTest = lag($"clickTimestamp", 1, "0000-00-00 00:00:00").over(partitionWindow)
val df_test=newDF.select($"*", ((unix_timestamp($"clickTimestamp")-unix_timestamp(lagTest))/60D cast "int") as "diff_val_with_previous")
val distinctUser=df_test.select($"user").distinct.as[String].collect.toList
val rankTest = rank().over(partitionWindow)
val ddf = df_test.select($"*", rankTest as "rank")
case class finalClick(User:String,clickTime:Timestamp,session:String)
val rowList: ListBuffer[finalClick] = new ListBuffer()
distinctUser.foreach{x =>{
val tempDf= ddf.filter($"user" === x)
var cumulDiff:Int=0
var session_index=1
var startBatch=true
var dp=0
val len = tempDf.count.toInt
for(i <- 1 until len+1){
val r = tempDf.filter($"rank" === i).head()
dp = r.getAs[Int]("diff_val_with_previous")
cumulDiff += dp
if(dp <= interimSessionThreshold && cumulDiff <= totalSessionTimeThreshold){
startBatch=false
rowList += finalClick(r.getAs[String]("user"),r.getAs[Timestamp]("clickTimestamp"),r.getAs[String]("user")+session_index)
}
else{
session_index+=1
cumulDiff = 0
startBatch=true
dp=0
rowList += finalClick(r.getAs[String]("user"),r.getAs[Timestamp]("clickTimestamp"),r.getAs[String]("user")+session_index)
}
}
}}
val dataFrame = sc.parallelize(rowList.toList).toDF("user","clickTimestamp","session")
dataFrame.show
+----+-------------------+-------+
|user| clickTimestamp|session|
+----+-------------------+-------+
| u1|2018-01-01 11:00:00| u11|
| u1|2018-01-01 12:10:00| u12|
| u1|2018-01-01 13:00:00| u12|
| u1|2018-01-01 13:50:00| u12|
| u1|2018-01-01 14:40:00| u13|
| u1|2018-01-01 15:30:00| u13|
| u1|2018-01-01 16:20:00| u13|
| u1|2018-01-01 16:50:00| u14|
| u2|2018-01-01 11:00:00| u21|
| u2|2018-01-02 11:00:00| u22|
+----+-------------------+-------+