I have a Spark Dataframe in that consists of a series of dates:
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import
datediff(Column end, Column start)
Returns the number of days from start to end.
https://spark.apache.org/docs/1.6.2/api/java/org/apache/spark/sql/functions.html
As of Spark 1.5 you can use unix_timestamp:
from pyspark.sql import functions as F
timeFmt = "yyyy-MM-dd'T'HH:mm:ss.SSS"
timeDiff = (F.unix_timestamp('EndDateTime', format=timeFmt)
- F.unix_timestamp('StartDateTime', format=timeFmt))
df = df.withColumn("Duration", timeDiff)
Note the Java style time format.
>>> df.show()
+---+--------------------+--------------------+--------+
| ID| EndDateTime| StartDateTime|Duration|
+---+--------------------+--------------------+--------+
|X01|2014-02-13T12:36:...|2014-02-13T12:31:...| 258|
|X02|2014-02-13T12:35:...|2014-02-13T12:32:...| 204|
|X03|2014-02-13T12:36:...|2014-02-13T12:32:...| 228|
|XO4|2014-02-13T12:37:...|2014-02-13T12:32:...| 269|
|XO5|2014-02-13T12:36:...|2014-02-13T12:33:...| 202|
+---+--------------------+--------------------+--------+
This can be done in spark-sql by converting the string date to timestamp and then getting the difference.
1: Convert to timestamp:
CAST(UNIX_TIMESTAMP(MY_COL_NAME,'dd-MMM-yy') as TIMESTAMP
2: Get the difference between dates using datediff
function.
This will be combined in a nested function like:
spark.sql("select COL_1, COL_2, datediff( CAST( UNIX_TIMESTAMP( COL_1,'dd-MMM-yy') as TIMESTAMP), CAST( UNIX_TIMESTAMP( COL_2,'dd-MMM-yy') as TIMESTAMP) ) as LAG_in_days from MyTable")
Below is the result:
+---------+---------+-----------+
| COL_1| COL_2|LAG_in_days|
+---------+---------+-----------+
|24-JAN-17|16-JAN-17| 8|
|19-JAN-05|18-JAN-05| 1|
|23-MAY-06|23-MAY-06| 0|
|18-AUG-06|17-AUG-06| 1|
+---------+---------+-----------+
Reference: https://docs-snaplogic.atlassian.net/wiki/spaces/SD/pages/2458071/Date+Functions+and+Properties+Spark+SQL
Thanks to David Griffin. Here's how to do this for future reference.
from pyspark.sql import SQLContext, Row
sqlContext = SQLContext(sc)
from pyspark.sql.types import StringType, IntegerType, StructType, StructField
from pyspark.sql.functions import udf
# Build sample data
rdd = sc.parallelize([('X01','2014-02-13T12:36:14.899','2014-02-13T12:31:56.876'),
('X02','2014-02-13T12:35:37.405','2014-02-13T12:32:13.321'),
('X03','2014-02-13T12:36:03.825','2014-02-13T12:32:15.229'),
('XO4','2014-02-13T12:37:05.460','2014-02-13T12:32:36.881'),
('XO5','2014-02-13T12:36:52.721','2014-02-13T12:33:30.323')])
schema = StructType([StructField('ID', StringType(), True),
StructField('EndDateTime', StringType(), True),
StructField('StartDateTime', StringType(), True)])
df = sqlContext.createDataFrame(rdd, schema)
# define timedelta function (obtain duration in seconds)
def time_delta(y,x):
from datetime import datetime
end = datetime.strptime(y, '%Y-%m-%dT%H:%M:%S.%f')
start = datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f')
delta = (end-start).total_seconds()
return delta
# register as a UDF
f = udf(time_delta, IntegerType())
# Apply function
df2 = df.withColumn('Duration', f(df.EndDateTime, df.StartDateTime))
Applying time_delta()
will give you duration in seconds:
>>> df2.show()
ID EndDateTime StartDateTime Duration
X01 2014-02-13T12:36:... 2014-02-13T12:31:... 258
X02 2014-02-13T12:35:... 2014-02-13T12:32:... 204
X03 2014-02-13T12:36:... 2014-02-13T12:32:... 228
XO4 2014-02-13T12:37:... 2014-02-13T12:32:... 268
XO5 2014-02-13T12:36:... 2014-02-13T12:33:... 202
Use DoubleType instead of IntegerType
from pyspark.sql import SQLContext, Row
sqlContext = SQLContext(sc)
from pyspark.sql.types import StringType, IntegerType, StructType, StructField
from pyspark.sql.functions import udf
# Build sample data
rdd = sc.parallelize([('X01','2014-02-13T12:36:14.899','2014-02-13T12:31:56.876'),
('X02','2014-02-13T12:35:37.405','2014-02-13T12:32:13.321'),
('X03','2014-02-13T12:36:03.825','2014-02-13T12:32:15.229'),
('XO4','2014-02-13T12:37:05.460','2014-02-13T12:32:36.881'),
('XO5','2014-02-13T12:36:52.721','2014-02-13T12:33:30.323')])
schema = StructType([StructField('ID', StringType(), True),
StructField('EndDateTime', StringType(), True),
StructField('StartDateTime', StringType(), True)])
df = sqlContext.createDataFrame(rdd, schema)
# define timedelta function (obtain duration in seconds)
def time_delta(y,x):
from datetime import datetime
end = datetime.strptime(y, '%Y-%m-%dT%H:%M:%S.%f')
start = datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f')
delta = (end-start).total_seconds()
return delta
# register as a UDF
f = udf(time_delta, DoubleType())
# Apply function
df2 = df.withColumn('Duration', f(df.EndDateTime, df.StartDateTime))
Here is a working version for spark 2.x derived from jason's answer
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession,SQLContext
from pyspark.sql.types import StringType, StructType, StructField
sc = SparkContext()
sqlContext = SQLContext(sc)
spark = SparkSession.builder.appName("Python Spark SQL basic example").getOrCreate()
rdd = sc.parallelize([('X01','2014-02-13T12:36:14.899','2014-02-13T12:31:56.876'),
('X02','2014-02-13T12:35:37.405','2014-02-13T12:32:13.321'),
('X03','2014-02-13T12:36:03.825','2014-02-13T12:32:15.229'),
('XO4','2014-02-13T12:37:05.460','2014-02-13T12:32:36.881'),
('XO5','2014-02-13T12:36:52.721','2014-02-13T12:33:30.323')])
schema = StructType([StructField('ID', StringType(), True),
StructField('EndDateTime', StringType(), True),
StructField('StartDateTime', StringType(), True)])
df = sqlContext.createDataFrame(rdd, schema)
# register as a UDF
from datetime import datetime
sqlContext.registerFunction("time_delta", lambda y,x:(datetime.strptime(y, '%Y-%m-%dT%H:%M:%S.%f')-datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f')).total_seconds())
df.createOrReplaceTempView("Test_table")
spark.sql("SELECT ID,EndDateTime,StartDateTime,time_delta(EndDateTime,StartDateTime) as time_delta FROM Test_table").show()
sc.stop()