Splitting Date into Year, Month and Day, with inconsistent delimiters

前端未结

关注

 3  460

I am trying to split my Date Column which is a String Type right now into 3 columns Year, Month and Date. I use (PySpark):

split_date=pyspark.sql.functions.split


                      
              相关标签:


      
      
        
          3条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  悲哀的现实        
                
              
                            
                2021-01-27 17:16
              
            
            
                                                                       
Try this :

split_date=pyspark.sql.functions.split(df['Date'], '[-/]')

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  一整个雨季        
                
              
                            
                2021-01-27 17:30
              
            
            
                                                                       
You just need little bit of extra coding to recognize type of date format .
for example, lets say your data is in below format -

data = [("2008-05-01",1),("2018-01-01",2),("03/14/2017",3),("01/01/2018",4)]
df = spark.createDataFrame(data,schema=['date','key'])

df.show()


:

+----------+---+
|      date|key|
+----------+---+
|2008-05-01|  1|
|2018-01-01|  2|
|03/14/2017|  3|
|01/01/2018|  4|
+----------+---+


:

from pyspark.sql.functions import *
from pyspark.sql.types import *

# udf that recognise pattern and return list of year,month and day
def splitUDF(row):
    if "/" in row:
        mm,dd,yyyy = row.split("/")
    elif "-" in row:
        yyyy,mm,dd = row.split("-")

    return [yyyy,mm,dd]


datSplitterUDF = udf(lambda row : splitUDF(row),ArrayType(StringType()))
df\
.select(datSplitterUDF(df.date).alias("dt"))\
.withColumn('year',col('dt').getItem(0).cast('int'))\
.withColumn('month',col('dt').getItem(1).cast('int'))\
.withColumn('day',col('dt').getItem(2).cast('int'))\
.show()


output:

+--------------+----+-----+---+
|            dt|year|month|day|
+--------------+----+-----+---+
|[2008, 05, 01]|2008|    5|  1|
|[2018, 01, 01]|2018|    1|  1|
|[2017, 03, 14]|2017|    3| 14|
|[2018, 01, 01]|2018|    1|  1|
+--------------+----+-----+---+

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  野的像风        
                
              
                            
                2021-01-27 17:38
              
            
            
                                                                       
Adding to @Pushkr solution. You can also use dateutil function to parse date format into datetime. Here is snippet to do that.

import pyspark.sql.functions as func
from pyspark.sql.types import *
from dateutil import parser

def parse_date(date):
    dt = parser.parse(date)
    return [dt.year, dt.month, dt.day]
udf_parse_date = func.udf(lambda x: parse_date(x), returnType=ArrayType(IntegerType()))

data = [("2008-05-01",1), ("2018-01-01",2), ("03/14/2017",3), ("01/01/2018",4)]
df = spark.createDataFrame(data, schema=['date','key'])
df = df.select('date', 'key', udf_parse_date('date').alias('date_parse'))
df_parsed = df.select('key', 
                      func.col('date_parse').getItem(0).alias('year'), 
                      func.col('date_parse').getItem(1).alias('month'), 
                      func.col('date_parse').getItem(2).alias('day'))

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复