pyspark dataframe withColumn command not working

后端未结

关注

 3  1014

I have a input dataframe: df_input (updated df_input)

|comment|inp_col|inp_val|
|11     |a      |a1     |
|12     |a      |a2     |


                      
              相关标签:


      
      
        
          3条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  天涯浪人        
                
              
                            
                2021-01-16 17:58
              
            
            
                                                                       
You can simply use regex_replace like this:
df.withColumn("new_col", regex_replace(col("inp_val"), "&", ""))

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  失恋的感觉        
                
              
                            
                2021-01-16 18:01
              
            
            
                                                                       
Can you tryout this solution. Your approach may run into whole lot of problems.
import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.sql.window import Window
#Test data
tst = sqlContext.createDataFrame([(1,'a','3'),(1,'a','4'),(1,'b','5'),(1,'b','7'),(2,'c','&b'),(2,'c','&a'),(2,'d','&b')],schema=['col1','col2','col3'])
# extract the special character out
tst_1 = tst.withColumn("col3_extract",F.substring(F.col('col3'),2,1))
# Selecct which values need to be replaced; withColumnRenamed will also solve spark self join issues
# The substring search can also be done using regex function
tst_filter=tst.where(~F.col('col3').contains('&')).withColumnRenamed('col2','col2_collect')
# For the selected data, perform a collect list
tst_clct = tst_filter.groupby('col2_collect').agg(F.collect_list('col3').alias('col3_collect'))
#%% Join the main table with the collected list
tst_join = tst_1.join(tst_clct,on=tst_1.col3_extract==tst_clct.col2_collect,how='left').drop('col2_collect')
#%% In the column3  replace the values such as a, b
tst_result = tst_join.withColumn("result",F.when(~F.col('col3').contains('&'),F.array(F.col('col3'))).otherwise(F.col('col3_collect')))

Results :
+----+----+----+------------+------------+------+
|col1|col2|col3|col3_extract|col3_collect|result|
+----+----+----+------------+------------+------+
|   2|   c|  &a|           a|      [3, 4]|[3, 4]|
|   2|   c|  &b|           b|      [7, 5]|[7, 5]|
|   2|   d|  &b|           b|      [7, 5]|[7, 5]|
|   1|   a|   3|            |        null|   [3]|
|   1|   a|   4|            |        null|   [4]|
|   1|   b|   5|            |        null|   [5]|
|   1|   b|   7|            |        null|   [7]|
+----+----+----+------------+------------+------+

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  有刺的猬        
                
              
                            
                2021-01-16 18:07
              
            
            
                                                                       
Try this, self-join with collected list on rlike join condition is the way to go.
df.show() #sampledataframe

#+-------+---------+---------+
#|comment|input_col|input_val|
#+-------+---------+---------+
#|     11|        a|        1|
#|     12|        a|        2|
#|     15|        b|        5|
#|     16|        b|        6|
#|     17|        c|       &b|
#|     17|        c|        7|
#+-------+---------+---------+

df.join(df.groupBy("input_col").agg(F.collect_list("input_val").alias("y1"))\
          .withColumnRenamed("input_col","x1"),F.expr("""input_val rlike x1"""),'left')\
  .withColumn("new_col", F.when(F.col("input_val").cast("int").isNotNull(), F.array("input_val"))\
                    .otherwise(F.col("y1"))).drop("x1","y1").show()

#+-------+---------+---------+-------+
#|comment|input_col|input_val|new_col|
#+-------+---------+---------+-------+
#|     11|        a|        1|    [1]|
#|     12|        a|        2|    [2]|
#|     15|        b|        5|    [5]|
#|     16|        b|        6|    [6]|
#|     17|        c|       &b| [5, 6]|
#|     17|        c|        7|    [7]|
#+-------+---------+---------+-------+

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复