collapse mulitple columns into one column and generate an index variable

后端未结

关注

 4  950

I have three date columns as shown below

       Id Date1       Date2         Date3
       12 2005-12-22  NA            NA
       11 2009-10-11  NA            NA


                      
              相关标签:


      
      
        
          4条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  我寻月下人不归        
                
              
                            
                2021-01-22 09:17
              
            
            
                                                                       
Using base R, we could get the column index of the non-NA values for the 'Date' columns in each row by matrix multiplication 

 indx <- (!is.na(df1[-1])) %*% seq_len(ncol(df1[-1]))


Or using max.col on the logical matrix (!is.na(df1[-1]))

 indx <- max.col(!is.na(df1[-1]))


Then create the new data.frame with 'Id' column from the 'df1', 'Date' from 'row/column' index and 'Index' from above.

 data.frame(Id=df1[1], Date=df1[-1][cbind(1:nrow(df1[-1]), indx)], Index=indx)
 #  Id       Date Index
 #1 12 2005-12-22     1
 #2 11 2009-10-11     1
 #3 29 2005-04-11     2
 #4 45 2008-11-06     3
 #5 39 2006-01-02     3
 #6 44 2005-04-16     2


Or using dplyr/tidyr

 library(dplyr)
 library(tidyr)
 gather(df1, Index, Date, -Id) %>% 
              filter(!is.na(Date)) %>% 
              extract(Index, 'Index', '[^0-9]+([0-9]+)', convert=TRUE)
 #  Id Index       Date
 #1 12     1 2005-12-22
 #2 11     1 2009-10-11
 #3 29     2 2005-04-11
 #4 44     2 2005-04-16
 #5 45     3 2008-11-06
 #6 39     3 2006-01-02


data

df1 <- structure(list(Id = c(12L, 11L, 29L, 45L, 39L, 44L), 
Date1 = c("2005-12-22", 
"2009-10-11", NA, NA, NA, NA), Date2 = c(NA, NA, "2005-04-11", 
NA, NA, "2005-04-16"), Date3 = c(NA, NA, NA, "2008-11-06",
"2006-01-02", NA)), .Names = c("Id", "Date1", "Date2", "Date3"),
 class = "data.frame", row.names = c(NA, -6L))

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  离开以前        
                
              
                            
                2021-01-22 09:25
              
            
            
                                                                       
This is a classic use of reshape to go from "wide" to "long" format. If d is your data.frame:

d2 <- reshape(d, idvar = "Id", v.names = "Date", timevar = "Index",
              varying = c("Date1", "Date2", "Date3"), direction = "long")


Result:

> d2
     Id Index       Date
12.1 12     1 2005-12-22
11.1 11     1 2009-10-11
29.1 29     1       <NA>
45.1 45     1       <NA>
39.1 39     1       <NA>
44.1 44     1       <NA>
12.2 12     2       <NA>
11.2 11     2       <NA>
29.2 29     2 2005-04-11
45.2 45     2       <NA>
39.2 39     2       <NA>
44.2 44     2 2005-04-16
12.3 12     3       <NA>
11.3 11     3       <NA>
29.3 29     3       <NA>
45.3 45     3 2008-11-06
39.3 39     3 2006-01-02
44.3 44     3       <NA>


If you don't want all the NA values (above) you can subset:

> d2[!is.na(d2$Date),]
     Id Index       Date
12.1 12     1 2005-12-22
11.1 11     1 2009-10-11
29.2 29     2 2005-04-11
44.2 44     2 2005-04-16
45.3 45     3 2008-11-06
39.3 39     3 2006-01-02

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  萌比男神i        
                
              
                            
                2021-01-22 09:29
              
            
            
                                                                       
You can consider melting your data.

Here's an example:

library(data.table)
library(reshape2)
melt(as.data.table(mydf), id.vars = "Id", na.rm = TRUE)
#    Id variable      value
# 1: 12    Date1 2005-12-22
# 2: 11    Date1 2009-10-11
# 3: 29    Date2 2005-04-11
# 4: 44    Date2 2005-04-16
# 5: 45    Date3 2008-11-06
# 6: 39    Date3 2006-01-02

## More specific to what you want:
melt(as.data.table(mydf), id.vars = "Id", na.rm = TRUE)[, 
  variable := sub("Date", "", variable)][]
#    Id variable      value
# 1: 12        1 2005-12-22
# 2: 11        1 2009-10-11
# 3: 29        2 2005-04-11
# 4: 44        2 2005-04-16
# 5: 45        3 2008-11-06
# 6: 39        3 2006-01-02

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  悲哀的现实        
                
              
                            
                2021-01-22 09:32
              
            
            
                                                                       
You can also use tidyr with a small hack for the id:

library(tidyr)

df[is.na(df)]=''

transform(unite(df, 'Date', Date1:Date3, sep=''), 
          id=ceiling(which(df[-1]!='')/nrow(df)))
#  Id       Date id
#1 12 2005-12-22  1
#2 11 2009-10-11  1
#3 29 2005-04-11  2
#4 45 2008-11-06  2
#5 39 2006-01-02  3
#6 44 2005-04-16  3

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复