Split thousands of columns at a time by '/' on multiple lines, sort the values in the new rows and add 'NA' values

前端未结

关注

 4  1677

I would like to split a data frame with thousands of columns. The data frame looks like this:

# sample data of four columns
sample <-read.table(stdin(),header


                      
              相关标签:


      
      
        
          4条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  孤街浪徒        
                
              
                            
                2021-01-22 21:43
              
            
            
                                                                       
Here's an alternative solution that instead of generating multiple rows, generates a bit mask for each value indicating presence or absence of the "0" "1" "2" etc bit.

> sample <-read.table(stdin(),header=TRUE,sep="",
                          row.names=1,colClasses="character")
0: POS v1  v2  v3  v4
1: 152 0   0/1 0/2 0/1/2
2: 73  1   0   0/1 0/1
3: 185 0   1   0/3 0
4: 
> # transform the strings into bit masks
> B<-function(X)lapply(strsplit(X,"/"),
                function(n)Reduce(bitOr,bitwShiftL(1,as.numeric(n)),0))
> B("0/1")
[[1]]
[1] 3
> # apply it everywhere
> s<-colwise(B)(sample)
> rownames(s)<-rownames(sample)
> s
    v1 v2 v3 v4
152  1  3  5  7
73   2  1  3  3
185  1  2  9  1


While it's not what you asked for, assuming the set of enum values is small (0,1,2) it is much much more efficient in storage space and can be processed easily:

Which elements have v1 "0" and v3 "0" and "1"

> subset(s, bitAnd(v1,B("0")) & bitAnd(v4,B("0/1")))
    v1 v2 v3 v4
152  1  3  5  7
185  1  2  9  1

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  梦毁少年i        
                
              
                            
                2021-01-22 21:52
              
            
            
                                                                       
I guess the interesting data is really a matrix

m = as.matrix(sample[,-1])


The underlying data is then a vector with relatively few unique values; we'll map the unique values to their integer representation, using the map where possible to minimize the number of iterations in any loops that are necessary

s = as.character(m)
map = lapply(strsplit(setNames(unique(s), unique(s)), "/"), as.integer)


Here's the number of times each row needs to be replicated

row.len = apply(matrix(sapply(map, max)[s], ncol=ncol(m)), 1, max) + 1


and the offsets into s of each row

offset = head(c(1, cumsum(rep(row.len, ncol(m))) + 1), -1)


Calculate the values of each mapped element, and the index of the value in s

v = unlist(unname(map)[match(s, names(map))])
idx = rep(offset, sapply(map, length)[s]) + v


Finally, allocate the result matrix of NA's, and update the non-NA values

ans = matrix(NA_integer_, sum(row.len), ncol(m))
ans[idx] = v


As a function:

flatten <- function(sample) {
    m = as.matrix(sample[,-1])
    s = as.character(m)
    map = lapply(strsplit(setNames(unique(s), unique(s)), "/"), as.integer)
    row.len = apply(matrix(sapply(map, max)[s], ncol=ncol(m)), 1, max) + 1
    offset = head(c(1, cumsum(rep(row.len, ncol(m))) + 1), -1)
    v = unlist(unname(map)[match(s, names(map))])
    idx = rep(offset, sapply(map, length)[s]) + v
    ans = matrix(NA_integer_, sum(row.len), ncol(m),
        dimnames=list(NULL, colnames(sample)[-1]))
    ans[idx] = v
    cbind(POS=rep(sample[,1], row.len), as.data.frame(ans))
}


The slowest part of this will be the apply function for calculating row.len.  Some timing (I guess the dimensions are not correct for the problem...)

xx = do.call(rbind, replicate(10000, sample, simplify=FALSE))
dim(xx)
## [1] 30000     5
system.time(flatten(xx))
##   user  system elapsed 
##  0.192   0.000   0.194 


versus about 5s for the data.table solution above.
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  借酒劲吻你        
                
              
                            
                2021-01-22 21:54
              
            
            
                                                                       
Here is a solution using data.table:

library("data.table")
dt <- data.table(df)
fun <- function(DT) {
  split <- strsplit(vapply(DT, as.character, character(1L)), "/")
  lapply(split, 
    function(x, max.len) as.numeric(x)[match(0:max.len, as.numeric(x))],
    max.len=max(as.numeric(unlist(split)))
) }
dt[, fun(.SD), by=POS]
#    POS v1 v2 v3 v4
# 1: 152  0  0  0  0
# 2: 152 NA  1 NA  1
# 3: 152 NA NA  2  2
# 4:  73 NA  0  0  0
# 5:  73  1 NA  1  1
# 6: 185  0 NA  0  0
# 7: 185 NA  1 NA NA
# 8: 185 NA NA NA NA
# 9: 185 NA NA  3 NA


The idea is to use data.table to execute our function fun against the data elements of each row (i.e. excluding POS).  data.table will stitch back POS for our modified result.

Here fun starts by converting each data row to a character vector, and then splitting by /, which will produce a list with for each item, a character vector with as many elements as there were /, + 1.

Finally, lapply cycles through each of these list items, converting them all to the same length vectors, filling in with NA, and sorting.

data.table recognizes the resulting list as representing columns for our result set, and adds back the POS column as noted earlier.



EDIT: the following addresses a question in the comments:

val <- "0/2/3:25:0.008,0.85,0.002:0.004,0.013,0.345"
first.colon <- strsplit(val, ":")[[1]][[1]]
strsplit(first.colon, "/")[[1]]
// [1] "0" "2" "3"


The key thing to understand is strsplit returns a list with as many elements as there are items in your input vector.  In this toy example there is only one item in the vector, so there is only one item in the list, though each item is a character vector that can have multiple values (in this case, 3 after we split by /).  So something like this should work (but I haven't tested debugged):

dt <- data.table(df)
fun <- function(DT) {
  split <- strsplit(vapply(DT, as.character, character(1L)), ":")
  split.2 <- vapply(split, `[[`, character(1L), 1)  # get just first value from `:` split
  split.2 <- strsplit(split.2, "/")
  lapply(split.2, 
    function(x, max.len) as.numeric(x)[match(0:max.len, as.numeric(x))],
    max.len=max(as.numeric(unlist(split)))
) }

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  面向向阳花        
                
              
                            
                2021-01-22 22:00
              
            
            
                                                                       
tmp <- apply(sample[-1], 1, function(x) {
  s <- strsplit(x, "\\/")
  num <- lapply(s, as.integer)
  ma <- max(unlist(num))
  vec <- rep(NA_integer_, ma + 1)
  sapply(num, function(y) replace(vec, y + 1, y)) 
})

res <- data.frame(POS = rep(sample[[1]], sapply(tmp, nrow)),
                  do.call(rbind, tmp))

#   POS v1 v2 v3 v4
# 1 152  0  0  0  0
# 2 152 NA  1 NA  1
# 3 152 NA NA  2  2
# 4  73 NA  0  0  0
# 5  73  1 NA  1  1
# 6 185  0 NA  0  0
# 7 185 NA  1 NA NA
# 8 185 NA NA NA NA
# 9 185 NA NA  3 NA

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复