Count features for different ids in columns in R in faster way

前端未结

关注

 4  1610

I am trying to process a 20 GB data file in R. I have 16 gigs RAM and i7 processor. I am reading the data using :

y<-read.table(file=\"sample.csv\", header =


                      
              相关标签:


      
      
        
          4条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  小鲜肉        
                
              
                            
                2021-01-27 14:32
              
            
            
                                                                       
How about table()? 

> set.seed(5)
> ids <- sample(1:3, 12, TRUE)
> features <- sample(1:4, 12, TRUE)
> cbind(ids, features)
      ids features
 [1,]   1        2
 [2,]   3        3
 [3,]   3        2
 [4,]   1        1
 [5,]   1        2
 [6,]   3        4
 [7,]   2        3
 [8,]   3        4
 [9,]   3        4
[10,]   1        3
[11,]   1        1
[12,]   2        1

> table(ids, features)
   features
ids 1 2 3 4
  1 2 2 1 0
  2 1 0 1 0
  3 0 1 1 3


So for example feature 4 appears 3 times in id 3.

EDIT: You can use as.data.frame() to "flatten" the table and get:

> as.data.frame(table(ids, features))
   ids features Freq
1    1        1    2
2    2        1    1
3    3        1    0
4    1        2    2
5    2        2    0
6    3        2    1
7    1        3    1
8    2        3    1
9    3        3    1
10   1        4    0
11   2        4    0
12   3        4    3

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  醉酒成梦        
                
              
                            
                2021-01-27 14:35
              
            
            
                                                                       
For your example:

apply(sign(table(y)), 1, sum)
21 22 23 
 4  4  2 

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  梦谈多话        
                
              
                            
                2021-01-27 14:37
              
            
            
                                                                       
I would recommend the data.table package for this (fread is very fast!), then set up a loop that loops through the file reading in chunks at a time and storing the feature count sums. Here are some adapted lines of a function I have for looping for a file, it probably won't work as is, but you can get an idea what to do

require(data.table)
LineNu <- as.numeric(gsub(" .+","",system2("wc",paste("-l",your.file,sep=" "),stdout=TRUE, stderr=TRUE)))
DT <- fread(your.file,nrows=50000000,sep=",",header=TRUE)
KEEP.DT <- DT[,list("feature"=sum(length(feature))),by=id]
rm(DT) ; gc()
Starts <- c(seq(50000000,LineNu,by=50000000),LineNu)
for (i in 2:(length(Starts)-1)) {
  cat(paste0("Filtering next 50000000 lines    ", i, " of ",length(Starts)-1, " \n"))
  DT <- fread(your.file,skip=Starts[i],nrows=ifelse(50000000*(i-1) < Starts[length(Starts)],50000000,(50000000*(i-1)) - Starts[length(Starts)]),sep=",",header=FALSE)
  DT[,list("feature"=sum(length(feature))),by=id]
  KEEP.DT <- rbind(KEEP.DT,DT)
  rm(DT) ; gc()
}


You may need to redo the DT[sum(length)] part since some id's might get read in in different chunks.
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  臣服心动        
                
              
                            
                2021-01-27 14:38
              
            
            
                                                                       
I admit that I don't really understand the question the way it is written, but it sounds like "data.table" would be the way to go, and you should look into the .N function. As already mentioned fread is going to be much better than read.csv, so I'll assume that you've read the data into a data.table named "DT".

Here's a small one:

DT <- data.table(id = c(rep(21, 6), rep(22, 5), 23, 23),
                 feature = c(234, 290, 234, 7802, 3467, 234, 235,
                             235, 1234, 236, 134, 9133, 223))
DT
#     id feature
#  1: 21     234
#  2: 21     290
#  3: 21     234
#  4: 21    7802
#  5: 21    3467
#  6: 21     234
#  7: 22     235
#  8: 22     235
#  9: 22    1234
# 10: 22     236
# 11: 22     134
# 12: 23    9133
# 13: 23     223


If you just wanted to count the number of each unique feature, you could do:

DT[, .N, by = "id,feature"]
#     id feature N
#  1: 21     234 3
#  2: 21     290 1
#  3: 21    7802 1
#  4: 21    3467 1
#  5: 22     235 2
#  6: 22    1234 1
#  7: 22     236 1
#  8: 22     134 1
#  9: 23    9133 1
# 10: 23     223 1


If you wanted the count of the first "feature", by "id", you could use:

DT[, .N, by = "id,feature"][, .SD[1], by = "id"]
#    id feature N
# 1: 21     234 3
# 2: 22     235 2
# 3: 23    9133 1


If you wanted to get the most frequently occurring "feature" by "id" (which is the same result as above, in this case), you can try the following:

DT[, .N, by = "id,feature"][, lapply(.SD, function(x) x[which.max(N)]), by = "id"]




Update

Based on your new description, this seems much easier.

Just merge your datasets and aggregate the counts. Again, fast to do in "data.table":

DTY <- data.table(y, key = "id,feature")
DTX <- data.table(x, key = "id,feature")
DTY[DTX][, .N, by = id]
#    id N
# 1: 21 3
# 2: 22 2
# 3: 23 3


Or:

DTY[, .N, by = key(DTY)][DTX]
#    id feature N
# 1: 21     234 3
# 2: 22     235 2
# 3: 23     223 3


This is assuming that "x" and "y" are defined as the following to begin with:

x <- structure(list(id = 21:23, feature = c(234L, 235L, 223L),
  counts = c(3L, 2L, 3L)), .Names = c("id", "feature", "counts"),
  row.names = c(NA, -3L), class = "data.frame")
y <- structure(list(id = c(21L, 21L, 21L, 21L, 21L, 21L, 22L, 22L, 
  22L, 22L, 22L, 23L, 23L, 23L, 23L, 23L, 23L), feature = c(234L,
  290L, 234L, 7802L, 3467L, 234L, 235L, 235L, 1234L, 236L, 134L,
  9133L, 223L, 245L, 223L, 122L, 223L)), .Names = c("id", "feature"),
  class = "data.frame", row.names = c(NA, -17L))

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复