How to split data into training/testing sets using sample function

前端未结

关注

 24  1503

I\'ve just started using R and I\'m not sure how to incorporate my dataset with the following sample code:

sample(x, size, replace = FALSE, prob = NULL)


                      
              相关标签:


      
      
        
          24条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  星月不相逢        
                
              
                            
                2020-11-22 11:08
              
            
            
                                                                       
I bumped into this one, it can help too.

set.seed(12)
data = Sonar[sample(nrow(Sonar)),]#reshufles the data
bound = floor(0.7 * nrow(data))
df_train = data[1:bound,]
df_test = data[(bound+1):nrow(data),]

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  庸人自扰        
                
              
                            
                2020-11-22 11:08
              
            
            
                                                                       
We can divide data into a particular ratio here it is 80% train and 20% in a test dataset. 

ind <- sample(2, nrow(dataName), replace = T, prob = c(0.8,0.2))
train <- dataName[ind==1, ]
test <- dataName[ind==2, ]

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  一生所求        
                
              
                            
                2020-11-22 11:09
              
            
            
                                                                       
It can be easily done by:

set.seed(101) # Set Seed so that same sample can be reproduced in future also
# Now Selecting 75% of data as sample from total 'n' rows of the data  
sample <- sample.int(n = nrow(data), size = floor(.75*nrow(data)), replace = F)
train <- data[sample, ]
test  <- data[-sample, ]


By using caTools package:

require(caTools)
set.seed(101) 
sample = sample.split(data$anycolumn, SplitRatio = .75)
train = subset(data, sample == TRUE)
test  = subset(data, sample == FALSE)

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  梦如初夏        
                
              
                            
                2020-11-22 11:09
              
            
            
                                                                       
Assuming df is your data frame, and that you want to create 75% train and 25% test

all <- 1:nrow(df)
train_i <- sort(sample(all, round(nrow(df)*0.75,digits = 0),replace=FALSE))
test_i <- all[-train_i]


Then to create a train and test data frames

df_train <- df[train_i,]
df_test <- df[test_i,]

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  北荒        
                
              
                            
                2020-11-22 11:09
              
            
            
                                                                       
set.seed(123)
llwork<-sample(1:length(mydata),round(0.75*length(mydata),digits=0)) 
wmydata<-mydata[llwork, ]
tmydata<-mydata[-llwork, ]

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  执念已碎        
                
              
                            
                2020-11-22 11:10
              
            
            
                                                                       
There are numerous approaches to achieve data partitioning. For a more complete approach take a look at the createDataPartition function in the caTools package.

Here is a simple example:

data(mtcars)

## 75% of the sample size
smp_size <- floor(0.75 * nrow(mtcars))

## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(mtcars)), size = smp_size)

train <- mtcars[train_ind, ]
test <- mtcars[-train_ind, ]

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
   
          
     上一页
1
2
3
4
           
           
        
                                  
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复