Extract text using regex in R

后端未结

关注

 5  1949

一整个雨季 2021-01-25 02:15

I read the text file with below data and am trying to convert it to a dataframe

Id:   1
ASIN: 0827229534
  title: Patterns of Preaching: A Sermon Sampler
  group


      
      
        
          5条回答        

        
                    
            
            
                         
                
              
              
                
                   一生所求
                                             
                
                
                (楼主)
            
              
              
                2021-01-25 03:02
              

            
            
                        
I am mostly using baseR here (apart from zoo and tiydr), may be little long code, but it can get the desired results.

options(stringsAsFactors = F)
text <- readLines("https://raw.githubusercontent.com/pranavn91/PhD/master/Expt/sample.txt") #Input file

textdf <- data.frame(text, stringsAsFactors = F) #Reading it
search_words <- c("Id","ASIN","title","group","salesrank","similar","avg rating") #search words as per OP
textdf <- data.frame(text = textdf[grepl(paste0(search_words,collapse = "|"), textdf$text),]) #finding the words and filtering it
textdf$key <- as.numeric(gsub("Id:\\s+(\\d+)","\\1",textdf$text))
View(textdf) # Making a key for each Id

textdf$key <- zoo::na.locf(textdf$key) #Propagating the key for same set of Ids
textdf$text <- gsub( "(.*)(?=avg rating:\\s*\\d+)","", textdf$text, perl=T) #Removing text from before "avg rating" 
textdf$text <- gsub("(similar:\\s*\\d+)(.*)","\\1", textdf$text, perl=T) #Removing text after "similar"
textdf$text <- trimws(textdf$text) ##removing leading and trailing blanks
textdf$text <- sub(":","+",textdf$text) #Replacing the first instance of : so that we can split with plus sign, since plus sign is very uncommon hence took it
splits <- strsplit(textdf$text, "\\+")  #Splitting 
max_len <- max(lengths(splits)) #checking for max length of items in the list
all_lyst_eq_len <- lapply(splits, `length<-`, max_len) #equaling the list
df_final <- data.frame(cbind(do.call('rbind', all_lyst_eq_len), textdf$key))# binding the data frame

df_final <- df_final[!duplicated(df_final),] #Removing the duplicates, there is some dups in data
df_f <- tidyr::spread(df_final, X1,X2) # Reshaping it(transposing)

df_f[,c("Id","ASIN", "title", "group","similar",
            "avg rating")] #Final dataset 


Output:

The text file is very wrapped up hence adding a screenshot , my apologies to community.

The output is ditto as per OP.


    
             
                                                        
            
            
              
                
                0
              
                   
                
               讨论(0)
              
                                                  
              
              
                          
             
       
          
              
                                       
     查看其它5个回答


            
                         
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
                              			
        
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复