Remove Duplicates from Text File

前端未结

关注

 7  1052

I want to remove duplicate word from a text file.

i have some text file which contain such like following:

None_None

ConfigHandler_56663624
ConfigHa


                      
              相关标签:


      
      
        
          7条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  野趣味        
                
              
                            
                2020-12-17 05:50
              
            
            
                                                                       
Here's about option that preserves order (unlike a set), but still has the same behaviour (note that the EOL character is deliberately stripped and blank lines are ignored)...

from collections import OrderedDict

with open('/home/jon/testdata.txt') as fin:
    lines = (line.rstrip() for line in fin)
    unique_lines = OrderedDict.fromkeys( (line for line in lines if line) )

print unique_lines.keys()
# ['None_None', 'ConfigHandler_56663624', 'ColumnConverter_56963312',PredicatesFactory_56963424', 'PredicateConverter_56963648', 'ConfigHandler_80134888']


Then you just need to write the above to your output file.
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  深忆病人        
                
              
                            
                2020-12-17 05:50
              
            
            
                                                                       
Here's how you can do it with sets (unordered results):

from pprint import pprint

with open('input.txt', 'r') as f:
    print pprint(set(f.readlines()))


Additionally you may want to get rid of new line chars.
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  时光说笑        
                
              
                            
                2020-12-17 05:55
              
            
            
                                                                       
this way get same file out that was put in

import uuid

def _remove_duplicates(filePath):
  f = open(filePath, 'r')
  lines = f.readlines()
  lines_set = set(lines)
  tmp_file=str(uuid.uuid4())
  out=open(tmp_file, 'w')
  for line in lines_set:
    out.write(line)
  f.close()
  os.rename(tmp_file,filePath)

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  夕颜        
                
              
                            
                2020-12-17 05:59
              
            
            
                                                                       
Here is a simple solution using sets to remove the duplicates from the text file.

lines = open('workfile.txt', 'r').readlines()

lines_set = set(lines)

out  = open('workfile.txt', 'w')

for line in lines_set:
    out.write(line)

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  后悔当初        
                
              
                            
                2020-12-17 05:59
              
            
            
                                                                       
If you just want to get un-duplicate output , you can use uniq and sort

hvn@lappy: /tmp () $ sort -nr dup | uniq
PredicatesFactory_56963424
PredicateConverter_56963648
None_None
ConfigHandler_80134888
ConfigHandler_56663624
ColumnConverter_56963312


For python:

In [2]: with open("dup", 'rt') as f:
    lines = f.readlines()
   ...:     

In [3]: lines
Out[3]: 
['None_None\n',
 '\n',
 'ConfigHandler_56663624\n',
 'ConfigHandler_56663624\n',
 'ConfigHandler_56663624\n',
 'ConfigHandler_56663624\n',
 '\n',
 'None_None\n',
 '\n',
 'ColumnConverter_56963312\n',
 'ColumnConverter_56963312\n',
 '\n',
 'PredicatesFactory_56963424\n',
 'PredicatesFactory_56963424\n',
 '\n',
 'PredicateConverter_56963648\n',
 'PredicateConverter_56963648\n',
 '\n',
 'ConfigHandler_80134888\n',
 'ConfigHandler_80134888\n',
 'ConfigHandler_80134888\n',
 'ConfigHandler_80134888\n']

In [4]: set(lines)
Out[4]: 
set(['ColumnConverter_56963312\n',
     '\n',
     'PredicatesFactory_56963424\n',
     'ConfigHandler_56663624\n',
     'PredicateConverter_56963648\n',
     'ConfigHandler_80134888\n',
     'None_None\n'])

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  慢半拍i        
                
              
                            
                2020-12-17 06:04
              
            
            
                                                                       
def remove_duplicates(infile):
    storehouse = set()
    with open('outfile.txt', 'w+') as out:
        for line in open(infile):
            if line not in storehouse:
                out.write(line)
                storehouse.add(line)

remove_duplicates('infile.txt')

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
   
          
     1
2
下一页
           
           
        
                                  
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复