How to find a columns set for a primary key candidate in CSV file?

前端未结

关注

 2  1365

I have a CSV file (not normalized, example, real file up to 100 columns):

   ID, CUST_NAME, CLIENT_NAME, PAYMENT_NUM, START_DATE, END_DATE
    1,     CUST1,


                      
              相关标签:


      
      
        
          2条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  广开言路        
                
              
                            
                2021-01-13 19:26
              
            
            
                                                                       
pandas and itertools will give you what you're looking for. 

import pandas
from itertools import chain, combinations

def key_options(items):
    return chain.from_iterable(combinations(items, r) for r in range(1, len(items)+1) )

df = pandas.read_csv('test.csv');

# iterate over all combos of headings, excluding ID for brevity
for candidate in key_options(list(df)[1:]):
    deduped = df.drop_duplicates(candidate)

    if len(deduped.index) == len(df.index):
        print ','.join(candidate)


This gives you the output:

PAYMENT_NUM, END_DATE
CUST_NAME, CLIENT_NAME, END_DATE
CUST_NAME, PAYMENT_NUM, END_DATE
CLIENT_NAME, PAYMENT_NUM, END_DATE
PAYMENT_NUM, START_DATE, END_DATE
CUST_NAME, CLIENT_NAME, PAYMENT_NUM, END_DATE
CUST_NAME, CLIENT_NAME, START_DATE, END_DATE
CUST_NAME, PAYMENT_NUM, START_DATE, END_DATE
CLIENT_NAME, PAYMENT_NUM, START_DATE, END_DATE
CUST_NAME, CLIENT_NAME, PAYMENT_NUM, START_DATE, END_DATE

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  梦谈多话        
                
              
                            
                2021-01-13 19:28
              
            
            
                                                                       
This is one way via itertools.combinations. It works by, for each set of columns, dropping duplicates and checking if the size of the dataframe changes.

This results in 44 distinct combinations of columns.

from itertools import combinations, chain

full_list = chain.from_iterable(combinations(df, i) for i in range(1, len(df.columns)+1))

n = len(df.index)

res = []
for cols in full_list:
    cols = list(cols)
    if len(df[cols].drop_duplicates().index) == n:
        res.append(cols)

print(len(res))  # 44

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复