Create a sub columns in the dataframe using a another dataframe

前端未结

关注

 3  427

I am new to the python and pandas. Here, I have a following dataframe .

did           features   offset   word   JAPE_feature  manual_feature 
0             200


                      
              相关标签:


      
      
        
          3条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  [愿得一人]        
                
              
                            
                2021-01-29 04:10
              
            
            
                                                                       
These types of problems can be solved in many ways. But here I am using simple way to solve it. Creating df with those features list as a column names and the using some comparison logic to update df with 0 and 1. You can use some other logic to avoid use of for loops.

import pandas as pd

data = {'id':[0,1,2,3,4,5,7,8,9,10],
'features':[200, 200, 200, 200, 100, 100, 2200, 2600, 2600, 4600]}

df1 = pd.DataFrame(data)

features_list = [100,200,2200,2600,156,162,4600]
id_list = df1.id.to_list()

df2 = pd.DataFrame(columns=features_list)
list2 = list()

for i in id_list:
    list1 = list()
    for k in df2.columns:
        if df1[df1.id == i].features.iloc[0] == k:
            list1.append(1)
        else:
            list1.append(0)
    list2.append(list1)

for i in range (0,len(list2)):
    df2.loc[i] = list2[i]

df2.insert(0, "id", id_list)   

>>>(df2)
   id 100 200 2200 2600 156 162 4600
0   0   0   1    0    0   0   0    0
1   1   0   1    0    0   0   0    0
2   2   0   1    0    0   0   0    0
3   3   0   1    0    0   0   0    0
4   4   1   0    0    0   0   0    0
5   5   1   0    0    0   0   0    0
6   7   0   0    1    0   0   0    0
7   8   0   0    0    1   0   0    0
8   9   0   0    0    1   0   0    0
9  10   0   0    0    0   0   0    1

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  野的像风        
                
              
                            
                2021-01-29 04:14
              
            
            
                                                                       
Use get_dummies with DataFrame.reindex:

feat = [100,200,2200,2600,156,162,4600,100]
df = df.join(pd.get_dummies(df.pop('features')).reindex(feat, axis=1, fill_value=0))
print (df)
   id  100  200  2200  2600  156  162  4600  100
0   0    0    1     0     0    0    0     0    0
1   1    0    1     0     0    0    0     0    0
2   2    0    1     0     0    0    0     0    0
3   4    1    0     0     0    0    0     0    1
4   5    1    0     0     0    0    0     0    1
5   7    0    0     1     0    0    0     0    0
6   8    0    0     0     1    0    0     0    0
7   9    0    0     0     1    0    0     0    0
8  10    0    0     0     0    0    0     1    0


If need MultiIndex only pass mux to reindex, but also convert id column to index:

feat = [100,200,2200,2600,156,162,4600,100]
mux = pd.MultiIndex.from_product([['features'],feat])

df = pd.get_dummies(df.set_index('id')['features']).reindex(mux, axis=1, fill_value=0)
print (df)
   features                                   
       100  200  2200 2600 156  162  4600 100 
id                                            
0         0    0    0    0    0    0    0    0
1         0    0    0    0    0    0    0    0
2         0    0    0    0    0    0    0    0
4         0    0    0    0    0    0    0    0
5         0    0    0    0    0    0    0    0
7         0    0    0    0    0    0    0    0
8         0    0    0    0    0    0    0    0
9         0    0    0    0    0    0    0    0
10        0    0    0    0    0    0    0    0


EDIT:

cols = ['features', 'JAPE_feature', 'manual_feature']

df = pd.get_dummies(df, columns=cols)
df.columns = df.columns.str.rsplit('_',1, expand=True)
print (df)
  did offset  word features                    JAPE_feature                \
  NaN    NaN   NaN      100 200 2200 2600 4600          100 200 2200 2600   
0   0      0    aa        0   1    0    0    0            0   1    0    0   
1   0     11    bf        0   1    0    0    0            0   1    0    0   
2   0     12    vf        0   1    0    0    0            1   0    0    0   
3   0     13    rw        1   0    0    0    0            0   0    1    0   
4   0     14   asd        1   0    0    0    0            0   0    0    1   
5   0     16  dsdd        0   0    1    0    0            0   0    1    0   
6   0     18    wd        0   0    0    1    0            0   0    1    0   
7   0     20   wsw        0   0    0    1    0            0   0    0    1   
8   0     21    sd        0   0    0    0    1            0   0    0    0   

       manual_feature                     
  4600            100 200 2200 2600 4600  
0    0              0   1    0    0    0  
1    0              0   1    0    0    0  
2    0              1   0    0    0    0  
3    0              0   0    1    0    0  
4    0              1   0    0    0    0  
5    0              0   0    1    0    0  
6    0              0   0    0    1    0  
7    0              0   0    0    1    0  
8    1              0   0    0    0    1  


If want avoid missing values in MultIndex in columns for columns with no MultiIndex:

cols = ['features', 'JAPE_feature', 'manual_feature']
df = df.set_index(df.columns.difference(cols).tolist())

df = pd.get_dummies(df, columns=cols)
df.columns = df.columns.str.rsplit('_',1, expand=True)
print (df)
                features                    JAPE_feature                     \
                     100 200 2200 2600 4600          100 200 2200 2600 4600   
did offset word                                                               
0   0      aa          0   1    0    0    0            0   1    0    0    0   
    11     bf          0   1    0    0    0            0   1    0    0    0   
    12     vf          0   1    0    0    0            1   0    0    0    0   
    13     rw          1   0    0    0    0            0   0    1    0    0   
    14     asd         1   0    0    0    0            0   0    0    1    0   
    16     dsdd        0   0    1    0    0            0   0    1    0    0   
    18     wd          0   0    0    1    0            0   0    1    0    0   
    20     wsw         0   0    0    1    0            0   0    0    1    0   
    21     sd          0   0    0    0    1            0   0    0    0    1   

                manual_feature                     
                           100 200 2200 2600 4600  
did offset word                                    
0   0      aa                0   1    0    0    0  
    11     bf                0   1    0    0    0  
    12     vf                1   0    0    0    0  
    13     rw                0   0    1    0    0  
    14     asd               1   0    0    0    0  
    16     dsdd              0   0    1    0    0  
    18     wd                0   0    0    1    0  
    20     wsw               0   0    0    1    0  
    21     sd                0   0    0    0    1 


EDIT:

If want compare some column from list by manual_feature column use DataFrame.eq with converting to integers:

cols = ['JAPE_feature', 'features']
df1 = df[cols].eq(df['manual_feature'], axis=0).astype(int)
print (df1)
   JAPE_feature  features
0             1         1
1             1         1
2             1         0
3             1         0
4             0         1
5             1         1
6             0         1
7             1         1
8             1         1 

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  暗喜        
                
              
                            
                2021-01-29 04:25
              
            
            
                                                                       
Less fancy solution, but maybe easier to understand:

First of all put the features that will decide which feature you choose on each row in a list called for example list_features.

Then:

# List all the features possible and create an empty df
feat = [100,200,2200,2600,156,162,4600,100]
df_final= pd.DataFrame({x:[] for x in feat})

# Fill the df little by little
for x in list_features:
    df_final = df_final.append({y:1 if x==y else 0 for y in feat }, ignore_index=True)

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复