extracting rows from CSV file based on specific keywords

后端未结

关注

 2  433

I have created a code to help me retrieving the data from csv file

  import re
keywords = {\"metal\", \"energy\", \"team\", \"sheet\", \"solar\" \"finan


                      
              相关标签:


      
      
        
          2条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  后悔当初        
                
              
                            
                2021-01-29 05:51
              
            
            
                                                                       
You can do this using pandas as follows, if you are looking for rows that contain exactly one word from the list of keywords: 

keywords = ["metal", "energy", "team", "sheet", "solar" "financial", "transportation", "electrical", "scientists",
            "electronic", "workers"]

# read the csv data into a dataframe 
# change "," to the data separator in your csv file 
df = pd.read_csv("2006-data-8-8-2016.csv", sep=",")
# filter the data: keep only the rows that contain one of the keywords 
# in the position or the Job description columns
df = df[df["position"].isin(keywords) | df["Job description"].isin(keywords)] 
# write the data back to a csv file 
df.to_csv("new_data.csv",sep=",", index=False) 


If you are looking for substrings in the rows (e.g looking financial in financial engineering) then you can do the following: 

keywords = ["metal", "energy", "team", "sheet", "solar" "financial", "transportation", "electrical", "scientists",
            "electronic", "workers"]
searched_keywords = '|'.join(keywords)

# read the csv data into a dataframe 
# change "," to the data separator in your csv file 
df = pd.read_csv("2006-data-8-8-2016.csv", sep=",")
# filter the data: keep only the rows that contain one of the keywords 
# in the position or the Job description columns
df = df[df["position"].str.contains(searched_keywords) | df["Job description"].str.contains(searched_keywords)] 
# write the data back to a csv file 
df.to_csv("new_data.csv",sep=",", index=False) 

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  情歌与酒        
                
              
                            
                2021-01-29 05:53
              
            
            
                                                                       
Try this, looping in a dataframe and write back a new dataframe to a csv file.

import pandas as pd

keywords = {"metal", "energy", "team", "sheet", "solar", "financial", 
        "transportation", "electrical", "scientists",
        "electronic", "workers"}  # all your keywords

df = pd.read_csv("2006-data-8-8-2016.csv", sep=",")

listMatchPosition = []
listMatchDescription = []

for i in range(len(df.index)):
    if any(x in df['position'][i] or x in df['Job description'][i] for x in keywords):
        listMatchPosition.append(df['position'][i])
        listMatchDescription.append(df['Job description'][i])


output = pd.DataFrame({'position':listMatchPosition, 'Job description':listMatchDescription})
output.to_csv("new_data.csv", index=False)


EDIT:
If you have many columns to add, the modified following code will do the job.

df = pd.read_csv("2006-data-8-8-2016.csv", sep=",")

output = pd.DataFrame(columns=df.columns)

for i in range(len(df.index)):
    if any(x in df['position'][i] or x in df['Job description'][i] for x in keywords):
    output.loc[len(output)] = [df[j][i] for j in df.columns]

output.to_csv("new_data.csv", index=False)

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复