问题
I have a DataFrame Column with one long string I would like to Parse. I am new to regex and have not worked with it yet. What I have below only returns the first name.. at best. I am wondering if parsing this string is easier for regex or creating a dictionary to iterate through. Here is what I have at the moment. The order is not always the same (C,W,D,G,UTIL) and I will be writing a for loop to iterate through multiple rows just like this one.
import pandas as pd
import numpy as np
import re
df = pd.DataFrame(data=np.array([['C Mark Scheifele C Pierre-Luc Dubois UTIL Zach Parise W Mats Zuccarello W Oliver Bjorkstrand W Nick Foligno D Ryan Suter D Seth Jones G Devan Dubnyk'],['UTIL Kyle Connor C Pierre-Luc Dubois C Boone Jenner W Mats Zuccarello W Oliver Bjorkstrand W Nick Foligno D Ryan Suter D Seth Jones G Devan Dubnyk']]), columns=['Lineup'])
df['C1'] = re.findall(r" C \w+",str(df['Lineup']))
df['C2'] = re.findall(r'C \w+',str(df['Lineup']))
df['W1'] = re.findall(r'W \w+',str(df['Lineup']))
df['W2'] = re.findall(r'W \w+',str(df['Lineup']))
df['W3'] = re.findall(r'W \w+',str(df['Lineup']))
df['D1'] = re.findall(r'D \w+',str(df['Lineup']))
df['D1'] = re.findall(r'D \w+',str(df['Lineup']))
df['G']= re.findall(r'G \w+',str(df['Lineup']))
df['UTIL'] = re.findall(r'UTIL \w+',str(df['Lineup']))
I am looking for storing these values into the DF.
df['C1'] = Mark Scheifele
df['C2'] = Pierre-Luc Dubois
df['W1'] = Mats Zuccarello
df['W2'] = Oliver Bjorkstrand
df['W3'] = Nick Foligno
df['D1'] = Ryan Suter
df['D2'] = Seth Jones
df['G']= Devan Dubnyk
df['UTIL'] = Zach Parise
RESULT DATAFRAME
df_result = pd.DataFrame(data=np.array([['Mark Scheifele','Pierre-Luc Dubois','Mats Zuccarello','Oliver Bjorkstrand','Nick Foligno','Ryan Suter','Seth Jones','Devan Dubnyk','Zach Parise'],['Boone Jenner','Pierre-Luc Dubois','Mats Zuccarello','Oliver Bjorkstrand','Nick Foligno','Ryan Suter','Seth Jones','Devan Dubnyk','Kyle Connor']]), columns=['C1','C2','W1','W2','W3','D1','D2','G','UTIL'])
回答1:
import pandas as pd
import numpy as np
import re
def calc_col(col):
'''This function takes a string,
finds the upper case letters or words placed as delimeter,
converts it to a list,
adds a number to the list elements if recurring.
Eg. input list :['W','W','W','D','D','G','C','C','UTIL']
o/p list: ['W1','W2','W3','D1','D2','G','C1','C2','UTIL']
'''
col_list = re.findall(" ?([A-Z]+) ", col)
col_list2 = []
for i in col_list:
cnt = col_list.count(i)
if cnt == 1:
col_list2.append(i)
if cnt > 1:
if i in " ".join(col_list2):
continue;
col_list2 += [i+str(k) for k in range(1,cnt+1)]
return col_list2
df = pd.DataFrame(data=np.array([['C Mark Scheifele C Pierre-Luc Dubois UTIL Zach Parise W Mats Zuccarello W Oliver Bjorkstrand W Nick Foligno D Ryan Suter D Seth Jones G Devan Dubnyk'],['UTIL Kyle Connor C Pierre-Luc Dubois C Boone Jenner W Mats Zuccarello W Oliver Bjorkstrand W Nick Foligno D Ryan Suter D Seth Jones G Devan Dubnyk']]), columns=['Lineup'])
extr_row = df['Lineup'].replace(to_replace =" ?[A-Z]+ ", value="\n", regex = True) #split the rows on
df_final = pd.DataFrame(columns = sorted(calc_col(df['Lineup'].iloc[0]))) #Create an empty data frame df3 with sorted columns
for i in range(len(extr_row)): #traverse all the rows in the original dataframe and append the formatted rows to df3
df_temp = pd.DataFrame((extr_row.values[i].split("\n")[1:])).T
df_temp.columns = calc_col(df['Lineup'].iloc[i])
df_temp= df_temp[sorted(df_temp)]
df_final = df_final.append(df_temp)
df_final.reset_index(drop = True, inplace = True)
df_final
Please see the picture below for the final data frame. This should work for any number of rows:
回答2:
This version will give you the ability to have random orders, lengths (varying counts of ids
and more. However, it relies on the indicator that a completely capitalised word is an id
.
import pandas as pd
def get_df(string):
result = [[key, f"{string[i + 1]} {string[i + 2]}"] for i, key in enumerate(string) if key.isupper()]
occurs = {}
for data in result:
if data[0] not in occurs:
occurs[data[0]] = 1
data[0] = f"{data[0]}1"
else:
occurs[data[0]] += 1
data[0] = f"{data[0]}{occurs[data[0]]}"
return pd.DataFrame(data=[[i[1] for i in result]], columns=[i[0] for i in result])
data = ['C Mark Scheifele C Pierre-Luc Dubois UTIL Zach Parise W Mats Zuccarello W Oliver Bjorkstrand W Nick Foligno D Ryan Suter \
D Seth Jones G Devan Dubnyk','UTIL Kyle Connor C Pierre-Luc Dubois C Boone Jenner W Mats Zuccarello W Oliver Bjorkstrand \
W Nick Foligno D Ryan Suter D Seth Jones G Devan Dubnyk']
for i in data:
print(get_df(i.split()))
Try this if you want to append the returned data frames together, hopefully returns the same data you're aiming for.
df = pd.DataFrame()
for i in data:
df = df.append(get_df(i.split()))
print(get_df(i.split()))
C1 C2 D1 D2 G1 UTIL1 W1 W2 W3
0 Mark Scheifele Pierre-Luc Dubois Ryan Suter Seth Jones Devan Dubnyk Zach Parise Mats Zuccarello Oliver Bjorkstrand Nick Foligno
0 Pierre-Luc Dubois Boone Jenner Ryan Suter Seth Jones Devan Dubnyk Kyle Connor Mats Zuccarello Oliver Bjorkstrand Nick Foligno
来源:https://stackoverflow.com/questions/59890489/python-regex-or-dictionary