问题
I'm trying to train a random forest regressor to predict the hourly wage of an employee given the job description supplied. Note, I've signed an NDA and cannot upload real data. The below "observation" is synthetic:
sample_row = {'job_posting_id': 'id_01',
'buyer_vertical': 'Business Services',
'currency': 'USD',
'fg_onet_code': '43-9011.00',
'jp_title': 'Computer Operator',
'jp_description': "Performs information security-related risk and compliance activities, including but not limited to data protection, legal, and regulatory compliance. Evaluates and records Vendor Risk Assessment responses and supporting documentation in the eGRC tool. Assists with consolidating risk and compliance data for management reporting and metrics. Update and/or create process documentation and business requirements. Skills Required Experience Last Used Bachelor's Degree or Equivalent 4 years Currently Using Required Experience in regulatory compliance and risk analysis, data privacy and other industry standards of an enterprise compliance program 1 year 1 year ago Required Understanding of industry security standards/best practices (e.g., NIST, UCF, ISO) 1 year 1 year ago Required Understanding of compliance and Information Security related laws and regulations (e.g., GLBA, PCI, HIPAA) 1 year 1 year ago Required Familiarity in computer operations required; (e.g. knowledge of Windows/Unix operating systems, authentication methods, firewalls, routers, web services, etc.) 1 year 2 years ago Experience in performing Third Party Vendor Risk Assessments 1 year 1 year ago Experience using RSA Archer eGRC application 1 year 1 year ago Proficiency in Microsoft Office tools and applications 2 years Currently Using Required Strong analytical, evaluative, and problem-solving abilities 2 years Currently Using Required Excellent written and oral communication and listening skills 2 years Currently Using Required",
'onet_salary': '22.04'}
I've tried to streamline my code through a user defined object:
class Model():
def __init__(self,model_instance,data,text_col,target_col,test_size=0.3,random_state=1):
self.model = model_instance
self.text = data[text_col]
self.target = data[target_col]
self.test_size = test_size
self.random_state = random_state
self._split_data()
self._fit()
def _fit(self):
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
self.pipeline = Pipeline([
('bow', CountVectorizer(analyzer=self._text_process)), # strings to token integer counts
('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores
('classifier', model), # train given model on TF-IDF vectors
])
self.pipeline.fit(self.text_train,self.value_train)
self.preds = self.pipeline.predict(self.text_test)
def _text_process(self,text):
def remove_non_ascii(text):
return ''.join(i for i in text if ord(i)<128)
text = remove_non_ascii(text)
import string
from nltk.corpus import stopwords
nopunc = [char for char in text if char not in string.punctuation]
nopunc = ''.join(nopunc)
return [word.lower() for word in nopunc.split() if word.lower() not in stopwords.words('english')]
def _split_data(self):
from sklearn.model_selection import train_test_split
self.text_train, self.text_test, self.value_train, self.value_test = train_test_split(
self.text, self.target, test_size=self.test_size, random_state=self.random_state)
def predict(self,observation):
return self.pipeline.predict(observation)
def performance_metrics(self):
from sklearn.metrics import classification_report
print(classification_report(self.preds,self.value_test))
Now when I call an instance of the class, I return the following error:
rfr = RandomForestRegressor()
m0 = Model(model_instance=rfr,data=unique_templates,text_col='jp_description',target_col='onet_salary')
>>>> ValueError: could not convert string to float: 'UKN'
What can I do to troubleshoot the issue? Is there a token in the test set that's not in the train set, which is being converted to 'UKN' - which triggers the error?
Any recommendations?
Edit 1: Expanded error message:
ValueError Traceback (most recent call last)
<ipython-input-69-d21643a194e8> in <module>
1 rfr = RandomForestRegressor()
----> 2 m0 = Model(model_instance=rfr,data=unique_templates,text_col='jp_description',target_col='onet_salary')
<ipython-input-68-88d95cf81383> in __init__(self, model_instance, data, text_col, target_col, test_size, random_state)
9
10 self._split_data()
---> 11 self._fit()
12
13 def _fit(self):
<ipython-input-68-88d95cf81383> in _fit(self)
21 ('classifier', model),
22 ])
---> 23 self.pipeline.fit(self.text_train,self.value_train)
24 self.preds = self.pipeline.predict(self.text_test)
25
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
354 self._log_message(len(self.steps) - 1)):
355 if self._final_estimator != 'passthrough':
--> 356 self._final_estimator.fit(Xt, y, **fit_params)
357 return self
358
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py in fit(self, X, y, sample_weight)
276
277 if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
--> 278 y = np.ascontiguousarray(y, dtype=DOUBLE)
279
280 if expanded_class_weight is not None:
~/opt/anaconda3/lib/python3.7/site-packages/numpy/core/_asarray.py in ascontiguousarray(a, dtype)
177
178 """
--> 179 return array(a, dtype, copy=False, order='C', ndmin=1)
180
181
ValueError: could not convert string to float: 'UKN'
来源:https://stackoverflow.com/questions/60580392/python-nlp-valueerror-could-not-convert-string-to-float-ukn