问题
I have implemented an emotion detection analysis using lstm, I have firstly trained my model with a dataset of reviews and its emotion, then I have implemented the predicting part where I have put my new dataset to predict emotion for each reviews. But the system is giving me wrong results. Can show please show me which part of the code is incorrect and help me to correct my errors so that I can have good results.. Please,, I am posing my codes below..
For the implementation, I have firstly trained my model and save it so that i can call it in my prediction part.
codes for my training part:
import pandas as pd #data processing, csv file I/O(e.g pd.read_csv)
df = pd.read_csv('C:/Users/User/Desktop/Coding/parsehubjsonfileeg/goodhotelresult.csv')
print(df)
#Preparing data for model traininng
#Tokenization-Since the data is already tokenized and lowecased, we just need to split the words
input_sentences = [text.split(" ") for text in df["text"].values.tolist()]
labels = df["emotions"].values.tolist()
#creating vocabulary(word index)
#Initialize word2id and label2id dictionaries that will be used to encode words and labels
word2id = dict() #creating the dictionary named word2id
label2id = dict() #creating a dictionary named label2id
max_words = 0 #maximum number of words in a sentence
#construction of word2id
for sentence in input_sentences:
for word in sentence:
#Add words to word2id if not exist
if word not in word2id:
word2id[word] = len(word2id)
#If length of the sentence is greater than max_words, update max_words
if len(sentence) > max_words:
max_words = len(sentence)
#Construction of label2id and id2label dictionaries
label2id = {l: i for i, l in enumerate(set(labels))}
id2label = {v: k for k, v in label2id.items()}
print(label2id)
print(id2label)
print(word2id)
#Encode samples with corresponding integer values
import keras
#Encode input words and labels
X = [[word2id[word] for word in sentence] for sentence in input_sentences]
Y = [label2id[label] for label in labels]
#Apply padding to X
from keras.preprocessing.sequence import pad_sequences
X = pad_sequences(X, max_words)
#Convert Y to numpy array
Y = keras.utils.to_categorical(Y, num_classes=len(label2id), dtype='float32')
#Print shapes
print("Shape of X: {}".format(X.shape))
print("Shape of Y: {}".format(Y.shape))
#Build LSTM Model with attention
embedding_dim = 100 # The dimension of word embeddings
#Define input tensor
sequence_input = keras.Input(shape=(max_words,), dtype='int32')
# Word embedding layer
embedded_inputs =keras.layers.Embedding(len(word2id) + 1,
embedding_dim,
input_length=max_words)(sequence_input)
# Apply dropout to prevent overfitting
embedded_inputs = keras.layers.Dropout(0.2)(embedded_inputs)
# Apply Bidirectional LSTM over embedded inputs
lstm_outs = keras.layers.wrappers.Bidirectional(
keras.layers.LSTM(embedding_dim, return_sequences=True)
)(embedded_inputs)
# Apply dropout to LSTM outputs to prevent overfitting
lstm_outs = keras.layers.Dropout(0.2)(lstm_outs)
# Attention Mechanism - Generate attention vectors
attention_vector = keras.layers.TimeDistributed(keras.layers.Dense(1))(lstm_outs)
attention_vector = keras.layers.Reshape((max_words,))(attention_vector)
attention_vector = keras.layers.Activation('softmax', name='attention_vec')(attention_vector)
attention_output = keras.layers.Dot(axes=1)([lstm_outs, attention_vector])
# Last layer: fully connected with softmax activation
fc = keras.layers.Dense(embedding_dim, activation='relu')(attention_output)
output = keras.layers.Dense(len(label2id), activation='softmax')(fc)
# Finally building model
model = keras.Model(inputs=[sequence_input], outputs=output)
model.compile(loss="categorical_crossentropy", metrics=["accuracy"], optimizer='adam')
# Print model summary
model.summary()
# Train model 10 iterations
model.fit(X, Y, epochs=2, batch_size=64, validation_split=0.1, shuffle=True)
model.save('trainmodelsave.py')
Then I did this for my prediction part:
import pandas as pd #data processing, csv file I/O(e.g pd.read_csv)
import json
import re
import string
from nltk.corpus import stopwords
import datetime
from nltk.tokenize import word_tokenize
def remove_stopwords(text):
stop_words = set(stopwords.words("english"))
word_tokens = word_tokenize(text)
filtered_text = [word for word in word_tokens if word not in stop_words]
return filtered_text
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
df = pd.read_csv('C:/Users/User/Desktop/Coding/parsehubjsonfileeg/goodhotelresult.csv')
#print('xxx',df)
#Preparing data for model traininng
#Tokenization-Since the data is already tokenized and lowecased, we just need to split the words
input_sentences = [text.split(" ") for text in df["text"].values.tolist()]
labels = df["emotions"].values.tolist()
#creating vocabulary(word index)
#Initialize word2id and label2id dictionaries that will be used to encode words and labels
word2id = dict() #creating the dictionary named word2id
label2id = dict() #creating a dictionary named label2id
max_words = 0 #maximum number of words in a sentence
#construction of word2id
for sentence in input_sentences:
for word in sentence:
#Add words to word2id if not exist
if word not in word2id:
word2id[word] = len(word2id)
#If length of the sentence is greater than max_words, update max_words
if len(sentence) > max_words:
max_words = len(sentence)
#Construction of label2id and id2label dictionaries
label2id = {l: i for i, l in enumerate(set(labels))}
id2label = {v: k for k, v in label2id.items()}
from keras.models import load_model
model = load_model('trainmodelsave.py')
print(model)
import keras
model_with_attentions = keras.Model(inputs=model.input,
output=[model.output,
model.get_layer('attention_vec').output])
###########################################################
#ADD FOR LOOP DIRECTLY HERE
#File I/O Open function for read data from JSON File
with open('C:/Users/User/Desktop/Coding/parsehubjsonfileeg/all.json', encoding='utf8') as file_object:
# store file data in object
data = json.load(file_object)
# dictionary for element which you want to keep
new_data = {'selection1': []}
print(new_data)
# copy item from old data to new data if it has 'reviews'
for item in data['selection1']:
if 'reviews' in item:
new_data['selection1'].append(item)
print(item['reviews'])
print('--')
# save in file
with open('output2.json', 'w') as f:
json.dump(new_data, f)
selection1 = new_data['selection1']
#creating empty list to be able to create a dataframe
names = []
dates = []
commentss = []
labels = []
hotelname = []
for item in selection1:
name = item['name']
hotelname.append(name)
#print ('>>>>>>>>>>>>>>>>>> ', name)
Date = item['reviews']
for d in Date:
names.append(name)
#convert date from 'january 12, 2020' to 2020-01-02
date = pd.to_datetime(d['date']).strftime("%Y-%m-%d")
#adding date to the empty list dates[]
dates.append(date)
#print('>>>>>>>>>>>>>>>>>> ', date)
CommentID = item['reviews']
for com in CommentID:
comment = com['review']
lcomment = comment.lower() # converting all to lowercase
result = re.sub(r'\d+', '', lcomment) # remove numbers
results = (result.translate(
str.maketrans('', '', string.punctuation))).strip() # remove punctuations and white spaces
comments = remove_stopwords(results)
commentss.append(comment)
print('>>>>>>',comments)
#add the words in comments that are already present in the keys of dictionary
encoded_samples = [[word2id[word] for word in comments if word in word2id.keys()]]
# Padding
encoded_samples = keras.preprocessing.sequence.pad_sequences(encoded_samples, maxlen=max_words)
# Make predictions
label_probs, attentions = model_with_attentions.predict(encoded_samples)
label_probs = {id2label[_id]: prob for (label, _id), prob in zip(label2id.items(), label_probs[0])}
labels.append(label_probs)
#Get word attentions using attenion vector
print(label_probs)
My results are like this:
>>>>>> ['wasnt', 'impressed', 'poor', 'choice', 'anything', 'inclusive', 'option', 'snaks', 'day', 'poor', 'choice', 'free', 'drinks', 'mini', 'bar', 'even', 'coffee', 'payable', 'entertainment', 'evening', 'surprising', 'would', 'come', 'back', 'price', 'go', 'maritim', 'much', 'much', 'better']
{'happy': 0.30163398, 'enjoy': 0.12936097, 'sadness': 0.018949889, 'trust': 0.19013356, 'joy': 0.08350239, 'disgust': 0.13000967, 'anger': 0.14640959}
>>>>>> ['bad', 'experience', 'lazy', 'employees', 'specially', 'boat', 'house', 'food', 'horrible', 'took', 'long', 'get', 'food', 'door', 'room', 'blockedoverall', 'service', 'hotel', 'poor']
{'happy': 0.08209001, 'enjoy': 0.26885188, 'sadness': 0.017319722, 'trust': 0.3754914, 'joy': 0.04761887, 'disgust': 0.059040256, 'anger': 0.14958787}
>>>>>> ['hotel', 'nice', 'lack', 'staff', 'lunch', 'indication', 'hotel', 'situated', 'road']
{'happy': 0.6219977, 'enjoy': 0.046003498, 'sadness': 0.0028672628, 'trust': 0.04223141, 'joy': 0.079679504, 'disgust': 0.14226186, 'anger': 0.06495871}
>>>>>> ['impressed', 'service', 'got', 'mari', 'deal', 'quick', 'efficient', 'hotel', 'turned', 'exactly', 'promised', 'lovely', 'time', 'warm', 'welcome', 'great', 'food']
{'happy': 0.91739607, 'enjoy': 0.0040345276, 'sadness': 4.3373333e-05, 'trust': 0.0034020818, 'joy': 0.01539256, 'disgust': 0.052098893, 'anger': 0.007632463}
>>>>>> ['great', 'stay', 'thank', 'azuri', 'team']
i am having wrong results..what do i need to do to correct this guys.. please help me.
来源:https://stackoverflow.com/questions/60457126/wrong-prediction-when-using-lstm