Wrong prediction when using lstm

问题

I have implemented an emotion detection analysis using lstm, I have firstly trained my model with a dataset of reviews and its emotion, then I have implemented the predicting part where I have put my new dataset to predict emotion for each reviews. But the system is giving me wrong results. Can show please show me which part of the code is incorrect and help me to correct my errors so that I can have good results.. Please,, I am posing my codes below..

For the implementation, I have firstly trained my model and save it so that i can call it in my prediction part.

codes for my training part:

import pandas as pd #data processing, csv file I/O(e.g pd.read_csv)


df = pd.read_csv('C:/Users/User/Desktop/Coding/parsehubjsonfileeg/goodhotelresult.csv')
print(df)

#Preparing data for model traininng
#Tokenization-Since the data is already tokenized and lowecased, we just need to split the words
input_sentences = [text.split(" ") for text in df["text"].values.tolist()]
labels = df["emotions"].values.tolist()

#creating vocabulary(word index)
#Initialize word2id and label2id dictionaries that will be used to encode words and labels
word2id = dict() #creating the dictionary named word2id
label2id = dict() #creating a dictionary named label2id

max_words = 0 #maximum number of words in a sentence

#construction of word2id
for sentence in input_sentences:
    for word in sentence:
        #Add words to word2id if not exist
        if word not in word2id:
            word2id[word] = len(word2id)
    #If length of the sentence is greater than max_words, update max_words
    if len(sentence) > max_words:
        max_words = len(sentence)

#Construction of label2id and id2label dictionaries
label2id = {l: i for i, l in enumerate(set(labels))}
id2label = {v: k for k, v in label2id.items()}
print(label2id)
print(id2label)
print(word2id)
#Encode samples with corresponding integer values
import keras

#Encode input words and labels

X = [[word2id[word] for word in sentence] for sentence in input_sentences]
Y = [label2id[label] for label in labels]

#Apply padding to X
from keras.preprocessing.sequence import pad_sequences
X = pad_sequences(X, max_words)

#Convert Y to numpy array
Y = keras.utils.to_categorical(Y, num_classes=len(label2id), dtype='float32')

#Print shapes
print("Shape of X: {}".format(X.shape))
print("Shape of Y: {}".format(Y.shape))

#Build LSTM Model with attention
embedding_dim = 100 # The dimension of word embeddings

#Define input tensor
sequence_input = keras.Input(shape=(max_words,), dtype='int32')

# Word embedding layer
embedded_inputs =keras.layers.Embedding(len(word2id) + 1,
                                        embedding_dim,
                                        input_length=max_words)(sequence_input)

# Apply dropout to prevent overfitting
embedded_inputs = keras.layers.Dropout(0.2)(embedded_inputs)

# Apply Bidirectional LSTM over embedded inputs
lstm_outs = keras.layers.wrappers.Bidirectional(
    keras.layers.LSTM(embedding_dim, return_sequences=True)
)(embedded_inputs)

# Apply dropout to LSTM outputs to prevent overfitting
lstm_outs = keras.layers.Dropout(0.2)(lstm_outs)

# Attention Mechanism - Generate attention vectors

attention_vector = keras.layers.TimeDistributed(keras.layers.Dense(1))(lstm_outs)
attention_vector = keras.layers.Reshape((max_words,))(attention_vector)
attention_vector = keras.layers.Activation('softmax', name='attention_vec')(attention_vector)
attention_output = keras.layers.Dot(axes=1)([lstm_outs, attention_vector])

# Last layer: fully connected with softmax activation
fc = keras.layers.Dense(embedding_dim, activation='relu')(attention_output)
output = keras.layers.Dense(len(label2id), activation='softmax')(fc)

# Finally building model
model = keras.Model(inputs=[sequence_input], outputs=output)
model.compile(loss="categorical_crossentropy", metrics=["accuracy"], optimizer='adam')

# Print model summary
model.summary()

# Train model 10 iterations
model.fit(X, Y, epochs=2, batch_size=64, validation_split=0.1, shuffle=True)
model.save('trainmodelsave.py')

Then I did this for my prediction part:

import pandas as pd #data processing, csv file I/O(e.g pd.read_csv)
import json
import re
import string
from nltk.corpus import stopwords
import datetime
from nltk.tokenize import word_tokenize
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text


from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


df = pd.read_csv('C:/Users/User/Desktop/Coding/parsehubjsonfileeg/goodhotelresult.csv')
#print('xxx',df)

#Preparing data for model traininng
#Tokenization-Since the data is already tokenized and lowecased, we just need to split the words
input_sentences = [text.split(" ") for text in df["text"].values.tolist()]
labels = df["emotions"].values.tolist()

#creating vocabulary(word index)
#Initialize word2id and label2id dictionaries that will be used to encode words and labels
word2id = dict() #creating the dictionary named word2id
label2id = dict() #creating a dictionary named label2id

max_words = 0 #maximum number of words in a sentence

#construction of word2id
for sentence in input_sentences:
    for word in sentence:
        #Add words to word2id if not exist
        if word not in word2id:
            word2id[word] = len(word2id)
    #If length of the sentence is greater than max_words, update max_words
    if len(sentence) > max_words:
        max_words = len(sentence)

#Construction of label2id and id2label dictionaries
label2id = {l: i for i, l in enumerate(set(labels))}
id2label = {v: k for k, v in label2id.items()}

from keras.models import load_model

model = load_model('trainmodelsave.py')
print(model)

import keras
model_with_attentions = keras.Model(inputs=model.input,
                                    output=[model.output,
                                              model.get_layer('attention_vec').output])


###########################################################
#ADD FOR LOOP DIRECTLY HERE

#File I/O Open function for read data from JSON File
with open('C:/Users/User/Desktop/Coding/parsehubjsonfileeg/all.json', encoding='utf8') as file_object:
        # store file data in object
        data = json.load(file_object)

        # dictionary for element which you want to keep
        new_data = {'selection1': []}
        print(new_data)
        # copy item from old data to new data if it has 'reviews'
        for item in data['selection1']:
            if 'reviews' in item:
                new_data['selection1'].append(item)
                print(item['reviews'])
                print('--')

        # save in file
        with open('output2.json', 'w') as f:
            json.dump(new_data, f)

selection1 = new_data['selection1']
#creating empty list to be able to create a dataframe
names = []
dates = []
commentss = []
labels = []
hotelname = []
for item in selection1:
    name = item['name']
    hotelname.append(name)
    #print ('>>>>>>>>>>>>>>>>>> ', name)
    Date = item['reviews']
    for d in Date:
        names.append(name)
        #convert date from 'january 12, 2020' to 2020-01-02
        date = pd.to_datetime(d['date']).strftime("%Y-%m-%d")
        #adding date to the empty list dates[]
        dates.append(date)
        #print('>>>>>>>>>>>>>>>>>> ', date)
    CommentID = item['reviews']
    for com in CommentID:
        comment = com['review']
        lcomment = comment.lower()  # converting all to lowercase
        result = re.sub(r'\d+', '', lcomment)  # remove numbers
        results = (result.translate(
        str.maketrans('', '', string.punctuation))).strip()  # remove punctuations and white spaces
        comments = remove_stopwords(results)
        commentss.append(comment)
        print('>>>>>>',comments)

    #add the words in comments that are already present in the keys of dictionary
        encoded_samples = [[word2id[word] for word in comments if word in word2id.keys()]]


    # Padding
        encoded_samples = keras.preprocessing.sequence.pad_sequences(encoded_samples, maxlen=max_words)

     # Make predictions
        label_probs, attentions = model_with_attentions.predict(encoded_samples)
        label_probs = {id2label[_id]: prob for (label, _id), prob in zip(label2id.items(), label_probs[0])}
        labels.append(label_probs)
       #Get word attentions using attenion vector
        print(label_probs)

My results are like this:

>>>>>> ['wasnt', 'impressed', 'poor', 'choice', 'anything', 'inclusive', 'option', 'snaks', 'day', 'poor', 'choice', 'free', 'drinks', 'mini', 'bar', 'even', 'coffee', 'payable', 'entertainment', 'evening', 'surprising', 'would', 'come', 'back', 'price', 'go', 'maritim', 'much', 'much', 'better']
{'happy': 0.30163398, 'enjoy': 0.12936097, 'sadness': 0.018949889, 'trust': 0.19013356, 'joy': 0.08350239, 'disgust': 0.13000967, 'anger': 0.14640959}
>>>>>> ['bad', 'experience', 'lazy', 'employees', 'specially', 'boat', 'house', 'food', 'horrible', 'took', 'long', 'get', 'food', 'door', 'room', 'blockedoverall', 'service', 'hotel', 'poor']
{'happy': 0.08209001, 'enjoy': 0.26885188, 'sadness': 0.017319722, 'trust': 0.3754914, 'joy': 0.04761887, 'disgust': 0.059040256, 'anger': 0.14958787}
>>>>>> ['hotel', 'nice', 'lack', 'staff', 'lunch', 'indication', 'hotel', 'situated', 'road']
{'happy': 0.6219977, 'enjoy': 0.046003498, 'sadness': 0.0028672628, 'trust': 0.04223141, 'joy': 0.079679504, 'disgust': 0.14226186, 'anger': 0.06495871}
>>>>>> ['impressed', 'service', 'got', 'mari', 'deal', 'quick', 'efficient', 'hotel', 'turned', 'exactly', 'promised', 'lovely', 'time', 'warm', 'welcome', 'great', 'food']
{'happy': 0.91739607, 'enjoy': 0.0040345276, 'sadness': 4.3373333e-05, 'trust': 0.0034020818, 'joy': 0.01539256, 'disgust': 0.052098893, 'anger': 0.007632463}
>>>>>> ['great', 'stay', 'thank', 'azuri', 'team']

i am having wrong results..what do i need to do to correct this guys.. please help me.

来源：https://stackoverflow.com/questions/60457126/wrong-prediction-when-using-lstm

标签

python

pandas

machine-learning

keras

lstm