例子1
from numpy import array from keras.preprocessing.text import one_hot from keras.preprocessing.sequence import pad_sequences from keras.models import Sequential from keras.layers import Dense from keras.layers import Flatten from keras.layers.embeddings import Embedding # define documents docs = ['Well done!', 'Good work', 'Great effort', 'nice work', 'Excellent!', 'Weak', 'Poor effort!', 'not good', 'poor work', 'Could have done better.'] # define class labels labels = array([1,1,1,1,1,0,0,0,0,0]) # integer encode the documents vocab_size = 50 encoded_docs = [one_hot(d, vocab_size) for d in docs] print(encoded_docs) # pad documents to a max length of 4 words max_length = 4 padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post') print(padded_docs) # define the model model = Sequential() model.add(Embedding(vocab_size, 8, input_length=max_length)) model.add(Flatten()) model.add(Dense(1, activation='sigmoid')) # compile the model model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) # summarize the model print(model.summary()) # fit the model model.fit(padded_docs, labels, epochs=50, verbose=0) # evaluate the model loss, accuracy = model.evaluate(padded_docs, labels, verbose=0) print('Accuracy: %f' % (accuracy*100))
例子2
# -*- coding: utf-8 -*- from keras.layers.core import Activation, Dense from keras.layers.embeddings import Embedding from keras.layers.recurrent import LSTM from keras.models import Sequential from keras.preprocessing import sequence from sklearn.model_selection import train_test_split import collections import nltk nltk.download('punkt') import numpy as np ## EDA maxlen = 0 word_freqs = collections.Counter() num_recs = 0 with open('train_data.txt','r+') as f: for line in f: label, sentence = line.strip().split("\t") words = nltk.word_tokenize(sentence.lower()) if len(words) > maxlen: maxlen = len(words) for word in words: word_freqs[word] += 1 num_recs += 1 print('max_len ',maxlen) print('nb_words ', len(word_freqs)) ## 准备数据 MAX_FEATURES = 2000 MAX_SENTENCE_LENGTH = 40 vocab_size = min(MAX_FEATURES, len(word_freqs)) + 2 word2index = {x[0]: i+2 for i, x in enumerate(word_freqs.most_common(MAX_FEATURES))} word2index["PAD"] = 0 word2index["UNK"] = 1 index2word = {v:k for k, v in word2index.items()} X = np.empty(num_recs,dtype=list) y = np.zeros(num_recs) i=0 with open('train_data.txt','r+') as f: for line in f: label, sentence = line.strip().split("\t") words = nltk.word_tokenize(sentence.lower()) seqs = [] for word in words: if word in word2index: seqs.append(word2index[word]) else: seqs.append(word2index["UNK"]) X[i] = seqs y[i] = int(label) i += 1 X = sequence.pad_sequences(X, maxlen=MAX_SENTENCE_LENGTH) ## 数据划分 Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42) ## 网络构建 EMBEDDING_SIZE = 128 HIDDEN_LAYER_SIZE = 64 BATCH_SIZE = 32 NUM_EPOCHS = 10 model = Sequential() model.add(Embedding(vocab_size, EMBEDDING_SIZE,input_length=MAX_SENTENCE_LENGTH)) model.add(LSTM(HIDDEN_LAYER_SIZE, dropout=0.2, recurrent_dropout=0.2)) model.add(Dense(1)) model.add(Activation("sigmoid")) model.compile(loss="binary_crossentropy", optimizer="adam",metrics=["accuracy"]) ## 网络训练 model.fit(Xtrain, ytrain, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,validation_data=(Xtest, ytest)) ## 预测 score, acc = model.evaluate(Xtest, ytest, batch_size=BATCH_SIZE) print("\nTest score: %.3f, accuracy: %.3f" % (score, acc)) print('{} {} {}'.format('预测','真实','句子')) for i in range(5): idx = np.random.randint(len(Xtest)) xtest = Xtest[idx].reshape(1,40) ylabel = ytest[idx] whk =model.predict(xtest) print whk ypred = model.predict(xtest)[0][0] sent = " ".join([index2word[x] for x in xtest[0] if x != 0]) print(' {} {} {}'.format(int(round(ypred)), int(ylabel), sent)) ##### 自己输入 INPUT_SENTENCES = ['I love reading.','You are so boring.'] XX = np.empty(len(INPUT_SENTENCES),dtype=list) i=0 for sentence in INPUT_SENTENCES: words = nltk.word_tokenize(sentence.lower()) seq = [] for word in words: if word in word2index: seq.append(word2index[word]) else: seq.append(word2index['UNK']) XX[i] = seq i+=1 XX = sequence.pad_sequences(XX, maxlen=MAX_SENTENCE_LENGTH) labels = [int(round(x[0])) for x in model.predict(XX) ] label2word = {1:'积极', 0:'消极'} for i in range(len(INPUT_SENTENCES)): print('{} {}'.format(label2word[labels[i]], INPUT_SENTENCES[i])) whk = model.predict(XX) print whk
txt中的内容大致如下
1 The Da Vinci Code book is just awesome. 1 this was the first clive cussler i've ever read, but even books like Relic, and Da Vinci code were more plausible than this. 1 i liked the Da Vinci Code a lot. 1 i liked the Da Vinci Code a lot. 1 I liked the Da Vinci Code but it ultimatly didn't seem to hold it's own. 1 that's not even an exaggeration ) and at midnight we went to Wal-Mart to buy the Da Vinci Code, which is amazing of course. 1 I loved the Da Vinci Code, but now I want something better and different!.. 1 i thought da vinci code was great, same with kite runner. 1 The Da Vinci Code is actually a good movie... 1 I thought the Da Vinci Code was a pretty good book. 1 The Da Vinci Code is one of the most beautiful movies ive ever seen. 1 The Da Vinci Code is an * amazing * book, do not get me wrong. 1 then I turn on the light and the radio and enjoy my Da Vinci Code. 1 The Da Vinci Code was REALLY good. 1 i love da vinci code.... 1 i loved da vinci code.. 1 TO NIGHT:: THE DA VINCI CODE AND A BEAUTIFUL MIND... 1 THE DA VINCI CODE is AN AWESOME BOOK.... 1 Thing is, I enjoyed The Da Vinci Code. 1 very da vinci code slash amazing race.
例子3,此例子是时间序列预测
import numpy import matplotlib.pyplot as plt from pandas import read_csv import math from keras.models import Sequential from keras.layers import Dense from keras.layers import LSTM from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import mean_squared_error def create_dataset(dataset, look_back=1): dataX, dataY = [], [] for i in range(len(dataset)-look_back-1): a = dataset[i:(i+look_back), 0] dataX.append(a) dataY.append(dataset[i + look_back, 0]) return numpy.array(dataX), numpy.array(dataY) dataframe = read_csv('international-airline-passengers.csv', usecols=[1], engine='python', skipfooter=3) dataset = dataframe.values dataset = dataset.astype('float32') train_size = int(len(dataset) * 0.67) print dataset scaler = MinMaxScaler(feature_range=(0, 1)) dataset = scaler.fit_transform(dataset) test_size = len(dataset) - train_size train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:] # reshape into X=t and Y=t+1 look_back = 1 trainX, trainY = create_dataset(train, look_back) testX, testY = create_dataset(test, look_back) trainX = numpy.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1])) testX = numpy.reshape(testX, (testX.shape[0], 1, testX.shape[1])) model = Sequential() model.add(LSTM(4, input_shape=(1, look_back))) model.add(Dense(1)) model.compile(loss='mean_squared_error', optimizer='adam') model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2) testPredict = model.predict(testX) testPredict = scaler.inverse_transform(testPredict) whk=[[112],[390],[622]] whk = scaler.fit_transform(whk) testOne=numpy.reshape(whk, (3, 1, 1)) print testOne testPredict = model.predict(testOne) testPredict = scaler.inverse_transform(testPredict) print testPredict
其中 international-airline-passengers.csv 的内容较少,大致如下
"Month","International airline passengers: monthly totals in thousands. Jan 49 ? Dec 60" "1949-01",112 "1949-02",118 "1949-03",132 "1949-04",129 "1949-05",121 "1949-06",135 "1949-07",148 "1949-08",148 "1949-09",136 "1949-10",119 "1949-11",104 "1949-12",118 "1950-01",115 "1950-02",126 "1950-03",141 "1950-04",135 "1950-05",125 "1950-06",149 "1950-07",170 "1950-08",170 "1950-09",158 "1950-10",133 "1950-11",114 "1950-12",140 "1951-01",145 "1951-02",150 "1951-03",178 "1951-04",163 "1951-05",172 "1951-06",178