卷积神经网络 处理文本:word2vec、TF-IDF、TextRank、字符卷积、词卷积、卷积神经网络文本分类模型的实现(Conv1D一维卷积、Conv2D二维卷积)
最小二乘法(LS算法):实际为L2范数的一个具体应用(计算残差平方和)
##### 文本分类使用的tools #####
import re
import csv
import tensorflow as tf
#文本清理函数
def text_clearTitle_word2vec(text,n=12):
text = text.lower() #将文本转化成小写
text = re.sub(r"[^a-z]"," ",text) #替换非标准字符,^是求反操作。
text = re.sub(r" +", " ", text) #替换多重空格
#text = re.sub(" ", "", text) #替换隔断空格
text = text.strip() #取出首尾空格
text = text + " eos" #添加结束符
text = text.split(" ")
return text
#将标签转为one-hot格式函数
def get_label_one_hot(list):
values = np.array(list)
n_values = np.max(values) + 1
return np.eye(n_values)[values]
#获取训练集和标签函数
def get_word2vec_dataset(n = 12):
agnews_label = []
agnews_title = []
agnews_train = csv.reader(open("./dataset/train.csv", "r"))
for line in agnews_train:
agnews_label.append(np.int(line[0]))
agnews_title.append(text_clearTitle_word2vec(line[1]))
from gensim.models import word2vec # 导入gensim包
model = word2vec.Word2Vec(agnews_title, size=64, min_count=0, window=5) # 设置训练参数
train_dataset = []
for line in agnews_title:
length = len(line)
if length > n:
line = line[:n]
word2vec_matrix = (model[line])
train_dataset.append(word2vec_matrix)
else:
word2vec_matrix = (model[line])
pad_length = n - length
pad_matrix = np.zeros([pad_length, 64]) + 1e-10
word2vec_matrix = np.concatenate([word2vec_matrix, pad_matrix], axis=0)
train_dataset.append(word2vec_matrix)
train_dataset = np.expand_dims(train_dataset,3)
label_dataset = get_label_one_hot(agnews_label)
return train_dataset, label_dataset
#word2vec_CNN的模型
def word2vec_CNN():
xs = tf.keras.Input([None,None])
conv_3 = tf.keras.layers.Conv2D(12, [3, 64],activation=tf.nn.relu)(xs) # 设置卷积核大小为[3,64]通道为12的卷积计算
conv_5 = tf.keras.layers.Conv2D(12, [5, 64],activation=tf.nn.relu)(conv_3) # 设置卷积核大小为[3,64]通道为12的卷积计算
conv_7 = tf.keras.layers.Conv2D(12, [7, 64],activation=tf.nn.relu)(conv_5) # 设置卷积核大小为[3,64]通道为12的卷积计算
# 下面是分别对卷积计算的结果进行池化处理,将池化处理的结果转成二维结构
conv_3_mean = tf.keras.layers.Flatten(tf.reduce_max(conv_3, axis=1, keep_dims=True))
conv_5_mean = tf.keras.layers.Flatten(tf.reduce_max(conv_5, axis=1, keep_dims=True))
conv_7_mean = tf.keras.layers.Flatten(tf.reduce_max(conv_7, axis=1, keep_dims=True))
flatten = tf.concat([conv_3_mean, conv_5_mean, conv_7_mean], axis=1) # 连接多个卷积值
fc_1 = tf.keras.layers.Dense(128,activation=tf.nn.relu)(flatten) # 采用全连接层进行分类
logits = tf.keras.layers.Dense(5,activation=tf.nn.softmax)(fc_1) # 获取分类数据
model = tf.keras.Model(inputs=xs, outputs=logits)
return model
########### 定义TFIDF ###########
class TFIDF_score:
def __init__(self,corpus,model = None):
self.corpus = corpus
self.model = model
self.idfs = self.__idf()
def __idf(self):
idfs = {}
d = 0.0
# 统计词出现次数
for doc in self.corpus:
d += 1
counted = []
for word in doc:
if not word in counted:
counted.append(word)
if word in idfs:
idfs[word] += 1
else:
idfs[word] = 1
# 计算每个词逆文档值
for word in idfs:
idfs[word] = math.log(d / float(idfs[word]))
return idfs
def __get_TFIDF_score(self, text):
word_tfidf = {}
for word in text: # 依次获取每个文档中的每个词
if word in word_tfidf: # 计算每个词的词频
word_tfidf[word] += 1
else:
word_tfidf[word] = 1
for word in word_tfidf:
word_tfidf[word] *= self.idfs[word] # 计算每个词的TFIDF值
values_list = sorted(word_tfidf.items(), key=lambda word_tfidf: word_tfidf[1], reverse=True) #将TFIDF数据按重要程度从大到小排序
return values_list
def get_TFIDF_result(self,text):
values_list = self.__get_TFIDF_score(text)
value_list = []
for value in values_list:
value_list.append(value[0])
return (value_list)
############ 计算TFIDF ############
import math
def idf(corpus):
idfs = {}
d = 0.0
# 统计词出现次数
for doc in corpus:
d += 1
counted = []
for word in doc:
if not word in counted:
counted.append(word)
if word in idfs:
idfs[word] += 1
else:
idfs[word] = 1
# 计算每个词逆文档值
for word in idfs:
idfs[word] = math.log(d/float(idfs[word]))
return idfs
idfs = idf(agnews_text) #获取计算好的文本中每个词的idf词频,agnews_text是经过处理后的语料库文档,在数据清洗一节中有详细介绍
for text in agnews_text: #获取文档集中每个文档
word_tfidf = {}
for word in text: #依次获取每个文档中的每个词
if word in word_idf: #计算每个词的词频
word_tfidf[word] += 1
else:
word_tfidf[word] = 1
for word in word_tfidf:
word_tfidf[word] *= idfs[word] # word_tfidf为计算后的每个词的TFIDF值
values_list = sorted(word_tfidf.items(), key=lambda item: item[1], reverse=True) #按value排序
values_list = [value[0] for value in values_list] #生成排序后的单个文档
################ word2vec训练 ################
import csv
import tools
import numpy as np
agnews_label = []
agnews_title = []
agnews_text = []
agnews_train = csv.reader(open("./dataset/train.csv","r"))
for line in agnews_train:
agnews_label.append(np.float32(line[0]))
agnews_title.append(tools.text_clear(line[1]))
agnews_text.append(tools.text_clear(line[2]))
print("开始训练模型")
from gensim.models import word2vec
model = word2vec.Word2Vec(agnews_text,size=64, min_count = 0,window = 5,iter=128)
model_name = "corpusWord2Vec.bin"
model.save(model_name)
from gensim.models import word2vec
model = word2vec.Word2Vec.load('./corpusWord2Vec.bin')
model.train(agnews_title, epochs=model.epochs, total_examples=model.corpus_count)
############### word2vec_CNN ###############
def word2vec_CNN():
xs = tf.keras.Input([None,None])
conv_3 = tf.keras.layers.Conv2D(12, [3, 64],activation=tf.nn.relu)(xs) # 设置卷积核大小为[3,64]通道为12的卷积计算
conv_5 = tf.keras.layers.Conv2D(12, [5, 64],activation=tf.nn.relu)(conv_3) # 设置卷积核大小为[3,64]通道为12的卷积计算
conv_7 = tf.keras.layers.Conv2D(12, [7, 64],activation=tf.nn.relu)(conv_5) # 设置卷积核大小为[3,64]通道为12的卷积计算
# 下面是分别对卷积计算的结果进行池化处理,将池化处理的结果转成二维结构
conv_3_mean = tf.keras.layers.Flatten(tf.reduce_max(conv_3, axis=1, keep_dims=True))
conv_5_mean = tf.keras.layers.Flatten(tf.reduce_max(conv_5, axis=1, keep_dims=True))
conv_7_mean = tf.keras.layers.Flatten(tf.reduce_max(conv_7, axis=1, keep_dims=True))
flatten = tf.concat([conv_3_mean, conv_5_mean, conv_7_mean], axis=1) # 连接多个卷积值
fc_1 = tf.keras.layers.Dense(128,activation=tf.nn.relu)(flatten) # 采用全连接层进行分类
logits = tf.keras.layers.Dense(5,activation=tf.nn.softmax)(fc_1) # 获取分类数据
model = tf.keras.Model(inputs=xs, outputs=logits)
return model
################## 定义TextRank类 ##################
class TextRank_score:
def __init__(self,agnews_text):
self.agnews_text = agnews_text
self.filter_list = self.__get_agnews_text()
self.win = self.__get_win()
self.agnews_text_dict = self.__get_TextRank_score_dict()
def __get_agnews_text(self):
sentence = []
for text in self.agnews_text:
for word in text:
sentence.append(word)
return sentence
def __get_win(self):
win = {}
for i in range(len(self.filter_list)):
if self.filter_list[i] not in win.keys():
win[self.filter_list[i]] = set()
if i - 5 < 0:
lindex = 0
else:
lindex = i - 5
for j in self.filter_list[lindex:i + 5]:
win[self.filter_list[i]].add(j)
return win
def __get_TextRank_score_dict(self):
time = 0
score = {w: 1.0 for w in self.filter_list}
while (time < 50):
for k, v in self.win.items():
s = score[k] / len(v)
score[k] = 0
for i in v:
score[i] += s
time += 1
agnews_text_dict = {}
for key in score:
agnews_text_dict[key] = score[key]
return agnews_text_dict
def __get_TextRank_score(self, text):
temp_dict = {}
for word in text:
if word in self.agnews_text_dict.keys():
temp_dict[word] = (self.agnews_text_dict[word])
values_list = sorted(temp_dict.items(), key=lambda word_tfidf: word_tfidf[1],
reverse=False) # 将TextRank数据按重要程度从大到小排序
return values_list
def get_TextRank_result(self,text):
temp_dict = {}
for word in text:
if word in self.agnews_text_dict.keys():
temp_dict[word] = (self.agnews_text_dict[word])
values_list = sorted(temp_dict.items(), key=lambda word_tfidf: word_tfidf[1], reverse=False)
value_list = []
for value in values_list:
value_list.append(value[0])
return (value_list)
################### 构建AGNews数据集 ###################
import csv
import numpy as np
import tools
agnews_label = [] #空标签列表
agnews_title = [] #空文本标题文档
agnews_train = csv.reader(open("./dataset/train.csv","r")) #读取数据集
for line in agnews_train: #分行迭代文本数据
agnews_label.append(np.int(line[0])) #将标签读入标签列表
agnews_title.append(tools.text_clearTitle(line[1])) #将文本读入
train_dataset = []
for title in agnews_title:
string_matrix = tools.get_handle_string_matrix(title) #构建文本矩阵
train_dataset.append(string_matrix) #将文本矩阵读取训练列表
train_dataset = np.array(train_dataset) #将原生的训练列表转换成numpy格式
label_dataset = tools.get_label_one_hot(agnews_label) #将label列表转换成one-hot格式
################### 构建AGNews数据集使用的工具 ###################
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import numpy as np
#对英文文本做数据清洗
stoplist = stopwords.words('english')
def text_clear(text):
text = text.lower() #将文本转化成小写
text = re.sub(r"[^a-z]"," ",text) #替换非标准字符,^是求反操作。
text = re.sub(r" +", " ", text) #替换多重空格
text = text.strip() #取出首尾空格
text = text.split(" ")
text = [word for word in text if word not in stoplist] #去除停用词
text = [PorterStemmer().stem(word) for word in text] #还原词干部分
text.append("eos") #添加结束符
text = ["bos"] + text #添加开始符
return text
#对标题进行处理
def text_clearTitle(text):
text = text.lower() #将文本转化成小写
text = re.sub(r"[^a-z]"," ",text) #替换非标准字符,^是求反操作。
text = re.sub(r" +", " ", text) #替换多重空格
#text = re.sub(" ", "", text) #替换隔断空格
text = text.strip() #取出首尾空格
text = text + " eos" #添加结束符
return text
#生成标题的one-hot标签
def get_label_one_hot(list):
values = np.array(list)
n_values = np.max(values) + 1
return np.eye(n_values)[values]
#生成文本的one-hot矩阵
def get_one_hot(list,alphabet_title = None):
if alphabet_title == None: #设置字符集
alphabet_title = "abcdefghijklmnopqrstuvwxyz "
else:alphabet_title = alphabet_title
values = np.array(list) #获取字符数列
n_values = len(alphabet_title) + 1 #获取字符表长度
return np.eye(n_values)[values]
#获取文本在词典中位置列表
def get_char_list(string,alphabet_title = None):
if alphabet_title == None:
alphabet_title = "abcdefghijklmnopqrstuvwxyz "
else:alphabet_title = alphabet_title
char_list = []
for char in string: #获取字符串中字符
num = alphabet_title.index(char) #获取对应位置
char_list.append(num) #组合位置编码
return char_list
#生成文本矩阵
def get_string_matrix(string):
char_list = get_char_list(string)
string_matrix = get_one_hot(char_list)
return string_matrix
#获取补全后的文本矩阵
def get_handle_string_matrix(string,n = 64):
string_length= len(string)
if string_length > 64:
string = string[:64]
string_matrix = get_string_matrix(string)
return string_matrix
else:
string_matrix = get_string_matrix(string)
handle_length = n - string_length
pad_matrix = np.zeros([handle_length,28])
string_matrix = np.concatenate([string_matrix,pad_matrix],axis=0)
return string_matrix]
#获取数据集
def get_dataset():
agnews_label = []
agnews_title = []
agnews_train = csv.reader(open("./dataset/train.csv","r"))
for line in agnews_train:
agnews_label.append(np.int(line[0]))
agnews_title.append(text_clearTitle(line[1]))
train_dataset = []
for title in agnews_title:
string_matrix = get_handle_string_matrix(title)
train_dataset.append(string_matrix)
train_dataset = np.array(train_dataset)
label_dataset = get_label_one_hot(agnews_label)
return train_dataset,label_dataset
###################### 采用分词模型处理的AGNews数据集 ######################
def get_word2vec_dataset(n = 12):
agnews_label = [] #创建标签列表
agnews_title = [] #创建标题列表
agnews_train = csv.reader(open("./dataset/train.csv", "r"))
for line in agnews_train: #将数据读取对应列表中
agnews_label.append(np.int(line[0]))
agnews_title.append(text_clearTitle_word2vec(line[1])) #先将数据进行清洗之后再读取
from gensim.models import word2vec # 导入gensim包
model = word2vec.Word2Vec(agnews_title, size=64, min_count=0, window=5) # 设置训练参数
train_dataset = [] #创建训练集列表
for line in agnews_title: #对长度进行判定
length = len(line) #获取列表长度
if length > n: #对列表长度进行判断
line = line[:n] #截取需要的长度列表
word2vec_matrix = (model[line]) #获取word2vec矩阵
train_dataset.append(word2vec_matrix) #将word2vec矩阵添加到训练集中
else: #补全长度不够的操作
word2vec_matrix = (model[line]) #获取word2vec矩阵
pad_length = n – length #获取需要补全的长度
pad_matrix = np.zeros([pad_length, 64]) + 1e-10 #创建补全矩阵并增加一个小数值
word2vec_matrix = np.concatenate([word2vec_matrix, pad_matrix], axis=0) #矩阵补全
train_dataset.append(word2vec_matrix) #将word2vec矩阵添加到训练集中
train_dataset = np.expand_dims(train_dataset,3) #将三维矩阵进行扩展
label_dataset = get_label_one_hot(agnews_label) #转换成onehot矩阵
return train_dataset, label_dataset
#################### 字符CNN训练模型 ####################
def char_CNN():
xs = tf.keras.Input([])
conv_1 = tf.keras.layers.Conv1D( 1, 3,activation=tf.nn.relu)(xs) # 第一层卷积
conv_1 = tf.keras.layers.BatchNormalization(conv_1)
conv_2 = tf.keras.layers.Conv1D( 1, 5,activation=tf.nn.relu)(conv_1) # 第一层卷积
conv_2 = tf.keras.layers.BatchNormalization(conv_2)
conv_3 = tf.keras.layers.Conv1D( 1, 5,activation=tf.nn.relu)(conv_2) # 第一层卷积
conv_3 = tf.keras.layers.BatchNormalization(conv_3)
flatten = tf.keras.layers.Flatten()(conv_3)
fc_1 = tf.keras.layers.Dense( 512,activation=tf.nn.relu)(flatten) # 全连接网络
logits = tf.keras.layers.Dense(5,activation=tf.nn.softmax)(fc_1)
model = tf.keras.Model(inputs=xs, outputs=logits)
return model
import csv
import numpy as np
import tools
import tensorflow as tf
from sklearn.model_selection import train_test_split
train_dataset,label_dataset = tools.get_dataset()
X_train,X_test, y_train, y_test = train_test_split(train_dataset,label_dataset,test_size=0.1, random_state=217) #将数据集划分为训练集和测试集
batch_size = 12
train_data = tf.data.Dataset.from_tensor_slices((X_train,y_train)).batch(batch_size)
model = tools.char_CNN() # 使用模型进行计算
model.compile(optimizer=tf.optimizers.Adam(1e-3), loss=tf.losses.categorical_crossentropy,metrics = ['accuracy'])
model.fit(train_data, epochs=1)
score = model.evaluate(X_test, y_test)
print("last score:",score)
#################### CNN词卷积的模型训练 ####################
import tools
import tensorflow as tf
from sklearn.model_selection import train_test_split
train_dataset,label_dataset = tools.get_word2vec_dataset() #获取数据集
X_train,X_test, y_train, y_test = train_test_split(train_dataset,label_dataset,test_size=0.1, random_state=217) #切分数据集为训练集和测试集
batch_size = 12
train_data = tf.data.Dataset.from_tensor_slices((X_train,y_train)).batch(batch_size)
model = tools.word2vec_CNN() # 使用模型进行计算
model.compile(optimizer=tf.optimizers.Adam(1e-3), loss=tf.losses.categorical_crossentropy,metrics = ['accuracy'])
model.fit(train_data, epochs=1)
score = model.evaluate(X_test, y_test)
print("last score:",score)
来源:CSDN
作者:zimiao552147572
链接:https://blog.csdn.net/zimiao552147572/article/details/104208463