朴素贝叶斯算法简单、高效。接下来我们来介绍其如何应用在《红楼梦》作者的鉴别上。
第一步,当然是先得有文本数据,我在网上随便下载了一个txt(当时急着交初稿。。。)。分类肯定是要一个回合一个回合的分,所以我们拿到文本数据后,先进行回合划分。然后就是去标点符号、分词,做词频统计。
1 # -*- coding: utf-8 -*-
2 import re
3 import jieba
4 import string
5 import collections as coll
6 jieba.load_userdict('E:\\forpython\\红楼梦词汇大全.txt') # 导入搜狗的红楼梦词库
7
8
9 class textprocesser:
10 def __init__(self):
11 pass
12
13 # 将小说分成120个章节并单独保存到txt文件中
14 def divide_into_chapter(self):
15 red=open('E:\\forpython\\红楼梦.txt',encoding='utf-8')
16 each_line = red.readline()
17 chapter_count = 0
18 chapter_text = ''
19 complied_rule = re.compile('第[一二三四五六七八九十百]+回 ')
20
21 while each_line:
22 if re.findall(complied_rule,each_line):
23 file_name = 'chap'+str(chapter_count)
24 file_out = open('E:\\forpython\\chapters\\'+file_name+'.txt','a',encoding = 'utf-8')
25 file_out.write(chapter_text)
26 chapter_count += 1
27 file_out.close()
28 chapter_text = each_line
29 else:
30 chapter_text += each_line
31
32 each_line = red.readline()
33
34 red.close()
35
36
37 # 对单个章节的分词
38 def segmentation(self,text,text_count):
39 file_name = 'chap'+str(text_count)+'-words.txt'
40 file_out = open('E:\\forpython\\chapter2words\\'+file_name,'a',encoding='utf-8')
41 delset = string.punctuation
42
43 line=text.readline()
44
45 while line:
46 seg_list = jieba.cut(line,cut_all = False)
47 words = " ".join(seg_list)
48 words = words.translate(delset) # 去除英文标点
49 words = "".join(words.split('\n')) # 去除回车符
50 words = self.delCNf(words) # 去除中文标点
51 words = re.sub('[ \u3000]+',' ',words) # 去除多余的空格
52 file_out.write(words)
53 line = text.readline()
54
55 file_out.close()
56 text.close()
57
58
59 # 对所有章节分词
60 def do_segmentation(self):
61 for loop in range(1,121):
62 file_name = 'chap'+str(loop)+'.txt'
63 file_in = open('E:\\forpython\\chapters\\'+file_name,'r',encoding = 'utf-8')
64
65 self.segmentation(file_in,loop)
66
67 file_in.close()
68
69 # 去除中文字符函数
70 def delCNf(self,line):
71 regex = re.compile('[^\u4e00-\u9fa5a-zA-Z0-9\s]')
72 return regex.sub('', line)
73
74
75 # 去除标点后进行词频统计
76 def count_words(self,text,textID):
77 line = str(text)
78 words = line.split()
79 words_dict = coll.Counter(words) # 生成词频字典
80
81 file_name = 'chap'+str(textID)+'-wordcount.txt'
82 file_out = open('E:\\forpython\\chapter-wordcount\\'+file_name,'a',encoding = 'utf-8')
83
84 # 排序后写入文本
85 sorted_result = sorted(words_dict.items(),key = lambda d:d[1],reverse = True)
86 for one in sorted_result:
87 line = "".join(one[0] + '\t' + str(one[1]) + '\n')
88 file_out.write(line)
89
90 file_out.close()
91
92
93
94 def do_wordcount(self):
95 for loop in range(1,121):
96 file_name = 'chap'+str(loop)+'-words.txt'
97 file_in = open('E:\\forpython\\chapter2words\\'+file_name,'r',encoding = 'utf-8')
98 line = file_in.readline()
99
100 text = ''
101 while line:
102 text += line
103 line = file_in.readline()
104 self.count_words(text,loop)
105 file_in.close()
106
107
108 if __name__ == '__main__':
109 processer = textprocesser()
110 processer.divide_into_chapter()
111 processer.do_segmentation()
112 processer.do_wordcount()
文本分类我个人感觉最重要的是选取特征向量,我查阅了相关文献,决定选取五十多个文言虚词和二十多个在120个回合中均出现过的词汇(文言虚词的使用不受情节影响,只与作者写作习惯有关)。下面是生成
特征向量的代码
1 # -*- coding: utf-8 -*-
2 import jieba
3 import re
4 import string
5 import collections as coll
6 jieba.load_userdict('E:\\forpython\\红楼梦词汇大全.txt') # 导入搜狗的红楼梦词库
7
8 class featureVector:
9 def __init__(self):
10 pass
11
12 # 去除中文字符函数
13 def delCNf(self,line):
14 regex = re.compile('[^\u4e00-\u9fa5a-zA-Z0-9\s]')
15 return regex.sub('', line)
16
17
18 # 对整篇文章分词
19 def cut_words(self):
20 red = open('E:\\forpython\\红楼梦.txt','r',encoding = 'utf-8')
21 file_out = open('E:\\forpython\\红楼梦-词.txt','a',encoding = 'utf-8')
22 delset = string.punctuation
23
24 line = red.readline()
25
26 while line:
27 seg_list = jieba.cut(line,cut_all = False)
28 words = ' '.join(seg_list)
29 words = words.translate(delset) # 去除英文标点
30 words = "".join(words.split('\n')) # 去除回车符
31 words = self.delCNf(words) # 去除中文标点
32 words = re.sub('[ \u3000]+',' ',words) # 去除多余的空格
33 file_out.write(words)
34 line = red.readline()
35
36 file_out.close()
37 red.close()
38
39 # 统计词频
40 def count_words(self):
41 data = open('E:\\forpython\\红楼梦-词.txt','r',encoding = 'utf-8')
42 line = data.read()
43 data.close()
44 words = line.split()
45 words_dict = coll.Counter(words) # 生成词频字典
46
47 file_out = open('E:\\forpython\\红楼梦-词频.txt','a',encoding = 'utf-8')
48
49 # 排序后写入文本
50 sorted_result = sorted(words_dict.items(),key = lambda d:d[1],reverse = True)
51 for one in sorted_result:
52 line = "".join(one[0] + '\t' + str(one[1]) + '\n')
53 file_out.write(line)
54
55 file_out.close()
56
57
58
59 def get_featureVector(self):
60 # 将分词后的120个章节文本放入一个列表中
61 everychapter = []
62 for loop in range(1,121):
63 data = open('E:\\forpython\\chapter2words\\chap'+str(loop)+'-words.txt','r',encoding = 'utf-8')
64 each_chapter = data.read()
65 everychapter.append(each_chapter)
66 data.close()
67
68 temp = open('E:\\forpython\\红楼梦-词.txt','r',encoding = 'utf-8')
69 word_beg = temp.read()
70 word_beg = word_beg.split(' ')
71 temp.close()
72
73 # 找出每一个回合都出现的词
74 cleanwords = []
75 for loop in range(1,121):
76 data = open('E:\\forpython\\chapter2words\\chap'+str(loop)+'-words.txt','r',encoding = 'utf-8')
77 words_list = list(set(data.read().split()))
78 data.close()
79 cleanwords.extend(words_list)
80
81 cleanwords_dict = coll.Counter(cleanwords)
82
83 cleanwords_dict = {k:v for k, v in cleanwords_dict.items() if v >= 120}
84
85 cleanwords_f = list(cleanwords_dict.keys())
86
87 xuci = open('E:\\forpython\\文言虚词.txt','r',encoding = 'utf-8')
88 xuci_list = xuci.read().split()
89 xuci.close()
90 featureVector = list(set(xuci_list + cleanwords_f))
91 featureVector.remove('\ufeff')
92
93 # 写入文本
94 file_out = open('E:\\forpython\\红楼梦-特征向量.txt','a',encoding = 'utf-8')
95 for one in featureVector:
96 line = "".join(one+ '\n')
97 file_out.write(line)
98
99 file_out.close()
100 return(featureVector)
101
102 if __name__ == '__main__':
103 vectorbuilter = featureVector()
104 vectorbuilter.cut_words()
105 vectorbuilter.count_words()
106 vectorbuilter.get_featureVector()
朴素贝叶斯文本分类就是用特征向量的词频作为每个回合的代表(偷个懒,直接截图答辩的ppt)
用特征向量把所有一百二十个回合向量化后,你会得到120×70的一个数组。接下来就简单了。直接挑选训练集,在这我是在前80回中挑选了20至29回标记为第一类(用数字1表示),并将其作为第一类的训练集;在后80回合中挑选了110至119回标记为第二类(用数字2表示),并将其作为第二类的训练集。
1 # -*- coding: utf-8 -*-
2
3 import numpy as np
4 from sklearn.naive_bayes import MultinomialNB
5 import get_trainset as ts
6 x_train = ts.get_train_set().get_all_vector()
7
8
9
10 class result:
11 def __inti__(self):
12 pass
13
14 def have_Xtrainset(self):
15 Xtrainset = x_train
16 Xtrainset = np.vstack((Xtrainset[19:29],Xtrainset[109:119]))
17 return(Xtrainset)
18
19 def as_num(self,x):
20 y='{:.10f}'.format(x)
21 return(y)
22
23 def built_model(self):
24 x_trainset = self.have_Xtrainset()
25 y_classset = np.repeat(np.array([1,2]),[10,10])
26
27 NBclf = MultinomialNB()
28 NBclf.fit(x_trainset,y_classset) # 建立模型
29
30 all_vector = x_train
31
32 result = NBclf.predict(all_vector)
33 print('前'+str(len(result[0:80]))+'回分类结果为:')
34 print(result[0:80])
35 print('后'+str(len(result[80:121]))+'回分类结果为:')
36 print(result[80:121])
37
38 diff_chapter = [80,81,83,84,87,88,90,100]
39 for i in diff_chapter:
40 tempr = NBclf.predict_proba(all_vector[i])
41 print('第'+str(i+1)+'回的分类概率为: ')
42 print(str(self.as_num(tempr[0][0]))+' '+str(self.as_num(tempr[0][1])))
43
44
45 if __name__ == '__main__':
46 res = result()
47 res.built_model()
上面是直接调用了skit-learn的MultinomialNB函数,详细情况我在前一篇中讲过。
得到分类结果:
从最终的分类结果来看,在第82回合左右是有一个比较明显的分界点,这样看来前80回合与后40回合在写作风格上还是有显著的差异的,这个结果和红楼梦学术界的年的推断比较一致。
至于为何在后40回中有8个回合被分到1类中,这8个回合分别是81回、82回、84回、85回、88回、89回、91回还有101回,都是在第80回合附近,这个差异有可能是由于上下文的衔接所导致的,因为本文所使用的《红楼梦》文本是从网上下载得到的,,版本不明,所以也有可能是由于红楼梦的版本所导致的。
代码肯定还有很多可以优化的地方,在这里献丑了。。。。
来源:oschina
链接:https://my.oschina.net/u/4351449/blog/4328227