1、知识点
""" 1)cut() a) codecs.open() 解决编码问题 b) f.readline() 读取一行,也可以使用f.readlines()读取多行 c) words =" ".join(jieba.cut(line))分词,每个词用空格分隔 2)lcut() 返回一个list列表 """
2、标点符号处理,并分词,存储到文件中
def fenCi(): """ 标点符号处理,并分词,存储到文件中 :return: """ f = codecs.open("深渊主宰系统.txt",'r',encoding='utf-8') f1 = open("seg.txt",'w',encoding='utf-8') line = f.readline() while line: line = line.strip(' ') words =" ".join(jieba.cut(line)) words = words.replace(",","").replace("!","").replace("“","")\ .replace("”","").replace("。","").replace("?","").replace(":","")\ .replace("...","").replace("、","").strip(' ') print(len(words)) if words.startswith('-') or words == '\r\n' or words.startswith('.') or len(words)<10 : line = f.readline() continue words = words.strip('\n') f1.writelines(words) line = f.readline()
3、中文分词统计
def zhongwen(): """ 中文分词统计 对两个词以上的次数进行统计 lcut 进行分词,返回分词后list列表 :return: """ f = codecs.open("深渊主宰系统.txt", 'r', encoding='utf-8').read() counts = {} wordsList =jieba.lcut(f) for word in wordsList: word = word.replace(",", "").replace("!", "").replace("“", "") \ .replace("”", "").replace("。", "").replace("?", "").replace(":", "") \ .replace("...", "").replace("、", "").strip(' ').strip('\r\n') if len(word) == 1 or word == "": continue else: counts[word]=counts.get(word,0)+1 #单词计数 items = list(counts.items()) #将字典转为list items.sort(key=lambda x:x[1],reverse=True) #根据单词出现次数降序排序 #打印前15个 for i in range(15): word,counter = items[i] print("单词:{},次数:{}".format(word,counter))
4、英文分词统计
def get_txt(): txt = open("1.txt", "r", encoding='UTF-8').read() txt = txt.lower() for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~': txt = txt.replace(ch, " ") # 将文本中特殊字符替换为空格 return txt def yingwen(): """ 英文分词统计 :return: """ file_txt = get_txt() words = file_txt.split() # 对字符串进行分割,获得单词列表 counts = {} for word in words: if len(word) == 1: continue else: counts[word] = counts.get(word, 0) + 1 items = list(counts.items()) items.sort(key=lambda x: x[1], reverse=True) for i in range(5): word, count = items[i] print("{0:<5}->{1:>5}".format(word, count))
来源:https://www.cnblogs.com/ywjfx/p/11003872.html