python jieba分词小说与词频统计

1、知识点

"""
1)cut()
    a) codecs.open() 解决编码问题
    b) f.readline() 读取一行，也可以使用f.readlines()读取多行
    c) words =" ".join(jieba.cut(line))分词，每个词用空格分隔
2)lcut()
    返回一个list列表
"""

2、标点符号处理，并分词,存储到文件中

def fenCi():
    """
    标点符号处理，并分词,存储到文件中
    :return:
    """
    f = codecs.open("深渊主宰系统.txt",'r',encoding='utf-8')
    f1 = open("seg.txt",'w',encoding='utf-8')
    line = f.readline()
    while line:
        line = line.strip(' ')
        words =" ".join(jieba.cut(line))
        words = words.replace("，","").replace("！","").replace("“","")\
            .replace("”","").replace("。","").replace("？","").replace("：","")\
            .replace("...","").replace("、","").strip(' ')
        print(len(words))
        if words.startswith('-') or words == '\r\n' or words.startswith('.') or len(words)<10 :
            line = f.readline()
            continue
        words = words.strip('\n')
        f1.writelines(words)
        line = f.readline()

3、中文分词统计

def zhongwen():
    """
    中文分词统计
    对两个词以上的次数进行统计
        lcut 进行分词，返回分词后list列表
    :return:
    """
    f = codecs.open("深渊主宰系统.txt", 'r', encoding='utf-8').read()
    counts = {}
    wordsList =jieba.lcut(f)
    for word in wordsList:
        word = word.replace("，", "").replace("！", "").replace("“", "") \
            .replace("”", "").replace("。", "").replace("？", "").replace("：", "") \
            .replace("...", "").replace("、", "").strip(' ').strip('\r\n')
        if len(word) == 1 or word == "":
            continue
        else:
            counts[word]=counts.get(word,0)+1 #单词计数
    items = list(counts.items()) #将字典转为list
    items.sort(key=lambda x:x[1],reverse=True) #根据单词出现次数降序排序
    #打印前15个
    for i in range(15):
        word,counter = items[i]
        print("单词：{},次数：{}".format(word,counter))

4、英文分词统计

def get_txt():
    txt = open("1.txt", "r", encoding='UTF-8').read()
    txt = txt.lower()
    for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~':
        txt = txt.replace(ch, " ")      # 将文本中特殊字符替换为空格
    return txt

def yingwen():
    """
    英文分词统计
    :return:
    """
    file_txt = get_txt()
    words = file_txt.split()    # 对字符串进行分割，获得单词列表
    counts = {}
    for word in words:
        if len(word) == 1:
            continue
        else:
            counts[word] = counts.get(word, 0) + 1

    items = list(counts.items())
    items.sort(key=lambda x: x[1], reverse=True)

    for i in range(5):
        word, count = items[i]
        print("{0:<5}->{1:>5}".format(word, count))

来源：https://www.cnblogs.com/ywjfx/p/11003872.html

标签

jieba

中文分词

分词

python