jieba库

jieba库是一款优秀的 Python 第三方中文分词库，jieba 支持四种分词模式：精确模式、全模式和搜索引擎模式和新增的 paddle模式。

精确模式：试图将语句最精确的切分，不存在冗余数据，适合文本分析。

全模式：将语句中所有可能是词的词语都切分出来，速度很快，但是存在冗余数据，不能消除歧义。

搜索引擎模式：在精确模式的基础上，对长词再次进行切分，提高召回率

paddle模式，利用PaddlePaddle深度学习框架，训练序列标注（双向GRU）网络模型实现分词。

更多详情请查看：https://github.com/fxsjy/jieba

常用示例

import jieba
import jieba.posseg
import jieba.analyse


'''模式分词'''
# cut_all=True：为全模式
# cut_all=FALSE：精准模式（默认）
sentence = "我想去拉斯维加斯"
words = jieba.cut(sentence, cut_all=False)
# words = jieba.lcut(sentence, cut_all=False) # 返回列表类型
for item in words:
    print(item)
print("----------------")

'''搜索引擎分词'''
sentence = "我想去拉斯维加斯"
words = jieba.cut_for_search(sentence)
# words = jieba.lcut_for_search(sentence) #返回列表类型
for item in words:
    print(item)
print("----------------")    


'''词性查询'''
sentence = "我想去拉斯维加斯"
words = jieba.posseg.cut(sentence) #jieba默认模式
for item in words:
    print(item.word+"--------"+item.flag)

jieba.enable_paddle() #启动paddle模式。 0.40版之后开始支持，早期版本不支持
words = jieba.posseg.cut(sentence,use_paddle=True) #paddle模式
for word, flag in words:
    print('%s %s' % (word, flag))


'''导入自建字典（大量词语添加）'''
jieba.load_userdict('D:\\mxx\\doc\\金庸武功招式.txt')

'''添加词语到字典（少量词语添加）'''
jieba.add_word("云计算")
sentence ="我要学习云计算技术"
sentence = jieba.cut(sentence)
for item in sentence:
    print(item)

'''提取文本中词频比较高的关键词'''
# 基于TF-IDF算法的关键词抽取
sentence = "我想去拉斯维加斯"
tag = jieba.analyse.extract_tags(sentence, 1,withWeight=True)
for (k, v) in tag:  
    print(k, v)
# 基于TextRank算法的关键词抽取
sentence = "我想去拉斯维加斯"
tag = jieba.analyse.textrank(sentence, topK=20, withWeight=True, allowPOS=('ns', 'n', 'vn', 'v'))
for (k, v) in tag:  
    print(k, v)


'''返回词语在原文的起止位置'''
result = jieba.tokenize(u'我想去拉斯维加斯')
for tk in result:
    print("word: %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))