//根据标记,进行分割操作、可以是分句或者分词 def segment(text, segs): words= [] last = 0 for i in range(len(segs)): if segs[i] =='1': words.append(text[last:i+1]) last = i+1 words.append(text[last:]) return words //计算得分值 def evaluate(text, segs): words= segment(text, segs) text_size = len(words) lexicon_size = len(' '.join(list(set(words)))) return text_size + lexicon_size from random import randint //改变某一个标记串的某一位(1变成0,0变成1) def flip(segs, pos): return segs[:pos] + str(1-int(segs[pos])) + segs[pos+1:] //根据整数N,随机改变N个位置,形成一个猜测的序列 def flip_n(segs, n): for i in range(n): segs = flip(segs, randint(0,len(segs)-1)) return segs //模拟退火算法 def anneal(text, segs, iterations, cooling_rate): temperature = float(len(segs)) whiletemperature > 0.5: best_segs,best = segs, evaluate(text, segs) for i in range(iterations): guess= flip_n(segs, int(round(temperature))) score =evaluate(text, guess) if score < best://对比猜测的和目前的,那一种评价值比较小,选择那一种序列 best,best_segs = score, guess score, segs = best,best_segs temperature = temperature / cooling_rate print evaluate(text, segs), segment(text, segs) print return segs //示例 >>>text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy" >>>seg1= "0000000000000001000000000010000000000000000100000000000" >>>anneal(text, seg1, 5000,1.2) 60['doyouseetheki', 'tty', 'see', 'thedoggy', 'doyouliketh', 'ekittylike', 'thedoggy'] 58['doy', 'ouseetheki', 'ttysee', 'thedoggy', 'doy', 'o', 'ulikethekittylike', 'thedoggy'] 56['doyou', 'seetheki', 'ttysee', 'thedoggy', 'doyou', 'liketh', 'ekittylike', 'thedoggy'] 54['doyou', 'seethekit', 'tysee', 'thedoggy', 'doyou', 'likethekittylike', 'thedoggy'] 120 53['doyou', 'seethekit', 'tysee', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy'] 51['doyou', 'seethekittysee', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy'] 42['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy'] '0000100100000001001000000010000100010000000100010000000'
有了足够的数据,就可能以一个合理的准确度自动将文本分割成词汇。这种方法可用于 为那些词的边界没有任何视觉表示的书写系统分词。
来源:CSDN
作者:weixin_42774642
链接:https://blog.csdn.net/weixin_42774642/article/details/103968329