BERT使用记录/KenLM避坑

使用 bert 生成词向量：

##### 运行此脚本 
export BERT_BASE_DIR = ./chinese_L-12_H-768_A-12    ## 模型地址
exprot Data_Dir = ./data

python bert-master/extract_features.py \
　　--input_file=$Data_Dir/train_ch.txt \
　　--output_file=$Data_dir/output.json \
　　--vocab_file=$BERT_BASE_DIR/vocab.txt \
　　--bert_config_file=$BERT_BASE_DIR/bert_config.json \
　　--init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
　　--layers=-1,-2,-3,-4 \
　　--max_seq_length=128 \
　　--batch_size=8

结果文件是这种形式：

 {"linex_index": 0, "features":[{"token": "[CLS]", "layers": [{"index": -1, "values":[-0.919886, 0.656876, -0.58464654]}]}]}

解码代码：

import re
import json

src = ''
tgt = ''

def fun(file1,file2):
　　with open(file1,'r',encoding='utf-8') as fl1:
　　　　with open(file2,'w',encoding='utf-8') as fl2:
　　　　　　k=0
　　　　　　for line in fl1.readlines():
　　　　　　　　k+=1
　　　　　　　　line = json.loads(line)
　　　　　　　　temp = line.get('features')
　　　　　　　　temp = temp[1]
　　　　　　　　temp = temp.get('layers')
　　　　　　　　temp = temp[1]
　　　　　　　　temp = temp.get("values")
　　　　　　　　fl2.write(str(temp)+'\n'+'\n') ## 好看一些
　　　　　　　　if k%1000==0:
　　　　　　　　　　print("Done"+' '+str(k))

fun(src,tgt)