import json import numpy as np import pandas as pd from random import choice from keras_bert import load_trained_model_from_checkpoint, Tokenizer import re, os import codecs
# dict_path = './bert/chinese_L-12_H-768_A-12/vocab.txt' with codecs.open(dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict)
classOurTokenizer(Tokenizer): def_tokenize(self, text): R = [] for c in text: if c in self._token_dict: print(c) R.append(c) elif self._is_space(c): R.append('[unused1]') # space类用未经训练的[unused1]表示 else: R.append('[UNK]') # 剩余的字符是[UNK] return R
Checking if Disqus is accessible...