首先我直接进class中 from_modelstring,from_modelpath方法
@classmethod def from_modelstring(cls, string, *args, **kwargs): b = base64.b64decode(string)#解码 '''is the string to decode. Optional altchars must be a string of at least length 2 (additional characters are ignored) which specifies the alternative alphabet used instead of the '+' and '/' characters''' z = bz2.decompress(b)#解压 """ decompress(data) -> decompressed data Decompress data in one shot. If you want to decompress data sequentially, use an instance of BZ2Decompressor instead. """ model = loads(z) """ loads(string) -- Load a pickle from the given string """ nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output = model #get value from model #as I konw nb_pc ,nb_ptc is list nb_numfeats = int(len(nb_ptc) / len(nb_pc))#retuen int # reconstruct pc and ptc nb_pc = np.array(nb_pc)#[1,2,3] nb_ptc = np.array(nb_ptc).reshape(nb_numfeats, len(nb_pc))#nb_numfeats is col len(nb_pc) is row '''look like nb_ptc=array([[0, 4, 3], [2, 1, 5]])''' return cls(nb_ptc, nb_pc, nb_numfeats, nb_classes, tk_nextmove, tk_output, *args, **kwargs) @classmethod def from_modelpath(cls, path, *args, **kwargs): with open(path) as f: return cls.from_modelstring(f.read().encode(), *args, **kwargs) def __init__(self, nb_ptc, nb_pc, nb_numfeats, nb_classes, tk_nextmove, tk_output, norm_probs=NORM_PROBS): self.nb_ptc = nb_ptc self.nb_pc = nb_pc self.nb_numfeats = nb_numfeats self.nb_classes = nb_classes self.tk_nextmove = tk_nextmove self.tk_output = tk_output #dirctily to kown value if norm_probs: def norm_probs(pd): """ Renormalize log-probs into a proper distribution (sum 1) The technique for dealing with underflow is described in http://jblevins.org/log/log-sum-exp """ # Ignore overflow when computing the exponential. Large values # in the exp produce a result of inf, which does not affect # the correctness of the calculation (as 1/x->0 as x->inf). # On Linux this does not actually trigger a warning, but on # Windows this causes a RuntimeWarning, so we explicitly # suppress it. with np.errstate(over='ignore'): pd = (1 / np.exp(pd[None, :] - pd[:, None]).sum(1)) return pd else: def norm_probs(pd): return pd self.norm_probs = norm_probs # Maintain a reference to the full model, in case we change our language set # multiple times. self.__full_model = nb_ptc, nb_pc, nb_classes
从作者训练的模型中导入数据,然后看set_languages
def set_languages(self, langs=None): logger.debug("restricting languages to: %s", langs) # Unpack the full original model. This is needed in case the language set # has been previously trimmed, and the new set is not a subset of the current # set. nb_ptc, nb_pc, nb_classes = self.__full_model if langs is None: self.nb_classes = nb_classes self.nb_ptc = nb_ptc self.nb_pc = nb_pc else: # We were passed a restricted set of languages. Trim the arrays accordingly # to speed up processing. for lang in langs: if lang not in nb_classes: raise ValueError("Unknown language code %s" % lang) subset_mask = np.fromiter((l in langs for l in nb_classes), dtype=bool) self.nb_classes = [c for c in nb_classes if c in langs] self.nb_ptc = nb_ptc[:, subset_mask] self.nb_pc = nb_pc[subset_mask]
设置可选语言检测语句
ex
langid.set_languages('zh','en')
再进入instance2fv
def instance2fv(self, text): """ Map an instance into the feature space of the trained model. """ if (sys.version_info > (3, 0)): # Python3 if isinstance(text, str): text = text.encode('utf8') else: # Python2 if isinstance(text, unicode): text = text.encode('utf8') # Convert the text to a sequence of ascii values text = map(ord, text)#a list of ord like [125,54,74,65] arr = np.zeros((self.nb_numfeats,), dtype='uint32')#array of uint32 # Count the number of times we enter each state state = 0 statecount = defaultdict(int) #dic for deafult null for letter in text: state = self.tk_nextmove[(state << 8) + letter] statecount[state] += 1 #return {ja:3,zh:4,en:0} # Update all the productions corresponding to the state for state in statecount: for index in self.tk_output.get(state, []): arr[index] += statecount[state] return arr