python 中langid的阅读。

17-08-01 来源：[db:作者]

收藏我要投稿

langid的源码结构

在这里看到功能跟方法

首先我直接进class中 from_modelstring,from_modelpath方法

@classmethod
  def from_modelstring(cls, string, *args, **kwargs):
    b = base64.b64decode(string)#解码
    '''is the string to decode.  Optional altchars must be a string of at least
    length 2 (additional characters are ignored) which specifies the
    alternative alphabet used instead of the '+' and '/' characters'''
    z = bz2.decompress(b)#解压
    """
            decompress(data) -> decompressed data

            Decompress data in one shot. If you want to decompress data sequentially,
            use an instance of BZ2Decompressor instead.
            """
    model = loads(z)
    """ loads(string) -- Load a pickle from the given string """
    nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output = model #get value from model
      #as I konw nb_pc ,nb_ptc is list
        nb_numfeats = int(len(nb_ptc) / len(nb_pc))#retuen int

        # reconstruct pc and ptc
        nb_pc = np.array(nb_pc)#[1,2,3]
        nb_ptc = np.array(nb_ptc).reshape(nb_numfeats, len(nb_pc))#nb_numfeats is col len(nb_pc) is row
        '''look like nb_ptc=array([[0, 4, 3],
                                    [2, 1, 5]])'''

        return cls(nb_ptc, nb_pc, nb_numfeats, nb_classes, tk_nextmove, tk_output, *args, **kwargs)

    @classmethod
    def from_modelpath(cls, path, *args, **kwargs):
        with open(path) as f:
            return cls.from_modelstring(f.read().encode(), *args, **kwargs)

    def __init__(self, nb_ptc, nb_pc, nb_numfeats, nb_classes, tk_nextmove, tk_output,
                 norm_probs=NORM_PROBS):
        self.nb_ptc = nb_ptc
        self.nb_pc = nb_pc
        self.nb_numfeats = nb_numfeats
        self.nb_classes = nb_classes
        self.tk_nextmove = tk_nextmove
        self.tk_output = tk_output
        #dirctily to kown value
        if norm_probs:
            def norm_probs(pd):
                """
                Renormalize log-probs into a proper distribution (sum 1)
                The technique for dealing with underflow is described in
                http://jblevins.org/log/log-sum-exp
                """
                # Ignore overflow when computing the exponential. Large values
                # in the exp produce a result of inf, which does not affect
                # the correctness of the calculation (as 1/x->0 as x->inf).
                # On Linux this does not actually trigger a warning, but on
                # Windows this causes a RuntimeWarning, so we explicitly
                # suppress it.
                with np.errstate(over='ignore'):
                    pd = (1 / np.exp(pd[None, :] - pd[:, None]).sum(1))
                return pd
        else:
            def norm_probs(pd):
                return pd

        self.norm_probs = norm_probs

        # Maintain a reference to the full model, in case we change our language set
        # multiple times.
        self.__full_model = nb_ptc, nb_pc, nb_classes

从作者训练的模型中导入数据，然后看set_languages

    def set_languages(self, langs=None):
        logger.debug("restricting languages to: %s", langs)

        # Unpack the full original model. This is needed in case the language set
        # has been previously trimmed, and the new set is not a subset of the current
        # set.
        nb_ptc, nb_pc, nb_classes = self.__full_model

        if langs is None:
            self.nb_classes = nb_classes
            self.nb_ptc = nb_ptc
            self.nb_pc = nb_pc

        else:
            # We were passed a restricted set of languages. Trim the arrays accordingly
            # to speed up processing.
            for lang in langs:
                if lang not in nb_classes:
                    raise ValueError("Unknown language code %s" % lang)

            subset_mask = np.fromiter((l in langs for l in nb_classes), dtype=bool)
            self.nb_classes = [c for c in nb_classes if c in langs]
            self.nb_ptc = nb_ptc[:, subset_mask]
            self.nb_pc = nb_pc[subset_mask]

设置可选语言检测语句
ex

langid.set_languages('zh','en')

再进入instance2fv

    def instance2fv(self, text):
        """
        Map an instance into the feature space of the trained model.
        """
        if (sys.version_info > (3, 0)):
            # Python3
            if isinstance(text, str):
                text = text.encode('utf8')
        else:
            # Python2
            if isinstance(text, unicode):
                text = text.encode('utf8')
            # Convert the text to a sequence of ascii values
            text = map(ord, text)#a list of ord like [125,54,74,65]

        arr = np.zeros((self.nb_numfeats,), dtype='uint32')#array of uint32

        # Count the number of times we enter each state
        state = 0
        statecount = defaultdict(int) #dic for deafult null
        for letter in text:
            state = self.tk_nextmove[(state << 8) + letter]
            statecount[state] += 1
            #return {ja:3,zh:4,en:0}
        # Update all the productions corresponding to the state
        for state in statecount:
            for index in self.tk_output.get(state, []):
                arr[index] += statecount[state]

        return arr

点击复制链接与好友分享!回本站首页