Source code for abydos.distance._meta_levenshtein

# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._meta_levenshtein.

Meta-Levenshtein distance
"""

from collections import Counter
from math import log1p

from numpy import float as np_float
from numpy import zeros as np_zeros

from ._distance import _Distance
from ._jaro_winkler import JaroWinkler
from ..corpus import UnigramCorpus
from ..tokenizer import QGrams, WhitespaceTokenizer

__all__ = ['MetaLevenshtein']


[docs]class MetaLevenshtein(_Distance): r"""Meta-Levenshtein distance. Meta-Levenshtein distance :cite:`Moreau:2008` combines Soft-TFIDF with Levenshtein alignment. .. versionadded:: 0.4.0 """ def __init__( self, tokenizer=None, corpus=None, metric=None, normalizer=max, **kwargs ): """Initialize MetaLevenshtein instance. Parameters ---------- tokenizer : _Tokenizer A tokenizer instance from the :py:mod:`abydos.tokenizer` package corpus : UnigramCorpus A unigram corpus :py:class:`UnigramCorpus`. If None, a corpus will be created from the two words when a similarity function is called. metric : _Distance A string distance measure class for making soft matches, by default Jaro-Winkler. normalizer : function A function that takes an list and computes a normalization term by which the edit distance is divided (max by default). Another good option is the sum function. **kwargs Arbitrary keyword arguments Other Parameters ---------------- qval : int The length of each q-gram. Using this parameter and tokenizer=None will cause the instance to use the QGram tokenizer with this q value. .. versionadded:: 0.4.0 """ super(MetaLevenshtein, self).__init__(**kwargs) self._corpus = corpus self._metric = metric self._normalizer = normalizer qval = 2 if 'qval' not in self.params else self.params['qval'] self.params['tokenizer'] = ( tokenizer if tokenizer is not None else WhitespaceTokenizer() if qval == 0 else QGrams(qval=qval, start_stop='$#', skip=0, scaler=None) ) if self._metric is None: self._metric = JaroWinkler()
[docs] def dist_abs(self, src, tar): """Return the Meta-Levenshtein distance of two strings. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison Returns ------- float Meta-Levenshtein distance Examples -------- >>> cmp = MetaLevenshtein() >>> cmp.dist_abs('cat', 'hat') 0.6155602628882225 >>> cmp.dist_abs('Niall', 'Neil') 2.538900657220556 >>> cmp.dist_abs('aluminum', 'Catalan') 6.940747163450747 >>> cmp.dist_abs('ATCG', 'TAGC') 3.2311205257764453 .. versionadded:: 0.4.0 """ if src == tar: return 0.0 if not src: return float(len(tar)) if not tar: return float(len(src)) src_tok = self.params['tokenizer'].tokenize(src) src_ordered = src_tok.get_list() src_tok = src_tok.get_counter() tar_tok = self.params['tokenizer'].tokenize(tar) tar_ordered = tar_tok.get_list() tar_tok = tar_tok.get_counter() if self._corpus is None: corpus = UnigramCorpus(word_tokenizer=self.params['tokenizer']) corpus.add_document(src) corpus.add_document(tar) else: corpus = self._corpus dists = Counter() s_toks = set(src_tok.keys()) t_toks = set(tar_tok.keys()) for s_tok in s_toks: for t_tok in t_toks: dists[(s_tok, t_tok)] = ( self._metric.dist(s_tok, t_tok) if s_tok != t_tok else 0 ) vws_dict = {} vwt_dict = {} for token in src_tok.keys(): vws_dict[token] = log1p(src_tok[token]) * corpus.idf(token) for token in tar_tok.keys(): vwt_dict[token] = log1p(tar_tok[token]) * corpus.idf(token) def _dist(s_tok, t_tok): return dists[(s_tok, t_tok)] * vws_dict[s_tok] * vwt_dict[t_tok] d_mat = np_zeros( (len(src_ordered) + 1, len(tar_ordered) + 1), dtype=np_float ) for i in range(len(src_ordered) + 1): d_mat[i, 0] = i for j in range(len(tar_ordered) + 1): d_mat[0, j] = j for i in range(len(src_ordered)): for j in range(len(tar_ordered)): d_mat[i + 1, j + 1] = min( d_mat[i + 1, j] + 1, # ins d_mat[i, j + 1] + 1, # del d_mat[i, j] + _dist(src_ordered[i], tar_ordered[j]), # sub/== ) return d_mat[len(src_ordered), len(tar_ordered)]
[docs] def dist(self, src, tar): """Return the normalized Levenshtein distance between two strings. The Levenshtein distance is normalized by dividing the Levenshtein distance (calculated by any of the three supported methods) by the greater of the number of characters in src times the cost of a delete and the number of characters in tar times the cost of an insert. For the case in which all operations have :math:`cost = 1`, this is equivalent to the greater of the length of the two strings src & tar. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison Returns ------- float The normalized Levenshtein distance between src & tar Examples -------- >>> cmp = MetaLevenshtein() >>> round(cmp.dist('cat', 'hat'), 12) 0.205186754296 >>> round(cmp.dist('Niall', 'Neil'), 12) 0.507780131444 >>> cmp.dist('aluminum', 'Catalan') 0.8675933954313434 >>> cmp.dist('ATCG', 'TAGC') 0.8077801314441113 .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ if src == tar: return 0.0 return self.dist_abs(src, tar) / ( self._normalizer( [ self.dist_abs(src, ' ' * len(tar)), self.dist_abs(src, ' ' * len(src)), ] ) if self._corpus else self._normalizer([len(src), len(tar)]) )
if __name__ == '__main__': import doctest doctest.testmod()