Source code for abydos.distance._meta_levenshtein
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.distance._meta_levenshtein.
Meta-Levenshtein distance
"""
from collections import Counter
from math import log1p
from numpy import float as np_float
from numpy import zeros as np_zeros
from ._distance import _Distance
from ._jaro_winkler import JaroWinkler
from ..corpus import UnigramCorpus
from ..tokenizer import QGrams, WhitespaceTokenizer
__all__ = ['MetaLevenshtein']
[docs]class MetaLevenshtein(_Distance):
r"""Meta-Levenshtein distance.
Meta-Levenshtein distance :cite:`Moreau:2008` combines Soft-TFIDF with
Levenshtein alignment.
.. versionadded:: 0.4.0
"""
def __init__(
self,
tokenizer=None,
corpus=None,
metric=None,
normalizer=max,
**kwargs
):
"""Initialize MetaLevenshtein instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
corpus : UnigramCorpus
A unigram corpus :py:class:`UnigramCorpus`. If None, a corpus will
be created from the two words when a similarity function is called.
metric : _Distance
A string distance measure class for making soft matches, by default
Jaro-Winkler.
normalizer : function
A function that takes an list and computes a normalization term
by which the edit distance is divided (max by default). Another
good option is the sum function.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
.. versionadded:: 0.4.0
"""
super(MetaLevenshtein, self).__init__(**kwargs)
self._corpus = corpus
self._metric = metric
self._normalizer = normalizer
qval = 2 if 'qval' not in self.params else self.params['qval']
self.params['tokenizer'] = (
tokenizer
if tokenizer is not None
else WhitespaceTokenizer()
if qval == 0
else QGrams(qval=qval, start_stop='$#', skip=0, scaler=None)
)
if self._metric is None:
self._metric = JaroWinkler()
[docs] def dist_abs(self, src, tar):
"""Return the Meta-Levenshtein distance of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Meta-Levenshtein distance
Examples
--------
>>> cmp = MetaLevenshtein()
>>> cmp.dist_abs('cat', 'hat')
0.6155602628882225
>>> cmp.dist_abs('Niall', 'Neil')
2.538900657220556
>>> cmp.dist_abs('aluminum', 'Catalan')
6.940747163450747
>>> cmp.dist_abs('ATCG', 'TAGC')
3.2311205257764453
.. versionadded:: 0.4.0
"""
if src == tar:
return 0.0
if not src:
return float(len(tar))
if not tar:
return float(len(src))
src_tok = self.params['tokenizer'].tokenize(src)
src_ordered = src_tok.get_list()
src_tok = src_tok.get_counter()
tar_tok = self.params['tokenizer'].tokenize(tar)
tar_ordered = tar_tok.get_list()
tar_tok = tar_tok.get_counter()
if self._corpus is None:
corpus = UnigramCorpus(word_tokenizer=self.params['tokenizer'])
corpus.add_document(src)
corpus.add_document(tar)
else:
corpus = self._corpus
dists = Counter()
s_toks = set(src_tok.keys())
t_toks = set(tar_tok.keys())
for s_tok in s_toks:
for t_tok in t_toks:
dists[(s_tok, t_tok)] = (
self._metric.dist(s_tok, t_tok) if s_tok != t_tok else 0
)
vws_dict = {}
vwt_dict = {}
for token in src_tok.keys():
vws_dict[token] = log1p(src_tok[token]) * corpus.idf(token)
for token in tar_tok.keys():
vwt_dict[token] = log1p(tar_tok[token]) * corpus.idf(token)
def _dist(s_tok, t_tok):
return dists[(s_tok, t_tok)] * vws_dict[s_tok] * vwt_dict[t_tok]
d_mat = np_zeros(
(len(src_ordered) + 1, len(tar_ordered) + 1), dtype=np_float
)
for i in range(len(src_ordered) + 1):
d_mat[i, 0] = i
for j in range(len(tar_ordered) + 1):
d_mat[0, j] = j
for i in range(len(src_ordered)):
for j in range(len(tar_ordered)):
d_mat[i + 1, j + 1] = min(
d_mat[i + 1, j] + 1, # ins
d_mat[i, j + 1] + 1, # del
d_mat[i, j]
+ _dist(src_ordered[i], tar_ordered[j]), # sub/==
)
return d_mat[len(src_ordered), len(tar_ordered)]
[docs] def dist(self, src, tar):
"""Return the normalized Levenshtein distance between two strings.
The Levenshtein distance is normalized by dividing the Levenshtein
distance (calculated by any of the three supported methods) by the
greater of the number of characters in src times the cost of a delete
and the number of characters in tar times the cost of an insert.
For the case in which all operations have :math:`cost = 1`, this is
equivalent to the greater of the length of the two strings src & tar.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
The normalized Levenshtein distance between src & tar
Examples
--------
>>> cmp = MetaLevenshtein()
>>> round(cmp.dist('cat', 'hat'), 12)
0.205186754296
>>> round(cmp.dist('Niall', 'Neil'), 12)
0.507780131444
>>> cmp.dist('aluminum', 'Catalan')
0.8675933954313434
>>> cmp.dist('ATCG', 'TAGC')
0.8077801314441113
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
if src == tar:
return 0.0
return self.dist_abs(src, tar) / (
self._normalizer(
[
self.dist_abs(src, ' ' * len(tar)),
self.dist_abs(src, ' ' * len(src)),
]
)
if self._corpus
else self._normalizer([len(src), len(tar)])
)
if __name__ == '__main__':
import doctest
doctest.testmod()