Source code for abydos.distance._monge_elkan

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._monge_elkan.

Monge-Elkan similarity & distance
"""

from deprecation import deprecated

from ._distance import _Distance
from ._levenshtein import Levenshtein, sim_levenshtein
from .. import __version__
from ..tokenizer import QGrams

__all__ = ['MongeElkan', 'dist_monge_elkan', 'sim_monge_elkan']


[docs]class MongeElkan(_Distance): """Monge-Elkan similarity. Monge-Elkan is defined in :cite:`Monge:1996`. Note: Monge-Elkan is NOT a symmetric similarity algorithm. Thus, the similarity of src to tar is not necessarily equal to the similarity of tar to src. If the symmetric argument is True, a symmetric value is calculated, at the cost of doubling the computation time (since :math:`sim_{Monge-Elkan}(src, tar)` and :math:`sim_{Monge-Elkan}(tar, src)` are both calculated and then averaged). .. versionadded:: 0.3.6 """ def __init__(self, sim_func=None, symmetric=False, **kwargs): """Initialize MongeElkan instance. Parameters ---------- sim_func : function The internal similarity metric to employ symmetric : bool Return a symmetric similarity measure **kwargs Arbitrary keyword arguments .. versionadded:: 0.4.0 """ super(MongeElkan, self).__init__(**kwargs) self._sim_func = sim_func if isinstance(self._sim_func, _Distance): self._sim_func = self._sim_func.sim elif self._sim_func is None: self._sim_func = Levenshtein().sim self._symmetric = symmetric
[docs] def sim(self, src, tar): """Return the Monge-Elkan similarity of two strings. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison Returns ------- float Monge-Elkan similarity Examples -------- >>> cmp = MongeElkan() >>> cmp.sim('cat', 'hat') 0.75 >>> round(cmp.sim('Niall', 'Neil'), 12) 0.666666666667 >>> round(cmp.sim('aluminum', 'Catalan'), 12) 0.388888888889 >>> cmp.sim('ATCG', 'TAGC') 0.5 .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ if src == tar: return 1.0 q_src = sorted(QGrams().tokenize(src).get_list()) q_tar = sorted(QGrams().tokenize(tar).get_list()) if not q_src or not q_tar: return 0.0 sum_of_maxes = 0 for q_s in q_src: max_sim = float('-inf') for q_t in q_tar: max_sim = max(max_sim, self._sim_func(q_s, q_t)) sum_of_maxes += max_sim sim_em = sum_of_maxes / len(q_src) if self._symmetric: sum_of_maxes = 0 for q_t in q_tar: max_sim = float('-inf') for q_s in q_src: max_sim = max(max_sim, self._sim_func(q_t, q_s)) sum_of_maxes += max_sim sim_rev = sum_of_maxes / len(q_tar) sim_em = (sim_em + sim_rev) / 2 return sim_em
[docs]@deprecated( deprecated_in='0.4.0', removed_in='0.6.0', current_version=__version__, details='Use the MongeElkan.sim method instead.', ) def sim_monge_elkan(src, tar, sim_func=sim_levenshtein, symmetric=False): """Return the Monge-Elkan similarity of two strings. This is a wrapper for :py:meth:`MongeElkan.sim`. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison sim_func : function Rhe internal similarity metric to employ symmetric : bool Return a symmetric similarity measure Returns ------- float Monge-Elkan similarity Examples -------- >>> sim_monge_elkan('cat', 'hat') 0.75 >>> round(sim_monge_elkan('Niall', 'Neil'), 12) 0.666666666667 >>> round(sim_monge_elkan('aluminum', 'Catalan'), 12) 0.388888888889 >>> sim_monge_elkan('ATCG', 'TAGC') 0.5 .. versionadded:: 0.1.0 """ return MongeElkan(sim_func, symmetric).sim(src, tar)
[docs]@deprecated( deprecated_in='0.4.0', removed_in='0.6.0', current_version=__version__, details='Use the MongeElkan.dist method instead.', ) def dist_monge_elkan(src, tar, sim_func=sim_levenshtein, symmetric=False): """Return the Monge-Elkan distance between two strings. This is a wrapper for :py:meth:`MongeElkan.dist`. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison sim_func : function The internal similarity metric to employ symmetric : bool Return a symmetric similarity measure Returns ------- float Monge-Elkan distance Examples -------- >>> dist_monge_elkan('cat', 'hat') 0.25 >>> round(dist_monge_elkan('Niall', 'Neil'), 12) 0.333333333333 >>> round(dist_monge_elkan('aluminum', 'Catalan'), 12) 0.611111111111 >>> dist_monge_elkan('ATCG', 'TAGC') 0.5 .. versionadded:: 0.1.0 """ return MongeElkan(sim_func, symmetric).dist(src, tar)
if __name__ == '__main__': import doctest doctest.testmod()