Source code for abydos.distance._eudex

# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._eudex.

eudex distance functions
"""

from types import GeneratorType

from deprecation import deprecated

from ._distance import _Distance
from .. import __version__
from ..phonetic import eudex

__all__ = ['Eudex', 'dist_eudex', 'eudex_hamming', 'sim_eudex']


[docs]class Eudex(_Distance): """Distance between the Eudex hashes of two terms. Cf. :cite:`Ticki:2016`. .. versionadded:: 0.3.6 """
[docs] @staticmethod def gen_fibonacci(): """Yield the next Fibonacci number. Based on https://www.python-course.eu/generators.php Starts at Fibonacci number 3 (the second 1) Yields ------ int The next Fibonacci number .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ num_a, num_b = 1, 2 while True: yield num_a num_a, num_b = num_b, num_a + num_b
[docs] @staticmethod def gen_exponential(base=2): """Yield the next value in an exponential series of the base. Starts at base**0 Parameters ---------- base : int The base to exponentiate Yields ------ int The next power of `base` .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ exp = 0 while True: yield base ** exp exp += 1
def __init__(self, weights='exponential', max_length=8, **kwargs): """Initialize Eudex instance. Parameters ---------- weights : str, iterable, or generator function The weights or weights generator function - If set to ``None``, a simple Hamming distance is calculated. - If set to ``exponential``, weight decays by powers of 2, as proposed in the eudex specification: https://github.com/ticki/eudex. - If set to ``fibonacci``, weight decays through the Fibonacci series, as in the eudex reference implementation. - If set to a callable function, this assumes it creates a generator and the generator is used to populate a series of weights. - If set to an iterable, the iterable's values should be integers and will be used as the weights. In all cases, the weights should be ordered or generated from least significant to most significant, so larger values should generally come first. max_length : int The number of characters to encode as a eudex hash **kwargs Arbitrary keyword arguments .. versionadded:: 0.4.0 """ super(Eudex, self).__init__(**kwargs) self._weights = weights self._max_length = max_length
[docs] def dist_abs(self, src, tar, normalized=False): """Calculate the distance between the Eudex hashes of two terms. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison normalized : bool Normalizes to [0, 1] if True Returns ------- int The Eudex Hamming distance Examples -------- >>> cmp = Eudex() >>> cmp.dist_abs('cat', 'hat') 128 >>> cmp.dist_abs('Niall', 'Neil') 2 >>> cmp.dist_abs('Colin', 'Cuilen') 10 >>> cmp.dist_abs('ATCG', 'TAGC') 403 >>> cmp = Eudex(weights='fibonacci') >>> cmp.dist_abs('cat', 'hat') 34 >>> cmp.dist_abs('Niall', 'Neil') 2 >>> cmp.dist_abs('Colin', 'Cuilen') 7 >>> cmp.dist_abs('ATCG', 'TAGC') 117 >>> cmp = Eudex(weights=None) >>> cmp.dist_abs('cat', 'hat') 1 >>> cmp.dist_abs('Niall', 'Neil') 1 >>> cmp.dist_abs('Colin', 'Cuilen') 2 >>> cmp.dist_abs('ATCG', 'TAGC') 9 >>> # Using the OEIS A000142: >>> cmp = Eudex(weights=[1, 1, 2, 6, 24, 120, 720, 5040]) >>> cmp.dist_abs('cat', 'hat') 5040 >>> cmp.dist_abs('Niall', 'Neil') 1 >>> cmp.dist_abs('Colin', 'Cuilen') 7 >>> cmp.dist_abs('ATCG', 'TAGC') 15130 .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ # Calculate the eudex hashes and XOR them xored = eudex(src, max_length=self._max_length) ^ eudex( tar, max_length=self._max_length ) # Simple hamming distance (all bits are equal) if not self._weights: binary = bin(xored) distance = binary.count('1') if normalized: return distance / (len(binary) - 2) return distance # If weights is a function, it should create a generator, # which we now use to populate a list if callable(self._weights): weights = self._weights() elif self._weights == 'exponential': weights = Eudex.gen_exponential() elif self._weights == 'fibonacci': weights = Eudex.gen_fibonacci() elif hasattr(self._weights, '__iter__') and not isinstance( self._weights, str ): weights = self._weights[::-1] else: raise ValueError('Unrecognized weights value or type.') if isinstance(weights, GeneratorType): weights = [next(weights) for _ in range(self._max_length)][::-1] # Sum the weighted hamming distance distance = 0 max_distance = 0 while (xored or normalized) and weights: max_distance += 8 * weights[-1] distance += bin(xored & 0xFF).count('1') * weights.pop() xored >>= 8 if normalized: distance /= max_distance return distance
[docs] def dist(self, src, tar): """Return normalized distance between the Eudex hashes of two terms. This is Eudex distance normalized to [0, 1]. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison Returns ------- int The normalized Eudex Hamming distance Examples -------- >>> cmp = Eudex() >>> round(cmp.dist('cat', 'hat'), 12) 0.062745098039 >>> round(cmp.dist('Niall', 'Neil'), 12) 0.000980392157 >>> round(cmp.dist('Colin', 'Cuilen'), 12) 0.004901960784 >>> round(cmp.dist('ATCG', 'TAGC'), 12) 0.197549019608 .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ return self.dist_abs(src, tar, True)
[docs]@deprecated( deprecated_in='0.4.0', removed_in='0.6.0', current_version=__version__, details='Use the Eudex.dist_abs method instead.', ) def eudex_hamming( src, tar, weights='exponential', max_length=8, normalized=False ): """Calculate the Hamming distance between the Eudex hashes of two terms. This is a wrapper for :py:meth:`Eudex.eudex_hamming`. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison weights : str, iterable, or generator function The weights or weights generator function max_length : int The number of characters to encode as a eudex hash normalized : bool Normalizes to [0, 1] if True Returns ------- int The Eudex Hamming distance Examples -------- >>> eudex_hamming('cat', 'hat') 128 >>> eudex_hamming('Niall', 'Neil') 2 >>> eudex_hamming('Colin', 'Cuilen') 10 >>> eudex_hamming('ATCG', 'TAGC') 403 >>> eudex_hamming('cat', 'hat', weights='fibonacci') 34 >>> eudex_hamming('Niall', 'Neil', weights='fibonacci') 2 >>> eudex_hamming('Colin', 'Cuilen', weights='fibonacci') 7 >>> eudex_hamming('ATCG', 'TAGC', weights='fibonacci') 117 >>> eudex_hamming('cat', 'hat', weights=None) 1 >>> eudex_hamming('Niall', 'Neil', weights=None) 1 >>> eudex_hamming('Colin', 'Cuilen', weights=None) 2 >>> eudex_hamming('ATCG', 'TAGC', weights=None) 9 >>> # Using the OEIS A000142: >>> eudex_hamming('cat', 'hat', [1, 1, 2, 6, 24, 120, 720, 5040]) 5040 >>> eudex_hamming('Niall', 'Neil', [1, 1, 2, 6, 24, 120, 720, 5040]) 1 >>> eudex_hamming('Colin', 'Cuilen', [1, 1, 2, 6, 24, 120, 720, 5040]) 7 >>> eudex_hamming('ATCG', 'TAGC', [1, 1, 2, 6, 24, 120, 720, 5040]) 15130 .. versionadded:: 0.3.0 """ return Eudex(weights, max_length).dist_abs(src, tar, normalized)
[docs]@deprecated( deprecated_in='0.4.0', removed_in='0.6.0', current_version=__version__, details='Use the Eudex.dist method instead.', ) def dist_eudex(src, tar, weights='exponential', max_length=8): """Return normalized Hamming distance between Eudex hashes of two terms. This is a wrapper for :py:meth:`Eudex.dist`. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison weights : str, iterable, or generator function The weights or weights generator function max_length : int The number of characters to encode as a eudex hash Returns ------- int The normalized Eudex Hamming distance Examples -------- >>> round(dist_eudex('cat', 'hat'), 12) 0.062745098039 >>> round(dist_eudex('Niall', 'Neil'), 12) 0.000980392157 >>> round(dist_eudex('Colin', 'Cuilen'), 12) 0.004901960784 >>> round(dist_eudex('ATCG', 'TAGC'), 12) 0.197549019608 .. versionadded:: 0.3.0 """ return Eudex(weights, max_length).dist(src, tar)
[docs]@deprecated( deprecated_in='0.4.0', removed_in='0.6.0', current_version=__version__, details='Use the Eudex.sim method instead.', ) def sim_eudex(src, tar, weights='exponential', max_length=8): """Return normalized Hamming similarity between Eudex hashes of two terms. This is a wrapper for :py:meth:`Eudex.sim`. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison weights : str, iterable, or generator function The weights or weights generator function max_length : int The number of characters to encode as a eudex hash Returns ------- int The normalized Eudex Hamming similarity Examples -------- >>> round(sim_eudex('cat', 'hat'), 12) 0.937254901961 >>> round(sim_eudex('Niall', 'Neil'), 12) 0.999019607843 >>> round(sim_eudex('Colin', 'Cuilen'), 12) 0.995098039216 >>> round(sim_eudex('ATCG', 'TAGC'), 12) 0.802450980392 .. versionadded:: 0.3.0 """ return Eudex(weights, max_length).sim(src, tar)
if __name__ == '__main__': import doctest doctest.testmod()