Source code for abydos.distance._eudex

# -*- coding: utf-8 -*-

# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance.eudex.

The distance.eudex module implements eudex distance functions.
"""

from __future__ import division, unicode_literals

from types import GeneratorType

from six.moves import range

from ..phonetic import eudex

__all__ = ['dist_eudex', 'eudex_hamming', 'sim_eudex']


[docs]def eudex_hamming( src, tar, weights='exponential', max_length=8, normalized=False ): """Calculate the Hamming distance between the Eudex hashes of two terms. Cf. :cite:`Ticki:2016`. - If weights is set to None, a simple Hamming distance is calculated. - If weights is set to 'exponential', weight decays by powers of 2, as proposed in the eudex specification: https://github.com/ticki/eudex. - If weights is set to 'fibonacci', weight decays through the Fibonacci series, as in the eudex reference implementation. - If weights is set to a callable function, this assumes it creates a generator and the generator is used to populate a series of weights. - If weights is set to an iterable, the iterable's values should be integers and will be used as the weights. :param str src: source string for comparison :param str tar: target string for comparison :param str, iterable, or generator function weights: the weights or weights generator function :param max_length: the number of characters to encode as a eudex hash :param bool normalized: normalizes to [0, 1] if True :returns: the Eudex Hamming distance :rtype: int >>> eudex_hamming('cat', 'hat') 128 >>> eudex_hamming('Niall', 'Neil') 2 >>> eudex_hamming('Colin', 'Cuilen') 10 >>> eudex_hamming('ATCG', 'TAGC') 403 >>> eudex_hamming('cat', 'hat', weights='fibonacci') 34 >>> eudex_hamming('Niall', 'Neil', weights='fibonacci') 2 >>> eudex_hamming('Colin', 'Cuilen', weights='fibonacci') 7 >>> eudex_hamming('ATCG', 'TAGC', weights='fibonacci') 117 >>> eudex_hamming('cat', 'hat', weights=None) 1 >>> eudex_hamming('Niall', 'Neil', weights=None) 1 >>> eudex_hamming('Colin', 'Cuilen', weights=None) 2 >>> eudex_hamming('ATCG', 'TAGC', weights=None) 9 >>> # Using the OEIS A000142: >>> eudex_hamming('cat', 'hat', [1, 1, 2, 6, 24, 120, 720, 5040]) 1 >>> eudex_hamming('Niall', 'Neil', [1, 1, 2, 6, 24, 120, 720, 5040]) 720 >>> eudex_hamming('Colin', 'Cuilen', [1, 1, 2, 6, 24, 120, 720, 5040]) 744 >>> eudex_hamming('ATCG', 'TAGC', [1, 1, 2, 6, 24, 120, 720, 5040]) 6243 """ def _gen_fibonacci(): """Yield the next Fibonacci number. Based on https://www.python-course.eu/generators.php Starts at Fibonacci number 3 (the second 1) :returns: the next Fibonacci number :rtype: int """ num_a, num_b = 1, 2 while True: yield num_a num_a, num_b = num_b, num_a + num_b def _gen_exponential(base=2): """Yield the next value in an exponential series of the base. Starts at base**0 :param int base: the base to exponentiate :returns: the next power of `base` :rtype: int """ exp = 0 while True: yield base ** exp exp += 1 # Calculate the eudex hashes and XOR them xored = eudex(src, max_length=max_length) ^ eudex( tar, max_length=max_length ) # Simple hamming distance (all bits are equal) if not weights: binary = bin(xored) distance = binary.count('1') if normalized: return distance / (len(binary) - 2) return distance # If weights is a function, it should create a generator, # which we now use to populate a list if callable(weights): weights = weights() elif weights == 'exponential': weights = _gen_exponential() elif weights == 'fibonacci': weights = _gen_fibonacci() if isinstance(weights, GeneratorType): weights = [next(weights) for _ in range(max_length)][::-1] # Sum the weighted hamming distance distance = 0 max_distance = 0 while (xored or normalized) and weights: max_distance += 8 * weights[-1] distance += bin(xored & 0xFF).count('1') * weights.pop() xored >>= 8 if normalized: distance /= max_distance return distance
[docs]def dist_eudex(src, tar, weights='exponential', max_length=8): """Return normalized Hamming distance between Eudex hashes of two terms. This is Eudex distance normalized to [0, 1]. :param str src: source string for comparison :param str tar: target string for comparison :param str, iterable, or generator function weights: the weights or weights generator function :param max_length: the number of characters to encode as a eudex hash :returns: the normalized Eudex distance :rtype: float >>> round(dist_eudex('cat', 'hat'), 12) 0.062745098039 >>> round(dist_eudex('Niall', 'Neil'), 12) 0.000980392157 >>> round(dist_eudex('Colin', 'Cuilen'), 12) 0.004901960784 >>> round(dist_eudex('ATCG', 'TAGC'), 12) 0.197549019608 """ return eudex_hamming(src, tar, weights, max_length, True)
[docs]def sim_eudex(src, tar, weights='exponential', max_length=8): """Return normalized Hamming similarity between Eudex hashes of two terms. Normalized Eudex similarity is the complement of normalized Eudex distance: :math:`sim_{Eudex} = 1 - dist_{Eudex}`. :param str src: source string for comparison :param str tar: target string for comparison :param str, iterable, or generator function weights: the weights or weights generator function :param max_length: the number of characters to encode as a eudex hash :returns: the normalized Eudex similarity :rtype: float >>> round(sim_eudex('cat', 'hat'), 12) 0.937254901961 >>> round(sim_eudex('Niall', 'Neil'), 12) 0.999019607843 >>> round(sim_eudex('Colin', 'Cuilen'), 12) 0.995098039216 >>> round(sim_eudex('ATCG', 'TAGC'), 12) 0.802450980392 """ return 1 - dist_eudex(src, tar, weights, max_length)
if __name__ == '__main__': import doctest doctest.testmod()