Source code for abydos.distance._hamming

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._hamming.

Hamming distance
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from deprecation import deprecated

from ._distance import _Distance
from .. import __version__

__all__ = ['Hamming', 'dist_hamming', 'hamming', 'sim_hamming']


[docs]class Hamming(_Distance): """Hamming distance. Hamming distance :cite:`Hamming:1950` equals the number of character positions at which two strings differ. For strings of unequal lengths, it is not normally defined. By default, this implementation calculates the Hamming distance of the first n characters where n is the lesser of the two strings' lengths and adds to this the difference in string lengths. .. versionadded:: 0.3.6 """ def __init__(self, diff_lens=True, **kwargs): """Initialize Hamming instance. Parameters ---------- diff_lens : bool If True (default), this returns the Hamming distance for those characters that have a matching character in both strings plus the difference in the strings' lengths. This is equivalent to extending the shorter string with obligatorily non-matching characters. If False, an exception is raised in the case of strings of unequal lengths. **kwargs Arbitrary keyword arguments .. versionadded:: 0.4.0 """ super(Hamming, self).__init__(**kwargs) self._diff_lens = diff_lens
[docs] def dist_abs(self, src, tar): """Return the Hamming distance between two strings. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison Returns ------- int The Hamming distance between src & tar Raises ------ ValueError Undefined for sequences of unequal length; set diff_lens to True for Hamming distance between strings of unequal lengths. Examples -------- >>> cmp = Hamming() >>> cmp.dist_abs('cat', 'hat') 1 >>> cmp.dist_abs('Niall', 'Neil') 3 >>> cmp.dist_abs('aluminum', 'Catalan') 8 >>> cmp.dist_abs('ATCG', 'TAGC') 4 .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ if not self._diff_lens and len(src) != len(tar): raise ValueError( 'Undefined for sequences of unequal length; set diff_lens ' + 'to True for Hamming distance between strings of unequal ' + 'lengths.' ) hdist = 0 if self._diff_lens: hdist += abs(len(src) - len(tar)) hdist += sum(c1 != c2 for c1, c2 in zip(src, tar)) return hdist
[docs] def dist(self, src, tar): """Return the normalized Hamming distance between two strings. Hamming distance normalized to the interval [0, 1]. The Hamming distance is normalized by dividing it by the greater of the number of characters in src & tar (unless diff_lens is set to False, in which case an exception is raised). The arguments are identical to those of the hamming() function. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison Returns ------- float Normalized Hamming distance Examples -------- >>> cmp = Hamming() >>> round(cmp.dist('cat', 'hat'), 12) 0.333333333333 >>> cmp.dist('Niall', 'Neil') 0.6 >>> cmp.dist('aluminum', 'Catalan') 1.0 >>> cmp.dist('ATCG', 'TAGC') 1.0 .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ if src == tar: return 0.0 return self.dist_abs(src, tar) / max(len(src), len(tar))
[docs]@deprecated( deprecated_in='0.4.0', removed_in='0.6.0', current_version=__version__, details='Use the Hamming.dist_abs method instead.', ) def hamming(src, tar, diff_lens=True): """Return the Hamming distance between two strings. This is a wrapper for :py:meth:`Hamming.dist_abs`. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison diff_lens : bool If True (default), this returns the Hamming distance for those characters that have a matching character in both strings plus the difference in the strings' lengths. This is equivalent to extending the shorter string with obligatorily non-matching characters. If False, an exception is raised in the case of strings of unequal lengths. Returns ------- int The Hamming distance between src & tar Examples -------- >>> hamming('cat', 'hat') 1 >>> hamming('Niall', 'Neil') 3 >>> hamming('aluminum', 'Catalan') 8 >>> hamming('ATCG', 'TAGC') 4 .. versionadded:: 0.1.0 """ return Hamming(diff_lens).dist_abs(src, tar)
[docs]@deprecated( deprecated_in='0.4.0', removed_in='0.6.0', current_version=__version__, details='Use the Hamming.dist method instead.', ) def dist_hamming(src, tar, diff_lens=True): """Return the normalized Hamming distance between two strings. This is a wrapper for :py:meth:`Hamming.dist`. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison diff_lens : bool If True (default), this returns the Hamming distance for those characters that have a matching character in both strings plus the difference in the strings' lengths. This is equivalent to extending the shorter string with obligatorily non-matching characters. If False, an exception is raised in the case of strings of unequal lengths. Returns ------- float The normalized Hamming distance Examples -------- >>> round(dist_hamming('cat', 'hat'), 12) 0.333333333333 >>> dist_hamming('Niall', 'Neil') 0.6 >>> dist_hamming('aluminum', 'Catalan') 1.0 >>> dist_hamming('ATCG', 'TAGC') 1.0 .. versionadded:: 0.1.0 """ return Hamming(diff_lens).dist(src, tar)
[docs]@deprecated( deprecated_in='0.4.0', removed_in='0.6.0', current_version=__version__, details='Use the Hamming.sim method instead.', ) def sim_hamming(src, tar, diff_lens=True): """Return the normalized Hamming similarity of two strings. This is a wrapper for :py:meth:`Hamming.sim`. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison diff_lens : bool If True (default), this returns the Hamming distance for those characters that have a matching character in both strings plus the difference in the strings' lengths. This is equivalent to extending the shorter string with obligatorily non-matching characters. If False, an exception is raised in the case of strings of unequal lengths. Returns ------- float The normalized Hamming similarity Examples -------- >>> round(sim_hamming('cat', 'hat'), 12) 0.666666666667 >>> sim_hamming('Niall', 'Neil') 0.4 >>> sim_hamming('aluminum', 'Catalan') 0.0 >>> sim_hamming('ATCG', 'TAGC') 0.0 .. versionadded:: 0.1.0 """ return Hamming(diff_lens).sim(src, tar)
if __name__ == '__main__': import doctest doctest.testmod()