Source code for abydos.distance._sokal_sneath_i

# -*- coding: utf-8 -*-

# Copyright 2018-2019 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._sokal_sneath_i.

Sokal & Sneath I similarity
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from ._token_distance import _TokenDistance

__all__ = ['SokalSneathI']


[docs]class SokalSneathI(_TokenDistance): r"""Sokal & Sneath I similarity. For two sets X and Y and a population N, Sokal & Sneath I similarity :cite:`Sokal:1963` is .. math:: sim_{SokalSneathI}(X, Y) = \frac{2(|X \cap Y| + |(N \setminus X) \setminus Y|)} {|X \cap Y| + |(N \setminus X) \setminus Y| + |N|} This is the first of five "Unnamed coefficients" presented in :cite:`Sokal:1963`. It corresponds to the "Matched pairs carry twice the weight of unmatched pairs in the Denominator" with "Negative Matches in Numerator Included". "Negative Matches in Numerator Excluded" corresponds to the Sørensen–Dice coefficient, :class:`.Dice`. In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n, this is .. math:: sim_{SokalSneathI} = \frac{2(a+d)}{a+d+n} .. versionadded:: 0.4.0 """ def __init__( self, alphabet=None, tokenizer=None, intersection_type='crisp', **kwargs ): """Initialize SokalSneathI instance. Parameters ---------- alphabet : Counter, collection, int, or None This represents the alphabet of possible tokens. See :ref:`alphabet <alphabet>` description in :py:class:`_TokenDistance` for details. tokenizer : _Tokenizer A tokenizer instance from the :py:mod:`abydos.tokenizer` package intersection_type : str Specifies the intersection type, and set type as a result: See :ref:`intersection_type <intersection_type>` description in :py:class:`_TokenDistance` for details. **kwargs Arbitrary keyword arguments Other Parameters ---------------- qval : int The length of each q-gram. Using this parameter and tokenizer=None will cause the instance to use the QGram tokenizer with this q value. metric : _Distance A string distance measure class for use in the ``soft`` and ``fuzzy`` variants. threshold : float A threshold value, similarities above which are counted as members of the intersection for the ``fuzzy`` variant. .. versionadded:: 0.4.0 """ super(SokalSneathI, self).__init__( alphabet=alphabet, tokenizer=tokenizer, intersection_type=intersection_type, **kwargs )
[docs] def sim(self, src, tar): """Return the Sokal & Sneath I similarity of two strings. Parameters ---------- src : str Source string (or QGrams/Counter objects) for comparison tar : str Target string (or QGrams/Counter objects) for comparison Returns ------- float Sokal & Sneath I similarity Examples -------- >>> cmp = SokalSneathI() >>> cmp.sim('cat', 'hat') 0.9974424552429667 >>> cmp.sim('Niall', 'Neil') 0.9955156950672646 >>> cmp.sim('aluminum', 'Catalan') 0.9903536977491961 >>> cmp.sim('ATCG', 'TAGC') 0.993581514762516 .. versionadded:: 0.4.0 """ if src == tar: return 1.0 self._tokenize(src, tar) a = self._intersection_card() d = self._total_complement_card() n = self._population_unique_card() return (2 * (a + d)) / (a + d + n)
if __name__ == '__main__': import doctest doctest.testmod()