Source code for abydos.distance._token

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance.token.

The distance.token module implements token-based distance measures (excluding
those that are Minkowski distance measures):

    - Tversky index
    - Sørensen–Dice coefficient & distance
    - Jaccard similarity coefficient & distance
    - overlap similarity & distance
    - Tanimoto coefficient & distance
    - cosine similarity & distance
    - Bag similarity & distance
    - Monge-Elkan similarity & distance
"""

from __future__ import division, unicode_literals

from collections import Counter
from math import log, sqrt

from ._levenshtein import sim_levenshtein
from ._util import _get_qgrams
from ..tokenizer import QGrams

__all__ = [
    'bag',
    'dist_bag',
    'dist_cosine',
    'dist_dice',
    'dist_jaccard',
    'dist_monge_elkan',
    'dist_overlap',
    'dist_tversky',
    'sim_bag',
    'sim_cosine',
    'sim_dice',
    'sim_jaccard',
    'sim_monge_elkan',
    'sim_overlap',
    'sim_tanimoto',
    'sim_tversky',
    'tanimoto',
]


[docs]def sim_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None): r"""Return the Tversky index of two strings. The Tversky index :cite:`Tversky:1977` is defined as: For two sets X and Y: :math:`sim_{Tversky}(X, Y) = \\frac{|X \\cap Y|} {|X \\cap Y| + \\alpha|X - Y| + \\beta|Y - X|}`. :math:`\\alpha = \\beta = 1` is equivalent to the Jaccard & Tanimoto similarity coefficients. :math:`\\alpha = \\beta = 0.5` is equivalent to the Sørensen-Dice similarity coefficient :cite:`Dice:1945,Sorensen:1948`. Unequal α and β will tend to emphasize one or the other set's contributions: - :math:`\\alpha > \\beta` emphasizes the contributions of X over Y - :math:`\\alpha < \\beta` emphasizes the contributions of Y over X) Parameter values' relation to 1 emphasizes different types of contributions: - :math:`\\alpha and \\beta > 1` emphsize unique contributions over the intersection - :math:`\\alpha and \\beta < 1` emphsize the intersection over unique contributions The symmetric variant is defined in :cite:`Jiminez:2013`. This is activated by specifying a bias parameter. :param str src: source string (or QGrams/Counter objects) for comparison :param str tar: target string (or QGrams/Counter objects) for comparison :param int qval: the length of each q-gram; 0 for non-q-gram version :param float alpha: Tversky index parameter as described above :param float beta: Tversky index parameter as described above :param float bias: The symmetric Tversky index bias parameter :returns: Tversky similarity :rtype: float >>> sim_tversky('cat', 'hat') 0.3333333333333333 >>> sim_tversky('Niall', 'Neil') 0.2222222222222222 >>> sim_tversky('aluminum', 'Catalan') 0.0625 >>> sim_tversky('ATCG', 'TAGC') 0.0 """ if alpha < 0 or beta < 0: raise ValueError( 'Unsupported weight assignment; alpha and beta ' + 'must be greater than or equal to 0.' ) if src == tar: return 1.0 elif not src or not tar: return 0.0 q_src, q_tar = _get_qgrams(src, tar, qval) q_src_mag = sum(q_src.values()) q_tar_mag = sum(q_tar.values()) q_intersection_mag = sum((q_src & q_tar).values()) if not q_src or not q_tar: return 0.0 if bias is None: return q_intersection_mag / ( q_intersection_mag + alpha * (q_src_mag - q_intersection_mag) + beta * (q_tar_mag - q_intersection_mag) ) a_val = min(q_src_mag - q_intersection_mag, q_tar_mag - q_intersection_mag) b_val = max(q_src_mag - q_intersection_mag, q_tar_mag - q_intersection_mag) c_val = q_intersection_mag + bias return c_val / (beta * (alpha * a_val + (1 - alpha) * b_val) + c_val)
[docs]def dist_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None): """Return the Tversky distance between two strings. Tversky distance is the complement of the Tvesrsky index (similarity): :math:`dist_{Tversky} = 1-sim_{Tversky}`. :param str src: source string (or QGrams/Counter objects) for comparison :param str tar: target string (or QGrams/Counter objects) for comparison :param int qval: the length of each q-gram; 0 for non-q-gram version :param float alpha: the Tversky index's alpha parameter :param float beta: the Tversky index's beta parameter :param float bias: The symmetric Tversky index bias parameter :returns: Tversky distance :rtype: float >>> dist_tversky('cat', 'hat') 0.6666666666666667 >>> dist_tversky('Niall', 'Neil') 0.7777777777777778 >>> dist_tversky('aluminum', 'Catalan') 0.9375 >>> dist_tversky('ATCG', 'TAGC') 1.0 """ return 1 - sim_tversky(src, tar, qval, alpha, beta, bias)
[docs]def sim_dice(src, tar, qval=2): r"""Return the Sørensen–Dice coefficient of two strings. For two sets X and Y, the Sørensen–Dice coefficient :cite:`Dice:1945,Sorensen:1948` is :math:`sim_{dice}(X, Y) = \\frac{2 \\cdot |X \\cap Y|}{|X| + |Y|}`. This is identical to the Tanimoto similarity coefficient :cite:`Tanimoto:1958` and the Tversky index :cite:`Tversky:1977` for :math:`\\alpha = \\beta = 0.5`. :param str src: source string (or QGrams/Counter objects) for comparison :param str tar: target string (or QGrams/Counter objects) for comparison :param int qval: the length of each q-gram; 0 for non-q-gram version :returns: Sørensen–Dice similarity :rtype: float >>> sim_dice('cat', 'hat') 0.5 >>> sim_dice('Niall', 'Neil') 0.36363636363636365 >>> sim_dice('aluminum', 'Catalan') 0.11764705882352941 >>> sim_dice('ATCG', 'TAGC') 0.0 """ return sim_tversky(src, tar, qval, 0.5, 0.5)
[docs]def dist_dice(src, tar, qval=2): """Return the Sørensen–Dice distance between two strings. Sørensen–Dice distance is the complemenjt of the Sørensen–Dice coefficient: :math:`dist_{dice} = 1 - sim_{dice}`. :param str src: source string (or QGrams/Counter objects) for comparison :param str tar: target string (or QGrams/Counter objects) for comparison :param int qval: the length of each q-gram; 0 for non-q-gram version :returns: Sørensen–Dice distance :rtype: float >>> dist_dice('cat', 'hat') 0.5 >>> dist_dice('Niall', 'Neil') 0.6363636363636364 >>> dist_dice('aluminum', 'Catalan') 0.8823529411764706 >>> dist_dice('ATCG', 'TAGC') 1.0 """ return 1 - sim_dice(src, tar, qval)
[docs]def sim_jaccard(src, tar, qval=2): r"""Return the Jaccard similarity of two strings. For two sets X and Y, the Jaccard similarity coefficient :cite:`Jaccard:1901` is :math:`sim_{jaccard}(X, Y) = \\frac{|X \\cap Y|}{|X \\cup Y|}`. This is identical to the Tanimoto similarity coefficient :cite:`Tanimoto:1958` and the Tversky index :cite:`Tversky:1977` for :math:`\\alpha = \\beta = 1`. :param str src: source string (or QGrams/Counter objects) for comparison :param str tar: target string (or QGrams/Counter objects) for comparison :param int qval: the length of each q-gram; 0 for non-q-gram version :returns: Jaccard similarity :rtype: float >>> sim_jaccard('cat', 'hat') 0.3333333333333333 >>> sim_jaccard('Niall', 'Neil') 0.2222222222222222 >>> sim_jaccard('aluminum', 'Catalan') 0.0625 >>> sim_jaccard('ATCG', 'TAGC') 0.0 """ return sim_tversky(src, tar, qval, 1, 1)
[docs]def dist_jaccard(src, tar, qval=2): """Return the Jaccard distance between two strings. Jaccard distance is the complement of the Jaccard similarity coefficient: :math:`dist_{Jaccard} = 1 - sim_{Jaccard}`. :param str src: source string (or QGrams/Counter objects) for comparison :param str tar: target string (or QGrams/Counter objects) for comparison :param int qval: the length of each q-gram; 0 for non-q-gram version :returns: Jaccard distance :rtype: float >>> dist_jaccard('cat', 'hat') 0.6666666666666667 >>> dist_jaccard('Niall', 'Neil') 0.7777777777777778 >>> dist_jaccard('aluminum', 'Catalan') 0.9375 >>> dist_jaccard('ATCG', 'TAGC') 1.0 """ return 1 - sim_jaccard(src, tar, qval)
[docs]def sim_overlap(src, tar, qval=2): r"""Return the overlap coefficient of two strings. For two sets X and Y, the overlap coefficient :cite:`Szymkiewicz:1934,Simpson:1949`, also called the Szymkiewicz-Simpson coefficient, is :math:`sim_{overlap}(X, Y) = \\frac{|X \\cap Y|}{min(|X|, |Y|)}`. :param str src: source string (or QGrams/Counter objects) for comparison :param str tar: target string (or QGrams/Counter objects) for comparison :param int qval: the length of each q-gram; 0 for non-q-gram version :returns: overlap similarity :rtype: float >>> sim_overlap('cat', 'hat') 0.5 >>> sim_overlap('Niall', 'Neil') 0.4 >>> sim_overlap('aluminum', 'Catalan') 0.125 >>> sim_overlap('ATCG', 'TAGC') 0.0 """ if src == tar: return 1.0 elif not src or not tar: return 0.0 q_src, q_tar = _get_qgrams(src, tar, qval) q_src_mag = sum(q_src.values()) q_tar_mag = sum(q_tar.values()) q_intersection_mag = sum((q_src & q_tar).values()) return q_intersection_mag / min(q_src_mag, q_tar_mag)
[docs]def dist_overlap(src, tar, qval=2): """Return the overlap distance between two strings. Overlap distance is the complement of the overlap coefficient: :math:`sim_{overlap} = 1 - dist_{overlap}`. :param str src: source string (or QGrams/Counter objects) for comparison :param str tar: target string (or QGrams/Counter objects) for comparison :param int qval: the length of each q-gram; 0 for non-q-gram version :returns: overlap distance :rtype: float >>> dist_overlap('cat', 'hat') 0.5 >>> dist_overlap('Niall', 'Neil') 0.6 >>> dist_overlap('aluminum', 'Catalan') 0.875 >>> dist_overlap('ATCG', 'TAGC') 1.0 """ return 1 - sim_overlap(src, tar, qval)
[docs]def sim_tanimoto(src, tar, qval=2): r"""Return the Tanimoto similarity of two strings. For two sets X and Y, the Tanimoto similarity coefficient :cite:`Tanimoto:1958` is :math:`sim_{Tanimoto}(X, Y) = \\frac{|X \\cap Y|}{|X \\cup Y|}`. This is identical to the Jaccard similarity coefficient :cite:`Jaccard:1901` and the Tversky index :cite:`Tversky:1977` for :math:`\\alpha = \\beta = 1`. :param str src: source string (or QGrams/Counter objects) for comparison :param str tar: target string (or QGrams/Counter objects) for comparison :param int qval: the length of each q-gram; 0 for non-q-gram version :returns: Tanimoto similarity :rtype: float >>> sim_tanimoto('cat', 'hat') 0.3333333333333333 >>> sim_tanimoto('Niall', 'Neil') 0.2222222222222222 >>> sim_tanimoto('aluminum', 'Catalan') 0.0625 >>> sim_tanimoto('ATCG', 'TAGC') 0.0 """ return sim_jaccard(src, tar, qval)
[docs]def tanimoto(src, tar, qval=2): """Return the Tanimoto distance between two strings. Tanimoto distance is :math:`-log_{2}sim_{Tanimoto}`. :param str src: source string (or QGrams/Counter objects) for comparison :param str tar: target string (or QGrams/Counter objects) for comparison :param int qval: the length of each q-gram; 0 for non-q-gram version :returns: Tanimoto distance :rtype: float >>> tanimoto('cat', 'hat') -1.5849625007211563 >>> tanimoto('Niall', 'Neil') -2.1699250014423126 >>> tanimoto('aluminum', 'Catalan') -4.0 >>> tanimoto('ATCG', 'TAGC') -inf """ coeff = sim_jaccard(src, tar, qval) if coeff != 0: return log(coeff, 2) return float('-inf')
[docs]def sim_cosine(src, tar, qval=2): r"""Return the cosine similarity of two strings. For two sets X and Y, the cosine similarity, Otsuka-Ochiai coefficient, or Ochiai coefficient :cite:`Otsuka:1936,Ochiai:1957` is: :math:`sim_{cosine}(X, Y) = \\frac{|X \\cap Y|}{\\sqrt{|X| \\cdot |Y|}}`. :param str src: source string (or QGrams/Counter objects) for comparison :param str tar: target string (or QGrams/Counter objects) for comparison :param int qval: the length of each q-gram; 0 for non-q-gram version :returns: cosine similarity :rtype: float >>> sim_cosine('cat', 'hat') 0.5 >>> sim_cosine('Niall', 'Neil') 0.3651483716701107 >>> sim_cosine('aluminum', 'Catalan') 0.11785113019775793 >>> sim_cosine('ATCG', 'TAGC') 0.0 """ if src == tar: return 1.0 if not src or not tar: return 0.0 q_src, q_tar = _get_qgrams(src, tar, qval) q_src_mag = sum(q_src.values()) q_tar_mag = sum(q_tar.values()) q_intersection_mag = sum((q_src & q_tar).values()) return q_intersection_mag / sqrt(q_src_mag * q_tar_mag)
[docs]def dist_cosine(src, tar, qval=2): """Return the cosine distance between two strings. Cosine distance is the complement of cosine similarity: :math:`dist_{cosine} = 1 - sim_{cosine}`. :param str src: source string (or QGrams/Counter objects) for comparison :param str tar: target string (or QGrams/Counter objects) for comparison :param int qval: the length of each q-gram; 0 for non-q-gram version :returns: cosine distance :rtype: float >>> dist_cosine('cat', 'hat') 0.5 >>> dist_cosine('Niall', 'Neil') 0.6348516283298893 >>> dist_cosine('aluminum', 'Catalan') 0.882148869802242 >>> dist_cosine('ATCG', 'TAGC') 1.0 """ return 1 - sim_cosine(src, tar, qval)
[docs]def bag(src, tar): """Return the bag distance between two strings. Bag distance is proposed in :cite:`Bartolini:2002`. It is defined as: :math:`max(|multiset(src)-multiset(tar)|, |multiset(tar)-multiset(src)|)`. :param str src: source string for comparison :param str tar: target string for comparison :returns: bag distance :rtype: int >>> bag('cat', 'hat') 1 >>> bag('Niall', 'Neil') 2 >>> bag('aluminum', 'Catalan') 5 >>> bag('ATCG', 'TAGC') 0 >>> bag('abcdefg', 'hijklm') 7 >>> bag('abcdefg', 'hijklmno') 8 """ if tar == src: return 0 elif not src: return len(tar) elif not tar: return len(src) src_bag = Counter(src) tar_bag = Counter(tar) return max( sum((src_bag - tar_bag).values()), sum((tar_bag - src_bag).values()) )
[docs]def dist_bag(src, tar): """Return the normalized bag distance between two strings. Bag distance is normalized by dividing by :math:`max( |src|, |tar| )`. :param str src: source string for comparison :param str tar: target string for comparison :returns: normalized bag distance :rtype: float >>> dist_bag('cat', 'hat') 0.3333333333333333 >>> dist_bag('Niall', 'Neil') 0.4 >>> dist_bag('aluminum', 'Catalan') 0.625 >>> dist_bag('ATCG', 'TAGC') 0.0 """ if tar == src: return 0.0 if not src or not tar: return 1.0 max_length = max(len(src), len(tar)) return bag(src, tar) / max_length
[docs]def sim_bag(src, tar): """Return the normalized bag similarity of two strings. Normalized bag similarity is the complement of normalized bag distance: :math:`sim_{bag} = 1 - dist_{bag}`. :param str src: source string for comparison :param str tar: target string for comparison :returns: normalized bag similarity :rtype: float >>> round(sim_bag('cat', 'hat'), 12) 0.666666666667 >>> sim_bag('Niall', 'Neil') 0.6 >>> sim_bag('aluminum', 'Catalan') 0.375 >>> sim_bag('ATCG', 'TAGC') 1.0 """ return 1 - dist_bag(src, tar)
[docs]def sim_monge_elkan(src, tar, sim_func=sim_levenshtein, symmetric=False): """Return the Monge-Elkan similarity of two strings. Monge-Elkan is defined in :cite:`Monge:1996`. Note: Monge-Elkan is NOT a symmetric similarity algorithm. Thus, the similarity of src to tar is not necessarily equal to the similarity of tar to src. If the symmetric argument is True, a symmetric value is calculated, at the cost of doubling the computation time (since :math:`sim_{Monge-Elkan}(src, tar)` and :math:`sim_{Monge-Elkan}(tar, src)` are both calculated and then averaged). :param str src: source string for comparison :param str tar: target string for comparison :param function sim_func: the internal similarity metric to employ :param bool symmetric: return a symmetric similarity measure :returns: Monge-Elkan similarity :rtype: float >>> sim_monge_elkan('cat', 'hat') 0.75 >>> round(sim_monge_elkan('Niall', 'Neil'), 12) 0.666666666667 >>> round(sim_monge_elkan('aluminum', 'Catalan'), 12) 0.388888888889 >>> sim_monge_elkan('ATCG', 'TAGC') 0.5 """ if src == tar: return 1.0 q_src = sorted(QGrams(src).elements()) q_tar = sorted(QGrams(tar).elements()) if not q_src or not q_tar: return 0.0 sum_of_maxes = 0 for q_s in q_src: max_sim = float('-inf') for q_t in q_tar: max_sim = max(max_sim, sim_func(q_s, q_t)) sum_of_maxes += max_sim sim_em = sum_of_maxes / len(q_src) if symmetric: sim_em = (sim_em + sim_monge_elkan(tar, src, sim_func, False)) / 2 return sim_em
[docs]def dist_monge_elkan(src, tar, sim_func=sim_levenshtein, symmetric=False): """Return the Monge-Elkan distance between two strings. Monge-Elkan distance is the complement of Monge-Elkan similarity: :math:`dist_{Monge-Elkan} = 1 - sim_{Monge-Elkan}`. :param str src: source string for comparison :param str tar: target string for comparison :param function sim_func: the internal similarity metric to employ :param bool symmetric: return a symmetric similarity measure :returns: Monge-Elkan distance :rtype: float >>> dist_monge_elkan('cat', 'hat') 0.25 >>> round(dist_monge_elkan('Niall', 'Neil'), 12) 0.333333333333 >>> round(dist_monge_elkan('aluminum', 'Catalan'), 12) 0.611111111111 >>> dist_monge_elkan('ATCG', 'TAGC') 0.5 """ return 1 - sim_monge_elkan(src, tar, sim_func, symmetric)
if __name__ == '__main__': import doctest doctest.testmod()