Source code for abydos.distance._soft_cosine

# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._softy_cosine.

Soft Cosine similarity & distance
"""

from ._levenshtein import Levenshtein
from ._token_distance import _TokenDistance

__all__ = ['SoftCosine']


[docs]class SoftCosine(_TokenDistance):
    r"""Soft Cosine similarity.

    As described in :cite:`Sidorov:2014`, soft cosine similarity of two
    multi-sets X and Y, drawn from an alphabet S, is

        .. math::

            sim_{soft cosine}(X, Y) =
            \frac{\sum_{i \in S}\sum_{j \in S} s_{ij} X_i Y_j}
            {\sqrt{\sum_{i \in S}\sum_{j \in S} s_{ij} X_i X_j}
            \sqrt{\sum_{i \in S}\sum_{j \in S} s_{ij} Y_i Y_j}}

    where :math:`s_{ij}` is the similarity of two tokens, by default a function
    of Levenshtein distance: :math:`\frac{1}{1+Levenshtein\_distance(i, j)}`.

    Notes
    -----
    This class implements soft cosine similarity, as defined by
    :cite:`Sidorov:2014`. An alternative formulation of soft cosine similarity
    using soft (multi-)sets is provided by the :class:`Cosine` class using
    intersection_type=``soft``, based on the soft intersection
    defined in :cite:`Russ:2014`.

    .. versionadded:: 0.4.0

    """

    def __init__(self, tokenizer=None, metric=None, sim_method='a', **kwargs):
        r"""Initialize SoftCosine instance.

        Parameters
        ----------
        tokenizer : _Tokenizer
            A tokenizer instance from the :py:mod:`abydos.tokenizer`
            package, defaulting to the QGrams tokenizer with q=4
        threshold : float
            The minimum similarity for a pair of tokens to contribute to
            similarity
        metric : _Distance
            A distance instance from the abydos.distance package, defaulting
            to Levenshtein distance
        sim_method : str
            Selects the similarity method from the four given in
            :cite:`Sidorov:2014`:

                - ``a`` : :math:`\frac{1}{1+d}`
                - ``b`` : :math:`1-\frac{d}{m}`
                - ``c`` : :math:`\sqrt{1-\frac{d}{m}}`
                - ``d`` : :math:`\Big(1-\frac{d}{m}\Big)^2`

            Where :math:`d` is the distance (Levenshtein by default) and
            :math:`m` is the maximum length of the two tokens. Option `a` is
            default, as suggested by the paper.
        **kwargs
            Arbitrary keyword arguments

        Raises
        ------
        ValueError
            sim_method must be one of 'a', 'b', 'c', or 'd'

        Other Parameters
        ----------------
        qval : int
            The length of each q-gram. Using this parameter and tokenizer=None
            will cause the instance to use the QGram tokenizer with this
            q value.


        .. versionadded:: 0.4.0

        """
        super(SoftCosine, self).__init__(tokenizer, **kwargs)
        self.params['metric'] = metric if metric is not None else Levenshtein()
        if sim_method not in 'abcd':
            raise ValueError("sim_method must be one of 'a', 'b', 'c', or 'd'")
        self.params['sim_method'] = sim_method

[docs]    def sim(self, src, tar):
        r"""Return the Soft Cosine similarity of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Fuzzy Cosine similarity

        Examples
        --------
        >>> cmp = SoftCosine()
        >>> cmp.sim('cat', 'hat')
        0.8750000000000001
        >>> cmp.sim('Niall', 'Neil')
        0.8844691709074513
        >>> cmp.sim('aluminum', 'Catalan')
        0.831348688760277
        >>> cmp.sim('ATCG', 'TAGC')
        0.8571428571428572


        .. versionadded:: 0.4.0

        """
        if src == tar:
            return 1.0

        self._tokenize(src, tar)

        if not self._src_card() or not self._tar_card():
            return 0.0

        similarity = {
            'a': lambda src, tar: 1
            / (1 + self.params['metric'].dist_abs(src, tar)),
            'b': lambda src, tar: 1
            - (
                self.params['metric'].dist_abs(src, tar)
                / max(len(src), len(tar))
            ),
            'c': lambda src, tar: (
                1
                - (
                    self.params['metric'].dist_abs(src, tar)
                    / max(len(src), len(tar))
                )
            )
            ** 0.5,
            'd': lambda src, tar: (
                1
                - (
                    self.params['metric'].dist_abs(src, tar)
                    / max(len(src), len(tar))
                )
            )
            ** 2,
        }

        nom = 0
        denom_left = 0
        denom_right = 0

        for src in self._src_tokens.keys():
            for tar in self._tar_tokens.keys():
                nom += (
                    self._src_tokens[src]
                    * self._tar_tokens[tar]
                    * similarity[self.params['sim_method']](src, tar)
                )

        for src in self._src_tokens.keys():
            for tar in self._src_tokens.keys():
                denom_left += (
                    self._src_tokens[src]
                    * self._src_tokens[tar]
                    * similarity[self.params['sim_method']](src, tar)
                )

        for src in self._tar_tokens.keys():
            for tar in self._tar_tokens.keys():
                denom_right += (
                    self._tar_tokens[src]
                    * self._tar_tokens[tar]
                    * similarity[self.params['sim_method']](src, tar)
                )

        return nom / (denom_left ** 0.5 * denom_right ** 0.5)


if __name__ == '__main__':
    import doctest

    doctest.testmod()