Source code for abydos.distance._saps

# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._saps_alignment.

Syllable Alignment Pattern Searching tokenizer
"""

from numpy import int as np_int
from numpy import zeros as np_zeros

from ._distance import _Distance
from ..tokenizer import SAPSTokenizer

__all__ = ['SAPS']


[docs]class SAPS(_Distance):
    """Syllable Alignment Pattern Searching tokenizer.

    This is the alignment and similarity calculation described on p. 917-918 of
    :cite:`Ruibin:2005`.

    .. versionadded:: 0.4.0
    """

    def __init__(
        self,
        cost=(1, -1, -4, 6, -2, -1, -3),
        normalizer=max,
        tokenizer=None,
        **kwargs
    ):
        """Initialize SAPS instance.

        Parameters
        ----------
        cost : tuple
            A 7-tuple representing the cost of the four possible matches:

                - syllable-internal match
                - syllable-internal mis-match
                - syllable-initial match or mismatch with syllable-internal
                - syllable-initial match
                - syllable-initial mis-match
                - syllable-internal gap
                - syllable-initial gap

            (by default: (1, -1, -4, 6, -2, -1, -3))
        normalizer : function
            A function that takes an list and computes a normalization term
            by which the edit distance is divided (max by default). Another
            good option is the sum function.
        **kwargs
            Arbitrary keyword arguments


        .. versionadded:: 0.4.0

        """
        super(SAPS, self).__init__(**kwargs)
        self._s1, self._s2, self._s3, self._s4, self._s5 = cost[:5]
        self._g1, self._g2 = cost[5:]

        self._normalizer = normalizer
        if tokenizer is None:
            self._tokenizer = SAPSTokenizer()
        else:
            self._tokenizer = tokenizer

    def _s(self, src, tar):
        if src.isupper():
            if tar.isupper():
                return self._s4 if src == tar else self._s5
            else:
                return self._s3
        else:
            if tar.islower():
                return self._s1 if src == tar else self._s2
            else:
                return self._s3

    def _g(self, ch):
        if ch.isupper():
            return self._g2
        else:
            return self._g1

[docs]    def sim_score(self, src, tar):
        """Return the SAPS similarity between two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        int
            The SAPS similarity between src & tar

        Examples
        --------
        >>> cmp = SAPS()
        >>> cmp.sim_score('cat', 'hat')
        0
        >>> cmp.sim_score('Niall', 'Neil')
        3
        >>> cmp.sim_score('aluminum', 'Catalan')
        -11
        >>> cmp.sim_score('ATCG', 'TAGC')
        -1
        >>> cmp.sim_score('Stevenson', 'Stinson')
        16


        .. versionadded:: 0.4.0

        """
        src = self._tokenizer.tokenize(src).get_list()
        tar = self._tokenizer.tokenize(tar).get_list()

        src = ''.join([_[0].upper() + _[1:].lower() for _ in src])
        tar = ''.join([_[0].upper() + _[1:].lower() for _ in tar])

        d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int)
        for i in range(len(src)):
            d_mat[i + 1, 0] = d_mat[i, 0] + self._g(src[i])
        for j in range(len(tar)):
            d_mat[0, j + 1] = d_mat[0, j] + self._g(tar[j])

        for i in range(len(src)):
            for j in range(len(tar)):
                d_mat[i + 1, j + 1] = max(
                    d_mat[i, j + 1] + self._g(src[i]),  # ins
                    d_mat[i + 1, j] + self._g(tar[j]),  # del
                    d_mat[i, j] + self._s(src[i], tar[j]),  # sub/==
                )

        return d_mat[len(src), len(tar)]

[docs]    def sim(self, src, tar):
        """Return the normalized SAPS similarity between two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            The normalized SAPS similarity between src & tar

        Examples
        --------
        >>> cmp = SAPS()
        >>> round(cmp.sim('cat', 'hat'), 12)
        0.0
        >>> round(cmp.sim('Niall', 'Neil'), 12)
        0.2
        >>> cmp.sim('aluminum', 'Catalan')
        0.0
        >>> cmp.sim('ATCG', 'TAGC')
        0.0


        .. versionadded:: 0.4.0

        """
        score = self.sim_score(src, tar)
        if score <= 0:
            return 0.0

        src = self._tokenizer.tokenize(src).get_list()
        src_max = sum(5 + len(_) for _ in src)
        tar = self._tokenizer.tokenize(tar).get_list()
        tar_max = sum(5 + len(_) for _ in tar)

        return score / max(src_max, tar_max)


if __name__ == '__main__':
    import doctest

    doctest.testmod()