# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.distance._ratcliff_obershelp.
Ratcliff-Obershelp similarity
"""
from deprecation import deprecated
from numpy import int as np_int
from numpy import zeros as np_zeros
from ._distance import _Distance
from .. import __version__
__all__ = [
'RatcliffObershelp',
'dist_ratcliff_obershelp',
'sim_ratcliff_obershelp',
]
[docs]class RatcliffObershelp(_Distance):
"""Ratcliff-Obershelp similarity.
This follows the Ratcliff-Obershelp algorithm :cite:`Ratcliff:1988` to
derive a similarity measure:
1. Find the length of the longest common substring in src & tar.
2. Recurse on the strings to the left & right of each this substring
in src & tar. The base case is a 0 length common substring, in which
case, return 0. Otherwise, return the sum of the current longest
common substring and the left & right recursed sums.
3. Multiply this length by 2 and divide by the sum of the lengths of
src & tar.
Cf.
http://www.drdobbs.com/database/pattern-matching-the-gestalt-approach/184407970
.. versionadded:: 0.3.6
"""
[docs] def sim(self, src, tar):
"""Return the Ratcliff-Obershelp similarity of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Ratcliff-Obershelp similarity
Examples
--------
>>> cmp = RatcliffObershelp()
>>> round(cmp.sim('cat', 'hat'), 12)
0.666666666667
>>> round(cmp.sim('Niall', 'Neil'), 12)
0.666666666667
>>> round(cmp.sim('aluminum', 'Catalan'), 12)
0.4
>>> cmp.sim('ATCG', 'TAGC')
0.5
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
def _lcsstr_stl(src, tar):
"""Return start positions & length for Ratcliff-Obershelp.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
tuple
The start position in the source string, start position in the
target string, and length of the longest common substring of
strings src and tar.
.. versionadded:: 0.1.0
"""
lengths = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int)
longest, src_longest, tar_longest = 0, 0, 0
for i in range(1, len(src) + 1):
for j in range(1, len(tar) + 1):
if src[i - 1] == tar[j - 1]:
lengths[i, j] = lengths[i - 1, j - 1] + 1
if lengths[i, j] > longest:
longest = lengths[i, j]
src_longest = i
tar_longest = j
else:
lengths[i, j] = 0
return src_longest - longest, tar_longest - longest, longest
def _sstr_matches(src, tar):
"""Return the sum of substring match lengths.
This follows the Ratcliff-Obershelp algorithm
:cite:`Ratcliff:1988`:
1. Find the length of the longest common substring in src &
tar.
2. Recurse on the strings to the left & right of each this
substring in src & tar.
3. Base case is a 0 length common substring, in which case,
return 0.
4. Return the sum.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
int
Sum of substring match lengths
.. versionadded:: 0.1.0
"""
src_start, tar_start, length = _lcsstr_stl(src, tar)
if length == 0:
return 0
return (
_sstr_matches(src[:src_start], tar[:tar_start])
+ length
+ _sstr_matches(
src[src_start + length :], tar[tar_start + length :]
)
)
if src == tar:
return 1.0
elif not src or not tar:
return 0.0
return 2 * _sstr_matches(src, tar) / (len(src) + len(tar))
[docs]@deprecated(
deprecated_in='0.4.0',
removed_in='0.6.0',
current_version=__version__,
details='Use the RatcliffObershelp.sim method instead.',
)
def sim_ratcliff_obershelp(src, tar):
"""Return the Ratcliff-Obershelp similarity of two strings.
This is a wrapper for :py:meth:`RatcliffObershelp.sim`.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Ratcliff-Obershelp similarity
Examples
--------
>>> round(sim_ratcliff_obershelp('cat', 'hat'), 12)
0.666666666667
>>> round(sim_ratcliff_obershelp('Niall', 'Neil'), 12)
0.666666666667
>>> round(sim_ratcliff_obershelp('aluminum', 'Catalan'), 12)
0.4
>>> sim_ratcliff_obershelp('ATCG', 'TAGC')
0.5
.. versionadded:: 0.1.0
"""
return RatcliffObershelp().sim(src, tar)
[docs]@deprecated(
deprecated_in='0.4.0',
removed_in='0.6.0',
current_version=__version__,
details='Use the RatcliffObershelp.dist method instead.',
)
def dist_ratcliff_obershelp(src, tar):
"""Return the Ratcliff-Obershelp distance between two strings.
This is a wrapper for :py:meth:`RatcliffObershelp.dist`.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Ratcliff-Obershelp distance
Examples
--------
>>> round(dist_ratcliff_obershelp('cat', 'hat'), 12)
0.333333333333
>>> round(dist_ratcliff_obershelp('Niall', 'Neil'), 12)
0.333333333333
>>> round(dist_ratcliff_obershelp('aluminum', 'Catalan'), 12)
0.6
>>> dist_ratcliff_obershelp('ATCG', 'TAGC')
0.5
.. versionadded:: 0.1.0
"""
return RatcliffObershelp().dist(src, tar)
if __name__ == '__main__':
import doctest
doctest.testmod()