Source code for abydos.distance._baystat

# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._baystat.

Baystat similarity.
"""

from deprecation import deprecated

from ._distance import _Distance
from .. import __version__

__all__ = ['Baystat', 'dist_baystat', 'sim_baystat']


[docs]class Baystat(_Distance): """Baystat similarity and distance. Good results for shorter words are reported when setting min_ss_len to 1 and either left_ext OR right_ext to 1. The Baystat similarity is defined in :cite:`Furnohr:2002`. This is ostensibly a port of the R module PPRL's implementation: https://github.com/cran/PPRL/blob/master/src/MTB_Baystat.cpp :cite:`Rukasz:2018`. As such, this could be made more pythonic. .. versionadded:: 0.3.6 """ def __init__( self, min_ss_len=None, left_ext=None, right_ext=None, **kwargs ): """Initialize Levenshtein instance. Parameters ---------- min_ss_len : int Minimum substring length to be considered left_ext : int Left-side extension length right_ext : int Right-side extension length **kwargs Arbitrary keyword arguments .. versionadded:: 0.4.0 """ super(Baystat, self).__init__(**kwargs) self._min_ss_len = min_ss_len self._left_ext = left_ext self._right_ext = right_ext
[docs] def sim(self, src, tar): """Return the Baystat similarity. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison Returns ------- float The Baystat similarity Examples -------- >>> cmp = Baystat() >>> round(cmp.sim('cat', 'hat'), 12) 0.666666666667 >>> cmp.sim('Niall', 'Neil') 0.4 >>> round(cmp.sim('Colin', 'Cuilen'), 12) 0.166666666667 >>> cmp.sim('ATCG', 'TAGC') 0.0 .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ if src == tar: return 1.0 if not src or not tar: return 0.0 max_len = max(len(src), len(tar)) if not (self._min_ss_len and self._left_ext and self._right_ext): # These can be set via arguments to the function. Otherwise they # are set automatically based on values from the article. if max_len >= 7: min_ss_len = 2 left_ext = 2 right_ext = 2 else: # The paper suggests that for short names, (exclusively) one or # the other of left_ext and right_ext can be 1, with good # results. I use 0 & 0 as the default in this case. min_ss_len = 1 left_ext = 0 right_ext = 0 else: min_ss_len = self._min_ss_len left_ext = self._left_ext right_ext = self._right_ext pos = 0 match_len = 0 while True: if pos + min_ss_len > len(src): return match_len / max_len hit_len = 0 ix = 1 substring = src[pos : pos + min_ss_len] search_begin = pos - left_ext if search_begin < 0: search_begin = 0 left_ext_len = pos else: left_ext_len = left_ext if pos + min_ss_len + right_ext >= len(tar): right_ext_len = len(tar) - pos - min_ss_len else: right_ext_len = right_ext if ( search_begin + left_ext_len + min_ss_len + right_ext_len > search_begin ): search_val = tar[ search_begin : ( search_begin + left_ext_len + min_ss_len + right_ext_len ) ] else: search_val = '' flagged_tar = '' while substring in search_val and pos + ix <= len(src): hit_len = len(substring) flagged_tar = tar.replace(substring, '#' * hit_len) if pos + min_ss_len + ix <= len(src): substring = src[pos : pos + min_ss_len + ix] if pos + min_ss_len + right_ext_len + 1 <= len(tar): right_ext_len += 1 # The following is unnecessary, I think # if (search_begin + left_ext_len + min_ss_len + right_ext_len # <= len(tar)): search_val = tar[ search_begin : ( search_begin + left_ext_len + min_ss_len + right_ext_len ) ] ix += 1 if hit_len > 0: tar = flagged_tar match_len += hit_len pos += ix
[docs]@deprecated( deprecated_in='0.4.0', removed_in='0.6.0', current_version=__version__, details='Use the Baystat.sim method instead.', ) def sim_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None): """Return the Baystat similarity. This is a wrapper for :py:meth:`Baystat.sim`. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison min_ss_len : int Minimum substring length to be considered left_ext :int Left-side extension length right_ext :int Right-side extension length Returns ------- float The Baystat similarity Examples -------- >>> round(sim_baystat('cat', 'hat'), 12) 0.666666666667 >>> sim_baystat('Niall', 'Neil') 0.4 >>> round(sim_baystat('Colin', 'Cuilen'), 12) 0.166666666667 >>> sim_baystat('ATCG', 'TAGC') 0.0 .. versionadded:: 0.3.0 """ return Baystat(min_ss_len, left_ext, right_ext).sim(src, tar)
[docs]@deprecated( deprecated_in='0.4.0', removed_in='0.6.0', current_version=__version__, details='Use the Baystat.dist method instead.', ) def dist_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None): """Return the Baystat distance. This is a wrapper for :py:meth:`Baystat.dist`. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison min_ss_len : int Minimum substring length to be considered left_ext : int Left-side extension length right_ext : int Right-side extension length Returns ------- float The Baystat distance Examples -------- >>> round(dist_baystat('cat', 'hat'), 12) 0.333333333333 >>> dist_baystat('Niall', 'Neil') 0.6 >>> round(dist_baystat('Colin', 'Cuilen'), 12) 0.833333333333 >>> dist_baystat('ATCG', 'TAGC') 1.0 .. versionadded:: 0.3.0 """ return Baystat(min_ss_len, left_ext, right_ext).dist(src, tar)
if __name__ == '__main__': import doctest doctest.testmod()