Source code for abydos.distance._baystat

# -*- coding: utf-8 -*-

# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance.baystat.

The distance.baystat module implements BayStat similarity.
"""

from __future__ import division, unicode_literals

__all__ = ['dist_baystat', 'sim_baystat']


[docs]def sim_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None): """Return the Baystat similarity. Good results for shorter words are reported when setting min_ss_len to 1 and either left_ext OR right_ext to 1. The Baystat similarity is defined in :cite:`Furnohr:2002`. This is ostensibly a port of the R module PPRL's implementation: https://github.com/cran/PPRL/blob/master/src/MTB_Baystat.cpp :cite:`Rukasz:2018`. As such, this could be made more pythonic. :param str src: source string for comparison :param str tar: target string for comparison :param int min_ss_len: minimum substring length to be considered :param int left_ext: left-side extension length :param int right_ext: right-side extension length :returns: the Baystat similarity :rtype: float >>> round(sim_baystat('cat', 'hat'), 12) 0.666666666667 >>> sim_baystat('Niall', 'Neil') 0.4 >>> round(sim_baystat('Colin', 'Cuilen'), 12) 0.166666666667 >>> sim_baystat('ATCG', 'TAGC') 0.0 """ if src == tar: return 1 if not src or not tar: return 0 max_len = max(len(src), len(tar)) if not (min_ss_len and left_ext and right_ext): # These can be set via arguments to the function. Otherwise they are # set automatically based on values from the article. if max_len >= 7: min_ss_len = 2 left_ext = 2 right_ext = 2 else: # The paper suggests that for short names, (exclusively) one or the # other of left_ext and right_ext can be 1, with good results. # I use 0 & 0 as the default in this case. min_ss_len = 1 left_ext = 0 right_ext = 0 pos = 0 match_len = 0 while True: if pos + min_ss_len > len(src): return match_len / max_len hit_len = 0 ix = 1 substring = src[pos : pos + min_ss_len] search_begin = pos - left_ext if search_begin < 0: search_begin = 0 left_ext_len = pos else: left_ext_len = left_ext if pos + min_ss_len + right_ext >= len(tar): right_ext_len = len(tar) - pos - min_ss_len else: right_ext_len = right_ext if ( search_begin + left_ext_len + min_ss_len + right_ext_len > search_begin ): search_val = tar[ search_begin : ( search_begin + left_ext_len + min_ss_len + right_ext_len ) ] else: search_val = '' flagged_tar = '' while substring in search_val and pos + ix <= len(src): hit_len = len(substring) flagged_tar = tar.replace(substring, '#' * hit_len) if pos + min_ss_len + ix <= len(src): substring = src[pos : pos + min_ss_len + ix] if pos + min_ss_len + right_ext_len + 1 <= len(tar): right_ext_len += 1 # The following is unnecessary, I think # if (search_begin + left_ext_len + min_ss_len + right_ext_len <= # len(tar)): search_val = tar[ search_begin : ( search_begin + left_ext_len + min_ss_len + right_ext_len ) ] ix += 1 if hit_len > 0: tar = flagged_tar match_len += hit_len pos += ix
[docs]def dist_baystat(src, tar, min_ss_len=None, left_ext=None, right_ext=None): """Return the Baystat distance. Normalized Baystat similarity is the complement of normalized Baystat distance: :math:`sim_{Baystat} = 1 - dist_{Baystat}`. :param str src: source string for comparison :param str tar: target string for comparison :param int min_ss_len: minimum substring length to be considered :param int left_ext: left-side extension length :param int right_ext: right-side extension length :returns: the Baystat distance :rtype: float >>> round(dist_baystat('cat', 'hat'), 12) 0.333333333333 >>> dist_baystat('Niall', 'Neil') 0.6 >>> round(dist_baystat('Colin', 'Cuilen'), 12) 0.833333333333 >>> dist_baystat('ATCG', 'TAGC') 1.0 """ return 1 - sim_baystat(src, tar, min_ss_len, left_ext, right_ext)
if __name__ == '__main__': import doctest doctest.testmod()