# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.distance._shapira_storer_i.
Shapira & Storer I edit distance with block moves, greedy algorithm
"""
from collections import Counter
from numpy import int as np_int
from numpy import zeros as np_zeros
from ._distance import _Distance
from ._lcsstr import LCSstr
__all__ = ['ShapiraStorerI']
[docs]class ShapiraStorerI(_Distance):
"""Shapira & Storer I edit distance with block moves, greedy algorithm.
Shapira & Storer's greedy edit distance :cite:`Shapira:2007` is similar to
Levenshtein edit distance, but with two important distinctions:
- It considers blocks of characters, if they occur in both the source
and target strings, so the edit distance between 'abcab' and 'abc'
is only 1, since the substring 'ab' occurs in both and can be
inserted as a block into 'abc'.
- It allows three edit operations: insert, delete, and move (but not
substitute). Thus the distance between 'abcde' and 'deabc' is only 1
because the block 'abc' can be moved in 1 move operation, rather than
being deleted and inserted in 2 separate operations.
If prime is set to True at initialization, this employs the greedy'
algorithm, which limits replacements of blocks in the two strings to
matching occurrences of the LCS.
.. versionadded:: 0.4.0
"""
_lcs = LCSstr()
def __init__(self, cost=(1, 1), prime=False, **kwargs):
"""Initialize ShapiraStorerI instance.
Parameters
----------
prime : bool
If True, employs the greedy' algorithm rather than greedy
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.0
"""
self._cost = cost
self._prime = prime
super(ShapiraStorerI, self).__init__(**kwargs)
[docs] def dist_abs(self, src, tar):
"""Return the Shapira & Storer I edit distance between two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
int
The Shapira & Storer I edit distance between src & tar
Examples
--------
>>> cmp = ShapiraStorerI()
>>> cmp.dist_abs('cat', 'hat')
2
>>> cmp.dist_abs('Niall', 'Neil')
3
>>> cmp.dist_abs('aluminum', 'Catalan')
9
>>> cmp.dist_abs('ATCG', 'TAGC')
2
.. versionadded:: 0.4.0
"""
alphabet = set(src) | set(tar)
next_char = 'A'
lcs = self._lcs.lcsstr(src, tar)
while len(lcs) > 1:
while next_char in alphabet:
next_char = chr(ord(next_char) + 1)
if self._prime:
count = min(src.count(lcs), tar.count(lcs))
src = src.replace(lcs, next_char, count)
tar = tar.replace(lcs, next_char, count)
else:
src = src.replace(lcs, next_char)
tar = tar.replace(lcs, next_char)
alphabet |= {next_char}
lcs = self._lcs.lcsstr(src, tar)
d = self._edit_with_moves(src, tar)
return d
def _edit_with_moves(self, src, tar):
"""Return the edit distance between two strings using ins, del, & move.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
int
The Levenshtein distance between src & tar
.. versionadded:: 0.4.0
"""
ins_cost, del_cost = self._cost[:2]
if src == tar:
return 0
if not src:
return len(tar) * ins_cost
if not tar:
return len(src) * del_cost
d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int)
for i in range(len(src) + 1):
d_mat[i, 0] = i * del_cost
for j in range(len(tar) + 1):
d_mat[0, j] = j * ins_cost
for i in range(len(src)):
for j in range(len(tar)):
d_mat[i + 1, j + 1] = min(
d_mat[i + 1, j] + ins_cost, # ins
d_mat[i, j + 1] + del_cost, # del
d_mat[i, j]
+ (float('inf') if src[i] != tar[j] else 0), # sub/==
)
distance = d_mat[len(src), len(tar)]
# Do a backtrace on d_mat to discover an optimal path & count
# inserted & deleted characters
i = len(src)
j = len(tar)
inserts = Counter()
deletes = Counter()
while i > 0 and j > 0:
ante = [d_mat[i - 1, j - 1], d_mat[i - 1, j], d_mat[i, j - 1]]
least = ante.index(min(ante))
old_dist = d_mat[i, j]
if least == 0:
i -= 1
j -= 1
if d_mat[i, j] < old_dist:
deletes[src[i]] += 1
inserts[tar[j]] += 1
elif least == 1:
i -= 1
if d_mat[i, j] < old_dist:
deletes[src[i]] += 1
else:
j -= 1
if d_mat[i, j] < old_dist:
inserts[tar[j]] += 1
while i > 0:
i -= 1
if d_mat[i, j] < old_dist:
deletes[src[i]] += 1
while j > 0:
j -= 1
if d_mat[i, j] < old_dist:
inserts[tar[j]] += 1
moves = sum((inserts & deletes).values())
return distance - moves
[docs] def dist(self, src, tar):
"""Return the normalized Shapira & Storer I distance.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
The normalized Shapira & Storer I distance between src & tar
Examples
--------
>>> cmp = ShapiraStorerI()
>>> round(cmp.dist('cat', 'hat'), 12)
0.333333333333
>>> round(cmp.dist('Niall', 'Neil'), 12)
0.333333333333
>>> cmp.dist('aluminum', 'Catalan')
0.6
>>> cmp.dist('ATCG', 'TAGC')
0.25
.. versionadded:: 0.4.0
"""
if src == tar:
return 0.0
ins_cost, del_cost = self._cost[:2]
src_len = len(src)
tar_len = len(tar)
return self.dist_abs(src, tar) / (
sum([src_len * del_cost, tar_len * ins_cost])
)
if __name__ == '__main__':
import doctest
doctest.testmod()