Source code for abydos.distance._damerau_levenshtein

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._damerau_levenshtein.

Damerau-Levenshtein distance
"""

from sys import maxsize

from deprecation import deprecated

from numpy import int as np_int
from numpy import zeros as np_zeros

from ._distance import _Distance
from .. import __version__

__all__ = [
'DamerauLevenshtein',
'damerau_levenshtein',
'dist_damerau',
'sim_damerau',
]

[docs]class DamerauLevenshtein(_Distance):
"""Damerau-Levenshtein distance.

This computes the Damerau-Levenshtein distance :cite:Damerau:1964.
Damerau-Levenshtein code is based on Java code by Kevin L. Stern
:cite:Stern:2014, under the MIT license:
https://github.com/KevinStern/software-and-algorithms/blob/master/src/main/java/blogspot/software_and_algorithms/stern_library/string/DamerauLevenshteinAlgorithm.java
"""

def __init__(self, cost=(1, 1, 1, 1), normalizer=max, **kwargs):
"""Initialize Levenshtein instance.

Parameters
----------
cost : tuple
A 4-tuple representing the cost of the four possible edits:
inserts, deletes, substitutions, and transpositions, respectively
(by default: (1, 1, 1, 1))
normalizer : function
A function that takes an list and computes a normalization term
by which the edit distance is divided (max by default). Another
good option is the sum function.
**kwargs
Arbitrary keyword arguments

"""
super(DamerauLevenshtein, self).__init__(**kwargs)
self._cost = cost
self._normalizer = normalizer

[docs]    def dist_abs(self, src, tar):
"""Return the Damerau-Levenshtein distance between two strings.

Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison

Returns
-------
int (may return a float if cost has float values)
The Damerau-Levenshtein distance between src & tar

Raises
------
ValueError
Unsupported cost assignment; the cost of two transpositions must
not be less than the cost of an insert plus a delete.

Examples
--------
>>> cmp = DamerauLevenshtein()
>>> cmp.dist_abs('cat', 'hat')
1
>>> cmp.dist_abs('Niall', 'Neil')
3
>>> cmp.dist_abs('aluminum', 'Catalan')
7
>>> cmp.dist_abs('ATCG', 'TAGC')
2

.. versionchanged:: 0.3.6
Encapsulated in class

"""
ins_cost, del_cost, sub_cost, trans_cost = self._cost

if src == tar:
return 0
if not src:
return len(tar) * ins_cost
if not tar:
return len(src) * del_cost

if 2 * trans_cost < ins_cost + del_cost:
raise ValueError(
'Unsupported cost assignment; the cost of two transpositions '
+ 'must not be less than the cost of an insert plus a delete.'
)

d_mat = np_zeros((len(src), len(tar)), dtype=np_int)

if src[0] != tar[0]:
d_mat[0, 0] = min(sub_cost, ins_cost + del_cost)

src_index_by_character = {src[0]: 0}
for i in range(1, len(src)):
del_distance = d_mat[i - 1, 0] + del_cost
ins_distance = (i + 1) * del_cost + ins_cost
match_distance = i * del_cost + (
0 if src[i] == tar[0] else sub_cost
)
d_mat[i, 0] = min(del_distance, ins_distance, match_distance)

for j in range(1, len(tar)):
del_distance = (j + 1) * ins_cost + del_cost
ins_distance = d_mat[0, j - 1] + ins_cost
match_distance = j * ins_cost + (
0 if src[0] == tar[j] else sub_cost
)
d_mat[0, j] = min(del_distance, ins_distance, match_distance)

for i in range(1, len(src)):
max_src_letter_match_index = 0 if src[i] == tar[0] else -1
for j in range(1, len(tar)):
candidate_swap_index = (
-1
if tar[j] not in src_index_by_character
else src_index_by_character[tar[j]]
)
j_swap = max_src_letter_match_index
del_distance = d_mat[i - 1, j] + del_cost
ins_distance = d_mat[i, j - 1] + ins_cost
match_distance = d_mat[i - 1, j - 1]
if src[i] != tar[j]:
match_distance += sub_cost
else:
max_src_letter_match_index = j

if candidate_swap_index != -1 and j_swap != -1:
i_swap = candidate_swap_index

if i_swap == 0 and j_swap == 0:
pre_swap_cost = 0
else:
pre_swap_cost = d_mat[
max(0, i_swap - 1), max(0, j_swap - 1)
]
swap_distance = (
pre_swap_cost
+ (i - i_swap - 1) * del_cost
+ (j - j_swap - 1) * ins_cost
+ trans_cost
)
else:
swap_distance = maxsize

d_mat[i, j] = min(
del_distance, ins_distance, match_distance, swap_distance
)
src_index_by_character[src[i]] = i

return d_mat[len(src) - 1, len(tar) - 1]

[docs]    def dist(self, src, tar):
"""Return the Damerau-Levenshtein similarity of two strings.

Damerau-Levenshtein distance normalized to the interval [0, 1].

The Damerau-Levenshtein distance is normalized by dividing the
Damerau-Levenshtein distance by the greater of
the number of characters in src times the cost of a delete and
the number of characters in tar times the cost of an insert.
For the case in which all operations have :math:cost = 1, this is
equivalent to the greater of the length of the two strings src & tar.

Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison

Returns
-------
float
The normalized Damerau-Levenshtein distance

Examples
--------
>>> cmp = DamerauLevenshtein()
>>> round(cmp.dist('cat', 'hat'), 12)
0.333333333333
>>> round(cmp.dist('Niall', 'Neil'), 12)
0.6
>>> cmp.dist('aluminum', 'Catalan')
0.875
>>> cmp.dist('ATCG', 'TAGC')
0.5

.. versionchanged:: 0.3.6
Encapsulated in class

"""
if src == tar:
return 0.0
ins_cost, del_cost = self._cost[:2]
return self.dist_abs(src, tar) / (
self._normalizer([len(src) * del_cost, len(tar) * ins_cost])
)

[docs]@deprecated(
deprecated_in='0.4.0',
removed_in='0.6.0',
current_version=__version__,
)
def damerau_levenshtein(src, tar, cost=(1, 1, 1, 1)):
"""Return the Damerau-Levenshtein distance between two strings.

This is a wrapper of :py:meth:DamerauLevenshtein.dist_abs.

Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
cost : tuple
A 4-tuple representing the cost of the four possible edits: inserts,
deletes, substitutions, and transpositions, respectively (by default:
(1, 1, 1, 1))

Returns
-------
int (may return a float if cost has float values)
The Damerau-Levenshtein distance between src & tar

Examples
--------
>>> damerau_levenshtein('cat', 'hat')
1
>>> damerau_levenshtein('Niall', 'Neil')
3
>>> damerau_levenshtein('aluminum', 'Catalan')
7
>>> damerau_levenshtein('ATCG', 'TAGC')
2

"""
return DamerauLevenshtein(cost).dist_abs(src, tar)

[docs]@deprecated(
deprecated_in='0.4.0',
removed_in='0.6.0',
current_version=__version__,
)
def dist_damerau(src, tar, cost=(1, 1, 1, 1)):
"""Return the Damerau-Levenshtein similarity of two strings.

This is a wrapper of :py:meth:DamerauLevenshtein.dist.

Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
cost : tuple
A 4-tuple representing the cost of the four possible edits: inserts,
deletes, substitutions, and transpositions, respectively (by default:
(1, 1, 1, 1))

Returns
-------
float
The normalized Damerau-Levenshtein distance

Examples
--------
>>> round(dist_damerau('cat', 'hat'), 12)
0.333333333333
>>> round(dist_damerau('Niall', 'Neil'), 12)
0.6
>>> dist_damerau('aluminum', 'Catalan')
0.875
>>> dist_damerau('ATCG', 'TAGC')
0.5

"""
return DamerauLevenshtein(cost).dist(src, tar)

[docs]@deprecated(
deprecated_in='0.4.0',
removed_in='0.6.0',
current_version=__version__,
)
def sim_damerau(src, tar, cost=(1, 1, 1, 1)):
"""Return the Damerau-Levenshtein similarity of two strings.

This is a wrapper of :py:meth:DamerauLevenshtein.sim.

Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
cost : tuple
A 4-tuple representing the cost of the four possible edits: inserts,
deletes, substitutions, and transpositions, respectively (by default:
(1, 1, 1, 1))

Returns
-------
float
The normalized Damerau-Levenshtein similarity

Examples
--------
>>> round(sim_damerau('cat', 'hat'), 12)
0.666666666667
>>> round(sim_damerau('Niall', 'Neil'), 12)
0.4
>>> sim_damerau('aluminum', 'Catalan')
0.125
>>> sim_damerau('ATCG', 'TAGC')
0.5