# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.distance._editex.
editex
"""
from sys import float_info
from unicodedata import normalize as unicode_normalize
from deprecation import deprecated
from numpy import float as np_float
from numpy import zeros as np_zeros
from ._distance import _Distance
from .. import __version__
__all__ = ['Editex', 'dist_editex', 'editex', 'sim_editex']
[docs]class Editex(_Distance):
"""Editex.
As described on pages 3 & 4 of :cite:`Zobel:1996`.
The local variant is based on :cite:`Ring:2009`.
.. versionadded:: 0.3.6
.. versionchanged:: 0.4.0
Added taper option
"""
_letter_groups = (
frozenset('AEIOUY'),
frozenset('BP'),
frozenset('CKQ'),
frozenset('DT'),
frozenset('LR'),
frozenset('MN'),
frozenset('GJ'),
frozenset('FPV'),
frozenset('SXZ'),
)
_all_letters = frozenset('ABCDEFGIJKLMNOPQRSTUVXYZ')
def __init__(self, cost=(0, 1, 2), local=False, taper=False, **kwargs):
"""Initialize Editex instance.
Parameters
----------
cost : tuple
A 3-tuple representing the cost of the four possible edits: match,
same-group, and mismatch respectively (by default: (0, 1, 2))
local : bool
If True, the local variant of Editex is used
taper : bool
Enables cost tapering. Following :cite:`Zobel:1996`, it causes
edits at the start of the string to "just [exceed] twice the
minimum penalty for replacement or deletion at the end of the
string".
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.0
"""
super(Editex, self).__init__(**kwargs)
self._cost = cost
self._local = local
self._taper_enabled = taper
def _taper(self, pos, length):
return (
round(1 + ((length - pos) / length) * (1 + float_info.epsilon), 15)
if self._taper_enabled
else 1
)
[docs] def dist_abs(self, src, tar):
"""Return the Editex distance between two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
int
Editex distance
Examples
--------
>>> cmp = Editex()
>>> cmp.dist_abs('cat', 'hat')
2
>>> cmp.dist_abs('Niall', 'Neil')
2
>>> cmp.dist_abs('aluminum', 'Catalan')
12
>>> cmp.dist_abs('ATCG', 'TAGC')
6
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
match_cost, group_cost, mismatch_cost = self._cost
def r_cost(ch1, ch2):
"""Return r(a,b) according to Zobel & Dart's definition.
Parameters
----------
ch1 : str
The first character to compare
ch2 : str
The second character to compare
Returns
-------
int
r(a,b) according to Zobel & Dart's definition
.. versionadded:: 0.1.0
"""
if ch1 == ch2:
return match_cost
if ch1 in self._all_letters and ch2 in self._all_letters:
for group in self._letter_groups:
if ch1 in group and ch2 in group:
return group_cost
return mismatch_cost
def d_cost(ch1, ch2):
"""Return d(a,b) according to Zobel & Dart's definition.
Parameters
----------
ch1 : str
The first character to compare
ch2 : str
The second character to compare
Returns
-------
int
d(a,b) according to Zobel & Dart's definition
.. versionadded:: 0.1.0
"""
if ch1 != ch2 and (ch1 == 'H' or ch1 == 'W'):
return group_cost
return r_cost(ch1, ch2)
# convert both src & tar to NFKD normalized unicode
src = unicode_normalize('NFKD', src.upper())
tar = unicode_normalize('NFKD', tar.upper())
src_len = len(src)
tar_len = len(tar)
max_len = max(src_len, tar_len)
if src == tar:
return 0.0
if not src:
return sum(
mismatch_cost * self._taper(pos, max_len)
for pos in range(tar_len)
)
if not tar:
return sum(
mismatch_cost * self._taper(pos, max_len)
for pos in range(src_len)
)
d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float)
src = ' ' + src
tar = ' ' + tar
if not self._local:
for i in range(1, src_len + 1):
d_mat[i, 0] = d_mat[i - 1, 0] + d_cost(
src[i - 1], src[i]
) * self._taper(i, max_len)
for j in range(1, tar_len + 1):
d_mat[0, j] = d_mat[0, j - 1] + d_cost(
tar[j - 1], tar[j]
) * self._taper(j, max_len)
for i in range(1, src_len + 1):
for j in range(1, tar_len + 1):
d_mat[i, j] = min(
d_mat[i - 1, j]
+ d_cost(src[i - 1], src[i])
* self._taper(max(i, j), max_len),
d_mat[i, j - 1]
+ d_cost(tar[j - 1], tar[j])
* self._taper(max(i, j), max_len),
d_mat[i - 1, j - 1]
+ r_cost(src[i], tar[j]) * self._taper(max(i, j), max_len),
)
if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
return int(d_mat[src_len, tar_len])
else:
return d_mat[src_len, tar_len]
[docs] def dist(self, src, tar):
"""Return the normalized Editex distance between two strings.
The Editex distance is normalized by dividing the Editex distance
(calculated by any of the three supported methods) by the greater of
the number of characters in src times the cost of a delete and
the number of characters in tar times the cost of an insert.
For the case in which all operations have :math:`cost = 1`, this is
equivalent to the greater of the length of the two strings src & tar.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
int
Normalized Editex distance
Examples
--------
>>> cmp = Editex()
>>> round(cmp.dist('cat', 'hat'), 12)
0.333333333333
>>> round(cmp.dist('Niall', 'Neil'), 12)
0.2
>>> cmp.dist('aluminum', 'Catalan')
0.75
>>> cmp.dist('ATCG', 'TAGC')
0.75
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
if src == tar:
return 0.0
match_cost, group_cost, mismatch_cost = self._cost
src_len = len(src)
tar_len = len(tar)
if self._taper_enabled:
normalize_term = max(
[
sum(
self._taper(pos, src_len) * mismatch_cost
for pos in range(src_len)
),
sum(
self._taper(pos, tar_len) * mismatch_cost
for pos in range(tar_len)
),
]
)
else:
normalize_term = max(
src_len * mismatch_cost, tar_len * mismatch_cost
)
return self.dist_abs(src, tar) / normalize_term
[docs]@deprecated(
deprecated_in='0.4.0',
removed_in='0.6.0',
current_version=__version__,
details='Use the Editex.dist_abs method instead.',
)
def editex(src, tar, cost=(0, 1, 2), local=False):
"""Return the Editex distance between two strings.
This is a wrapper for :py:meth:`Editex.dist_abs`.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
cost : tuple
A 3-tuple representing the cost of the four possible edits: match,
same-group, and mismatch respectively (by default: (0, 1, 2))
local : bool
If True, the local variant of Editex is used
Returns
-------
int
Editex distance
Examples
--------
>>> editex('cat', 'hat')
2
>>> editex('Niall', 'Neil')
2
>>> editex('aluminum', 'Catalan')
12
>>> editex('ATCG', 'TAGC')
6
.. versionadded:: 0.1.0
"""
return Editex(cost, local).dist_abs(src, tar)
[docs]@deprecated(
deprecated_in='0.4.0',
removed_in='0.6.0',
current_version=__version__,
details='Use the Editex.dist method instead.',
)
def dist_editex(src, tar, cost=(0, 1, 2), local=False):
"""Return the normalized Editex distance between two strings.
This is a wrapper for :py:meth:`Editex.dist`.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
cost : tuple
A 3-tuple representing the cost of the four possible edits: match,
same-group, and mismatch respectively (by default: (0, 1, 2))
local : bool
If True, the local variant of Editex is used
Returns
-------
int
Normalized Editex distance
Examples
--------
>>> round(dist_editex('cat', 'hat'), 12)
0.333333333333
>>> round(dist_editex('Niall', 'Neil'), 12)
0.2
>>> dist_editex('aluminum', 'Catalan')
0.75
>>> dist_editex('ATCG', 'TAGC')
0.75
.. versionadded:: 0.1.0
"""
return Editex(cost, local).dist(src, tar)
[docs]@deprecated(
deprecated_in='0.4.0',
removed_in='0.6.0',
current_version=__version__,
details='Use the Editex.sim method instead.',
)
def sim_editex(src, tar, cost=(0, 1, 2), local=False):
"""Return the normalized Editex similarity of two strings.
This is a wrapper for :py:meth:`Editex.sim`.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
cost : tuple
A 3-tuple representing the cost of the four possible edits: match,
same-group, and mismatch respectively (by default: (0, 1, 2))
local : bool
If True, the local variant of Editex is used
Returns
-------
int
Normalized Editex similarity
Examples
--------
>>> round(sim_editex('cat', 'hat'), 12)
0.666666666667
>>> round(sim_editex('Niall', 'Neil'), 12)
0.8
>>> sim_editex('aluminum', 'Catalan')
0.25
>>> sim_editex('ATCG', 'TAGC')
0.25
.. versionadded:: 0.1.0
"""
return Editex(cost, local).sim(src, tar)
if __name__ == '__main__':
import doctest
doctest.testmod()