# -*- coding: utf-8 -*-
# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.distance.
The distance module implements string edit distance functions including:
- Levenshtein distance
- Optimal String Alignment distance
- Levenshtein-Damerau distance
- Hamming distance
- Tversky index
- Sørensen–Dice coefficient & distance
- Jaccard similarity coefficient & distance
- overlap similarity & distance
- Tanimoto coefficient & distance
- Minkowski distance & similarity
- Manhattan distance & similarity
- Euclidean distance & similarity
- Chebyshev distance
- cosine similarity & distance
- Jaro distance
- Jaro-Winkler distance (incl. the strcmp95 algorithm variant)
- Longest common substring
- Ratcliff-Obershelp similarity & distance
- Match Rating Algorithm similarity
- Normalized Compression Distance (NCD) & similarity
- Monge-Elkan similarity & distance
- Matrix similarity
- Needleman-Wunsch score
- Smith-Waterman score
- Gotoh score
- Length similarity
- Prefix, Suffix, and Identity similarity & distance
- Modified Language-Independent Product Name Search (MLIPNS) similarity &
distance
- Bag similarity & distance
- Editex distance
- Eudex distances
- Sift4 distance
- Baystat distance & similarity
- Typo distance
- Indel distance
- Synoname
Functions beginning with the prefixes 'sim' and 'dist' are guaranteed to be
in the range [0, 1], and sim_X = 1 - dist_X since the two are complements.
If a sim_X function is supplied identical src & tar arguments, it is guaranteed
to return 1; the corresponding dist_X function is guaranteed to return 0.
"""
from __future__ import unicode_literals
from ._basic import (
dist_ident,
dist_length,
dist_prefix,
dist_suffix,
sim_ident,
sim_length,
sim_prefix,
sim_suffix,
)
from ._baystat import dist_baystat, sim_baystat
from ._compression import (
dist_ncd_arith,
dist_ncd_bwtrle,
dist_ncd_bz2,
dist_ncd_lzma,
dist_ncd_rle,
dist_ncd_zlib,
sim_ncd_arith,
sim_ncd_bwtrle,
sim_ncd_bz2,
sim_ncd_lzma,
sim_ncd_rle,
sim_ncd_zlib,
)
from ._editex import dist_editex, editex, sim_editex
from ._eudex import dist_eudex, eudex_hamming, sim_eudex
from ._hamming import (
dist_hamming,
dist_mlipns,
hamming,
sim_hamming,
sim_mlipns,
)
from ._jaro import (
dist_jaro_winkler,
dist_strcmp95,
sim_jaro_winkler,
sim_strcmp95,
)
from ._levenshtein import (
damerau_levenshtein,
dist_damerau,
dist_indel,
dist_levenshtein,
levenshtein,
sim_damerau,
sim_indel,
sim_levenshtein,
)
from ._minkowski import (
chebyshev,
dist_euclidean,
dist_manhattan,
dist_minkowski,
euclidean,
manhattan,
minkowski,
sim_euclidean,
sim_manhattan,
sim_minkowski,
)
from ._mra import dist_mra, mra_compare, sim_mra
from ._seqalign import gotoh, needleman_wunsch, sim_matrix, smith_waterman
from ._sequence import (
dist_lcsseq,
dist_lcsstr,
dist_ratcliff_obershelp,
lcsseq,
lcsstr,
sim_lcsseq,
sim_lcsstr,
sim_ratcliff_obershelp,
)
from ._sift4 import dist_sift4, sift4_common, sift4_simplest, sim_sift4
from ._synoname import synoname
from ._token import (
bag,
dist_bag,
dist_cosine,
dist_dice,
dist_jaccard,
dist_monge_elkan,
dist_overlap,
dist_tversky,
sim_bag,
sim_cosine,
sim_dice,
sim_jaccard,
sim_monge_elkan,
sim_overlap,
sim_tanimoto,
sim_tversky,
tanimoto,
)
from ._typo import dist_typo, sim_typo, typo
__all__ = [
'sim',
'dist',
'levenshtein',
'dist_levenshtein',
'sim_levenshtein',
'damerau_levenshtein',
'dist_damerau',
'sim_damerau',
'dist_indel',
'sim_indel',
'hamming',
'dist_hamming',
'sim_hamming',
'dist_jaro_winkler',
'sim_jaro_winkler',
'dist_strcmp95',
'sim_strcmp95',
'minkowski',
'dist_minkowski',
'sim_minkowski',
'manhattan',
'dist_manhattan',
'sim_manhattan',
'euclidean',
'dist_euclidean',
'sim_euclidean',
'chebyshev',
'dist_tversky',
'sim_tversky',
'dist_dice',
'sim_dice',
'dist_jaccard',
'sim_jaccard',
'dist_overlap',
'sim_overlap',
'tanimoto',
'sim_tanimoto',
'dist_cosine',
'sim_cosine',
'bag',
'dist_bag',
'sim_bag',
'dist_monge_elkan',
'sim_monge_elkan',
'needleman_wunsch',
'smith_waterman',
'gotoh',
'sim_matrix',
'lcsseq',
'dist_lcsseq',
'sim_lcsseq',
'lcsstr',
'dist_lcsstr',
'sim_lcsstr',
'dist_ratcliff_obershelp',
'sim_ratcliff_obershelp',
'dist_ident',
'sim_ident',
'dist_length',
'sim_length',
'dist_prefix',
'sim_prefix',
'dist_suffix',
'sim_suffix',
'dist_ncd_zlib',
'sim_ncd_zlib',
'dist_ncd_bz2',
'sim_ncd_bz2',
'dist_ncd_lzma',
'sim_ncd_lzma',
'dist_ncd_bwtrle',
'sim_ncd_bwtrle',
'dist_ncd_rle',
'sim_ncd_rle',
'dist_ncd_arith',
'sim_ncd_arith',
'mra_compare',
'dist_mra',
'sim_mra',
'editex',
'dist_editex',
'sim_editex',
'dist_mlipns',
'sim_mlipns',
'dist_baystat',
'sim_baystat',
'eudex_hamming',
'dist_eudex',
'sim_eudex',
'sift4_common',
'sift4_simplest',
'dist_sift4',
'sim_sift4',
'typo',
'dist_typo',
'sim_typo',
'synoname',
]
[docs]def sim(src, tar, method=sim_levenshtein):
"""Return a similarity of two strings.
This is a generalized function for calling other similarity functions.
:param str src: source string for comparison
:param str tar: target string for comparison
:param function method: specifies the similarity metric (sim_levenshtein by
default)
:returns: similarity according to the specified function
:rtype: float
>>> round(sim('cat', 'hat'), 12)
0.666666666667
>>> round(sim('Niall', 'Neil'), 12)
0.4
>>> sim('aluminum', 'Catalan')
0.125
>>> sim('ATCG', 'TAGC')
0.25
"""
if callable(method):
return method(src, tar)
else:
raise AttributeError('Unknown similarity function: ' + str(method))
[docs]def dist(src, tar, method=sim_levenshtein):
"""Return a distance between two strings.
This is a generalized function for calling other distance functions.
:param str src: source string for comparison
:param str tar: target string for comparison
:param function method: specifies the similarity metric (sim_levenshtein by
default) -- Note that this takes a similarity metric function, not
a distance metric function.
:returns: distance according to the specified function
:rtype: float
>>> round(dist('cat', 'hat'), 12)
0.333333333333
>>> round(dist('Niall', 'Neil'), 12)
0.6
>>> dist('aluminum', 'Catalan')
0.875
>>> dist('ATCG', 'TAGC')
0.75
"""
if callable(method):
return 1 - method(src, tar)
else:
raise AttributeError('Unknown distance function: ' + str(method))
if __name__ == '__main__':
import doctest
doctest.testmod()