# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.distance._aline.
ALINE alignment, similarity, and distance
"""
from copy import deepcopy
from numpy import NINF
from numpy import float as np_float
from numpy import zeros as np_zeros
from ._distance import _Distance
__all__ = ['ALINE']
[docs]class ALINE(_Distance):
r"""ALINE alignment, similarity, and distance.
ALINE alignment was developed by
:cite:`Kondrak:2000,Kondrak:2002,Downey:2008`, and establishes an
alignment algorithm based on multivalued phonetic features and feature
salience weights. Along with the alignment itself, the algorithm produces a
term similarity score.
:cite:`Downey:2008` develops ALINE's similarity score into a similarity
measure & distance measure:
.. math::
sim_{ALINE} = \frac{2 \dot score_{ALINE}(src, tar)}
{score_{ALINE}(src, src) + score_{ALINE}(tar, tar)}
However, because the average of the two self-similarity scores is not
guaranteed to be greater than or equal to the similarity score between
the two strings, by default, this formula is not used here in order to
guarantee that the similarity measure is bounded to [0, 1]. Instead,
Kondrak's similarity measure is employed:
.. math::
sim_{ALINE} = \frac{score_{ALINE}(src, tar)}
{max(score_{ALINE}(src, src), score_{ALINE}(tar, tar))}
.. versionadded:: 0.4.0
"""
# The three dicts below are mostly copied from NLTK's implementation
# https://www.nltk.org/_modules/nltk/metrics/aline.html
# But values have been returned, as much as possible to the reference
# values supplied in Kondrak's paper.
feature_weights = {
# place
'bilabial': 1.0,
'labiodental': 0.95,
'dental': 0.9,
'alveolar': 0.85,
'retroflex': 0.8,
'palato-alveolar': 0.75,
'palatal': 0.7,
'velar': 0.6,
'uvular': 0.5,
'pharyngeal': 0.3,
'glottal': 0.1,
# manner
'stop': 1.0,
'affricate': 0.9,
'fricative': 0.8,
'approximant': 0.6,
'trill': 0.55, # not in original
'tap': 0.5, # not in original
'high vowel': 0.4,
'mid vowel': 0.2,
'low vowel': 0.0,
# high
'high': 1.0,
'mid': 0.5,
'low': 0.0,
# back
'front': 1.0,
'central': 0.5,
'back': 0.0,
# binary features
'plus': 1.0,
'minus': 0.0,
}
v_features = {
'syllabic',
'nasal',
'retroflex',
'high',
'back',
'round',
'long',
}
c_features = {
'syllabic',
'manner',
'voice',
'nasal',
'retroflex',
'lateral',
'aspirated',
'place',
}
salience = {
'syllabic': 5,
'voice': 10,
'lateral': 10,
'high': 5,
'manner': 50,
'long': 1,
'place': 40,
'nasal': 10,
'aspirated': 5,
'back': 5,
'retroflex': 10,
'round': 5,
}
phones_ipa = {
'p': {
'place': 'bilabial',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'b': {
'place': 'bilabial',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
't': {
'place': 'alveolar',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'd': {
'place': 'alveolar',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ʈ': {
'place': 'retroflex',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'plus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɖ': {
'place': 'retroflex',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'plus',
'lateral': 'minus',
'aspirated': 'minus',
},
'c': {
'place': 'palatal',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɟ': {
'place': 'palatal',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'k': {
'place': 'velar',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'g': {
'place': 'velar',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'q': {
'place': 'uvular',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɢ': {
'place': 'uvular',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ʔ': {
'place': 'glottal',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'm': {
'place': 'bilabial',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'plus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɱ': {
'place': 'labiodental',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'plus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'n': {
'place': 'alveolar',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'plus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɳ': {
'place': 'retroflex',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'plus',
'retroflex': 'plus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɲ': {
'place': 'palatal',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'plus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ŋ': {
'place': 'velar',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'plus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɴ': {
'place': 'uvular',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'plus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ʙ': {
'place': 'bilabial',
'manner': 'trill',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'r': {
'place': 'alveolar',
'manner': 'trill',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'plus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ʀ': {
'place': 'uvular',
'manner': 'trill',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɾ': {
'place': 'alveolar',
'manner': 'tap',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɽ': {
'place': 'retroflex',
'manner': 'tap',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'plus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɸ': {
'place': 'bilabial',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'β': {
'place': 'bilabial',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'f': {
'place': 'labiodental',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'v': {
'place': 'labiodental',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'θ': {
'place': 'dental',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ð': {
'place': 'dental',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
's': {
'place': 'alveolar',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'z': {
'place': 'alveolar',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ʃ': {
'place': 'palato-alveolar',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ʒ': {
'place': 'palato-alveolar',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ʂ': {
'place': 'retroflex',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'plus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ʐ': {
'place': 'retroflex',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'plus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ç': {
'place': 'palatal',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ʝ': {
'place': 'palatal',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'x': {
'place': 'velar',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɣ': {
'place': 'velar',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'χ': {
'place': 'uvular',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ʁ': {
'place': 'uvular',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ħ': {
'place': 'pharyngeal',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ʕ': {
'place': 'pharyngeal',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'h': {
'place': 'glottal',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɦ': {
'place': 'glottal',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɬ': {
'place': 'alveolar',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'plus',
'aspirated': 'minus',
},
'ɮ': {
'place': 'alveolar',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'plus',
'aspirated': 'minus',
},
'ʋ': {
'place': 'labiodental',
'manner': 'approximant',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɹ': {
'place': 'alveolar',
'manner': 'approximant',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɻ': {
'place': 'retroflex',
'manner': 'approximant',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'plus',
'lateral': 'minus',
'aspirated': 'minus',
},
'j': {
'place': 'palatal',
'manner': 'approximant',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɰ': {
'place': 'velar',
'manner': 'approximant',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'l': {
'place': 'alveolar',
'manner': 'approximant',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'plus',
'aspirated': 'minus',
},
'w': {
'place': 'velar',
'manner': 'approximant',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
'double': 'bilabial',
},
'i': {
'manner': 'high vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'high',
'back': 'front',
'round': 'minus',
'long': 'minus',
'aspirated': 'minus',
},
'y': {
'manner': 'high vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'high',
'back': 'front',
'round': 'plus',
'long': 'minus',
'aspirated': 'minus',
},
'e': {
'manner': 'mid vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'mid',
'back': 'front',
'round': 'minus',
'long': 'minus',
'aspirated': 'minus',
},
'ø': {
'manner': 'mid vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'mid',
'back': 'front',
'round': 'plus',
'long': 'minus',
'aspirated': 'minus',
},
'ɛ': {
'manner': 'mid vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'mid',
'back': 'front',
'round': 'minus',
'long': 'minus',
'aspirated': 'minus',
},
'œ': {
'manner': 'mid vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'mid',
'back': 'front',
'round': 'plus',
'long': 'minus',
'aspirated': 'minus',
},
'æ': {
'manner': 'low vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'low',
'back': 'front',
'round': 'minus',
'long': 'minus',
'aspirated': 'minus',
},
'a': {
'manner': 'low vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'low',
'back': 'front',
'round': 'minus',
'long': 'minus',
'aspirated': 'minus',
},
'ɨ': {
'manner': 'high vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'high',
'back': 'central',
'round': 'minus',
'long': 'minus',
'aspirated': 'minus',
},
'ʉ': {
'manner': 'high vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'high',
'back': 'central',
'round': 'plus',
'long': 'minus',
'aspirated': 'minus',
},
'ə': {
'manner': 'mid vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'mid',
'back': 'central',
'round': 'minus',
'long': 'minus',
'aspirated': 'minus',
},
'u': {
'manner': 'high vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'high',
'back': 'back',
'round': 'plus',
'long': 'minus',
'aspirated': 'minus',
},
'o': {
'manner': 'mid vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'mid',
'back': 'back',
'round': 'plus',
'long': 'minus',
'aspirated': 'minus',
},
'ɔ': {
'manner': 'mid vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'mid',
'back': 'back',
'round': 'plus',
'long': 'minus',
'aspirated': 'minus',
},
'ɒ': {
'manner': 'low vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'low',
'back': 'back',
'round': 'minus',
'long': 'minus',
'aspirated': 'minus',
},
'ː': {'long': 'plus', 'supplemental': True},
'ʰ': {'aspirated': 'plus', 'supplemental': True},
}
phones_kondrak = {
'a': {
'place': 'velar',
'manner': 'low vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'low',
'back': 'central',
'round': 'minus',
},
'b': {
'place': 'bilabial',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'c': {
'place': 'alveolar',
'manner': 'affricate',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'd': {
'place': 'alveolar',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'e': {
'place': 'palatal',
'manner': 'mid vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'mid',
'back': 'front',
'round': 'minus',
},
'f': {
'place': 'labiodental',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'g': {
'place': 'velar',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'h': {
'place': 'glottal',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'i': {
'place': 'palatal',
'manner': 'high vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'high',
'back': 'front',
'round': 'plus',
},
'j': {
'place': 'alveolar',
'manner': 'affricate',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'k': {
'place': 'velar',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'l': {
'place': 'alveolar',
'manner': 'approximant',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'plus',
},
'm': {
'place': 'bilabial',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'plus',
'retroflex': 'minus',
'lateral': 'minus',
},
'n': {
'place': 'alveolar',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'plus',
'retroflex': 'minus',
'lateral': 'minus',
},
'o': {
'place': 'velar',
'manner': 'mid vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'mid',
'back': 'back',
'round': 'plus',
},
'p': {
'place': 'bilabial',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'q': {
'place': 'glottal',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'r': {
'place': 'retroflex',
'manner': 'approximant',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'plus',
'lateral': 'minus',
},
's': {
'place': 'alveolar',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
't': {
'place': 'alveolar',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'u': {
'place': 'velar',
'manner': 'high vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'high',
'back': 'back',
'round': 'plus',
},
'v': {
'place': 'labiodental',
'manner': 'fricative',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'w': {
'place': 'velar',
'manner': 'high vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'high',
'back': 'back',
'round': 'plus',
'double': 'bilabial',
},
'x': {
'place': 'velar',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'y': {
'place': 'velar',
'manner': 'high vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'high',
'back': 'front',
'round': 'minus',
},
'z': {
'place': 'alveolar',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'A': {'aspirated': 'plus', 'supplemental': True},
'B': {'back': 'back', 'supplemental': True},
'C': {'back': 'central', 'supplemental': True},
'D': {'place': 'dental', 'supplemental': True},
'F': {'back': 'front', 'supplemental': True},
'H': {'long': 'plus', 'supplemental': True},
'N': {'nasal': 'plus', 'supplemental': True},
'P': {'place': 'palatal', 'supplemental': True},
'R': {'round': 'plus', 'supplemental': True},
'S': {'manner': 'fricative', 'supplemental': True},
'V': {'place': 'palato-alveolar', 'supplemental': True},
}
def __init__(
self,
epsilon=0,
c_skip=-10,
c_sub=35,
c_exp=45,
c_vwl=10,
mode='local',
phones='aline',
normalizer=max,
**kwargs
):
"""Initialize ALINE instance.
Parameters
----------
epsilon : float
The portion (out of 1.0) of the maximum ALINE score, above which
alignments are returned. If set to 0, only the alignments matching
the maximum alignment score are returned. If set to 1, all
alignments scoring 0 or higher are returned.
c_skip : int
The cost of an insertion or deletion
c_sub : int
The cost of a substitution
c_exp : int
The cost of an expansion or contraction
c_vwl : int
The additional cost of a vowel substitution, expansion, or
contraction
mode : str
Alignment mode, which can be ``local`` (default), ``global``,
``half-local``, or ``semi-global``
phones : str
Phonetic symbol set, which can be:
- ``aline`` selects Kondrak's original symbols set
- ``ipa`` selects IPA symbols
normalizer : function
A function that takes an list and computes a normalization term
by which the edit distance is divided (max by default). For the
normalization proposed by Downey, et al. (2008), set this to:
``lambda x: sum(x)/len(x)``
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.0
"""
super(ALINE, self).__init__(**kwargs)
self._epsilon = epsilon
self._c_skip = c_skip
self._c_sub = c_sub
self._c_exp = c_exp
self._c_vwl = c_vwl
self._mode = mode
if self._mode not in {'local', 'global', 'half-local', 'semi-global'}:
self._mode = 'local'
if phones == 'ipa':
self._phones = self.phones_ipa
else:
self._phones = self.phones_kondrak
self._normalizer = normalizer
[docs] def alignment(self, src, tar):
"""Return the top ALINE alignment of two strings.
The `top` ALINE alignment is the first alignment with the best score.
The purpose of this function is to have a single tuple as a return
value.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
tuple(float, str, str)
ALINE alignment and its score
Examples
--------
>>> cmp = ALINE()
>>> cmp.alignment('cat', 'hat')
(50.0, 'c ‖ a t ‖', 'h ‖ a t ‖')
>>> cmp.alignment('niall', 'neil')
(90.0, '‖ n i a ll ‖', '‖ n e i l ‖')
>>> cmp.alignment('aluminum', 'catalan')
(81.5, '‖ a l u m ‖ inum', 'cat ‖ a l a n ‖')
>>> cmp.alignment('atcg', 'tagc')
(65.0, '‖ a t c ‖ g', 't ‖ a g c ‖')
.. versionadded:: 0.4.1
"""
return self.alignments(src, tar)[0]
[docs] def alignments(self, src, tar, score_only=False):
"""Return the ALINE alignments of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
score_only : bool
Return the score only, not the alignments
Returns
-------
list(tuple(float, str, str) or float
ALINE alignments and their scores or the top score
Examples
--------
>>> cmp = ALINE()
>>> cmp.alignments('cat', 'hat')
[(50.0, 'c ‖ a t ‖', 'h ‖ a t ‖')]
>>> cmp.alignments('niall', 'neil')
[(90.0, '‖ n i a ll ‖', '‖ n e i l ‖')]
>>> cmp.alignments('aluminum', 'catalan')
[(81.5, '‖ a l u m ‖ inum', 'cat ‖ a l a n ‖')]
>>> cmp.alignments('atcg', 'tagc')
[(65.0, '‖ a t c ‖ g', 't ‖ a g c ‖'), (65.0, 'a ‖ tc - g ‖',
'‖ t a g ‖ c')]
.. versionadded:: 0.4.0
.. versionchanged:: 0.4.1
Renamed from .alignment to .alignments
"""
def _sig_skip(seg):
return self._c_skip
def _sig_sub(seg1, seg2):
return (
self._c_sub
- _delta(seg1, seg2)
- _sig_vwl(seg1)
- _sig_vwl(seg2)
)
def _sig_exp(seg1, seg2a, seg2b):
return (
self._c_exp
- _delta(seg1, seg2a)
- _delta(seg1, seg2b)
- _sig_vwl(seg1)
- max(_sig_vwl(seg2a), _sig_vwl(seg2b))
)
def _sig_vwl(seg):
return (
0.0
if seg['manner'] > self.feature_weights['high vowel']
else self._c_vwl
)
def _delta(seg1, seg2):
features = (
self.c_features
if max(seg1['manner'], seg2['manner'])
> self.feature_weights['high vowel']
else self.v_features
)
diff = 0.0
for f in features:
diff += (
abs(seg1.get(f, 0.0) - seg2.get(f, 0.0)) * self.salience[f]
)
return diff
def _retrieve(i, j, score, out):
def _record(score, out):
out.append(('‖', '‖'))
for i1 in range(i - 1, -1, -1):
out.append((src[i1]['segment'], ''))
for j1 in range(j - 1, -1, -1):
out.append(('', tar[j1]['segment']))
if self._mode == 'global':
score += (i + j) * _sig_skip('')
out = out[::-1]
src_alignment = []
tar_alignment = []
out.append(('‖', '‖'))
part = 0
s_segment = ''
t_segment = ''
for ss, ts in out:
if ss == '‖':
if part % 2 == 0:
src_alignment.append(s_segment)
tar_alignment.append(t_segment)
s_segment = []
t_segment = []
else:
src_alignment.append(' '.join(s_segment))
tar_alignment.append(' '.join(t_segment))
s_segment = ''
t_segment = ''
part += 1
else:
if part % 2 == 0:
s_segment += ss
t_segment += ts
else:
s_segment.append(ss + ' ' * (len(ts) - len(ss)))
t_segment.append(ts + ' ' * (len(ss) - len(ts)))
src_alignment = ' ‖ '.join(src_alignment).strip()
tar_alignment = ' ‖ '.join(tar_alignment).strip()
alignments.append((score, src_alignment, tar_alignment))
return
if s_mat[i, j] == 0:
_record(score, out)
return
else:
if (
i > 0
and j > 0
and s_mat[i - 1, j - 1]
+ _sig_sub(src[i - 1], tar[j - 1])
+ score
>= threshold
):
loc_out = deepcopy(out)
loc_out.append(
(src[i - 1]['segment'], tar[j - 1]['segment'])
)
_retrieve(
i - 1,
j - 1,
score + _sig_sub(src[i - 1], tar[j - 1]),
loc_out,
)
loc_out.pop()
if (
j > 0
and s_mat[i, j - 1] + _sig_skip(tar[j - 1]) + score
>= threshold
):
loc_out = deepcopy(out)
loc_out.append(('-', tar[j - 1]['segment']))
_retrieve(i, j - 1, score + _sig_skip(tar[j - 1]), loc_out)
loc_out.pop()
if (
i > 0
and j > 1
and s_mat[i - 1, j - 2]
+ _sig_exp(src[i - 1], tar[j - 2], tar[j - 1])
+ score
>= threshold
):
loc_out = deepcopy(out)
loc_out.append(
(
src[i - 1]['segment'],
tar[j - 2]['segment'] + tar[j - 1]['segment'],
)
)
_retrieve(
i - 1,
j - 2,
score + _sig_exp(src[i - 1], tar[j - 2], tar[j - 1]),
loc_out,
)
loc_out.pop()
if (
i > 0
and s_mat[i - 1, j] + _sig_skip(src[i - 1]) + score
>= threshold
):
loc_out = deepcopy(out)
loc_out.append((src[i - 1]['segment'], '-'))
_retrieve(i - 1, j, score + _sig_skip(src[i - 1]), loc_out)
loc_out.pop()
if (
i > 1
and j > 0
and s_mat[i - 2, j - 1]
+ _sig_exp(tar[j - 1], src[i - 2], src[i - 1])
+ score
>= threshold
):
loc_out = deepcopy(out)
loc_out.append(
(
src[i - 2]['segment'] + src[i - 1]['segment'],
tar[j - 1]['segment'],
)
)
_retrieve(
i - 2,
j - 1,
score + _sig_exp(tar[j - 1], src[i - 2], src[i - 1]),
loc_out,
)
loc_out.pop()
sg_max = 0.0
src = list(src)
tar = list(tar)
for ch in range(len(src)):
if src[ch] in self._phones:
seg = src[ch]
src[ch] = dict(self._phones[src[ch]])
src[ch]['segment'] = seg
for ch in range(len(tar)):
if tar[ch] in self._phones:
seg = tar[ch]
tar[ch] = dict(self._phones[tar[ch]])
tar[ch]['segment'] = seg
src = [fb for fb in src if isinstance(fb, dict)]
tar = [fb for fb in tar if isinstance(fb, dict)]
for i in range(1, len(src)):
if 'supplemental' in src[i]:
j = i - 1
while j > -1:
if 'supplemental' not in src[j]:
for key, value in src[i].items():
if key != 'supplemental':
if key == 'segment':
src[j]['segment'] += value
else:
src[j][key] = value
j = 0
j -= 1
src = [fb for fb in src if 'supplemental' not in fb]
for i in range(1, len(tar)):
if 'supplemental' in tar[i]:
j = i - 1
while j > -1:
if 'supplemental' not in tar[j]:
for key, value in tar[i].items():
if key != 'supplemental':
if key == 'segment':
tar[j]['segment'] += value
else:
tar[j][key] = value
j = 0
j -= 1
tar = [fb for fb in tar if 'supplemental' not in fb]
for i in range(len(src)):
for key in src[i].keys():
if key != 'segment':
src[i][key] = self.feature_weights[src[i][key]]
for i in range(len(tar)):
for key in tar[i].keys():
if key != 'segment':
tar[i][key] = self.feature_weights[tar[i][key]]
src_len = len(src)
tar_len = len(tar)
s_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float)
if self._mode == 'global':
for i in range(1, src_len + 1):
s_mat[i, 0] = s_mat[i - 1, 0] + _sig_skip(src[i - 1])
for j in range(1, tar_len + 1):
s_mat[0, j] = s_mat[0, j - 1] + _sig_skip(tar[j - 1])
for i in range(1, src_len + 1):
for j in range(1, tar_len + 1):
s_mat[i, j] = max(
s_mat[i - 1, j] + _sig_skip(src[i - 1]),
s_mat[i, j - 1] + _sig_skip(tar[j - 1]),
s_mat[i - 1, j - 1] + _sig_sub(src[i - 1], tar[j - 1]),
s_mat[i - 1, j - 2]
+ _sig_exp(src[i - 1], tar[j - 2], tar[j - 1])
if j > 1
else NINF,
s_mat[i - 2, j - 1]
+ _sig_exp(tar[j - 1], src[i - 2], src[i - 1])
if i > 1
else NINF,
0 if self._mode in {'local', 'half-local'} else NINF,
)
if s_mat[i, j] > sg_max:
if self._mode == 'semi-global':
if i == src_len or j == tar_len:
sg_max = s_mat[i, j]
else:
sg_max = s_mat[i, j]
if self._mode in {'global', 'half-local'}:
dp_score = s_mat[src_len, tar_len]
else:
dp_score = s_mat.max()
if score_only:
return dp_score
threshold = (1 - self._epsilon) * dp_score
alignments = []
for i in range(1, src_len + 1):
for j in range(1, tar_len + 1):
if self._mode in {'global', 'half-local'} and (
i < src_len or j < tar_len
):
continue
if self._mode == 'semi-global' and (
i < src_len and j < tar_len
):
continue
if s_mat[i, j] >= threshold:
out = []
for j1 in range(tar_len - 1, j - 1, -1):
out.append(('', tar[j1]['segment']))
for i1 in range(src_len - 1, i - 1, -1):
out.append((src[i1]['segment'], ''))
out.append(('‖', '‖'))
_retrieve(i, j, 0, out)
def _first_element(x):
return x[0]
return sorted(alignments, key=_first_element, reverse=True)
[docs] def sim_score(self, src, tar):
"""Return the ALINE alignment score of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
ALINE alignment score
Examples
--------
>>> cmp = ALINE()
>>> cmp.sim_score('cat', 'hat')
50.0
>>> cmp.sim_score('niall', 'neil')
90.0
>>> cmp.sim_score('aluminum', 'catalan')
81.5
>>> cmp.sim_score('atcg', 'tagc')
65.0
.. versionadded:: 0.4.0
"""
if src == '' and tar == '':
return 1.0
return self.alignments(src, tar, score_only=True)
[docs] def sim(self, src, tar):
"""Return the normalized ALINE similarity of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Normalized ALINE similarity
Examples
--------
>>> cmp = ALINE()
>>> cmp.dist('cat', 'hat')
0.4117647058823529
>>> cmp.dist('niall', 'neil')
0.33333333333333337
>>> cmp.dist('aluminum', 'catalan')
0.5925
>>> cmp.dist('atcg', 'tagc')
0.45833333333333337
.. versionadded:: 0.4.0
"""
num = self.sim_score(src, tar)
if num:
return num / self._normalizer(
[self.sim_score(src, src), self.sim_score(tar, tar)]
)
return 0.0
if __name__ == '__main__':
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)