Source code for abydos.phonetic._daitch_mokotoff

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._daitch_mokotoff.

Daitch-Mokotoff Soundex
"""

from unicodedata import normalize as unicode_normalize

from deprecation import deprecated

from ._phonetic import _Phonetic
from .. import __version__

__all__ = ['DaitchMokotoff', 'dm_soundex']


[docs]class DaitchMokotoff(_Phonetic): """Daitch-Mokotoff Soundex. Based on Daitch-Mokotoff Soundex :cite:`Mokotoff:1997`, this returns values of a word as a set. A collection is necessary since there can be multiple values for a single word. .. versionadded:: 0.3.6 """ _dms_table = { 'STCH': (2, 4, 4), 'DRZ': (4, 4, 4), 'ZH': (4, 4, 4), 'ZHDZH': (2, 4, 4), 'DZH': (4, 4, 4), 'DRS': (4, 4, 4), 'DZS': (4, 4, 4), 'SCHTCH': (2, 4, 4), 'SHTSH': (2, 4, 4), 'SZCZ': (2, 4, 4), 'TZS': (4, 4, 4), 'SZCS': (2, 4, 4), 'STSH': (2, 4, 4), 'SHCH': (2, 4, 4), 'D': (3, 3, 3), 'H': (5, 5, '_'), 'TTSCH': (4, 4, 4), 'THS': (4, 4, 4), 'L': (8, 8, 8), 'P': (7, 7, 7), 'CHS': (5, 54, 54), 'T': (3, 3, 3), 'X': (5, 54, 54), 'OJ': (0, 1, '_'), 'OI': (0, 1, '_'), 'SCHTSH': (2, 4, 4), 'OY': (0, 1, '_'), 'Y': (1, '_', '_'), 'TSH': (4, 4, 4), 'ZDZ': (2, 4, 4), 'TSZ': (4, 4, 4), 'SHT': (2, 43, 43), 'SCHTSCH': (2, 4, 4), 'TTSZ': (4, 4, 4), 'TTZ': (4, 4, 4), 'SCH': (4, 4, 4), 'TTS': (4, 4, 4), 'SZD': (2, 43, 43), 'AI': (0, 1, '_'), 'PF': (7, 7, 7), 'TCH': (4, 4, 4), 'PH': (7, 7, 7), 'TTCH': (4, 4, 4), 'SZT': (2, 43, 43), 'ZDZH': (2, 4, 4), 'EI': (0, 1, '_'), 'G': (5, 5, 5), 'EJ': (0, 1, '_'), 'ZD': (2, 43, 43), 'IU': (1, '_', '_'), 'K': (5, 5, 5), 'O': (0, '_', '_'), 'SHTCH': (2, 4, 4), 'S': (4, 4, 4), 'TRZ': (4, 4, 4), 'SHD': (2, 43, 43), 'DSH': (4, 4, 4), 'CSZ': (4, 4, 4), 'EU': (1, 1, '_'), 'TRS': (4, 4, 4), 'ZS': (4, 4, 4), 'STRZ': (2, 4, 4), 'UY': (0, 1, '_'), 'STRS': (2, 4, 4), 'CZS': (4, 4, 4), 'MN': ('6_6', '6_6', '6_6'), 'UI': (0, 1, '_'), 'UJ': (0, 1, '_'), 'UE': (0, '_', '_'), 'EY': (0, 1, '_'), 'W': (7, 7, 7), 'IA': (1, '_', '_'), 'FB': (7, 7, 7), 'STSCH': (2, 4, 4), 'SCHT': (2, 43, 43), 'NM': ('6_6', '6_6', '6_6'), 'SCHD': (2, 43, 43), 'B': (7, 7, 7), 'DSZ': (4, 4, 4), 'F': (7, 7, 7), 'N': (6, 6, 6), 'CZ': (4, 4, 4), 'R': (9, 9, 9), 'U': (0, '_', '_'), 'V': (7, 7, 7), 'CS': (4, 4, 4), 'Z': (4, 4, 4), 'SZ': (4, 4, 4), 'TSCH': (4, 4, 4), 'KH': (5, 5, 5), 'ST': (2, 43, 43), 'KS': (5, 54, 54), 'SH': (4, 4, 4), 'SC': (2, 4, 4), 'SD': (2, 43, 43), 'DZ': (4, 4, 4), 'ZHD': (2, 43, 43), 'DT': (3, 3, 3), 'ZSH': (4, 4, 4), 'DS': (4, 4, 4), 'TZ': (4, 4, 4), 'TS': (4, 4, 4), 'TH': (3, 3, 3), 'TC': (4, 4, 4), 'A': (0, '_', '_'), 'E': (0, '_', '_'), 'I': (0, '_', '_'), 'AJ': (0, 1, '_'), 'M': (6, 6, 6), 'Q': (5, 5, 5), 'AU': (0, 7, '_'), 'IO': (1, '_', '_'), 'AY': (0, 1, '_'), 'IE': (1, '_', '_'), 'ZSCH': (4, 4, 4), 'CH': ((5, 4), (5, 4), (5, 4)), 'CK': ((5, 45), (5, 45), (5, 45)), 'C': ((5, 4), (5, 4), (5, 4)), 'J': ((1, 4), ('_', 4), ('_', 4)), 'RZ': ((94, 4), (94, 4), (94, 4)), 'RS': ((94, 4), (94, 4), (94, 4)), } _dms_order = { 'A': ('AI', 'AJ', 'AU', 'AY', 'A'), 'B': ('B',), 'C': ('CHS', 'CSZ', 'CZS', 'CH', 'CK', 'CS', 'CZ', 'C'), 'D': ('DRS', 'DRZ', 'DSH', 'DSZ', 'DZH', 'DZS', 'DS', 'DT', 'DZ', 'D'), 'E': ('EI', 'EJ', 'EU', 'EY', 'E'), 'F': ('FB', 'F'), 'G': ('G',), 'H': ('H',), 'I': ('IA', 'IE', 'IO', 'IU', 'I'), 'J': ('J',), 'K': ('KH', 'KS', 'K'), 'L': ('L',), 'M': ('MN', 'M'), 'N': ('NM', 'N'), 'O': ('OI', 'OJ', 'OY', 'O'), 'P': ('PF', 'PH', 'P'), 'Q': ('Q',), 'R': ('RS', 'RZ', 'R'), 'S': ( 'SCHTSCH', 'SCHTCH', 'SCHTSH', 'SHTCH', 'SHTSH', 'STSCH', 'SCHD', 'SCHT', 'SHCH', 'STCH', 'STRS', 'STRZ', 'STSH', 'SZCS', 'SZCZ', 'SCH', 'SHD', 'SHT', 'SZD', 'SZT', 'SC', 'SD', 'SH', 'ST', 'SZ', 'S', ), 'T': ( 'TTSCH', 'TSCH', 'TTCH', 'TTSZ', 'TCH', 'THS', 'TRS', 'TRZ', 'TSH', 'TSZ', 'TTS', 'TTZ', 'TZS', 'TC', 'TH', 'TS', 'TZ', 'T', ), 'U': ('UE', 'UI', 'UJ', 'UY', 'U'), 'V': ('V',), 'W': ('W',), 'X': ('X',), 'Y': ('Y',), 'Z': ( 'ZHDZH', 'ZDZH', 'ZSCH', 'ZDZ', 'ZHD', 'ZSH', 'ZD', 'ZH', 'ZS', 'Z', ), } _uc_v_set = set('AEIJOUY') _alphabetic = dict(zip((ord(_) for _ in '0123456789'), 'AYstTSKNPLR')) _alphabetic_non_initials = dict( zip((ord(_) for _ in '0123456789'), ' A TSKNPLR') ) def __init__(self, max_length=6, zero_pad=True): """Initialize DaitchMokotoff instance. Parameters ---------- max_length : int The length of the code returned (defaults to 6; must be between 6 and 64) zero_pad : bool Pad the end of the return value with 0s to achieve a max_length string .. versionadded:: 0.4.0 """ # Require a max_length of at least 6 and not more than 64 if max_length != -1: self._max_length = min(max(6, max_length), 64) else: self._max_length = 64 self._zero_pad = zero_pad
[docs] def encode_alpha(self, word): """Return the alphabetic Daitch-Mokotoff Soundex code for a word. Parameters ---------- word : str The word to transform Returns ------- str The alphabetic Daitch-Mokotoff Soundex value Examples -------- >>> pe = DaitchMokotoff() >>> sorted(pe.encode_alpha('Christopher')) ['KRSTPR', 'SRSTPR'] >>> pe.encode_alpha('Niall') {'NL'} >>> pe.encode_alpha('Smith') {'SNT'} >>> pe.encode_alpha('Schmidt') {'SNT'} >>> sorted(DaitchMokotoff(max_length=20, ... zero_pad=False).encode_alpha('The quick brown fox')) ['TKKPRPNPKS', 'TKSKPRPNPKS'] .. versionadded:: 0.4.0 """ alphas = { code.rstrip('0').translate(self._alphabetic) for code in self.encode(word) } return {code[:1] + code[1:].replace('Y', 'A') for code in alphas}
[docs] def encode(self, word): """Return the Daitch-Mokotoff Soundex code for a word. Parameters ---------- word : str The word to transform Returns ------- str The Daitch-Mokotoff Soundex value Examples -------- >>> pe = DaitchMokotoff() >>> sorted(pe.encode('Christopher')) ['494379', '594379'] >>> pe.encode('Niall') {'680000'} >>> pe.encode('Smith') {'463000'} >>> pe.encode('Schmidt') {'463000'} >>> sorted(DaitchMokotoff(max_length=20, ... zero_pad=False).encode('The quick brown fox')) ['35457976754', '3557976754'] .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ dms = [''] # initialize empty code list # uppercase, normalize, decompose, and filter non-A-Z word = unicode_normalize('NFKD', word.upper()) word = ''.join(c for c in word if c in self._uc_set) # Nothing to convert, return base case if not word: if self._zero_pad: return {'0' * self._max_length} return {'0'} pos = 0 while pos < len(word): # Iterate through _dms_order, which specifies the possible # substrings for which codes exist in the Daitch-Mokotoff coding for sstr in self._dms_order[word[pos]]: # pragma: no branch if word[pos:].startswith(sstr): # Having determined a valid substring start, retrieve the # code dm_val = self._dms_table[sstr] # Having retried the code (triple), determine the correct # positional variant (first, pre-vocalic, elsewhere) if pos == 0: dm_val = dm_val[0] elif ( pos + len(sstr) < len(word) and word[pos + len(sstr)] in self._uc_v_set ): dm_val = dm_val[1] else: dm_val = dm_val[2] # Build the code strings if isinstance(dm_val, tuple): dms = [_ + str(dm_val[0]) for _ in dms] + [ _ + str(dm_val[1]) for _ in dms ] else: dms = [_ + str(dm_val) for _ in dms] pos += len(sstr) break # Filter out double letters and _ placeholders dms = ( ''.join(c for c in self._delete_consecutive_repeats(_) if c != '_') for _ in dms ) # Trim codes and return set if self._zero_pad: dms = ( (_ + ('0' * self._max_length))[: self._max_length] for _ in dms ) else: dms = (_[: self._max_length] for _ in dms) return set(dms)
[docs]@deprecated( deprecated_in='0.4.0', removed_in='0.6.0', current_version=__version__, details='Use the DaitchMokotoff.encode method instead.', ) def dm_soundex(word, max_length=6, zero_pad=True): """Return the Daitch-Mokotoff Soundex code for a word. This is a wrapper for :py:meth:`DaitchMokotoff.encode`. Parameters ---------- word : str The word to transform max_length : int The length of the code returned (defaults to 6; must be between 6 and 64) zero_pad : bool Pad the end of the return value with 0s to achieve a max_length string Returns ------- str The Daitch-Mokotoff Soundex value Examples -------- >>> sorted(dm_soundex('Christopher')) ['494379', '594379'] >>> dm_soundex('Niall') {'680000'} >>> dm_soundex('Smith') {'463000'} >>> dm_soundex('Schmidt') {'463000'} >>> sorted(dm_soundex('The quick brown fox', max_length=20, ... zero_pad=False)) ['35457976754', '3557976754'] .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ return DaitchMokotoff(max_length, zero_pad).encode(word)
if __name__ == '__main__': import doctest doctest.testmod()