Source code for abydos.phonetic._nysiis

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._nysiis.

New York State Identification and Intelligence System (NYSIIS) phonetic
encoding
"""

from deprecation import deprecated

from ._phonetic import _Phonetic
from .. import __version__

__all__ = ['NYSIIS', 'nysiis']


[docs]class NYSIIS(_Phonetic): """NYSIIS Code. The New York State Identification and Intelligence System algorithm is defined in :cite:`Taft:1970`. The modified version of this algorithm is described in Appendix B of :cite:`Lynch:1977`. .. versionadded:: 0.3.6 """ def __init__(self, max_length=6, modified=False): """Initialize AlphaSIS instance. Parameters ---------- max_length : int The maximum length (default 6) of the code to return modified : bool Indicates whether to use USDA modified NYSIIS .. versionadded:: 0.4.0 """ self._max_length = max_length # Require a max_length of at least 6 if self._max_length > -1: self._max_length = max(6, self._max_length) self._modified = modified
[docs] def encode(self, word): """Return the NYSIIS code for a word. Parameters ---------- word : str The word to transform Returns ------- str The NYSIIS value Examples -------- >>> pe = NYSIIS() >>> pe.encode('Christopher') 'CRASTA' >>> pe.encode('Niall') 'NAL' >>> pe.encode('Smith') 'SNAT' >>> pe.encode('Schmidt') 'SNAD' >>> NYSIIS(max_length=-1).encode('Christopher') 'CRASTAFAR' >>> pe_8m = NYSIIS(max_length=8, modified=True) >>> pe_8m.encode('Christopher') 'CRASTAFA' >>> pe_8m.encode('Niall') 'NAL' >>> pe_8m.encode('Smith') 'SNAT' >>> pe_8m.encode('Schmidt') 'SNAD' .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ word = ''.join(c for c in word.upper() if c.isalpha()) # exit early if there are no alphas if not word: return '' original_first_char = word[0] if word[:3] == 'MAC': word = 'MCC' + word[3:] elif word[:2] == 'KN': word = 'NN' + word[2:] elif word[:1] == 'K': word = 'C' + word[1:] elif word[:2] in {'PH', 'PF'}: word = 'FF' + word[2:] elif word[:3] == 'SCH': word = 'SSS' + word[3:] elif self._modified: if word[:2] == 'WR': word = 'RR' + word[2:] elif word[:2] == 'RH': word = 'RR' + word[2:] elif word[:2] == 'DG': word = 'GG' + word[2:] elif word[:1] in self._uc_v_set: word = 'A' + word[1:] if self._modified and word[-1:] in {'S', 'Z'}: word = word[:-1] if ( word[-2:] == 'EE' or word[-2:] == 'IE' or (self._modified and word[-2:] == 'YE') ): word = word[:-2] + 'Y' elif word[-2:] in {'DT', 'RT', 'RD'}: word = word[:-2] + 'D' elif word[-2:] in {'NT', 'ND'}: word = word[:-2] + ('N' if self._modified else 'D') elif self._modified: if word[-2:] == 'IX': word = word[:-2] + 'ICK' elif word[-2:] == 'EX': word = word[:-2] + 'ECK' elif word[-2:] in {'JR', 'SR'}: return 'ERROR' key = word[:1] skip = 0 for i in range(1, len(word)): if i >= len(word): continue elif skip: skip -= 1 continue elif word[i : i + 2] == 'EV': word = word[:i] + 'AF' + word[i + 2 :] skip = 1 elif word[i] in self._uc_v_set: word = word[:i] + 'A' + word[i + 1 :] elif self._modified and i != len(word) - 1 and word[i] == 'Y': word = word[:i] + 'A' + word[i + 1 :] elif word[i] == 'Q': word = word[:i] + 'G' + word[i + 1 :] elif word[i] == 'Z': word = word[:i] + 'S' + word[i + 1 :] elif word[i] == 'M': word = word[:i] + 'N' + word[i + 1 :] elif word[i : i + 2] == 'KN': word = word[:i] + 'N' + word[i + 2 :] elif word[i] == 'K': word = word[:i] + 'C' + word[i + 1 :] elif ( self._modified and i == len(word) - 3 and word[i : i + 3] == 'SCH' ): word = word[:i] + 'SSA' skip = 2 elif word[i : i + 3] == 'SCH': word = word[:i] + 'SSS' + word[i + 3 :] skip = 2 elif ( self._modified and i == len(word) - 2 and word[i : i + 2] == 'SH' ): word = word[:i] + 'SA' skip = 1 elif word[i : i + 2] == 'SH': word = word[:i] + 'SS' + word[i + 2 :] skip = 1 elif word[i : i + 2] == 'PH': word = word[:i] + 'FF' + word[i + 2 :] skip = 1 elif self._modified and word[i : i + 3] == 'GHT': word = word[:i] + 'TTT' + word[i + 3 :] skip = 2 elif self._modified and word[i : i + 2] == 'DG': word = word[:i] + 'GG' + word[i + 2 :] skip = 1 elif self._modified and word[i : i + 2] == 'WR': word = word[:i] + 'RR' + word[i + 2 :] skip = 1 elif word[i] == 'H' and ( word[i - 1] not in self._uc_v_set or word[i + 1 : i + 2] not in self._uc_v_set ): word = word[:i] + word[i - 1] + word[i + 1 :] elif word[i] == 'W' and word[i - 1] in self._uc_v_set: word = word[:i] + word[i - 1] + word[i + 1 :] if word[i : i + skip + 1] != key[-1:]: key += word[i : i + skip + 1] key = self._delete_consecutive_repeats(key) if key[-1:] == 'S': key = key[:-1] if key[-2:] == 'AY': key = key[:-2] + 'Y' if key[-1:] == 'A': key = key[:-1] if self._modified and key[:1] == 'A': key = original_first_char + key[1:] if self._max_length > 0: key = key[: self._max_length] return key
[docs]@deprecated( deprecated_in='0.4.0', removed_in='0.6.0', current_version=__version__, details='Use the NYSIIS.encode method instead.', ) def nysiis(word, max_length=6, modified=False): """Return the NYSIIS code for a word. This is a wrapper for :py:meth:`NYSIIS.encode`. Parameters ---------- word : str The word to transform max_length : int The maximum length (default 6) of the code to return modified : bool Indicates whether to use USDA modified NYSIIS Returns ------- str The NYSIIS value Examples -------- >>> nysiis('Christopher') 'CRASTA' >>> nysiis('Niall') 'NAL' >>> nysiis('Smith') 'SNAT' >>> nysiis('Schmidt') 'SNAD' >>> nysiis('Christopher', max_length=-1) 'CRASTAFAR' >>> nysiis('Christopher', max_length=8, modified=True) 'CRASTAFA' >>> nysiis('Niall', max_length=8, modified=True) 'NAL' >>> nysiis('Smith', max_length=8, modified=True) 'SNAT' >>> nysiis('Schmidt', max_length=8, modified=True) 'SNAD' .. versionadded:: 0.1.0 """ return NYSIIS(max_length, modified).encode(word)
if __name__ == '__main__': import doctest doctest.testmod()