Source code for abydos.phonetic._pshp_soundex_last

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._pshp_soundex_last.

PSHP Soundex/Viewex Coding for last names
"""

from unicodedata import normalize as unicode_normalize

from deprecation import deprecated

from ._phonetic import _Phonetic
from .. import __version__

__all__ = ['PSHPSoundexLast', 'pshp_soundex_last']


[docs]class PSHPSoundexLast(_Phonetic): """PSHP Soundex/Viewex Coding of a last name. This coding is based on :cite:`Hershberg:1976`. Reference was also made to the German version of the same: :cite:`Hershberg:1979`. A separate function, :py:class:`PSHPSoundexFirst` is used for first names. .. versionadded:: 0.3.6 """ _trans = dict( zip( (ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), '01230120022455012523010202', ) ) _alphabetic = dict(zip((ord(_) for _ in '12345'), 'PKTLN')) def __init__(self, max_length=4, german=False): """Initialize PSHPSoundexLast instance. Parameters ---------- max_length : int The length of the code returned (defaults to 4) german : bool Set to True if the name is German (different rules apply) .. versionadded:: 0.4.0 """ self._max_length = max_length self._german = german
[docs] def encode_alpha(self, lname): """Calculate the alphabetic PSHP Soundex/Viewex Coding of a last name. Parameters ---------- lname : str The last name to encode Returns ------- str The PSHP alphabetic Soundex/Viewex Coding Examples -------- >>> pe = PSHPSoundexLast() >>> pe.encode_alpha('Smith') 'SNT' >>> pe.encode_alpha('Waters') 'WTN' >>> pe.encode_alpha('James') 'JN' >>> pe.encode_alpha('Schmidt') 'SNT' >>> pe.encode_alpha('Ashcroft') 'AKKN' .. versionadded:: 0.4.0 """ code = self.encode(lname).rstrip('0') return code[:1] + code[1:].translate(self._alphabetic)
[docs] def encode(self, lname): """Calculate the PSHP Soundex/Viewex Coding of a last name. Parameters ---------- lname : str The last name to encode Returns ------- str The PSHP Soundex/Viewex Coding Examples -------- >>> pe = PSHPSoundexLast() >>> pe.encode('Smith') 'S530' >>> pe.encode('Waters') 'W350' >>> pe.encode('James') 'J500' >>> pe.encode('Schmidt') 'S530' >>> pe.encode('Ashcroft') 'A225' .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ lname = unicode_normalize('NFKD', lname.upper()) lname = ''.join(c for c in lname if c in self._uc_set) # A. Prefix treatment if lname[:3] == 'VON' or lname[:3] == 'VAN': lname = lname[3:].strip() # The rule implemented below says "MC, MAC become 1". I believe it # meant to say they become M except in German data (where superscripted # 1 indicates "except in German data"). It doesn't make sense for them # to become 1 (BPFV -> 1) or to apply outside German. Unfortunately, # both articles have this error(?). if not self._german: if lname[:3] == 'MAC': lname = 'M' + lname[3:] elif lname[:2] == 'MC': lname = 'M' + lname[2:] # The non-German-only rule to strip ' is unnecessary due to filtering if lname[:1] in {'E', 'I', 'O', 'U'}: lname = 'A' + lname[1:] elif lname[:2] in {'GE', 'GI', 'GY'}: lname = 'J' + lname[1:] elif lname[:2] in {'CE', 'CI', 'CY'}: lname = 'S' + lname[1:] elif lname[:3] == 'CHR': lname = 'K' + lname[1:] elif lname[:1] == 'C' and lname[:2] != 'CH': lname = 'K' + lname[1:] if lname[:2] == 'KN': lname = 'N' + lname[1:] elif lname[:2] == 'PH': lname = 'F' + lname[1:] elif lname[:3] in {'WIE', 'WEI'}: lname = 'V' + lname[1:] if self._german and lname[:1] in {'W', 'M', 'Y', 'Z'}: lname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[lname[0]] + lname[ 1: ] code = lname[:1] # B. Postfix treatment if self._german: # moved from end of postfix treatment due to blocking if lname[-3:] == 'TES': lname = lname[:-3] elif lname[-2:] == 'TS': lname = lname[:-2] if lname[-3:] == 'TZE': lname = lname[:-3] elif lname[-2:] == 'ZE': lname = lname[:-2] if lname[-1:] == 'Z': lname = lname[:-1] elif lname[-2:] == 'TE': lname = lname[:-2] if lname[-1:] == 'R': lname = lname[:-1] + 'N' elif lname[-2:] in {'SE', 'CE'}: lname = lname[:-2] if lname[-2:] == 'SS': lname = lname[:-2] elif lname[-1:] == 'S': lname = lname[:-1] if not self._german: l5_repl = {'STOWN': 'SAWON', 'MPSON': 'MASON'} l4_repl = { 'NSEN': 'ASEN', 'MSON': 'ASON', 'STEN': 'SAEN', 'STON': 'SAON', } if lname[-5:] in l5_repl: lname = lname[:-5] + l5_repl[lname[-5:]] elif lname[-4:] in l4_repl: lname = lname[:-4] + l4_repl[lname[-4:]] if lname[-2:] in {'NG', 'ND'}: lname = lname[:-1] if not self._german and lname[-3:] in {'GAN', 'GEN'}: lname = lname[:-3] + 'A' + lname[-2:] # C. Infix Treatment lname = lname.replace('CK', 'C') lname = lname.replace('SCH', 'S') lname = lname.replace('DT', 'T') lname = lname.replace('ND', 'N') lname = lname.replace('NG', 'N') lname = lname.replace('LM', 'M') lname = lname.replace('MN', 'M') lname = lname.replace('WIE', 'VIE') lname = lname.replace('WEI', 'VEI') # D. Soundexing # code for X & Y are unspecified, but presumably are 2 & 0 lname = lname.translate(self._trans) lname = self._delete_consecutive_repeats(lname) code += lname[1:] code = code.replace('0', '') # rule 1 if self._max_length != -1: if len(code) < self._max_length: code += '0' * (self._max_length - len(code)) else: code = code[: self._max_length] return code
[docs]@deprecated( deprecated_in='0.4.0', removed_in='0.6.0', current_version=__version__, details='Use the PSHPSoundexLast.encode method instead.', ) def pshp_soundex_last(lname, max_length=4, german=False): """Calculate the PSHP Soundex/Viewex Coding of a last name. This is a wrapper for :py:meth:`PSHPSoundexLast.encode`. Parameters ---------- lname : str The last name to encode max_length : int The length of the code returned (defaults to 4) german : bool Set to True if the name is German (different rules apply) Returns ------- str The PSHP Soundex/Viewex Coding Examples -------- >>> pshp_soundex_last('Smith') 'S530' >>> pshp_soundex_last('Waters') 'W350' >>> pshp_soundex_last('James') 'J500' >>> pshp_soundex_last('Schmidt') 'S530' >>> pshp_soundex_last('Ashcroft') 'A225' .. versionadded:: 0.3.0 """ return PSHPSoundexLast(max_length, german).encode(lname)
if __name__ == '__main__': import doctest doctest.testmod()