Source code for abydos.phonetic._phonex

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._phonex.

Phonex
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from unicodedata import normalize as unicode_normalize

from six import text_type
from six.moves import range

from ._phonetic import _Phonetic

__all__ = ['Phonex', 'phonex']


[docs]class Phonex(_Phonetic): """Phonex code. Phonex is an algorithm derived from Soundex, defined in :cite:`Lait:1996`. """
[docs] def encode(self, word, max_length=4, zero_pad=True): """Return the Phonex code for a word. Parameters ---------- word : str The word to transform max_length : int The length of the code returned (defaults to 4) zero_pad : bool Pad the end of the return value with 0s to achieve a max_length string Returns ------- str The Phonex value Examples -------- >>> pe = Phonex() >>> pe.encode('Christopher') 'C623' >>> pe.encode('Niall') 'N400' >>> pe.encode('Schmidt') 'S253' >>> pe.encode('Smith') 'S530' """ name = unicode_normalize('NFKD', text_type(word.upper())) name = name.replace('ß', 'SS') # Clamp max_length to [4, 64] if max_length != -1: max_length = min(max(4, max_length), 64) else: max_length = 64 name_code = last = '' # Deletions effected by replacing with next letter which # will be ignored due to duplicate handling of Soundex code. # This is faster than 'moving' all subsequent letters. # Remove any trailing Ss while name[-1:] == 'S': name = name[:-1] # Phonetic equivalents of first 2 characters # Works since duplicate letters are ignored if name[:2] == 'KN': name = 'N' + name[2:] # KN.. == N.. elif name[:2] == 'PH': name = 'F' + name[2:] # PH.. == F.. (H ignored anyway) elif name[:2] == 'WR': name = 'R' + name[2:] # WR.. == R.. if name: # Special case, ignore H first letter (subsequent Hs ignored # anyway) # Works since duplicate letters are ignored if name[0] == 'H': name = name[1:] if name: # Phonetic equivalents of first character if name[0] in self._uc_vy_set: name = 'A' + name[1:] elif name[0] in {'B', 'P'}: name = 'B' + name[1:] elif name[0] in {'V', 'F'}: name = 'F' + name[1:] elif name[0] in {'C', 'K', 'Q'}: name = 'C' + name[1:] elif name[0] in {'G', 'J'}: name = 'G' + name[1:] elif name[0] in {'S', 'Z'}: name = 'S' + name[1:] name_code = last = name[0] # Modified Soundex code for i in range(1, len(name)): code = '0' if name[i] in {'B', 'F', 'P', 'V'}: code = '1' elif name[i] in {'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z'}: code = '2' elif name[i] in {'D', 'T'}: if name[i + 1 : i + 2] != 'C': code = '3' elif name[i] == 'L': if name[i + 1 : i + 2] in self._uc_vy_set or i + 1 == len( name ): code = '4' elif name[i] in {'M', 'N'}: if name[i + 1 : i + 2] in {'D', 'G'}: name = name[: i + 1] + name[i] + name[i + 2 :] code = '5' elif name[i] == 'R': if name[i + 1 : i + 2] in self._uc_vy_set or i + 1 == len( name ): code = '6' if code != last and code != '0' and i != 0: name_code += code last = name_code[-1] if zero_pad: name_code += '0' * max_length if not name_code: name_code = '0' return name_code[:max_length]
[docs]def phonex(word, max_length=4, zero_pad=True): """Return the Phonex code for a word. This is a wrapper for :py:meth:`Phonex.encode`. Parameters ---------- word : str The word to transform max_length : int The length of the code returned (defaults to 4) zero_pad : bool Pad the end of the return value with 0s to achieve a max_length string Returns ------- str The Phonex value Examples -------- >>> phonex('Christopher') 'C623' >>> phonex('Niall') 'N400' >>> phonex('Schmidt') 'S253' >>> phonex('Smith') 'S530' """ return Phonex().encode(word, max_length, zero_pad)
if __name__ == '__main__': import doctest doctest.testmod()