# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.phonetic._phonex.
Phonex
"""
from unicodedata import normalize as unicode_normalize
from deprecation import deprecated
from ._phonetic import _Phonetic
from .. import __version__
__all__ = ['Phonex', 'phonex']
[docs]class Phonex(_Phonetic):
"""Phonex code.
Phonex is an algorithm derived from Soundex, defined in :cite:`Lait:1996`.
.. versionadded:: 0.3.6
"""
_alphabetic = dict(zip((ord(_) for _ in '123456'), 'PSTLNR'))
def __init__(self, max_length=4, zero_pad=True):
"""Initialize Phonex instance.
Parameters
----------
max_length : int
The length of the code returned (defaults to 4)
zero_pad : bool
Pad the end of the return value with 0s to achieve a max_length
string
.. versionadded:: 0.4.0
"""
# Clamp max_length to [4, 64]
if max_length != -1:
self._max_length = min(max(4, max_length), 64)
else:
self._max_length = 64
self._zero_pad = zero_pad
[docs] def encode_alpha(self, word):
"""Return the alphabetic Phonex code for a word.
Parameters
----------
word : str
The word to transform
Returns
-------
str
The alphabetic Phonex value
Examples
--------
>>> pe = Phonex()
>>> pe.encode_alpha('Christopher')
'CRST'
>>> pe.encode_alpha('Niall')
'NL'
>>> pe.encode_alpha('Smith')
'SNT'
>>> pe.encode_alpha('Schmidt')
'SSNT'
.. versionadded:: 0.4.0
"""
code = self.encode(word).rstrip('0')
return code[:1] + code[1:].translate(self._alphabetic)
[docs] def encode(self, word):
"""Return the Phonex code for a word.
Parameters
----------
word : str
The word to transform
Returns
-------
str
The Phonex value
Examples
--------
>>> pe = Phonex()
>>> pe.encode('Christopher')
'C623'
>>> pe.encode('Niall')
'N400'
>>> pe.encode('Schmidt')
'S253'
>>> pe.encode('Smith')
'S530'
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
name = unicode_normalize('NFKD', word.upper())
name_code = last = ''
# Deletions effected by replacing with next letter which
# will be ignored due to duplicate handling of Soundex code.
# This is faster than 'moving' all subsequent letters.
# Remove any trailing Ss
while name[-1:] == 'S':
name = name[:-1]
# Phonetic equivalents of first 2 characters
# Works since duplicate letters are ignored
if name[:2] == 'KN':
name = 'N' + name[2:] # KN.. == N..
elif name[:2] == 'PH':
name = 'F' + name[2:] # PH.. == F.. (H ignored anyway)
elif name[:2] == 'WR':
name = 'R' + name[2:] # WR.. == R..
if name:
# Special case, ignore H first letter (subsequent Hs ignored
# anyway)
# Works since duplicate letters are ignored
if name[0] == 'H':
name = name[1:]
if name:
# Phonetic equivalents of first character
if name[0] in self._uc_vy_set:
name = 'A' + name[1:]
elif name[0] in {'B', 'P'}:
name = 'B' + name[1:]
elif name[0] in {'V', 'F'}:
name = 'F' + name[1:]
elif name[0] in {'C', 'K', 'Q'}:
name = 'C' + name[1:]
elif name[0] in {'G', 'J'}:
name = 'G' + name[1:]
elif name[0] in {'S', 'Z'}:
name = 'S' + name[1:]
name_code = last = name[0]
# Modified Soundex code
for i in range(1, len(name)):
code = '0'
if name[i] in {'B', 'F', 'P', 'V'}:
code = '1'
elif name[i] in {'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z'}:
code = '2'
elif name[i] in {'D', 'T'}:
if name[i + 1 : i + 2] != 'C':
code = '3'
elif name[i] == 'L':
if name[i + 1 : i + 2] in self._uc_vy_set or i + 1 == len(
name
):
code = '4'
elif name[i] in {'M', 'N'}:
if name[i + 1 : i + 2] in {'D', 'G'}:
name = name[: i + 1] + name[i] + name[i + 2 :]
code = '5'
elif name[i] == 'R':
if name[i + 1 : i + 2] in self._uc_vy_set or i + 1 == len(
name
):
code = '6'
if code != last and code != '0' and i != 0:
name_code += code
last = name_code[-1]
if self._zero_pad:
name_code += '0' * self._max_length
if not name_code:
name_code = '0'
return name_code[: self._max_length]
[docs]@deprecated(
deprecated_in='0.4.0',
removed_in='0.6.0',
current_version=__version__,
details='Use the Phonex.encode method instead.',
)
def phonex(word, max_length=4, zero_pad=True):
"""Return the Phonex code for a word.
This is a wrapper for :py:meth:`Phonex.encode`.
Parameters
----------
word : str
The word to transform
max_length : int
The length of the code returned (defaults to 4)
zero_pad : bool
Pad the end of the return value with 0s to achieve a max_length string
Returns
-------
str
The Phonex value
Examples
--------
>>> phonex('Christopher')
'C623'
>>> phonex('Niall')
'N400'
>>> phonex('Schmidt')
'S253'
>>> phonex('Smith')
'S530'
.. versionadded:: 0.1.0
"""
return Phonex(max_length, zero_pad).encode(word)
if __name__ == '__main__':
import doctest
doctest.testmod()