# -*- coding: utf-8 -*-
# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.phonetic._roger_root.
Roger Root phonetic algorithm
"""
from __future__ import (
absolute_import,
division,
print_function,
unicode_literals,
)
from unicodedata import normalize as unicode_normalize
from deprecation import deprecated
from six import text_type
from six.moves import range
from ._phonetic import _Phonetic
from .. import __version__
__all__ = ['RogerRoot', 'roger_root']
[docs]class RogerRoot(_Phonetic):
"""Roger Root code.
This is Roger Root name coding, described in :cite:`Moore:1977`.
.. versionadded:: 0.3.6
"""
# '*' is used to prevent combining by _delete_consecutive_repeats()
_init_patterns = {
4: {'TSCH': '06'},
3: {'TSH': '06', 'SCH': '06'},
2: {
'CE': '0*0',
'CH': '06',
'CI': '0*0',
'CY': '0*0',
'DG': '07',
'GF': '08',
'GM': '03',
'GN': '02',
'KN': '02',
'PF': '08',
'PH': '08',
'PN': '02',
'SH': '06',
'TS': '0*0',
'WR': '04',
},
1: {
'A': '1',
'B': '09',
'C': '07',
'D': '01',
'E': '1',
'F': '08',
'G': '07',
'H': '2',
'I': '1',
'J': '3',
'K': '07',
'L': '05',
'M': '03',
'N': '02',
'O': '1',
'P': '09',
'Q': '07',
'R': '04',
'S': '0*0',
'T': '01',
'U': '1',
'V': '08',
'W': '4',
'X': '07',
'Y': '5',
'Z': '0*0',
},
}
_med_patterns = {
4: {'TSCH': '6'},
3: {'TSH': '6', 'SCH': '6'},
2: {
'CE': '0',
'CH': '6',
'CI': '0',
'CY': '0',
'DG': '7',
'PH': '8',
'SH': '6',
'TS': '0',
},
1: {
'B': '9',
'C': '7',
'D': '1',
'F': '8',
'G': '7',
'J': '6',
'K': '7',
'L': '5',
'M': '3',
'N': '2',
'P': '9',
'Q': '7',
'R': '4',
'S': '0',
'T': '1',
'V': '8',
'X': '7',
'Z': '0',
'A': '*',
'E': '*',
'H': '*',
'I': '*',
'O': '*',
'U': '*',
'W': '*',
'Y': '*',
},
}
_alphabetic_initial = dict(zip((ord(_) for _ in '012345'), ' AHJWY'))
_alphabetic = dict(zip((ord(_) for _ in '0123456789'), 'STNMRLJKFP'))
def __init__(self, max_length=5, zero_pad=True):
"""Initialize RogerRoot instance.
Parameters
----------
max_length : int
The maximum length (default 5) of the code to return
zero_pad : bool
Pad the end of the return value with 0s to achieve a max_length
string
.. versionadded:: 0.4.0
"""
self._max_length = max_length
self._zero_pad = zero_pad
[docs] def encode_alpha(self, word):
"""Return the alphabetic Roger Root code for a word.
Parameters
----------
word : str
The word to transform
Returns
-------
str
The alphabetic Roger Root code
Examples
--------
>>> pe = RogerRoot()
>>> pe.encode_alpha('Christopher')
'JRST'
>>> pe.encode_alpha('Niall')
'NL'
>>> pe.encode_alpha('Smith')
'SMT'
>>> pe.encode_alpha('Schmidt')
'JMT'
.. versionadded:: 0.4.0
"""
code = self.encode(word).rstrip('0')
return code[:1].translate(self._alphabetic_initial).strip() + code[
1:
].translate(self._alphabetic)
[docs] def encode(self, word):
"""Return the Roger Root code for a word.
Parameters
----------
word : str
The word to transform
Returns
-------
str
The Roger Root code
Examples
--------
>>> pe = RogerRoot()
>>> pe.encode('Christopher')
'06401'
>>> pe.encode('Niall')
'02500'
>>> pe.encode('Smith')
'00310'
>>> pe.encode('Schmidt')
'06310'
.. versionadded:: 0.3.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
# uppercase, normalize, decompose, and filter non-A-Z out
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
word = ''.join(c for c in word if c in self._uc_set)
code = ''
pos = 0
# Do first digit(s) first
for num in range(4, 0, -1):
if word[:num] in self._init_patterns[num]:
code = self._init_patterns[num][word[:num]]
pos += num
break
# Then code subsequent digits
while pos < len(word):
for num in range(4, 0, -1): # pragma: no branch
if word[pos : pos + num] in self._med_patterns[num]:
code += self._med_patterns[num][word[pos : pos + num]]
pos += num
break
code = self._delete_consecutive_repeats(code)
code = code.replace('*', '')
if self._zero_pad:
code += '0' * self._max_length
return code[: self._max_length]
[docs]@deprecated(
deprecated_in='0.4.0',
removed_in='0.6.0',
current_version=__version__,
details='Use the RogerRoot.encode method instead.',
)
def roger_root(word, max_length=5, zero_pad=True):
"""Return the Roger Root code for a word.
This is a wrapper for :py:meth:`RogerRoot.encode`.
Parameters
----------
word : str
The word to transform
max_length : int
The maximum length (default 5) of the code to return
zero_pad : bool
Pad the end of the return value with 0s to achieve a max_length string
Returns
-------
str
The Roger Root code
Examples
--------
>>> roger_root('Christopher')
'06401'
>>> roger_root('Niall')
'02500'
>>> roger_root('Smith')
'00310'
>>> roger_root('Schmidt')
'06310'
.. versionadded:: 0.3.0
"""
return RogerRoot(max_length, zero_pad).encode(word)
if __name__ == '__main__':
import doctest
doctest.testmod()