Source code for abydos.phonetic._russell

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._russell.

The phonetic._russell module implements Robert C. Russell's Index.
"""

from __future__ import unicode_literals

from unicodedata import normalize as unicode_normalize

from six import text_type

from ._util import _delete_consecutive_repeats

__all__ = [
    'russell_index',
    'russell_index_alpha',
    'russell_index_num_to_alpha',
]


[docs]def russell_index(word): """Return the Russell Index (integer output) of a word. This follows Robert C. Russell's Index algorithm, as described in :cite:`Russell:1917`. :param str word: the word to transform :returns: the Russell Index value :rtype: int >>> russell_index('Christopher') 3813428 >>> russell_index('Niall') 715 >>> russell_index('Smith') 3614 >>> russell_index('Schmidt') 3614 """ _russell_translation = dict( zip( (ord(_) for _ in 'ABCDEFGIKLMNOPQRSTUVXYZ'), '12341231356712383412313', ) ) word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = word.replace('GH', '') # discard gh (rule 3) word = word.rstrip('SZ') # discard /[sz]$/ (rule 3) # translate according to Russell's mapping word = ''.join( c for c in word if c in { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z', } ) sdx = word.translate(_russell_translation) # remove any 1s after the first occurrence one = sdx.find('1') + 1 if one: sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1') # remove repeating characters sdx = _delete_consecutive_repeats(sdx) # return as an int return int(sdx) if sdx else float('NaN')
[docs]def russell_index_num_to_alpha(num): """Convert the Russell Index integer to an alphabetic string. This follows Robert C. Russell's Index algorithm, as described in :cite:`Russell:1917`. :param int num: a Russell Index integer value :returns: the Russell Index as an alphabetic string :rtype: str >>> russell_index_num_to_alpha(3813428) 'CRACDBR' >>> russell_index_num_to_alpha(715) 'NAL' >>> russell_index_num_to_alpha(3614) 'CMAD' """ _russell_num_translation = dict( zip((ord(_) for _ in '12345678'), 'ABCDLMNR') ) num = ''.join( c for c in text_type(num) if c in {'1', '2', '3', '4', '5', '6', '7', '8'} ) if num: return num.translate(_russell_num_translation) return ''
[docs]def russell_index_alpha(word): """Return the Russell Index (alphabetic output) for the word. This follows Robert C. Russell's Index algorithm, as described in :cite:`Russell:1917`. :param str word: the word to transform :returns: the Russell Index value as an alphabetic string :rtype: str >>> russell_index_alpha('Christopher') 'CRACDBR' >>> russell_index_alpha('Niall') 'NAL' >>> russell_index_alpha('Smith') 'CMAD' >>> russell_index_alpha('Schmidt') 'CMAD' """ if word: return russell_index_num_to_alpha(russell_index(word)) return ''
if __name__ == '__main__': import doctest doctest.testmod()