Source code for abydos.phonetic._russell

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._russell.

The phonetic._russell module implements Robert C. Russell's Index.
"""

from __future__ import unicode_literals

from unicodedata import normalize as unicode_normalize

from six import text_type

from ._util import _delete_consecutive_repeats

__all__ = [
    'russell_index',
    'russell_index_alpha',
    'russell_index_num_to_alpha',
]


[docs]def russell_index(word):
    """Return the Russell Index (integer output) of a word.

    This follows Robert C. Russell's Index algorithm, as described in
    :cite:`Russell:1917`.

    :param str word: the word to transform
    :returns: the Russell Index value
    :rtype: int

    >>> russell_index('Christopher')
    3813428
    >>> russell_index('Niall')
    715
    >>> russell_index('Smith')
    3614
    >>> russell_index('Schmidt')
    3614
    """
    _russell_translation = dict(
        zip(
            (ord(_) for _ in 'ABCDEFGIKLMNOPQRSTUVXYZ'),
            '12341231356712383412313',
        )
    )

    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')
    word = word.replace('GH', '')  # discard gh (rule 3)
    word = word.rstrip('SZ')  # discard /[sz]$/ (rule 3)

    # translate according to Russell's mapping
    word = ''.join(
        c
        for c in word
        if c
        in {
            'A',
            'B',
            'C',
            'D',
            'E',
            'F',
            'G',
            'I',
            'K',
            'L',
            'M',
            'N',
            'O',
            'P',
            'Q',
            'R',
            'S',
            'T',
            'U',
            'V',
            'X',
            'Y',
            'Z',
        }
    )
    sdx = word.translate(_russell_translation)

    # remove any 1s after the first occurrence
    one = sdx.find('1') + 1
    if one:
        sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1')

    # remove repeating characters
    sdx = _delete_consecutive_repeats(sdx)

    # return as an int
    return int(sdx) if sdx else float('NaN')


[docs]def russell_index_num_to_alpha(num):
    """Convert the Russell Index integer to an alphabetic string.

    This follows Robert C. Russell's Index algorithm, as described in
    :cite:`Russell:1917`.

    :param int num: a Russell Index integer value
    :returns: the Russell Index as an alphabetic string
    :rtype: str

    >>> russell_index_num_to_alpha(3813428)
    'CRACDBR'
    >>> russell_index_num_to_alpha(715)
    'NAL'
    >>> russell_index_num_to_alpha(3614)
    'CMAD'
    """
    _russell_num_translation = dict(
        zip((ord(_) for _ in '12345678'), 'ABCDLMNR')
    )
    num = ''.join(
        c
        for c in text_type(num)
        if c in {'1', '2', '3', '4', '5', '6', '7', '8'}
    )
    if num:
        return num.translate(_russell_num_translation)
    return ''


[docs]def russell_index_alpha(word):
    """Return the Russell Index (alphabetic output) for the word.

    This follows Robert C. Russell's Index algorithm, as described in
    :cite:`Russell:1917`.

    :param str word: the word to transform
    :returns: the Russell Index value as an alphabetic string
    :rtype: str

    >>> russell_index_alpha('Christopher')
    'CRACDBR'
    >>> russell_index_alpha('Niall')
    'NAL'
    >>> russell_index_alpha('Smith')
    'CMAD'
    >>> russell_index_alpha('Schmidt')
    'CMAD'
    """
    if word:
        return russell_index_num_to_alpha(russell_index(word))
    return ''


if __name__ == '__main__':
    import doctest

    doctest.testmod()