Source code for abydos.phonetic._dolby

# -*- coding: utf-8 -*-

# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._dolby.

The phonetic._dolby module implements the Dolby Code algorithm.
"""

from __future__ import unicode_literals

from unicodedata import normalize as unicode_normalize

from six import text_type

from ._util import _delete_consecutive_repeats

__all__ = ['dolby']


[docs]def dolby(word, max_length=-1, keep_vowels=False, vowel_char='*'):
    r"""Return the Dolby Code of a name.

    This follows "A Spelling Equivalent Abbreviation Algorithm For Personal
    Names" from :cite:`Dolby:1970` and :cite:`Cunningham:1969`.

    :param word: the word to encode
    :param max_length: maximum length of the returned Dolby code -- this also
        activates the fixed-length code mode if it is greater than 0
    :param keep_vowels: if True, retains all vowel markers
    :param vowel_char: the vowel marker character (default to \*)
    :returns: the Dolby Code
    :rtype: str

    >>> dolby('Hansen')
    'H*NSN'
    >>> dolby('Larsen')
    'L*RSN'
    >>> dolby('Aagaard')
    '*GR'
    >>> dolby('Braaten')
    'BR*DN'
    >>> dolby('Sandvik')
    'S*NVK'
    >>> dolby('Hansen', max_length=6)
    'H*NS*N'
    >>> dolby('Larsen', max_length=6)
    'L*RS*N'
    >>> dolby('Aagaard', max_length=6)
    '*G*R  '
    >>> dolby('Braaten', max_length=6)
    'BR*D*N'
    >>> dolby('Sandvik', max_length=6)
    'S*NF*K'

    >>> dolby('Smith')
    'SM*D'
    >>> dolby('Waters')
    'W*DRS'
    >>> dolby('James')
    'J*MS'
    >>> dolby('Schmidt')
    'SM*D'
    >>> dolby('Ashcroft')
    '*SKRFD'
    >>> dolby('Smith', max_length=6)
    'SM*D  '
    >>> dolby('Waters', max_length=6)
    'W*D*RS'
    >>> dolby('James', max_length=6)
    'J*M*S '
    >>> dolby('Schmidt', max_length=6)
    'SM*D  '
    >>> dolby('Ashcroft', max_length=6)
    '*SKRFD'
    """
    _vowels = {'A', 'E', 'I', 'O', 'U', 'Y'}

    # uppercase, normalize, decompose, and filter non-A-Z out
    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')
    word = ''.join(
        c
        for c in word
        if c
        in {
            'A',
            'B',
            'C',
            'D',
            'E',
            'F',
            'G',
            'H',
            'I',
            'J',
            'K',
            'L',
            'M',
            'N',
            'O',
            'P',
            'Q',
            'R',
            'S',
            'T',
            'U',
            'V',
            'W',
            'X',
            'Y',
            'Z',
        }
    )

    # Rule 1 (FL2)
    if word[:3] in {'MCG', 'MAG', 'MAC'}:
        word = 'MK' + word[3:]
    elif word[:2] == 'MC':
        word = 'MK' + word[2:]

    # Rule 2 (FL3)
    pos = len(word) - 2
    while pos > -1:
        if word[pos : pos + 2] in {
            'DT',
            'LD',
            'ND',
            'NT',
            'RC',
            'RD',
            'RT',
            'SC',
            'SK',
            'ST',
        }:
            word = word[: pos + 1] + word[pos + 2 :]
            pos += 1
        pos -= 1

    # Rule 3 (FL4)
    # Although the rule indicates "after the first letter", the test cases make
    # it clear that these apply to the first letter also.
    word = word.replace('X', 'KS')
    word = word.replace('CE', 'SE')
    word = word.replace('CI', 'SI')
    word = word.replace('CY', 'SI')

    # not in the rule set, but they seem to have intended it
    word = word.replace('TCH', 'CH')

    pos = word.find('CH', 1)
    while pos != -1:
        if word[pos - 1 : pos] not in _vowels:
            word = word[:pos] + 'S' + word[pos + 1 :]
        pos = word.find('CH', pos + 1)

    word = word.replace('C', 'K')
    word = word.replace('Z', 'S')

    word = word.replace('WR', 'R')
    word = word.replace('DG', 'G')
    word = word.replace('QU', 'K')
    word = word.replace('T', 'D')
    word = word.replace('PH', 'F')

    # Rule 4 (FL5)
    # Although the rule indicates "after the first letter", the test cases make
    # it clear that these apply to the first letter also.
    pos = word.find('K', 0)
    while pos != -1:
        if pos > 1 and word[pos - 1 : pos] not in _vowels | {'L', 'N', 'R'}:
            word = word[: pos - 1] + word[pos:]
            pos -= 1
        pos = word.find('K', pos + 1)

    # Rule FL6
    if max_length > 0 and word[-1:] == 'E':
        word = word[:-1]

    # Rule 5 (FL7)
    word = _delete_consecutive_repeats(word)

    # Rule 6 (FL8)
    if word[:2] == 'PF':
        word = word[1:]
    if word[-2:] == 'PF':
        word = word[:-1]
    elif word[-2:] == 'GH':
        if word[-3:-2] in _vowels:
            word = word[:-2] + 'F'
        else:
            word = word[:-2] + 'G'
    word = word.replace('GH', '')

    # Rule FL9
    if max_length > 0:
        word = word.replace('V', 'F')

    # Rules 7-9 (FL10-FL12)
    first = 1 + (1 if max_length > 0 else 0)
    code = ''
    for pos, char in enumerate(word):
        if char in _vowels:
            if first or keep_vowels:
                code += vowel_char
                first -= 1
        elif pos > 0 and char in {'W', 'H'}:
            continue
        else:
            code += char

    if max_length > 0:
        # Rule FL13
        if len(code) > max_length and code[-1:] == 'S':
            code = code[:-1]
        if keep_vowels:
            code = code[:max_length]
        else:
            # Rule FL14
            code = code[: max_length + 2]
            # Rule FL15
            while len(code) > max_length:
                vowels = len(code) - max_length
                excess = vowels - 1
                word = code
                code = ''
                for char in word:
                    if char == vowel_char:
                        if vowels:
                            code += char
                            vowels -= 1
                    else:
                        code += char
                code = code[: max_length + excess]

        # Rule FL16
        code += ' ' * (max_length - len(code))

    return code


if __name__ == '__main__':
    import doctest

    doctest.testmod()