Source code for abydos.phonetic._spfc

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._spfc.

The phonetic._spfc module implements the Standardized Phonetic Frequency Code
(SPFC) algorithm.
"""

from __future__ import unicode_literals

from unicodedata import normalize as unicode_normalize

from six import text_type
from six.moves import range

from ._util import _delete_consecutive_repeats

__all__ = ['spfc']


[docs]def spfc(word):
    """Return the Standardized Phonetic Frequency Code (SPFC) of a word.

    Standardized Phonetic Frequency Code is roughly Soundex-like.
    This implementation is based on page 19-21 of :cite:`Moore:1977`.

    :param str word: the word to transform
    :returns: the SPFC value
    :rtype: str

    >>> spfc('Christopher Smith')
    '01160'
    >>> spfc('Christopher Schmidt')
    '01160'
    >>> spfc('Niall Smith')
    '01660'
    >>> spfc('Niall Schmidt')
    '01660'

    >>> spfc('L.Smith')
    '01960'
    >>> spfc('R.Miller')
    '65490'

    >>> spfc(('L', 'Smith'))
    '01960'
    >>> spfc(('R', 'Miller'))
    '65490'
    """
    _pf1 = dict(
        zip(
            (ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'),
            '0011112222334445556666777',
        )
    )
    _pf2 = dict(
        zip(
            (ord(_) for _ in 'SZCKQFPXABORDHIMNGJTUVWEL'),
            '0011122233445556677788899',
        )
    )
    _pf3 = dict(
        zip(
            (ord(_) for _ in 'BCKQVDTFLPGJXMNRSZAEHIOUWY'),
            '00000112223334456677777777',
        )
    )

    _substitutions = (
        ('DK', 'K'),
        ('DT', 'T'),
        ('SC', 'S'),
        ('KN', 'N'),
        ('MN', 'N'),
    )

    def _raise_word_ex():
        """Raise an AttributeError."""
        raise AttributeError(
            'word attribute must be a string with a space '
            + 'or period dividing the first and last names '
            + 'or a tuple/list consisting of the first and '
            + 'last names'
        )

    if not word:
        return ''

    names = []
    if isinstance(word, (str, text_type)):
        names = word.split('.', 1)
        if len(names) != 2:
            names = word.split(' ', 1)
            if len(names) != 2:
                _raise_word_ex()
    elif hasattr(word, '__iter__'):
        if len(word) != 2:
            _raise_word_ex()
        names = word
    else:
        _raise_word_ex()

    names = [
        unicode_normalize(
            'NFKD', text_type(_.strip().replace('ß', 'SS').upper())
        )
        for _ in names
    ]
    code = ''

    def steps_one_to_three(name):
        """Perform the first three steps of SPFC."""
        # filter out non A-Z
        name = ''.join(
            _
            for _ in name
            if _
            in {
                'A',
                'B',
                'C',
                'D',
                'E',
                'F',
                'G',
                'H',
                'I',
                'J',
                'K',
                'L',
                'M',
                'N',
                'O',
                'P',
                'Q',
                'R',
                'S',
                'T',
                'U',
                'V',
                'W',
                'X',
                'Y',
                'Z',
            }
        )

        # 1. In the field, convert DK to K, DT to T, SC to S, KN to N,
        # and MN to N
        for subst in _substitutions:
            name = name.replace(subst[0], subst[1])

        # 2. In the name field, replace multiple letters with a single letter
        name = _delete_consecutive_repeats(name)

        # 3. Remove vowels, W, H, and Y, but keep the first letter in the name
        # field.
        if name:
            name = name[0] + ''.join(
                _
                for _ in name[1:]
                if _ not in {'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'}
            )
        return name

    names = [steps_one_to_three(_) for _ in names]

    # 4. The first digit of the code is obtained using PF1 and the first letter
    # of the name field. Remove this letter after coding.
    if names[1]:
        code += names[1][0].translate(_pf1)
        names[1] = names[1][1:]

    # 5. Using the last letters of the name, use Table PF3 to obtain the
    # second digit of the code. Use as many letters as possible and remove
    # after coding.
    if names[1]:
        if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS':
            code += '8'
            names[1] = names[1][:-3]
        elif names[1][-2:] == 'SN':
            code += '8'
            names[1] = names[1][:-2]
        elif names[1][-3:] == 'STR':
            code += '9'
            names[1] = names[1][:-3]
        elif names[1][-2:] in {'SR', 'TN', 'TD'}:
            code += '9'
            names[1] = names[1][:-2]
        elif names[1][-3:] == 'DRS':
            code += '7'
            names[1] = names[1][:-3]
        elif names[1][-2:] in {'TR', 'MN'}:
            code += '7'
            names[1] = names[1][:-2]
        else:
            code += names[1][-1].translate(_pf3)
            names[1] = names[1][:-1]

    # 6. The third digit is found using Table PF2 and the first character of
    # the first name. Remove after coding.
    if names[0]:
        code += names[0][0].translate(_pf2)
        names[0] = names[0][1:]

    # 7. The fourth digit is found using Table PF2 and the first character of
    # the name field. If no letters remain use zero. After coding remove the
    # letter.
    # 8. The fifth digit is found in the same manner as the fourth using the
    # remaining characters of the name field if any.
    for _ in range(2):
        if names[1]:
            code += names[1][0].translate(_pf2)
            names[1] = names[1][1:]
        else:
            code += '0'

    return code


if __name__ == '__main__':
    import doctest

    doctest.testmod()