Source code for abydos.phonetic._waahlin

# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._waahlin.

Wåhlin phonetic encoding
"""

from unicodedata import normalize as unicode_normalize

from ._phonetic import _Phonetic

__all__ = ['Waahlin']


[docs]class Waahlin(_Phonetic):
    """Wåhlin code.

    Wåhlin's first-letter coding is based on the description in
    :cite:`Erikson:1997`.

    .. versionadded:: 0.3.6
    """

    def __init__(self, encoder=None):
        """Initialize Waahlin instance.

        Parameters
        ----------
        encoder : _Phonetic
            An initialized phonetic algorithm object


        .. versionadded:: 0.4.0

        """
        self._encoder = encoder

    _transforms = {
        3: {'SCH': '*', 'STJ': '*', 'SKJ': '*'},
        2: {
            'AE': 'E',
            'CH': 'K',
            'DJ': 'J',
            'GJ': 'J',
            'HJ': 'J',
            'HV': 'V',
            'HW': 'V',
            'HR': 'R',
            'KJ': '+',
            'LJ': 'J',
            'PH': 'F',
            'QU': 'KV',
            'SJ': '*',
            'TJ': '+',
        },
        1: {'Q': 'K', 'W': 'V', 'Z': 'S', 'Ä': 'E'},
    }

    def _encode_next(self, word):
        if word[:3] == 'STI' and word[3:4] in {'E', 'Ä'}:
            code = '*'
            remainder = word[3:]
        elif word[:3] in self._transforms[3]:
            code = self._transforms[3][word[:3]]
            remainder = word[3:]
        elif word[:2] == 'HI' and word[2:3] in {
            'A',
            'E',
            'I',
            'O',
            'U',
            'Y',
            'Å',
            'Ä',
            'Ö',
        }:
            code = 'J'
            remainder = word[2:]
        elif word[:2] == 'SK' and word[2:3] in {'E', 'I', 'Y', 'Ä', 'Ö'}:
            code = '*'
            remainder = word[2:]
        elif word[:2] in self._transforms[2]:
            code = self._transforms[2][word[:2]]
            remainder = word[2:]
        elif word[:1] == 'C' and word[1:2] in {'E', 'I', 'Y', 'Ä'}:
            code = 'S'
            remainder = word[1:]
        elif word[:1] == 'G' and word[1:2] in {'E', 'I', 'Y', 'Ä', 'Ö'}:
            code = 'J'
            remainder = word[1:]
        elif word[:1] == 'I' and word[1:2] in {
            'A',
            'E',
            'I',
            'O',
            'U',
            'Y',
            'Å',
            'Ä',
            'Ö',
        }:
            code = 'J'
            remainder = word[1:]
        elif word[:1] == 'K' and word[1:2] in {'E', 'I', 'Y', 'Ä', 'Ö'}:
            code = '+'
            remainder = word[1:]
        elif word[:1] in self._transforms[1]:
            code = self._transforms[1][word[:1]]
            remainder = word[1:]
        else:
            code = word[:1]
            remainder = word[1:]

        return code, remainder

[docs]    def encode_alpha(self, word):
        """Return the alphabetic Wåhlin code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The alphabetic Wåhlin code value

        Examples
        --------
        >>> pe = Waahlin()
        >>> pe.encode_alpha('Christopher')
        'KRISTOFER'
        >>> pe.encode_alpha('Niall')
        'NJALL'
        >>> pe.encode_alpha('Smith')
        'SMITH'
        >>> pe.encode_alpha('Schmidt')
        'ŠMIDT'


        .. versionadded:: 0.4.0

        """
        return (
            self.encode(word, alphabetic=True)
            .replace('+', 'Ç')
            .replace('*', 'Š')
        )

[docs]    def encode(self, word, alphabetic=False):
        """Return the Wåhlin code for a word.

        Parameters
        ----------
        word : str
            The word to transform
        alphabetic : bool
            If True, the encoder will apply its alphabetic form (.encode_alpha
            rather than .encode)

        Returns
        -------
        str
            The Wåhlin code value

        Examples
        --------
        >>> pe = Waahlin()
        >>> pe.encode('Christopher')
        'KRISTOFER'
        >>> pe.encode('Niall')
        'NJALL'
        >>> pe.encode('Smith')
        'SMITH'
        >>> pe.encode('Schmidt')
        '*MIDT'


        .. versionadded:: 0.4.0

        """
        # uppercase, normalize, decompose, and filter non-A-Z out
        word = unicode_normalize('NFC', word.upper())
        if not word:
            return ''

        if self._encoder is None:
            code = ''
            while word:
                part, word = self._encode_next(word)
                code += part
            return code

        code, word = self._encode_next(word)
        return code + (
            self._encoder.encode_alpha(word)
            if alphabetic
            else self._encoder.encode(word)
        )


if __name__ == '__main__':
    import doctest

    doctest.testmod()