Source code for abydos.phonetic._sv

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._sv.

The phonetic._sv module implements phonetic algorithms for Scandinavian names
& languages (currently Swedish & Norwegian), including:

    - SfinxBis
    - Norphone
"""

from __future__ import unicode_literals

from unicodedata import normalize as unicode_normalize

from six import text_type

from ._util import _delete_consecutive_repeats

__all__ = ['norphone', 'sfinxbis']


[docs]def sfinxbis(word, max_length=-1):
    """Return the SfinxBis code for a word.

    SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`.

    This implementation follows the reference implementation:
    :cite:`Sjoo:2009`.

    SfinxBis is intended chiefly for Swedish names.

    :param str word: the word to transform
    :param int max_length: the length of the code returned (defaults to
        unlimited)
    :returns: the SfinxBis value
    :rtype: tuple

    >>> sfinxbis('Christopher')
    ('K68376',)
    >>> sfinxbis('Niall')
    ('N4',)
    >>> sfinxbis('Smith')
    ('S53',)
    >>> sfinxbis('Schmidt')
    ('S53',)

    >>> sfinxbis('Johansson')
    ('J585',)
    >>> sfinxbis('Sjöberg')
    ('#162',)
    """
    adelstitler = (
        ' DE LA ',
        ' DE LAS ',
        ' DE LOS ',
        ' VAN DE ',
        ' VAN DEN ',
        ' VAN DER ',
        ' VON DEM ',
        ' VON DER ',
        ' AF ',
        ' AV ',
        ' DA ',
        ' DE ',
        ' DEL ',
        ' DEN ',
        ' DES ',
        ' DI ',
        ' DO ',
        ' DON ',
        ' DOS ',
        ' DU ',
        ' E ',
        ' IN ',
        ' LA ',
        ' LE ',
        ' MAC ',
        ' MC ',
        ' VAN ',
        ' VON ',
        ' Y ',
        ' S:T ',
    )

    _harde_vokaler = {'A', 'O', 'U', 'Å'}
    _mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'}
    _konsonanter = {
        'B',
        'C',
        'D',
        'F',
        'G',
        'H',
        'J',
        'K',
        'L',
        'M',
        'N',
        'P',
        'Q',
        'R',
        'S',
        'T',
        'V',
        'W',
        'X',
        'Z',
    }
    _alfabet = {
        'A',
        'B',
        'C',
        'D',
        'E',
        'F',
        'G',
        'H',
        'I',
        'J',
        'K',
        'L',
        'M',
        'N',
        'O',
        'P',
        'Q',
        'R',
        'S',
        'T',
        'U',
        'V',
        'W',
        'X',
        'Y',
        'Z',
        'Ä',
        'Å',
        'Ö',
    }

    _sfinxbis_translation = dict(
        zip(
            (ord(_) for _ in 'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),
            '123729224551268378999999999',
        )
    )

    _sfinxbis_substitutions = dict(
        zip(
            (ord(_) for _ in 'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
            'VSAAAAÄCEEEEIIIINOOOOÖUUUYY',
        )
    )

    def _foersvensker(lokal_ordet):
        """Return the Swedish-ized form of the word."""
        lokal_ordet = lokal_ordet.replace('STIERN', 'STJÄRN')
        lokal_ordet = lokal_ordet.replace('HIE', 'HJ')
        lokal_ordet = lokal_ordet.replace('SIÖ', 'SJÖ')
        lokal_ordet = lokal_ordet.replace('SCH', 'SH')
        lokal_ordet = lokal_ordet.replace('QU', 'KV')
        lokal_ordet = lokal_ordet.replace('IO', 'JO')
        lokal_ordet = lokal_ordet.replace('PH', 'F')

        for i in _harde_vokaler:
            lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
            lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
            lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
        for i in _mjuka_vokaler:
            lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
            lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
            lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')

        if 'H' in lokal_ordet:
            for i in _konsonanter:
                lokal_ordet = lokal_ordet.replace('H' + i, i)

        lokal_ordet = lokal_ordet.translate(_sfinxbis_substitutions)

        lokal_ordet = lokal_ordet.replace('Ð', 'ETH')
        lokal_ordet = lokal_ordet.replace('Þ', 'TH')
        lokal_ordet = lokal_ordet.replace('ß', 'SS')

        return lokal_ordet

    def _koda_foersta_ljudet(lokal_ordet):
        """Return the word with the first sound coded."""
        if (
            lokal_ordet[0:1] in _mjuka_vokaler
            or lokal_ordet[0:1] in _harde_vokaler
        ):
            lokal_ordet = '$' + lokal_ordet[1:]
        elif lokal_ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
            lokal_ordet = 'J' + lokal_ordet[2:]
        elif lokal_ordet[0:1] == 'G' and lokal_ordet[1:2] in _mjuka_vokaler:
            lokal_ordet = 'J' + lokal_ordet[1:]
        elif lokal_ordet[0:1] == 'Q':
            lokal_ordet = 'K' + lokal_ordet[1:]
        elif lokal_ordet[0:2] == 'CH' and lokal_ordet[2:3] in frozenset(
            _mjuka_vokaler | _harde_vokaler
        ):
            lokal_ordet = '#' + lokal_ordet[2:]
        elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _harde_vokaler:
            lokal_ordet = 'K' + lokal_ordet[1:]
        elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _konsonanter:
            lokal_ordet = 'K' + lokal_ordet[1:]
        elif lokal_ordet[0:1] == 'X':
            lokal_ordet = 'S' + lokal_ordet[1:]
        elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _mjuka_vokaler:
            lokal_ordet = 'S' + lokal_ordet[1:]
        elif lokal_ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
            lokal_ordet = '#' + lokal_ordet[3:]
        elif lokal_ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
            lokal_ordet = '#' + lokal_ordet[2:]
        elif lokal_ordet[0:2] == 'SK' and lokal_ordet[2:3] in _mjuka_vokaler:
            lokal_ordet = '#' + lokal_ordet[2:]
        elif lokal_ordet[0:1] == 'K' and lokal_ordet[1:2] in _mjuka_vokaler:
            lokal_ordet = '#' + lokal_ordet[1:]
        return lokal_ordet

    # Steg 1, Versaler
    word = unicode_normalize('NFC', text_type(word.upper()))
    word = word.replace('ß', 'SS')
    word = word.replace('-', ' ')

    # Steg 2, Ta bort adelsprefix
    for adelstitel in adelstitler:
        while adelstitel in word:
            word = word.replace(adelstitel, ' ')
        if word.startswith(adelstitel[1:]):
            word = word[len(adelstitel) - 1 :]

    # Split word into tokens
    ordlista = word.split()

    # Steg 3, Ta bort dubbelteckning i början på namnet
    ordlista = [_delete_consecutive_repeats(ordet) for ordet in ordlista]
    if not ordlista:
        # noinspection PyRedundantParentheses
        return ('',)

    # Steg 4, Försvenskning
    ordlista = [_foersvensker(ordet) for ordet in ordlista]

    # Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
    ordlista = [
        ''.join(c for c in ordet if c in _alfabet) for ordet in ordlista
    ]

    # Steg 6, Koda första ljudet
    ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]

    # Steg 7, Dela upp namnet i två delar
    rest = [ordet[1:] for ordet in ordlista]

    # Steg 8, Utför fonetisk transformation i resten
    rest = [ordet.replace('DT', 'T') for ordet in rest]
    rest = [ordet.replace('X', 'KS') for ordet in rest]

    # Steg 9, Koda resten till en sifferkod
    for vokal in _mjuka_vokaler:
        rest = [ordet.replace('C' + vokal, '8' + vokal) for ordet in rest]
    rest = [ordet.translate(_sfinxbis_translation) for ordet in rest]

    # Steg 10, Ta bort intilliggande dubbletter
    rest = [_delete_consecutive_repeats(ordet) for ordet in rest]

    # Steg 11, Ta bort alla "9"
    rest = [ordet.replace('9', '') for ordet in rest]

    # Steg 12, Sätt ihop delarna igen
    ordlista = [
        ''.join(ordet) for ordet in zip((_[0:1] for _ in ordlista), rest)
    ]

    # truncate, if max_length is set
    if max_length > 0:
        ordlista = [ordet[:max_length] for ordet in ordlista]

    return tuple(ordlista)


[docs]def norphone(word):
    """Return the Norphone code.

    The reference implementation by Lars Marius Garshol is available in
    :cite:`Garshol:2015`.

    Norphone was designed for Norwegian, but this implementation has been
    extended to support Swedish vowels as well. This function incorporates
    the "not implemented" rules from the above file's rule set.

    :param str word: the word to transform
    :returns: the Norphone code
    :rtype: str

    >>> norphone('Hansen')
    'HNSN'
    >>> norphone('Larsen')
    'LRSN'
    >>> norphone('Aagaard')
    'ÅKRT'
    >>> norphone('Braaten')
    'BRTN'
    >>> norphone('Sandvik')
    'SNVK'
    """
    _vowels = {'A', 'E', 'I', 'O', 'U', 'Y', 'Å', 'Æ', 'Ø', 'Ä', 'Ö'}

    replacements = {
        4: {'SKEI': 'X'},
        3: {'SKJ': 'X', 'KEI': 'X'},
        2: {
            'CH': 'K',
            'CK': 'K',
            'GJ': 'J',
            'GH': 'K',
            'HG': 'K',
            'HJ': 'J',
            'HL': 'L',
            'HR': 'R',
            'KJ': 'X',
            'KI': 'X',
            'LD': 'L',
            'ND': 'N',
            'PH': 'F',
            'TH': 'T',
            'SJ': 'X',
        },
        1: {'W': 'V', 'X': 'KS', 'Z': 'S', 'D': 'T', 'G': 'K'},
    }

    word = word.upper()

    code = ''
    skip = 0

    if word[0:2] == 'AA':
        code = 'Å'
        skip = 2
    elif word[0:2] == 'GI':
        code = 'J'
        skip = 2
    elif word[0:3] == 'SKY':
        code = 'X'
        skip = 3
    elif word[0:2] == 'EI':
        code = 'Æ'
        skip = 2
    elif word[0:2] == 'KY':
        code = 'X'
        skip = 2
    elif word[:1] == 'C':
        code = 'K'
        skip = 1
    elif word[:1] == 'Ä':
        code = 'Æ'
        skip = 1
    elif word[:1] == 'Ö':
        code = 'Ø'
        skip = 1

    if word[-2:] == 'DT':
        word = word[:-2] + 'T'
    # Though the rules indicate this rule applies in all positions, the
    # reference implementation indicates it applies only in final position.
    elif word[-2:-1] in _vowels and word[-1:] == 'D':
        word = word[:-2]

    for pos, char in enumerate(word):
        if skip:
            skip -= 1
        else:
            for length in sorted(replacements, reverse=True):
                if word[pos : pos + length] in replacements[length]:
                    code += replacements[length][word[pos : pos + length]]
                    skip = length - 1
                    break
            else:
                if not pos or char not in _vowels:
                    code += char

    code = _delete_consecutive_repeats(code)

    return code


if __name__ == '__main__':
    import doctest

    doctest.testmod()