Source code for abydos.phonetic._sv

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._sv.

The phonetic._sv module implements phonetic algorithms for Scandinavian names
& languages (currently Swedish & Norwegian), including:

    - SfinxBis
    - Norphone
"""

from __future__ import unicode_literals

from unicodedata import normalize as unicode_normalize

from six import text_type

from ._util import _delete_consecutive_repeats

__all__ = ['norphone', 'sfinxbis']


[docs]def sfinxbis(word, max_length=-1): """Return the SfinxBis code for a word. SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`. This implementation follows the reference implementation: :cite:`Sjoo:2009`. SfinxBis is intended chiefly for Swedish names. :param str word: the word to transform :param int max_length: the length of the code returned (defaults to unlimited) :returns: the SfinxBis value :rtype: tuple >>> sfinxbis('Christopher') ('K68376',) >>> sfinxbis('Niall') ('N4',) >>> sfinxbis('Smith') ('S53',) >>> sfinxbis('Schmidt') ('S53',) >>> sfinxbis('Johansson') ('J585',) >>> sfinxbis('Sjöberg') ('#162',) """ adelstitler = ( ' DE LA ', ' DE LAS ', ' DE LOS ', ' VAN DE ', ' VAN DEN ', ' VAN DER ', ' VON DEM ', ' VON DER ', ' AF ', ' AV ', ' DA ', ' DE ', ' DEL ', ' DEN ', ' DES ', ' DI ', ' DO ', ' DON ', ' DOS ', ' DU ', ' E ', ' IN ', ' LA ', ' LE ', ' MAC ', ' MC ', ' VAN ', ' VON ', ' Y ', ' S:T ', ) _harde_vokaler = {'A', 'O', 'U', 'Å'} _mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'} _konsonanter = { 'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Z', } _alfabet = { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'Ä', 'Å', 'Ö', } _sfinxbis_translation = dict( zip( (ord(_) for _ in 'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'), '123729224551268378999999999', ) ) _sfinxbis_substitutions = dict( zip( (ord(_) for _ in 'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'), 'VSAAAAÄCEEEEIIIINOOOOÖUUUYY', ) ) def _foersvensker(lokal_ordet): """Return the Swedish-ized form of the word.""" lokal_ordet = lokal_ordet.replace('STIERN', 'STJÄRN') lokal_ordet = lokal_ordet.replace('HIE', 'HJ') lokal_ordet = lokal_ordet.replace('SIÖ', 'SJÖ') lokal_ordet = lokal_ordet.replace('SCH', 'SH') lokal_ordet = lokal_ordet.replace('QU', 'KV') lokal_ordet = lokal_ordet.replace('IO', 'JO') lokal_ordet = lokal_ordet.replace('PH', 'F') for i in _harde_vokaler: lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J') lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J') lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J') for i in _mjuka_vokaler: lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J') lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J') lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J') if 'H' in lokal_ordet: for i in _konsonanter: lokal_ordet = lokal_ordet.replace('H' + i, i) lokal_ordet = lokal_ordet.translate(_sfinxbis_substitutions) lokal_ordet = lokal_ordet.replace('Ð', 'ETH') lokal_ordet = lokal_ordet.replace('Þ', 'TH') lokal_ordet = lokal_ordet.replace('ß', 'SS') return lokal_ordet def _koda_foersta_ljudet(lokal_ordet): """Return the word with the first sound coded.""" if ( lokal_ordet[0:1] in _mjuka_vokaler or lokal_ordet[0:1] in _harde_vokaler ): lokal_ordet = '$' + lokal_ordet[1:] elif lokal_ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'): lokal_ordet = 'J' + lokal_ordet[2:] elif lokal_ordet[0:1] == 'G' and lokal_ordet[1:2] in _mjuka_vokaler: lokal_ordet = 'J' + lokal_ordet[1:] elif lokal_ordet[0:1] == 'Q': lokal_ordet = 'K' + lokal_ordet[1:] elif lokal_ordet[0:2] == 'CH' and lokal_ordet[2:3] in frozenset( _mjuka_vokaler | _harde_vokaler ): lokal_ordet = '#' + lokal_ordet[2:] elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _harde_vokaler: lokal_ordet = 'K' + lokal_ordet[1:] elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _konsonanter: lokal_ordet = 'K' + lokal_ordet[1:] elif lokal_ordet[0:1] == 'X': lokal_ordet = 'S' + lokal_ordet[1:] elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _mjuka_vokaler: lokal_ordet = 'S' + lokal_ordet[1:] elif lokal_ordet[0:3] in ('SKJ', 'STJ', 'SCH'): lokal_ordet = '#' + lokal_ordet[3:] elif lokal_ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'): lokal_ordet = '#' + lokal_ordet[2:] elif lokal_ordet[0:2] == 'SK' and lokal_ordet[2:3] in _mjuka_vokaler: lokal_ordet = '#' + lokal_ordet[2:] elif lokal_ordet[0:1] == 'K' and lokal_ordet[1:2] in _mjuka_vokaler: lokal_ordet = '#' + lokal_ordet[1:] return lokal_ordet # Steg 1, Versaler word = unicode_normalize('NFC', text_type(word.upper())) word = word.replace('ß', 'SS') word = word.replace('-', ' ') # Steg 2, Ta bort adelsprefix for adelstitel in adelstitler: while adelstitel in word: word = word.replace(adelstitel, ' ') if word.startswith(adelstitel[1:]): word = word[len(adelstitel) - 1 :] # Split word into tokens ordlista = word.split() # Steg 3, Ta bort dubbelteckning i början på namnet ordlista = [_delete_consecutive_repeats(ordet) for ordet in ordlista] if not ordlista: # noinspection PyRedundantParentheses return ('',) # Steg 4, Försvenskning ordlista = [_foersvensker(ordet) for ordet in ordlista] # Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214) ordlista = [ ''.join(c for c in ordet if c in _alfabet) for ordet in ordlista ] # Steg 6, Koda första ljudet ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista] # Steg 7, Dela upp namnet i två delar rest = [ordet[1:] for ordet in ordlista] # Steg 8, Utför fonetisk transformation i resten rest = [ordet.replace('DT', 'T') for ordet in rest] rest = [ordet.replace('X', 'KS') for ordet in rest] # Steg 9, Koda resten till en sifferkod for vokal in _mjuka_vokaler: rest = [ordet.replace('C' + vokal, '8' + vokal) for ordet in rest] rest = [ordet.translate(_sfinxbis_translation) for ordet in rest] # Steg 10, Ta bort intilliggande dubbletter rest = [_delete_consecutive_repeats(ordet) for ordet in rest] # Steg 11, Ta bort alla "9" rest = [ordet.replace('9', '') for ordet in rest] # Steg 12, Sätt ihop delarna igen ordlista = [ ''.join(ordet) for ordet in zip((_[0:1] for _ in ordlista), rest) ] # truncate, if max_length is set if max_length > 0: ordlista = [ordet[:max_length] for ordet in ordlista] return tuple(ordlista)
[docs]def norphone(word): """Return the Norphone code. The reference implementation by Lars Marius Garshol is available in :cite:`Garshol:2015`. Norphone was designed for Norwegian, but this implementation has been extended to support Swedish vowels as well. This function incorporates the "not implemented" rules from the above file's rule set. :param str word: the word to transform :returns: the Norphone code :rtype: str >>> norphone('Hansen') 'HNSN' >>> norphone('Larsen') 'LRSN' >>> norphone('Aagaard') 'ÅKRT' >>> norphone('Braaten') 'BRTN' >>> norphone('Sandvik') 'SNVK' """ _vowels = {'A', 'E', 'I', 'O', 'U', 'Y', 'Å', 'Æ', 'Ø', 'Ä', 'Ö'} replacements = { 4: {'SKEI': 'X'}, 3: {'SKJ': 'X', 'KEI': 'X'}, 2: { 'CH': 'K', 'CK': 'K', 'GJ': 'J', 'GH': 'K', 'HG': 'K', 'HJ': 'J', 'HL': 'L', 'HR': 'R', 'KJ': 'X', 'KI': 'X', 'LD': 'L', 'ND': 'N', 'PH': 'F', 'TH': 'T', 'SJ': 'X', }, 1: {'W': 'V', 'X': 'KS', 'Z': 'S', 'D': 'T', 'G': 'K'}, } word = word.upper() code = '' skip = 0 if word[0:2] == 'AA': code = 'Å' skip = 2 elif word[0:2] == 'GI': code = 'J' skip = 2 elif word[0:3] == 'SKY': code = 'X' skip = 3 elif word[0:2] == 'EI': code = 'Æ' skip = 2 elif word[0:2] == 'KY': code = 'X' skip = 2 elif word[:1] == 'C': code = 'K' skip = 1 elif word[:1] == 'Ä': code = 'Æ' skip = 1 elif word[:1] == 'Ö': code = 'Ø' skip = 1 if word[-2:] == 'DT': word = word[:-2] + 'T' # Though the rules indicate this rule applies in all positions, the # reference implementation indicates it applies only in final position. elif word[-2:-1] in _vowels and word[-1:] == 'D': word = word[:-2] for pos, char in enumerate(word): if skip: skip -= 1 else: for length in sorted(replacements, reverse=True): if word[pos : pos + length] in replacements[length]: code += replacements[length][word[pos : pos + length]] skip = length - 1 break else: if not pos or char not in _vowels: code += char code = _delete_consecutive_repeats(code) return code
if __name__ == '__main__': import doctest doctest.testmod()