# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.phonetic._sfinx_bis.
SfinxBis
"""
from unicodedata import normalize as unicode_normalize
from deprecation import deprecated
from ._phonetic import _Phonetic
from .. import __version__
__all__ = ['SfinxBis', 'sfinxbis']
[docs]class SfinxBis(_Phonetic):
"""SfinxBis code.
SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`.
This implementation follows the reference implementation:
:cite:`Sjoo:2009`.
SfinxBis is intended chiefly for Swedish names.
.. versionadded:: 0.3.6
"""
_adelstitler = (
' DE LA ',
' DE LAS ',
' DE LOS ',
' VAN DE ',
' VAN DEN ',
' VAN DER ',
' VON DEM ',
' VON DER ',
' AF ',
' AV ',
' DA ',
' DE ',
' DEL ',
' DEN ',
' DES ',
' DI ',
' DO ',
' DON ',
' DOS ',
' DU ',
' E ',
' IN ',
' LA ',
' LE ',
' MAC ',
' MC ',
' VAN ',
' VON ',
' Y ',
' S:T ',
)
_harde_vokaler = {'A', 'O', 'U', 'Å'}
_mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'}
_uc_c_set = {
'B',
'C',
'D',
'F',
'G',
'H',
'J',
'K',
'L',
'M',
'N',
'P',
'Q',
'R',
'S',
'T',
'V',
'W',
'X',
'Z',
}
_uc_set = {
'A',
'B',
'C',
'D',
'E',
'F',
'G',
'H',
'I',
'J',
'K',
'L',
'M',
'N',
'O',
'P',
'Q',
'R',
'S',
'T',
'U',
'V',
'W',
'X',
'Y',
'Z',
'Ä',
'Å',
'Ö',
}
_trans = dict(
zip(
(ord(_) for _ in 'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),
'123729224551268378999999999',
)
)
_substitutions = dict(
zip(
(ord(_) for _ in 'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
'VSAAAAÄCEEEEIIIINOOOOÖUUUYY',
)
)
_alphabetic = dict(zip((ord(_) for _ in '123456789#'), 'PKTLNRFSAŠ'))
def __init__(self, max_length=-1):
"""Initialize SfinxBis instance.
Parameters
----------
max_length : int
The length of the code returned (defaults to unlimited)
.. versionadded:: 0.4.0
"""
self._max_length = max_length
[docs] def encode_alpha(self, word):
"""Return the alphabetic SfinxBis code for a word.
Parameters
----------
word : str
The word to transform
Returns
-------
tuple
The alphabetic SfinxBis value
Examples
--------
>>> pe = SfinxBis()
>>> pe.encode_alpha('Christopher')
('KRSTFR',)
>>> pe.encode_alpha('Niall')
('NL',)
>>> pe.encode_alpha('Smith')
('SNT',)
>>> pe.encode_alpha('Schmidt')
('SNT',)
>>> pe.encode_alpha('Johansson')
('JNSN',)
>>> pe.encode_alpha('Sjöberg')
('ŠPRK',)
.. versionadded:: 0.4.0
"""
return tuple(
code.translate(self._alphabetic) for code in self.encode(word)
)
[docs] def encode(self, word):
"""Return the SfinxBis code for a word.
Parameters
----------
word : str
The word to transform
Returns
-------
tuple
The SfinxBis value
Examples
--------
>>> pe = SfinxBis()
>>> pe.encode('Christopher')
('K68376',)
>>> pe.encode('Niall')
('N4',)
>>> pe.encode('Smith')
('S53',)
>>> pe.encode('Schmidt')
('S53',)
>>> pe.encode('Johansson')
('J585',)
>>> pe.encode('Sjöberg')
('#162',)
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
def _foersvensker(lokal_ordet):
"""Return the Swedish-ized form of the word.
Parameters
----------
lokal_ordet : str
Word to transform
Returns
-------
str
Transformed word
.. versionadded:: 0.1.0
"""
lokal_ordet = lokal_ordet.replace('STIERN', 'STJÄRN')
lokal_ordet = lokal_ordet.replace('HIE', 'HJ')
lokal_ordet = lokal_ordet.replace('SIÖ', 'SJÖ')
lokal_ordet = lokal_ordet.replace('SCH', 'SH')
lokal_ordet = lokal_ordet.replace('QU', 'KV')
lokal_ordet = lokal_ordet.replace('IO', 'JO')
lokal_ordet = lokal_ordet.replace('PH', 'F')
for i in self._harde_vokaler:
lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
for i in self._mjuka_vokaler:
lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
if 'H' in lokal_ordet:
for i in self._uc_c_set:
lokal_ordet = lokal_ordet.replace('H' + i, i)
lokal_ordet = lokal_ordet.translate(self._substitutions)
lokal_ordet = lokal_ordet.replace('Ð', 'ETH')
lokal_ordet = lokal_ordet.replace('Þ', 'TH')
return lokal_ordet
def _koda_foersta_ljudet(lokal_ordet):
"""Return the word with the first sound coded.
Parameters
----------
lokal_ordet : str
Word to transform
Returns
-------
str
Transformed word
.. versionadded:: 0.1.0
"""
if (
lokal_ordet[0:1] in self._mjuka_vokaler
or lokal_ordet[0:1] in self._harde_vokaler
):
lokal_ordet = '$' + lokal_ordet[1:]
elif lokal_ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
lokal_ordet = 'J' + lokal_ordet[2:]
elif (
lokal_ordet[0:1] == 'G'
and lokal_ordet[1:2] in self._mjuka_vokaler
):
lokal_ordet = 'J' + lokal_ordet[1:]
elif lokal_ordet[0:1] == 'Q':
lokal_ordet = 'K' + lokal_ordet[1:]
elif lokal_ordet[0:2] == 'CH' and lokal_ordet[2:3] in frozenset(
self._mjuka_vokaler | self._harde_vokaler
):
lokal_ordet = '#' + lokal_ordet[2:]
elif (
lokal_ordet[0:1] == 'C'
and lokal_ordet[1:2] in self._harde_vokaler
):
lokal_ordet = 'K' + lokal_ordet[1:]
elif (
lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in self._uc_c_set
):
lokal_ordet = 'K' + lokal_ordet[1:]
elif lokal_ordet[0:1] == 'X':
lokal_ordet = 'S' + lokal_ordet[1:]
elif (
lokal_ordet[0:1] == 'C'
and lokal_ordet[1:2] in self._mjuka_vokaler
):
lokal_ordet = 'S' + lokal_ordet[1:]
elif lokal_ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
lokal_ordet = '#' + lokal_ordet[3:]
elif lokal_ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
lokal_ordet = '#' + lokal_ordet[2:]
elif (
lokal_ordet[0:2] == 'SK'
and lokal_ordet[2:3] in self._mjuka_vokaler
):
lokal_ordet = '#' + lokal_ordet[2:]
elif (
lokal_ordet[0:1] == 'K'
and lokal_ordet[1:2] in self._mjuka_vokaler
):
lokal_ordet = '#' + lokal_ordet[1:]
return lokal_ordet
# Steg 1, Versaler
word = unicode_normalize('NFC', word.upper())
word = word.replace('-', ' ')
# Steg 2, Ta bort adelsprefix
for adelstitel in self._adelstitler:
while adelstitel in word:
word = word.replace(adelstitel, ' ')
if word.startswith(adelstitel[1:]):
word = word[len(adelstitel) - 1 :]
# Split word into tokens
ordlista = word.split()
# Steg 3, Ta bort dubbelteckning i början på namnet
ordlista = [
self._delete_consecutive_repeats(ordet) for ordet in ordlista
]
if not ordlista:
# noinspection PyRedundantParentheses
return ('',)
# Steg 4, Försvenskning
ordlista = [_foersvensker(ordet) for ordet in ordlista]
# Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
ordlista = [
''.join(c for c in ordet if c in self._uc_set)
for ordet in ordlista
]
# Steg 6, Koda första ljudet
ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]
# Steg 7, Dela upp namnet i två delar
rest = [ordet[1:] for ordet in ordlista]
# Steg 8, Utför fonetisk transformation i resten
rest = [ordet.replace('DT', 'T') for ordet in rest]
rest = [ordet.replace('X', 'KS') for ordet in rest]
# Steg 9, Koda resten till en sifferkod
for vokal in self._mjuka_vokaler:
rest = [ordet.replace('C' + vokal, '8' + vokal) for ordet in rest]
rest = [ordet.translate(self._trans) for ordet in rest]
# Steg 10, Ta bort intilliggande dubbletter
rest = [self._delete_consecutive_repeats(ordet) for ordet in rest]
# Steg 11, Ta bort alla "9"
rest = [ordet.replace('9', '') for ordet in rest]
# Steg 12, Sätt ihop delarna igen
ordlista = [
''.join(ordet) for ordet in zip((_[0:1] for _ in ordlista), rest)
]
# truncate, if max_length is set
if self._max_length > 0:
ordlista = [ordet[: self._max_length] for ordet in ordlista]
return tuple(ordlista)
[docs]@deprecated(
deprecated_in='0.4.0',
removed_in='0.6.0',
current_version=__version__,
details='Use the SfinxBis.encode method instead.',
)
def sfinxbis(word, max_length=-1):
"""Return the SfinxBis code for a word.
This is a wrapper for :py:meth:`SfinxBis.encode`.
Parameters
----------
word : str
The word to transform
max_length : int
The length of the code returned (defaults to unlimited)
Returns
-------
tuple
The SfinxBis value
Examples
--------
>>> sfinxbis('Christopher')
('K68376',)
>>> sfinxbis('Niall')
('N4',)
>>> sfinxbis('Smith')
('S53',)
>>> sfinxbis('Schmidt')
('S53',)
>>> sfinxbis('Johansson')
('J585',)
>>> sfinxbis('Sjöberg')
('#162',)
.. versionadded:: 0.1.0
"""
return SfinxBis(max_length).encode(word)
if __name__ == '__main__':
import doctest
doctest.testmod()