# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.phonetic._haase.
Haase Phonetik
"""
from itertools import product
from unicodedata import normalize as unicode_normalize
from deprecation import deprecated
from ._phonetic import _Phonetic
from .. import __version__
__all__ = ['Haase', 'haase_phonetik']
[docs]class Haase(_Phonetic):
"""Haase Phonetik.
Based on the algorithm described at :cite:`Prante:2015`.
Based on the original :cite:`Haase:2000`.
.. versionadded:: 0.3.6
"""
_uc_v_set = set('AEIJOUY')
_alphabetic = dict(zip((ord(_) for _ in '123456789'), 'PTFKLNRSA'))
def __init__(self, primary_only=False):
"""Initialize Haase instance.
Parameters
----------
primary_only : bool
If True, only the primary code is returned
.. versionadded:: 0.4.0
"""
self._primary_only = primary_only
[docs] def encode_alpha(self, word):
"""Return the alphabetic Haase Phonetik code for a word.
Parameters
----------
word : str
The word to transform
Returns
-------
tuple
The alphabetic Haase Phonetik value
Examples
--------
>>> pe = Haase()
>>> pe.encode_alpha('Joachim')
('AKAN',)
>>> pe.encode_alpha('Christoph')
('KRASTAF', 'SRASTAF')
>>> pe.encode_alpha('Jörg')
('ARK',)
>>> pe.encode_alpha('Smith')
('SNAT',)
>>> pe.encode_alpha('Schmidt')
('SNAT', 'KNAT')
.. versionadded:: 0.4.0
"""
return tuple(
code.translate(self._alphabetic) for code in self.encode(word)
)
[docs] def encode(self, word):
"""Return the Haase Phonetik (numeric output) code for a word.
While the output code is numeric, it is nevertheless a str.
Parameters
----------
word : str
The word to transform
Returns
-------
tuple
The Haase Phonetik value as a numeric string
Examples
--------
>>> pe = Haase()
>>> pe.encode('Joachim')
('9496',)
>>> pe.encode('Christoph')
('4798293', '8798293')
>>> pe.encode('Jörg')
('974',)
>>> pe.encode('Smith')
('8692',)
>>> pe.encode('Schmidt')
('8692', '4692')
.. versionadded:: 0.3.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
def _after(word, pos, letters):
"""Return True if word[pos] follows one of the supplied letters.
Parameters
----------
word : str
Word to modify
pos : int
Position to examine
letters : set
Letters to check for
Returns
-------
bool
True if word[pos] follows one of letters
.. versionadded:: 0.3.0
"""
if pos > 0 and word[pos - 1] in letters:
return True
return False
def _before(word, pos, letters):
"""Return True if word[pos] precedes one of the supplied letters.
Parameters
----------
word : str
Word to modify
pos : int
Position to examine
letters : set
Letters to check for
Returns
-------
bool
True if word[pos] precedes one of letters
.. versionadded:: 0.3.0
"""
if pos + 1 < len(word) and word[pos + 1] in letters:
return True
return False
word = unicode_normalize('NFKD', word.upper())
word = word.replace('Ä', 'AE')
word = word.replace('Ö', 'OE')
word = word.replace('Ü', 'UE')
word = ''.join(c for c in word if c in self._uc_set)
variants = []
if self._primary_only:
variants = [word]
else:
pos = 0
if word[:2] == 'CH':
variants.append(('CH', 'SCH'))
pos += 2
len_3_vars = {
'OWN': 'AUN',
'WSK': 'RSK',
'SCH': 'CH',
'GLI': 'LI',
'AUX': 'O',
'EUX': 'O',
}
while pos < len(word):
if word[pos : pos + 4] == 'ILLE':
variants.append(('ILLE', 'I'))
pos += 4
elif word[pos : pos + 3] in len_3_vars:
variants.append(
(word[pos : pos + 3], len_3_vars[word[pos : pos + 3]])
)
pos += 3
elif word[pos : pos + 2] == 'RB':
variants.append(('RB', 'RW'))
pos += 2
elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
variants.append(('EAU', 'O'))
pos += 3
elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
if word[pos:] == 'O':
variants.append(('O', 'OW'))
else:
variants.append(('A', 'AR'))
pos += 1
else:
variants.append((word[pos],))
pos += 1
variants = [''.join(letters) for letters in product(*variants)]
def _haase_code(word):
sdx = ''
for i in range(len(word)):
if word[i] in self._uc_v_set:
sdx += '9'
elif word[i] == 'B':
sdx += '1'
elif word[i] == 'P':
if _before(word, i, {'H'}):
sdx += '3'
else:
sdx += '1'
elif word[i] in {'D', 'T'}:
if _before(word, i, {'C', 'S', 'Z'}):
sdx += '8'
else:
sdx += '2'
elif word[i] in {'F', 'V', 'W'}:
sdx += '3'
elif word[i] in {'G', 'K', 'Q'}:
sdx += '4'
elif word[i] == 'C':
if _after(word, i, {'S', 'Z'}):
sdx += '8'
elif i == 0:
if _before(
word,
i,
{'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'},
):
sdx += '4'
else:
sdx += '8'
elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
sdx += '4'
else:
sdx += '8'
elif word[i] == 'X':
if _after(word, i, {'C', 'K', 'Q'}):
sdx += '8'
else:
sdx += '48'
elif word[i] == 'L':
sdx += '5'
elif word[i] in {'M', 'N'}:
sdx += '6'
elif word[i] == 'R':
sdx += '7'
elif word[i] in {'S', 'Z'}:
sdx += '8'
sdx = self._delete_consecutive_repeats(sdx)
return sdx
encoded = tuple(_haase_code(word) for word in variants)
if len(encoded) > 1:
encoded_set = set()
encoded_single = []
for code in encoded:
if code not in encoded_set:
encoded_set.add(code)
encoded_single.append(code)
return tuple(encoded_single)
return encoded
[docs]@deprecated(
deprecated_in='0.4.0',
removed_in='0.6.0',
current_version=__version__,
details='Use the Haase.encode method instead.',
)
def haase_phonetik(word, primary_only=False):
"""Return the Haase Phonetik code for a word.
This is a wrapper for :py:meth:`Haase.encode`.
Parameters
----------
word : str
The word to transform
primary_only : bool
If True, only the primary code is returned
Returns
-------
tuple
The Haase Phonetik value as a numeric string
Examples
--------
>>> haase_phonetik('Joachim')
('9496',)
>>> haase_phonetik('Christoph')
('4798293', '8798293')
>>> haase_phonetik('Jörg')
('974',)
>>> haase_phonetik('Smith')
('8692',)
>>> haase_phonetik('Schmidt')
('8692', '4692')
.. versionadded:: 0.3.0
"""
return Haase(primary_only).encode(word)
if __name__ == '__main__':
import doctest
doctest.testmod()