# -*- coding: utf-8 -*-
# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.phonetic._de.
The phonetic._de module implements the Kölner Phonetik and related
algorithms for German:
- Kölner Phonetik
- Phonem
- Haase Phonetik
- Reth-Schek Phonetik
"""
from __future__ import unicode_literals
from itertools import product
from unicodedata import normalize as unicode_normalize
from six import text_type
from six.moves import range
from ._util import _delete_consecutive_repeats
__all__ = [
'haase_phonetik',
'koelner_phonetik',
'koelner_phonetik_alpha',
'koelner_phonetik_num_to_alpha',
'phonem',
'reth_schek_phonetik',
]
[docs]def koelner_phonetik(word):
"""Return the Kölner Phonetik (numeric output) code for a word.
Based on the algorithm defined by :cite:`Postel:1969`.
While the output code is numeric, it is still a str because 0s can lead
the code.
:param str word: the word to transform
:returns: the Kölner Phonetik value as a numeric string
:rtype: str
>>> koelner_phonetik('Christopher')
'478237'
>>> koelner_phonetik('Niall')
'65'
>>> koelner_phonetik('Smith')
'862'
>>> koelner_phonetik('Schmidt')
'862'
>>> koelner_phonetik('Müller')
'657'
>>> koelner_phonetik('Zimmermann')
'86766'
"""
def _after(word, pos, letters):
"""Return True if word[i] follows one of the supplied letters."""
return pos > 0 and word[pos - 1] in letters
def _before(word, pos, letters):
"""Return True if word[i] precedes one of the supplied letters."""
return pos + 1 < len(word) and word[pos + 1] in letters
_vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
sdx = ''
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
word = word.replace('Ä', 'AE')
word = word.replace('Ö', 'OE')
word = word.replace('Ü', 'UE')
word = ''.join(
c
for c in word
if c
in {
'A',
'B',
'C',
'D',
'E',
'F',
'G',
'H',
'I',
'J',
'K',
'L',
'M',
'N',
'O',
'P',
'Q',
'R',
'S',
'T',
'U',
'V',
'W',
'X',
'Y',
'Z',
}
)
# Nothing to convert, return base case
if not word:
return sdx
for i in range(len(word)):
if word[i] in _vowels:
sdx += '0'
elif word[i] == 'B':
sdx += '1'
elif word[i] == 'P':
if _before(word, i, {'H'}):
sdx += '3'
else:
sdx += '1'
elif word[i] in {'D', 'T'}:
if _before(word, i, {'C', 'S', 'Z'}):
sdx += '8'
else:
sdx += '2'
elif word[i] in {'F', 'V', 'W'}:
sdx += '3'
elif word[i] in {'G', 'K', 'Q'}:
sdx += '4'
elif word[i] == 'C':
if _after(word, i, {'S', 'Z'}):
sdx += '8'
elif i == 0:
if _before(
word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}
):
sdx += '4'
else:
sdx += '8'
elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
sdx += '4'
else:
sdx += '8'
elif word[i] == 'X':
if _after(word, i, {'C', 'K', 'Q'}):
sdx += '8'
else:
sdx += '48'
elif word[i] == 'L':
sdx += '5'
elif word[i] in {'M', 'N'}:
sdx += '6'
elif word[i] == 'R':
sdx += '7'
elif word[i] in {'S', 'Z'}:
sdx += '8'
sdx = _delete_consecutive_repeats(sdx)
if sdx:
sdx = sdx[:1] + sdx[1:].replace('0', '')
return sdx
[docs]def koelner_phonetik_num_to_alpha(num):
"""Convert a Kölner Phonetik code from numeric to alphabetic.
:param str num: a numeric Kölner Phonetik representation (can be a str or
an int)
:returns: an alphabetic representation of the same word
:rtype: str
>>> koelner_phonetik_num_to_alpha('862')
'SNT'
>>> koelner_phonetik_num_to_alpha('657')
'NLR'
>>> koelner_phonetik_num_to_alpha('86766')
'SNRNN'
"""
_koelner_num_translation = dict(
zip((ord(_) for _ in '012345678'), 'APTFKLNRS')
)
num = ''.join(
c
for c in text_type(num)
if c in {'0', '1', '2', '3', '4', '5', '6', '7', '8'}
)
return num.translate(_koelner_num_translation)
[docs]def koelner_phonetik_alpha(word):
"""Return the Kölner Phonetik (alphabetic output) code for a word.
:param str word: the word to transform
:returns: the Kölner Phonetik value as an alphabetic string
:rtype: str
>>> koelner_phonetik_alpha('Smith')
'SNT'
>>> koelner_phonetik_alpha('Schmidt')
'SNT'
>>> koelner_phonetik_alpha('Müller')
'NLR'
>>> koelner_phonetik_alpha('Zimmermann')
'SNRNN'
"""
return koelner_phonetik_num_to_alpha(koelner_phonetik(word))
[docs]def phonem(word):
"""Return the Phonem code for a word.
Phonem is defined in :cite:`Wilde:1988`.
This version is based on the Perl implementation documented at
:cite:`Wilz:2005`.
It includes some enhancements presented in the Java port at
:cite:`dcm4che:2011`.
Phonem is intended chiefly for German names/words.
:param str word: the word to transform
:returns: the Phonem value
:rtype: str
>>> phonem('Christopher')
'CRYSDOVR'
>>> phonem('Niall')
'NYAL'
>>> phonem('Smith')
'SMYD'
>>> phonem('Schmidt')
'CMYD'
"""
_phonem_substitutions = (
('SC', 'C'),
('SZ', 'C'),
('CZ', 'C'),
('TZ', 'C'),
('TS', 'C'),
('KS', 'X'),
('PF', 'V'),
('QU', 'KW'),
('PH', 'V'),
('UE', 'Y'),
('AE', 'E'),
('OE', 'Ö'),
('EI', 'AY'),
('EY', 'AY'),
('EU', 'OY'),
('AU', 'A§'),
('OU', '§'),
)
_phonem_translation = dict(
zip(
(ord(_) for _ in 'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ'),
'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ',
)
)
word = unicode_normalize('NFC', text_type(word.upper()))
for i, j in _phonem_substitutions:
word = word.replace(i, j)
word = word.translate(_phonem_translation)
return ''.join(
c
for c in _delete_consecutive_repeats(word)
if c
in {
'A',
'B',
'C',
'D',
'L',
'M',
'N',
'O',
'R',
'S',
'U',
'V',
'W',
'X',
'Y',
'Ö',
}
)
[docs]def haase_phonetik(word, primary_only=False):
"""Return the Haase Phonetik (numeric output) code for a word.
Based on the algorithm described at :cite:`Prante:2015`.
Based on the original :cite:`Haase:2000`.
While the output code is numeric, it is nevertheless a str.
:param str word: the word to transform
:param bool primary_only: if True, only the primary code is returned
:returns: the Haase Phonetik value as a numeric string
:rtype: tuple
>>> haase_phonetik('Joachim')
('9496',)
>>> haase_phonetik('Christoph')
('4798293', '8798293')
>>> haase_phonetik('Jörg')
('974',)
>>> haase_phonetik('Smith')
('8692',)
>>> haase_phonetik('Schmidt')
('8692', '4692')
"""
def _after(word, i, letters):
"""Return True if word[i] follows one of the supplied letters."""
if i > 0 and word[i - 1] in letters:
return True
return False
def _before(word, i, letters):
"""Return True if word[i] precedes one of the supplied letters."""
if i + 1 < len(word) and word[i + 1] in letters:
return True
return False
_vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
word = word.replace('Ä', 'AE')
word = word.replace('Ö', 'OE')
word = word.replace('Ü', 'UE')
word = ''.join(
c
for c in word
if c
in {
'A',
'B',
'C',
'D',
'E',
'F',
'G',
'H',
'I',
'J',
'K',
'L',
'M',
'N',
'O',
'P',
'Q',
'R',
'S',
'T',
'U',
'V',
'W',
'X',
'Y',
'Z',
}
)
variants = []
if primary_only:
variants = [word]
else:
pos = 0
if word[:2] == 'CH':
variants.append(('CH', 'SCH'))
pos += 2
len_3_vars = {
'OWN': 'AUN',
'WSK': 'RSK',
'SCH': 'CH',
'GLI': 'LI',
'AUX': 'O',
'EUX': 'O',
}
while pos < len(word):
if word[pos : pos + 4] == 'ILLE':
variants.append(('ILLE', 'I'))
pos += 4
elif word[pos : pos + 3] in len_3_vars:
variants.append(
(word[pos : pos + 3], len_3_vars[word[pos : pos + 3]])
)
pos += 3
elif word[pos : pos + 2] == 'RB':
variants.append(('RB', 'RW'))
pos += 2
elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
variants.append(('EAU', 'O'))
pos += 3
elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
if word[pos:] == 'O':
variants.append(('O', 'OW'))
else:
variants.append(('A', 'AR'))
pos += 1
else:
variants.append((word[pos],))
pos += 1
variants = [''.join(letters) for letters in product(*variants)]
def _haase_code(word):
sdx = ''
for i in range(len(word)):
if word[i] in _vowels:
sdx += '9'
elif word[i] == 'B':
sdx += '1'
elif word[i] == 'P':
if _before(word, i, {'H'}):
sdx += '3'
else:
sdx += '1'
elif word[i] in {'D', 'T'}:
if _before(word, i, {'C', 'S', 'Z'}):
sdx += '8'
else:
sdx += '2'
elif word[i] in {'F', 'V', 'W'}:
sdx += '3'
elif word[i] in {'G', 'K', 'Q'}:
sdx += '4'
elif word[i] == 'C':
if _after(word, i, {'S', 'Z'}):
sdx += '8'
elif i == 0:
if _before(
word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}
):
sdx += '4'
else:
sdx += '8'
elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
sdx += '4'
else:
sdx += '8'
elif word[i] == 'X':
if _after(word, i, {'C', 'K', 'Q'}):
sdx += '8'
else:
sdx += '48'
elif word[i] == 'L':
sdx += '5'
elif word[i] in {'M', 'N'}:
sdx += '6'
elif word[i] == 'R':
sdx += '7'
elif word[i] in {'S', 'Z'}:
sdx += '8'
sdx = _delete_consecutive_repeats(sdx)
return sdx
encoded = tuple(_haase_code(word) for word in variants)
if len(encoded) > 1:
encoded_set = set()
encoded_single = []
for code in encoded:
if code not in encoded_set:
encoded_set.add(code)
encoded_single.append(code)
return tuple(encoded_single)
return encoded
[docs]def reth_schek_phonetik(word):
"""Return Reth-Schek Phonetik code for a word.
This algorithm is proposed in :cite:`Reth:1977`.
Since I couldn't secure a copy of that document (maybe I'll look for it
next time I'm in Germany), this implementation is based on what I could
glean from the implementations published by German Record Linkage
Center (www.record-linkage.de):
- Privacy-preserving Record Linkage (PPRL) (in R) :cite:`Rukasz:2018`
- Merge ToolBox (in Java) :cite:`Schnell:2004`
Rules that are unclear:
- Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked)
- Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo)
- Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't
think of a German word with '-tui-' in it.)
- Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'?
:param str word: the word to transform
:returns: the Reth-Schek Phonetik code
:rtype: str
>>> reth_schek_phonetik('Joachim')
'JOAGHIM'
>>> reth_schek_phonetik('Christoph')
'GHRISDOF'
>>> reth_schek_phonetik('Jörg')
'JOERG'
>>> reth_schek_phonetik('Smith')
'SMID'
>>> reth_schek_phonetik('Schmidt')
'SCHMID'
"""
replacements = {
3: {
'AEH': 'E',
'IEH': 'I',
'OEH': 'OE',
'UEH': 'UE',
'SCH': 'CH',
'ZIO': 'TIO',
'TIU': 'TIO',
'ZIU': 'TIO',
'CHS': 'X',
'CKS': 'X',
'AEU': 'OI',
},
2: {
'LL': 'L',
'AA': 'A',
'AH': 'A',
'BB': 'B',
'PP': 'B',
'BP': 'B',
'PB': 'B',
'DD': 'D',
'DT': 'D',
'TT': 'D',
'TH': 'D',
'EE': 'E',
'EH': 'E',
'AE': 'E',
'FF': 'F',
'PH': 'F',
'KK': 'K',
'GG': 'G',
'GK': 'G',
'KG': 'G',
'CK': 'G',
'CC': 'C',
'IE': 'I',
'IH': 'I',
'MM': 'M',
'NN': 'N',
'OO': 'O',
'OH': 'O',
'SZ': 'S',
'UH': 'U',
'GS': 'X',
'KS': 'X',
'TZ': 'Z',
'AY': 'AI',
'EI': 'AI',
'EY': 'AI',
'EU': 'OI',
'RR': 'R',
'SS': 'S',
'KW': 'QU',
},
1: {
'P': 'B',
'T': 'D',
'V': 'F',
'W': 'F',
'C': 'G',
'K': 'G',
'Y': 'I',
},
}
# Uppercase
word = word.upper()
# Replace umlauts/eszett
word = word.replace('Ä', 'AE')
word = word.replace('Ö', 'OE')
word = word.replace('Ü', 'UE')
word = word.replace('ß', 'SS')
# Main loop, using above replacements table
pos = 0
while pos < len(word):
for num in range(3, 0, -1):
if word[pos : pos + num] in replacements[num]:
word = (
word[:pos]
+ replacements[num][word[pos : pos + num]]
+ word[pos + num :]
)
pos += 1
break
else:
pos += 1 # Advance if nothing is recognized
# Change 'CH' back(?) to 'SCH'
word = word.replace('CH', 'SCH')
# Replace final sequences
if word[-2:] == 'ER':
word = word[:-2] + 'R'
elif word[-2:] == 'EL':
word = word[:-2] + 'L'
elif word[-1:] == 'H':
word = word[:-1]
return word
if __name__ == '__main__':
import doctest
doctest.testmod()