Source code for abydos.phonetic._de

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._de.

The phonetic._de module implements the Kölner Phonetik and related
algorithms for German:

    - Kölner Phonetik
    - Phonem
    - Haase Phonetik
    - Reth-Schek Phonetik
"""

from __future__ import unicode_literals

from itertools import product
from unicodedata import normalize as unicode_normalize

from six import text_type
from six.moves import range

from ._util import _delete_consecutive_repeats

__all__ = [
    'haase_phonetik',
    'koelner_phonetik',
    'koelner_phonetik_alpha',
    'koelner_phonetik_num_to_alpha',
    'phonem',
    'reth_schek_phonetik',
]


[docs]def koelner_phonetik(word): """Return the Kölner Phonetik (numeric output) code for a word. Based on the algorithm defined by :cite:`Postel:1969`. While the output code is numeric, it is still a str because 0s can lead the code. :param str word: the word to transform :returns: the Kölner Phonetik value as a numeric string :rtype: str >>> koelner_phonetik('Christopher') '478237' >>> koelner_phonetik('Niall') '65' >>> koelner_phonetik('Smith') '862' >>> koelner_phonetik('Schmidt') '862' >>> koelner_phonetik('Müller') '657' >>> koelner_phonetik('Zimmermann') '86766' """ def _after(word, pos, letters): """Return True if word[i] follows one of the supplied letters.""" return pos > 0 and word[pos - 1] in letters def _before(word, pos, letters): """Return True if word[i] precedes one of the supplied letters.""" return pos + 1 < len(word) and word[pos + 1] in letters _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'} sdx = '' word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = word.replace('Ä', 'AE') word = word.replace('Ö', 'OE') word = word.replace('Ü', 'UE') word = ''.join( c for c in word if c in { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', } ) # Nothing to convert, return base case if not word: return sdx for i in range(len(word)): if word[i] in _vowels: sdx += '0' elif word[i] == 'B': sdx += '1' elif word[i] == 'P': if _before(word, i, {'H'}): sdx += '3' else: sdx += '1' elif word[i] in {'D', 'T'}: if _before(word, i, {'C', 'S', 'Z'}): sdx += '8' else: sdx += '2' elif word[i] in {'F', 'V', 'W'}: sdx += '3' elif word[i] in {'G', 'K', 'Q'}: sdx += '4' elif word[i] == 'C': if _after(word, i, {'S', 'Z'}): sdx += '8' elif i == 0: if _before( word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'} ): sdx += '4' else: sdx += '8' elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}): sdx += '4' else: sdx += '8' elif word[i] == 'X': if _after(word, i, {'C', 'K', 'Q'}): sdx += '8' else: sdx += '48' elif word[i] == 'L': sdx += '5' elif word[i] in {'M', 'N'}: sdx += '6' elif word[i] == 'R': sdx += '7' elif word[i] in {'S', 'Z'}: sdx += '8' sdx = _delete_consecutive_repeats(sdx) if sdx: sdx = sdx[:1] + sdx[1:].replace('0', '') return sdx
[docs]def koelner_phonetik_num_to_alpha(num): """Convert a Kölner Phonetik code from numeric to alphabetic. :param str num: a numeric Kölner Phonetik representation (can be a str or an int) :returns: an alphabetic representation of the same word :rtype: str >>> koelner_phonetik_num_to_alpha('862') 'SNT' >>> koelner_phonetik_num_to_alpha('657') 'NLR' >>> koelner_phonetik_num_to_alpha('86766') 'SNRNN' """ _koelner_num_translation = dict( zip((ord(_) for _ in '012345678'), 'APTFKLNRS') ) num = ''.join( c for c in text_type(num) if c in {'0', '1', '2', '3', '4', '5', '6', '7', '8'} ) return num.translate(_koelner_num_translation)
[docs]def koelner_phonetik_alpha(word): """Return the Kölner Phonetik (alphabetic output) code for a word. :param str word: the word to transform :returns: the Kölner Phonetik value as an alphabetic string :rtype: str >>> koelner_phonetik_alpha('Smith') 'SNT' >>> koelner_phonetik_alpha('Schmidt') 'SNT' >>> koelner_phonetik_alpha('Müller') 'NLR' >>> koelner_phonetik_alpha('Zimmermann') 'SNRNN' """ return koelner_phonetik_num_to_alpha(koelner_phonetik(word))
[docs]def phonem(word): """Return the Phonem code for a word. Phonem is defined in :cite:`Wilde:1988`. This version is based on the Perl implementation documented at :cite:`Wilz:2005`. It includes some enhancements presented in the Java port at :cite:`dcm4che:2011`. Phonem is intended chiefly for German names/words. :param str word: the word to transform :returns: the Phonem value :rtype: str >>> phonem('Christopher') 'CRYSDOVR' >>> phonem('Niall') 'NYAL' >>> phonem('Smith') 'SMYD' >>> phonem('Schmidt') 'CMYD' """ _phonem_substitutions = ( ('SC', 'C'), ('SZ', 'C'), ('CZ', 'C'), ('TZ', 'C'), ('TS', 'C'), ('KS', 'X'), ('PF', 'V'), ('QU', 'KW'), ('PH', 'V'), ('UE', 'Y'), ('AE', 'E'), ('OE', 'Ö'), ('EI', 'AY'), ('EY', 'AY'), ('EU', 'OY'), ('AU', 'A§'), ('OU', '§'), ) _phonem_translation = dict( zip( (ord(_) for _ in 'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ'), 'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ', ) ) word = unicode_normalize('NFC', text_type(word.upper())) for i, j in _phonem_substitutions: word = word.replace(i, j) word = word.translate(_phonem_translation) return ''.join( c for c in _delete_consecutive_repeats(word) if c in { 'A', 'B', 'C', 'D', 'L', 'M', 'N', 'O', 'R', 'S', 'U', 'V', 'W', 'X', 'Y', 'Ö', } )
[docs]def haase_phonetik(word, primary_only=False): """Return the Haase Phonetik (numeric output) code for a word. Based on the algorithm described at :cite:`Prante:2015`. Based on the original :cite:`Haase:2000`. While the output code is numeric, it is nevertheless a str. :param str word: the word to transform :param bool primary_only: if True, only the primary code is returned :returns: the Haase Phonetik value as a numeric string :rtype: tuple >>> haase_phonetik('Joachim') ('9496',) >>> haase_phonetik('Christoph') ('4798293', '8798293') >>> haase_phonetik('Jörg') ('974',) >>> haase_phonetik('Smith') ('8692',) >>> haase_phonetik('Schmidt') ('8692', '4692') """ def _after(word, i, letters): """Return True if word[i] follows one of the supplied letters.""" if i > 0 and word[i - 1] in letters: return True return False def _before(word, i, letters): """Return True if word[i] precedes one of the supplied letters.""" if i + 1 < len(word) and word[i + 1] in letters: return True return False _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'} word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = word.replace('Ä', 'AE') word = word.replace('Ö', 'OE') word = word.replace('Ü', 'UE') word = ''.join( c for c in word if c in { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', } ) variants = [] if primary_only: variants = [word] else: pos = 0 if word[:2] == 'CH': variants.append(('CH', 'SCH')) pos += 2 len_3_vars = { 'OWN': 'AUN', 'WSK': 'RSK', 'SCH': 'CH', 'GLI': 'LI', 'AUX': 'O', 'EUX': 'O', } while pos < len(word): if word[pos : pos + 4] == 'ILLE': variants.append(('ILLE', 'I')) pos += 4 elif word[pos : pos + 3] in len_3_vars: variants.append( (word[pos : pos + 3], len_3_vars[word[pos : pos + 3]]) ) pos += 3 elif word[pos : pos + 2] == 'RB': variants.append(('RB', 'RW')) pos += 2 elif len(word[pos:]) == 3 and word[pos:] == 'EAU': variants.append(('EAU', 'O')) pos += 3 elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}: if word[pos:] == 'O': variants.append(('O', 'OW')) else: variants.append(('A', 'AR')) pos += 1 else: variants.append((word[pos],)) pos += 1 variants = [''.join(letters) for letters in product(*variants)] def _haase_code(word): sdx = '' for i in range(len(word)): if word[i] in _vowels: sdx += '9' elif word[i] == 'B': sdx += '1' elif word[i] == 'P': if _before(word, i, {'H'}): sdx += '3' else: sdx += '1' elif word[i] in {'D', 'T'}: if _before(word, i, {'C', 'S', 'Z'}): sdx += '8' else: sdx += '2' elif word[i] in {'F', 'V', 'W'}: sdx += '3' elif word[i] in {'G', 'K', 'Q'}: sdx += '4' elif word[i] == 'C': if _after(word, i, {'S', 'Z'}): sdx += '8' elif i == 0: if _before( word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'} ): sdx += '4' else: sdx += '8' elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}): sdx += '4' else: sdx += '8' elif word[i] == 'X': if _after(word, i, {'C', 'K', 'Q'}): sdx += '8' else: sdx += '48' elif word[i] == 'L': sdx += '5' elif word[i] in {'M', 'N'}: sdx += '6' elif word[i] == 'R': sdx += '7' elif word[i] in {'S', 'Z'}: sdx += '8' sdx = _delete_consecutive_repeats(sdx) return sdx encoded = tuple(_haase_code(word) for word in variants) if len(encoded) > 1: encoded_set = set() encoded_single = [] for code in encoded: if code not in encoded_set: encoded_set.add(code) encoded_single.append(code) return tuple(encoded_single) return encoded
[docs]def reth_schek_phonetik(word): """Return Reth-Schek Phonetik code for a word. This algorithm is proposed in :cite:`Reth:1977`. Since I couldn't secure a copy of that document (maybe I'll look for it next time I'm in Germany), this implementation is based on what I could glean from the implementations published by German Record Linkage Center (www.record-linkage.de): - Privacy-preserving Record Linkage (PPRL) (in R) :cite:`Rukasz:2018` - Merge ToolBox (in Java) :cite:`Schnell:2004` Rules that are unclear: - Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked) - Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo) - Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't think of a German word with '-tui-' in it.) - Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'? :param str word: the word to transform :returns: the Reth-Schek Phonetik code :rtype: str >>> reth_schek_phonetik('Joachim') 'JOAGHIM' >>> reth_schek_phonetik('Christoph') 'GHRISDOF' >>> reth_schek_phonetik('Jörg') 'JOERG' >>> reth_schek_phonetik('Smith') 'SMID' >>> reth_schek_phonetik('Schmidt') 'SCHMID' """ replacements = { 3: { 'AEH': 'E', 'IEH': 'I', 'OEH': 'OE', 'UEH': 'UE', 'SCH': 'CH', 'ZIO': 'TIO', 'TIU': 'TIO', 'ZIU': 'TIO', 'CHS': 'X', 'CKS': 'X', 'AEU': 'OI', }, 2: { 'LL': 'L', 'AA': 'A', 'AH': 'A', 'BB': 'B', 'PP': 'B', 'BP': 'B', 'PB': 'B', 'DD': 'D', 'DT': 'D', 'TT': 'D', 'TH': 'D', 'EE': 'E', 'EH': 'E', 'AE': 'E', 'FF': 'F', 'PH': 'F', 'KK': 'K', 'GG': 'G', 'GK': 'G', 'KG': 'G', 'CK': 'G', 'CC': 'C', 'IE': 'I', 'IH': 'I', 'MM': 'M', 'NN': 'N', 'OO': 'O', 'OH': 'O', 'SZ': 'S', 'UH': 'U', 'GS': 'X', 'KS': 'X', 'TZ': 'Z', 'AY': 'AI', 'EI': 'AI', 'EY': 'AI', 'EU': 'OI', 'RR': 'R', 'SS': 'S', 'KW': 'QU', }, 1: { 'P': 'B', 'T': 'D', 'V': 'F', 'W': 'F', 'C': 'G', 'K': 'G', 'Y': 'I', }, } # Uppercase word = word.upper() # Replace umlauts/eszett word = word.replace('Ä', 'AE') word = word.replace('Ö', 'OE') word = word.replace('Ü', 'UE') word = word.replace('ß', 'SS') # Main loop, using above replacements table pos = 0 while pos < len(word): for num in range(3, 0, -1): if word[pos : pos + num] in replacements[num]: word = ( word[:pos] + replacements[num][word[pos : pos + num]] + word[pos + num :] ) pos += 1 break else: pos += 1 # Advance if nothing is recognized # Change 'CH' back(?) to 'SCH' word = word.replace('CH', 'SCH') # Replace final sequences if word[-2:] == 'ER': word = word[:-2] + 'R' elif word[-2:] == 'EL': word = word[:-2] + 'L' elif word[-1:] == 'H': word = word[:-1] return word
if __name__ == '__main__': import doctest doctest.testmod()