Source code for abydos.phonetic._reth_schek

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._reth_schek.

Reth-Schek Phonetik
"""

from deprecation import deprecated

from ._phonetic import _Phonetic
from .. import __version__

__all__ = ['RethSchek', 'reth_schek_phonetik']


[docs]class RethSchek(_Phonetic): """Reth-Schek Phonetik. This algorithm is proposed in :cite:`Reth:1977`. Since I couldn't secure a copy of that document (maybe I'll look for it next time I'm in Germany), this implementation is based on what I could glean from the implementations published by German Record Linkage Center (www.record-linkage.de): - Privacy-preserving Record Linkage (PPRL) (in R) :cite:`Rukasz:2018` - Merge ToolBox (in Java) :cite:`Schnell:2004` Rules that are unclear: - Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked) - Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo) - Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't think of a German word with '-tui-' in it.) - Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'? .. versionadded:: 0.3.6 """ _replacements = { 3: { 'AEH': 'E', 'IEH': 'I', 'OEH': 'OE', 'UEH': 'UE', 'SCH': 'CH', 'ZIO': 'TIO', 'TIU': 'TIO', 'ZIU': 'TIO', 'CHS': 'X', 'CKS': 'X', 'AEU': 'OI', }, 2: { 'LL': 'L', 'AA': 'A', 'AH': 'A', 'BB': 'B', 'PP': 'B', 'BP': 'B', 'PB': 'B', 'DD': 'D', 'DT': 'D', 'TT': 'D', 'TH': 'D', 'EE': 'E', 'EH': 'E', 'AE': 'E', 'FF': 'F', 'PH': 'F', 'KK': 'K', 'GG': 'G', 'GK': 'G', 'KG': 'G', 'CK': 'G', 'CC': 'C', 'IE': 'I', 'IH': 'I', 'MM': 'M', 'NN': 'N', 'OO': 'O', 'OH': 'O', 'SZ': 'S', 'UH': 'U', 'GS': 'X', 'KS': 'X', 'TZ': 'Z', 'AY': 'AI', 'EI': 'AI', 'EY': 'AI', 'EU': 'OI', 'RR': 'R', 'SS': 'S', 'KW': 'QU', }, 1: { 'P': 'B', 'T': 'D', 'V': 'F', 'W': 'F', 'C': 'G', 'K': 'G', 'Y': 'I', }, }
[docs] def encode(self, word): """Return Reth-Schek Phonetik code for a word. Parameters ---------- word : str The word to transform Returns ------- str The Reth-Schek Phonetik code Examples -------- >>> pe = RethSchek() >>> pe.encode('Joachim') 'JOAGHIM' >>> pe.encode('Christoph') 'GHRISDOF' >>> pe.encode('Jörg') 'JOERG' >>> pe.encode('Smith') 'SMID' >>> pe.encode('Schmidt') 'SCHMID' .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ # Uppercase word = word.upper() # Replace umlauts/eszett word = word.replace('Ä', 'AE') word = word.replace('Ö', 'OE') word = word.replace('Ü', 'UE') # Main loop, using above replacements table pos = 0 while pos < len(word): for num in range(3, 0, -1): if word[pos : pos + num] in self._replacements[num]: word = ( word[:pos] + self._replacements[num][word[pos : pos + num]] + word[pos + num :] ) pos += 1 break else: pos += 1 # Advance if nothing is recognized # Change 'CH' back(?) to 'SCH' word = word.replace('CH', 'SCH') # Replace final sequences if word[-2:] == 'ER': word = word[:-2] + 'R' elif word[-2:] == 'EL': word = word[:-2] + 'L' elif word[-1:] == 'H': word = word[:-1] return word
[docs]@deprecated( deprecated_in='0.4.0', removed_in='0.6.0', current_version=__version__, details='Use the RethSchek.encode method instead.', ) def reth_schek_phonetik(word): """Return Reth-Schek Phonetik code for a word. This is a wrapper for :py:meth:`RethSchek.encode`. Parameters ---------- word : str The word to transform Returns ------- str The Reth-Schek Phonetik code Examples -------- >>> reth_schek_phonetik('Joachim') 'JOAGHIM' >>> reth_schek_phonetik('Christoph') 'GHRISDOF' >>> reth_schek_phonetik('Jörg') 'JOERG' >>> reth_schek_phonetik('Smith') 'SMID' >>> reth_schek_phonetik('Schmidt') 'SCHMID' .. versionadded:: 0.3.0 """ return RethSchek().encode(word)
if __name__ == '__main__': import doctest doctest.testmod()