# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.phonetic._reth_schek.
Reth-Schek Phonetik
"""
from deprecation import deprecated
from ._phonetic import _Phonetic
from .. import __version__
__all__ = ['RethSchek', 'reth_schek_phonetik']
[docs]class RethSchek(_Phonetic):
"""Reth-Schek Phonetik.
This algorithm is proposed in :cite:`Reth:1977`.
Since I couldn't secure a copy of that document (maybe I'll look for it
next time I'm in Germany), this implementation is based on what I could
glean from the implementations published by German Record Linkage
Center (www.record-linkage.de):
- Privacy-preserving Record Linkage (PPRL) (in R) :cite:`Rukasz:2018`
- Merge ToolBox (in Java) :cite:`Schnell:2004`
Rules that are unclear:
- Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked)
- Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo)
- Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't
think of a German word with '-tui-' in it.)
- Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'?
.. versionadded:: 0.3.6
"""
_replacements = {
3: {
'AEH': 'E',
'IEH': 'I',
'OEH': 'OE',
'UEH': 'UE',
'SCH': 'CH',
'ZIO': 'TIO',
'TIU': 'TIO',
'ZIU': 'TIO',
'CHS': 'X',
'CKS': 'X',
'AEU': 'OI',
},
2: {
'LL': 'L',
'AA': 'A',
'AH': 'A',
'BB': 'B',
'PP': 'B',
'BP': 'B',
'PB': 'B',
'DD': 'D',
'DT': 'D',
'TT': 'D',
'TH': 'D',
'EE': 'E',
'EH': 'E',
'AE': 'E',
'FF': 'F',
'PH': 'F',
'KK': 'K',
'GG': 'G',
'GK': 'G',
'KG': 'G',
'CK': 'G',
'CC': 'C',
'IE': 'I',
'IH': 'I',
'MM': 'M',
'NN': 'N',
'OO': 'O',
'OH': 'O',
'SZ': 'S',
'UH': 'U',
'GS': 'X',
'KS': 'X',
'TZ': 'Z',
'AY': 'AI',
'EI': 'AI',
'EY': 'AI',
'EU': 'OI',
'RR': 'R',
'SS': 'S',
'KW': 'QU',
},
1: {
'P': 'B',
'T': 'D',
'V': 'F',
'W': 'F',
'C': 'G',
'K': 'G',
'Y': 'I',
},
}
[docs] def encode(self, word):
"""Return Reth-Schek Phonetik code for a word.
Parameters
----------
word : str
The word to transform
Returns
-------
str
The Reth-Schek Phonetik code
Examples
--------
>>> pe = RethSchek()
>>> pe.encode('Joachim')
'JOAGHIM'
>>> pe.encode('Christoph')
'GHRISDOF'
>>> pe.encode('Jörg')
'JOERG'
>>> pe.encode('Smith')
'SMID'
>>> pe.encode('Schmidt')
'SCHMID'
.. versionadded:: 0.3.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
# Uppercase
word = word.upper()
# Replace umlauts/eszett
word = word.replace('Ä', 'AE')
word = word.replace('Ö', 'OE')
word = word.replace('Ü', 'UE')
# Main loop, using above replacements table
pos = 0
while pos < len(word):
for num in range(3, 0, -1):
if word[pos : pos + num] in self._replacements[num]:
word = (
word[:pos]
+ self._replacements[num][word[pos : pos + num]]
+ word[pos + num :]
)
pos += 1
break
else:
pos += 1 # Advance if nothing is recognized
# Change 'CH' back(?) to 'SCH'
word = word.replace('CH', 'SCH')
# Replace final sequences
if word[-2:] == 'ER':
word = word[:-2] + 'R'
elif word[-2:] == 'EL':
word = word[:-2] + 'L'
elif word[-1:] == 'H':
word = word[:-1]
return word
[docs]@deprecated(
deprecated_in='0.4.0',
removed_in='0.6.0',
current_version=__version__,
details='Use the RethSchek.encode method instead.',
)
def reth_schek_phonetik(word):
"""Return Reth-Schek Phonetik code for a word.
This is a wrapper for :py:meth:`RethSchek.encode`.
Parameters
----------
word : str
The word to transform
Returns
-------
str
The Reth-Schek Phonetik code
Examples
--------
>>> reth_schek_phonetik('Joachim')
'JOAGHIM'
>>> reth_schek_phonetik('Christoph')
'GHRISDOF'
>>> reth_schek_phonetik('Jörg')
'JOERG'
>>> reth_schek_phonetik('Smith')
'SMID'
>>> reth_schek_phonetik('Schmidt')
'SCHMID'
.. versionadded:: 0.3.0
"""
return RethSchek().encode(word)
if __name__ == '__main__':
import doctest
doctest.testmod()