# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.phonetic._phonet.
phonet algorithm (a.k.a. Hannoveraner Phonetik), intended chiefly for German
"""
from collections import Counter
from unicodedata import normalize as unicode_normalize
from deprecation import deprecated
from ._phonetic import _Phonetic
from .. import __version__
__all__ = ['Phonet', 'phonet']
[docs]class Phonet(_Phonetic):
"""Phonet code.
phonet ("Hannoveraner Phonetik") was developed by Jörg Michael and
documented in :cite:`Michael:1999`.
This is a port of Jesper Zedlitz's code, which is licensed LGPL
:cite:`Zedlitz:2015`.
That is, in turn, based on Michael's C code, which is also licensed LGPL
:cite:`Michael:2007`.
.. versionadded:: 0.3.6
"""
_rules_no_lang = ( # separator chars
# fmt: off
'´', ' ', ' ',
'"', ' ', ' ',
'`$', '', '',
"'", ' ', ' ',
',', ',', ',',
';', ',', ',',
'-', ' ', ' ',
' ', ' ', ' ',
'.', '.', '.',
':', '.', '.',
# German umlauts
'Ä', 'AE', 'AE',
'Ö', 'OE', 'OE',
'Ü', 'UE', 'UE',
'ß', 'S', 'S',
# international umlauts
'À', 'A', 'A',
'Á', 'A', 'A',
'Â', 'A', 'A',
'Ã', 'A', 'A',
'Å', 'A', 'A',
'Æ', 'AE', 'AE',
'Ç', 'C', 'C',
'Ð', 'DJ', 'DJ',
'È', 'E', 'E',
'É', 'E', 'E',
'Ê', 'E', 'E',
'Ë', 'E', 'E',
'Ì', 'I', 'I',
'Í', 'I', 'I',
'Î', 'I', 'I',
'Ï', 'I', 'I',
'Ñ', 'NH', 'NH',
'Ò', 'O', 'O',
'Ó', 'O', 'O',
'Ô', 'O', 'O',
'Õ', 'O', 'O',
'Œ', 'OE', 'OE',
'Ø', 'OE', 'OE',
'Š', 'SH', 'SH',
'Þ', 'TH', 'TH',
'Ù', 'U', 'U',
'Ú', 'U', 'U',
'Û', 'U', 'U',
'Ý', 'Y', 'Y',
'Ÿ', 'Y', 'Y',
# 'normal' letters (A-Z)
'MC^', 'MAC', 'MAC',
'MC^', 'MAC', 'MAC',
'M´^', 'MAC', 'MAC',
"M'^", 'MAC', 'MAC',
'O´^', 'O', 'O',
"O'^", 'O', 'O',
'VAN DEN ^', 'VANDEN', 'VANDEN',
None, None, None
# fmt: on
)
_rules_german = ( # separator chars
# fmt: off
'´', ' ', ' ',
'"', ' ', ' ',
'`$', '', '',
"'", ' ', ' ',
',', ' ', ' ',
';', ' ', ' ',
'-', ' ', ' ',
' ', ' ', ' ',
'.', '.', '.',
':', '.', '.',
# German umlauts
'ÄE', 'E', 'E',
'ÄU<', 'EU', 'EU',
'ÄV(AEOU)-<', 'EW', None,
'Ä$', 'Ä', None,
'Ä<', None, 'E',
'Ä', 'E', None,
'ÖE', 'Ö', 'Ö',
'ÖU', 'Ö', 'Ö',
'ÖVER--<', 'ÖW', None,
'ÖV(AOU)-', 'ÖW', None,
'ÜBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
'ÜBER^^', 'ÜBA', 'IBA',
'ÜE', 'Ü', 'I',
'ÜVER--<', 'ÜW', None,
'ÜV(AOU)-', 'ÜW', None,
'Ü', None, 'I',
'ßCH<', None, 'Z',
'ß<', 'S', 'Z',
# international umlauts
'À<', 'A', 'A',
'Á<', 'A', 'A',
'Â<', 'A', 'A',
'Ã<', 'A', 'A',
'Å<', 'A', 'A',
'ÆER-', 'E', 'E',
'ÆU<', 'EU', 'EU',
'ÆV(AEOU)-<', 'EW', None,
'Æ$', 'Ä', None,
'Æ<', None, 'E',
'Æ', 'E', None,
'Ç', 'Z', 'Z',
'ÐÐ-', '', '',
'Ð', 'DI', 'TI',
'È<', 'E', 'E',
'É<', 'E', 'E',
'Ê<', 'E', 'E',
'Ë', 'E', 'E',
'Ì<', 'I', 'I',
'Í<', 'I', 'I',
'Î<', 'I', 'I',
'Ï', 'I', 'I',
'ÑÑ-', '', '',
'Ñ', 'NI', 'NI',
'Ò<', 'O', 'U',
'Ó<', 'O', 'U',
'Ô<', 'O', 'U',
'Õ<', 'O', 'U',
'Œ<', 'Ö', 'Ö',
'Ø(IJY)-<', 'E', 'E',
'Ø<', 'Ö', 'Ö',
'Š', 'SH', 'Z',
'Þ', 'T', 'T',
'Ù<', 'U', 'U',
'Ú<', 'U', 'U',
'Û<', 'U', 'U',
'Ý<', 'I', 'I',
'Ÿ<', 'I', 'I',
# 'normal' letters (A-Z)
'ABELLE$', 'ABL', 'ABL',
'ABELL$', 'ABL', 'ABL',
'ABIENNE$', 'ABIN', 'ABIN',
'ACHME---^', 'ACH', 'AK',
'ACEY$', 'AZI', 'AZI',
'ADV', 'ATW', None,
'AEGL-', 'EK', None,
'AEU<', 'EU', 'EU',
'AE2', 'E', 'E',
'AFTRAUBEN------', 'AFT ', 'AFT ',
'AGL-1', 'AK', None,
'AGNI-^', 'AKN', 'AKN',
'AGNIE-', 'ANI', 'ANI',
'AGN(AEOU)-$', 'ANI', 'ANI',
'AH(AIOÖUÜY)-', 'AH', None,
'AIA2', 'AIA', 'AIA',
'AIE$', 'E', 'E',
'AILL(EOU)-', 'ALI', 'ALI',
'AINE$', 'EN', 'EN',
'AIRE$', 'ER', 'ER',
'AIR-', 'E', 'E',
'AISE$', 'ES', 'EZ',
'AISSANCE$', 'ESANS', 'EZANZ',
'AISSE$', 'ES', 'EZ',
'AIX$', 'EX', 'EX',
'AJ(AÄEÈÉÊIOÖUÜ)--', 'A', 'A',
'AKTIE', 'AXIE', 'AXIE',
'AKTUEL', 'AKTUEL', None,
'ALOI^', 'ALOI', 'ALUI', # Don't merge these rules
'ALOY^', 'ALOI', 'ALUI', # needed by 'check_rules'
'AMATEU(RS)-', 'AMATÖ', 'ANATÖ',
'ANCH(OEI)-', 'ANSH', 'ANZ',
'ANDERGEGANG----', 'ANDA GE', 'ANTA KE',
'ANDERGEHE----', 'ANDA ', 'ANTA ',
'ANDERGESETZ----', 'ANDA GE', 'ANTA KE',
'ANDERGING----', 'ANDA ', 'ANTA ',
'ANDERSETZ(ET)-----', 'ANDA ', 'ANTA ',
'ANDERZUGEHE----', 'ANDA ZU ', 'ANTA ZU ',
'ANDERZUSETZE-----', 'ANDA ZU ', 'ANTA ZU ',
'ANER(BKO)---^^', 'AN', None,
'ANHAND---^$', 'AN H', 'AN ',
'ANH(AÄEIOÖUÜY)--^^', 'AN', None,
'ANIELLE$', 'ANIEL', 'ANIL',
'ANIEL', 'ANIEL', None,
'ANSTELLE----^$', 'AN ST', 'AN ZT',
'ANTI^^', 'ANTI', 'ANTI',
'ANVER^^', 'ANFA', 'ANFA',
'ATIA$', 'ATIA', 'ATIA',
'ATIA(NS)--', 'ATI', 'ATI',
'ATI(AÄOÖUÜ)-', 'AZI', 'AZI',
'AUAU--', '', '',
'AUERE$', 'AUERE', None,
'AUERE(NS)-$', 'AUERE', None,
'AUERE(AIOUY)--', 'AUER', None,
'AUER(AÄIOÖUÜY)-', 'AUER', None,
'AUER<', 'AUA', 'AUA',
'AUF^^', 'AUF', 'AUF',
'AULT$', 'O', 'U',
'AUR(BCDFGKLMNQSTVWZ)-', 'AUA', 'AUA',
'AUR$', 'AUA', 'AUA',
'AUSSE$', 'OS', 'UZ',
'AUS(ST)-^', 'AUS', 'AUS',
'AUS^^', 'AUS', 'AUS',
'AUTOFAHR----', 'AUTO ', 'AUTU ',
'AUTO^^', 'AUTO', 'AUTU',
'AUX(IY)-', 'AUX', 'AUX',
'AUX', 'O', 'U',
'AU', 'AU', 'AU',
'AVER--<', 'AW', None,
'AVIER$', 'AWIE', 'AFIE',
'AV(EÈÉÊI)-^', 'AW', None,
'AV(AOU)-', 'AW', None,
'AYRE$', 'EIRE', 'EIRE',
'AYRE(NS)-$', 'EIRE', 'EIRE',
'AYRE(AIOUY)--', 'EIR', 'EIR',
'AYR(AÄIOÖUÜY)-', 'EIR', 'EIR',
'AYR<', 'EIA', 'EIA',
'AYER--<', 'EI', 'EI',
'AY(AÄEIOÖUÜY)--', 'A', 'A',
'AË', 'E', 'E',
'A(IJY)<', 'EI', 'EI',
'BABY^$', 'BEBI', 'BEBI',
'BAB(IY)^', 'BEBI', 'BEBI',
'BEAU^$', 'BO', None,
'BEA(BCMNRU)-^', 'BEA', 'BEA',
'BEAT(AEIMORU)-^', 'BEAT', 'BEAT',
'BEE$', 'BI', 'BI',
'BEIGE^$', 'BESH', 'BEZ',
'BENOIT--', 'BENO', 'BENU',
'BER(DT)-', 'BER', None,
'BERN(DT)-', 'BERN', None,
'BE(LMNRST)-^', 'BE', 'BE',
'BETTE$', 'BET', 'BET',
'BEVOR^$', 'BEFOR', None,
'BIC$', 'BIZ', 'BIZ',
'BOWL(EI)-', 'BOL', 'BUL',
'BP(AÄEÈÉÊIÌÍÎOÖRUÜY)-', 'B', 'B',
'BRINGEND-----^', 'BRI', 'BRI',
'BRINGEND-----', ' BRI', ' BRI',
'BROW(NS)-', 'BRAU', 'BRAU',
'BUDGET7', 'BÜGE', 'BIKE',
'BUFFET7', 'BÜFE', 'BIFE',
'BYLLE$', 'BILE', 'BILE',
'BYLL$', 'BIL', 'BIL',
'BYPA--^', 'BEI', 'BEI',
'BYTE<', 'BEIT', 'BEIT',
'BY9^', 'BÜ', None,
'B(SßZ)$', 'BS', None,
'CACH(EI)-^', 'KESH', 'KEZ',
'CAE--', 'Z', 'Z',
'CA(IY)$', 'ZEI', 'ZEI',
'CE(EIJUY)--', 'Z', 'Z',
'CENT<', 'ZENT', 'ZENT',
'CERST(EI)----^', 'KE', 'KE',
'CER$', 'ZA', 'ZA',
'CE3', 'ZE', 'ZE',
"CH'S$", 'X', 'X',
'CH´S$', 'X', 'X',
'CHAO(ST)-', 'KAO', 'KAU',
'CHAMPIO-^', 'SHEMPI', 'ZENBI',
'CHAR(AI)-^', 'KAR', 'KAR',
'CHAU(CDFSVWXZ)-', 'SHO', 'ZU',
'CHÄ(CF)-', 'SHE', 'ZE',
'CHE(CF)-', 'SHE', 'ZE',
'CHEM-^', 'KE', 'KE', # or: 'CHE', 'KE'
'CHEQUE<', 'SHEK', 'ZEK',
'CHI(CFGPVW)-', 'SHI', 'ZI',
'CH(AEUY)-<^', 'SH', 'Z',
'CHK-', '', '',
'CHO(CKPS)-^', 'SHO', 'ZU',
'CHRIS-', 'KRI', None,
'CHRO-', 'KR', None,
'CH(LOR)-<^', 'K', 'K',
'CHST-', 'X', 'X',
'CH(SßXZ)3', 'X', 'X',
'CHTNI-3', 'CHN', 'KN',
'CH^', 'K', 'K', # or: 'CH', 'K'
'CH', 'CH', 'K',
'CIC$', 'ZIZ', 'ZIZ',
'CIENCEFICT----', 'EIENS ', 'EIENZ ',
'CIENCE$', 'EIENS', 'EIENZ',
'CIER$', 'ZIE', 'ZIE',
'CYB-^', 'ZEI', 'ZEI',
'CY9^', 'ZÜ', 'ZI',
'C(IJY)-<3', 'Z', 'Z',
'CLOWN-', 'KLAU', 'KLAU',
'CCH', 'Z', 'Z',
'CCE-', 'X', 'X',
'C(CK)-', '', '',
'CLAUDET---', 'KLO', 'KLU',
'CLAUDINE^$', 'KLODIN', 'KLUTIN',
'COACH', 'KOSH', 'KUZ',
'COLE$', 'KOL', 'KUL',
'COUCH', 'KAUSH', 'KAUZ',
'COW', 'KAU', 'KAU',
'CQUES$', 'K', 'K',
'CQUE', 'K', 'K',
'CRASH--9', 'KRE', 'KRE',
'CREAT-^', 'KREA', 'KREA',
'CST', 'XT', 'XT',
'CS<^', 'Z', 'Z',
'C(SßX)', 'X', 'X',
"CT'S$", 'X', 'X',
'CT(SßXZ)', 'X', 'X',
'CZ<', 'Z', 'Z',
'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z',
'C.^', 'C.', 'C.',
'CÄ-', 'Z', 'Z',
'CÜ$', 'ZÜ', 'ZI',
"C'S$", 'X', 'X',
'C<', 'K', 'K',
'DAHER^$', 'DAHER', None,
'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ',
'DAVO(NR)-^$', 'DAFO', 'TAFU',
'DD(SZ)--<', '', '',
'DD9', 'D', None,
'DEPOT7', 'DEPO', 'TEBU',
'DESIGN', 'DISEIN', 'TIZEIN',
'DE(LMNRST)-3^', 'DE', 'TE',
'DETTE$', 'DET', 'TET',
'DH$', 'T', None,
'DIC$', 'DIZ', 'TIZ',
'DIDR-^', 'DIT', None,
'DIEDR-^', 'DIT', None,
'DJ(AEIOU)-^', 'I', 'I',
'DMITR-^', 'DIMIT', 'TINIT',
'DRY9^', 'DRÜ', None,
'DT-', '', '',
'DUIS-^', 'DÜ', 'TI',
'DURCH^^', 'DURCH', 'TURK',
'DVA$', 'TWA', None,
'DY9^', 'DÜ', None,
'DYS$', 'DIS', None,
'DS(CH)--<', 'T', 'T',
'DST', 'ZT', 'ZT',
'DZS(CH)--', 'T', 'T',
'D(SßZ)', 'Z', 'Z',
'D(AÄEIOÖRUÜY)-', 'D', None,
'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None,
"D'H^", 'D', 'T',
'D´H^', 'D', 'T',
'D`H^', 'D', 'T',
"D'S3$", 'Z', 'Z',
'D´S3$', 'Z', 'Z',
'D^', 'D', None,
'D', 'T', 'T',
'EAULT$', 'O', 'U',
'EAUX$', 'O', 'U',
'EAU', 'O', 'U',
'EAV', 'IW', 'IF',
'EAS3$', 'EAS', None,
'EA(AÄEIOÖÜY)-3', 'EA', 'EA',
'EA3$', 'EA', 'EA',
'EA3', 'I', 'I',
'EBENSO^$', 'EBNSO', 'EBNZU',
'EBENSO^^', 'EBNSO ', 'EBNZU ',
'EBEN^^', 'EBN', 'EBN',
'EE9', 'E', 'E',
'EGL-1', 'EK', None,
'EHE(IUY)--1', 'EH', None,
'EHUNG---1', 'E', None,
'EH(AÄIOÖUÜY)-1', 'EH', None,
'EIEI--', '', '',
'EIERE^$', 'EIERE', None,
'EIERE$', 'EIERE', None,
'EIERE(NS)-$', 'EIERE', None,
'EIERE(AIOUY)--', 'EIER', None,
'EIER(AÄIOÖUÜY)-', 'EIER', None,
'EIER<', 'EIA', None,
'EIGL-1', 'EIK', None,
'EIGH$', 'EI', 'EI',
'EIH--', 'E', 'E',
'EILLE$', 'EI', 'EI',
'EIR(BCDFGKLMNQSTVWZ)-', 'EIA', 'EIA',
'EIR$', 'EIA', 'EIA',
'EITRAUBEN------', 'EIT ', 'EIT ',
'EI', 'EI', 'EI',
'EJ$', 'EI', 'EI',
'ELIZ^', 'ELIS', None,
'ELZ^', 'ELS', None,
'EL-^', 'E', 'E',
'ELANG----1', 'E', 'E',
'EL(DKL)--1', 'E', 'E',
'EL(MNT)--1$', 'E', 'E',
'ELYNE$', 'ELINE', 'ELINE',
'ELYN$', 'ELIN', 'ELIN',
'EL(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'EL', 'EL',
'EL-1', 'L', 'L',
'EM-^', None, 'E',
'EM(DFKMPQT)--1', None, 'E',
'EM(AÄEÈÉÊIÌÍÎOÖUÜY)--1', None, 'E',
'EM-1', None, 'N',
'ENGAG-^', 'ANGA', 'ANKA',
'EN-^', 'E', 'E',
'ENTUEL', 'ENTUEL', None,
'EN(CDGKQSTZ)--1', 'E', 'E',
'EN(AÄEÈÉÊIÌÍÎNOÖUÜY)-1', 'EN', 'EN',
'EN-1', '', '',
'ERH(AÄEIOÖUÜ)-^', 'ERH', 'ER',
'ER-^', 'E', 'E',
'ERREGEND-----', ' ER', ' ER',
'ERT1$', 'AT', None,
'ER(DGLKMNRQTZß)-1', 'ER', None,
'ER(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'ER', 'A',
'ER1$', 'A', 'A',
'ER<1', 'A', 'A',
'ETAT7', 'ETA', 'ETA',
'ETI(AÄOÖÜU)-', 'EZI', 'EZI',
'EUERE$', 'EUERE', None,
'EUERE(NS)-$', 'EUERE', None,
'EUERE(AIOUY)--', 'EUER', None,
'EUER(AÄIOÖUÜY)-', 'EUER', None,
'EUER<', 'EUA', None,
'EUEU--', '', '',
'EUILLE$', 'Ö', 'Ö',
'EUR$', 'ÖR', 'ÖR',
'EUX', 'Ö', 'Ö',
'EUSZ$', 'EUS', None,
'EUTZ$', 'EUS', None,
'EUYS$', 'EUS', 'EUZ',
'EUZ$', 'EUS', None,
'EU', 'EU', 'EU',
'EVER--<1', 'EW', None,
'EV(ÄOÖUÜ)-1', 'EW', None,
'EYER<', 'EIA', 'EIA',
'EY<', 'EI', 'EI',
'FACETTE', 'FASET', 'FAZET',
'FANS--^$', 'FE', 'FE',
'FAN-^$', 'FE', 'FE',
'FAULT-', 'FOL', 'FUL',
'FEE(DL)-', 'FI', 'FI',
'FEHLER', 'FELA', 'FELA',
'FE(LMNRST)-3^', 'FE', 'FE',
'FOERDERN---^', 'FÖRD', 'FÖRT',
'FOERDERN---', ' FÖRD', ' FÖRT',
'FOND7', 'FON', 'FUN',
'FRAIN$', 'FRA', 'FRA',
'FRISEU(RS)-', 'FRISÖ', 'FRIZÖ',
'FY9^', 'FÜ', None,
'FÖRDERN---^', 'FÖRD', 'FÖRT',
'FÖRDERN---', ' FÖRD', ' FÖRT',
'GAGS^$', 'GEX', 'KEX',
'GAG^$', 'GEK', 'KEK',
'GD', 'KT', 'KT',
'GEGEN^^', 'GEGN', 'KEKN',
'GEGENGEKOM-----', 'GEGN ', 'KEKN ',
'GEGENGESET-----', 'GEGN ', 'KEKN ',
'GEGENKOMME-----', 'GEGN ', 'KEKN ',
'GEGENZUKOM---', 'GEGN ZU ', 'KEKN ZU ',
'GENDETWAS-----$', 'GENT ', 'KENT ',
'GENRE', 'IORE', 'IURE',
'GE(LMNRST)-3^', 'GE', 'KE',
'GER(DKT)-', 'GER', None,
'GETTE$', 'GET', 'KET',
'GGF.', 'GF.', None,
'GG-', '', '',
'GH', 'G', None,
'GI(AOU)-^', 'I', 'I',
'GION-3', 'KIO', 'KIU',
'G(CK)-', '', '',
'GJ(AEIOU)-^', 'I', 'I',
'GMBH^$', 'GMBH', 'GMBH',
'GNAC$', 'NIAK', 'NIAK',
'GNON$', 'NION', 'NIUN',
'GN$', 'N', 'N',
'GONCAL-^', 'GONZA', 'KUNZA',
'GRY9^', 'GRÜ', None,
'G(SßXZ)-<', 'K', 'K',
'GUCK-', 'KU', 'KU',
'GUISEP-^', 'IUSE', 'IUZE',
'GUI-^', 'G', 'K',
'GUTAUSSEH------^', 'GUT ', 'KUT ',
'GUTGEHEND------^', 'GUT ', 'KUT ',
'GY9^', 'GÜ', None,
'G(AÄEILOÖRUÜY)-', 'G', None,
'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None,
"G'S$", 'X', 'X',
'G´S$', 'X', 'X',
'G^', 'G', None,
'G', 'K', 'K',
'HA(HIUY)--1', 'H', None,
'HANDVOL---^', 'HANT ', 'ANT ',
'HANNOVE-^', 'HANOF', None,
'HAVEN7$', 'HAFN', None,
'HEAD-', 'HE', 'E',
'HELIEGEN------', 'E ', 'E ',
'HESTEHEN------', 'E ', 'E ',
'HE(LMNRST)-3^', 'HE', 'E',
'HE(LMN)-1', 'E', 'E',
'HEUR1$', 'ÖR', 'ÖR',
'HE(HIUY)--1', 'H', None,
'HIH(AÄEIOÖUÜY)-1', 'IH', None,
'HLH(AÄEIOÖUÜY)-1', 'LH', None,
'HMH(AÄEIOÖUÜY)-1', 'MH', None,
'HNH(AÄEIOÖUÜY)-1', 'NH', None,
'HOBBY9^', 'HOBI', None,
'HOCHBEGAB-----^', 'HOCH ', 'UK ',
'HOCHTALEN-----^', 'HOCH ', 'UK ',
'HOCHZUFRI-----^', 'HOCH ', 'UK ',
'HO(HIY)--1', 'H', None,
'HRH(AÄEIOÖUÜY)-1', 'RH', None,
'HUH(AÄEIOÖUÜY)-1', 'UH', None,
'HUIS^^', 'HÜS', 'IZ',
'HUIS$', 'ÜS', 'IZ',
'HUI--1', 'H', None,
'HYGIEN^', 'HÜKIEN', None,
'HY9^', 'HÜ', None,
'HY(BDGMNPST)-', 'Ü', None,
'H.^', None, 'H.',
'HÄU--1', 'H', None,
'H^', 'H', '',
'H', '', '',
'ICHELL---', 'ISH', 'IZ',
'ICHI$', 'ISHI', 'IZI',
'IEC$', 'IZ', 'IZ',
'IEDENSTELLE------', 'IDN ', 'ITN ',
'IEI-3', '', '',
'IELL3', 'IEL', 'IEL',
'IENNE$', 'IN', 'IN',
'IERRE$', 'IER', 'IER',
'IERZULAN---', 'IR ZU ', 'IR ZU ',
'IETTE$', 'IT', 'IT',
'IEU', 'IÖ', 'IÖ',
'IE<4', 'I', 'I',
'IGL-1', 'IK', None,
'IGHT3$', 'EIT', 'EIT',
'IGNI(EO)-', 'INI', 'INI',
'IGN(AEOU)-$', 'INI', 'INI',
'IHER(DGLKRT)--1', 'IHE', None,
'IHE(IUY)--', 'IH', None,
'IH(AIOÖUÜY)-', 'IH', None,
'IJ(AOU)-', 'I', 'I',
'IJ$', 'I', 'I',
'IJ<', 'EI', 'EI',
'IKOLE$', 'IKOL', 'IKUL',
'ILLAN(STZ)--4', 'ILIA', 'ILIA',
'ILLAR(DT)--4', 'ILIA', 'ILIA',
'IMSTAN----^', 'IM ', 'IN ',
'INDELERREGE------', 'INDL ', 'INTL ',
'INFRAGE-----^$', 'IN ', 'IN ',
'INTERN(AOU)-^', 'INTAN', 'INTAN',
'INVER-', 'INWE', 'INFE',
'ITI(AÄIOÖUÜ)-', 'IZI', 'IZI',
'IUSZ$', 'IUS', None,
'IUTZ$', 'IUS', None,
'IUZ$', 'IUS', None,
'IVER--<', 'IW', None,
'IVIER$', 'IWIE', 'IFIE',
'IV(ÄOÖUÜ)-', 'IW', None,
'IV<3', 'IW', None,
'IY2', 'I', None,
'I(ÈÉÊ)<4', 'I', 'I',
'JAVIE---<^', 'ZA', 'ZA',
'JEANS^$', 'JINS', 'INZ',
'JEANNE^$', 'IAN', 'IAN',
'JEAN-^', 'IA', 'IA',
'JER-^', 'IE', 'IE',
'JE(LMNST)-', 'IE', 'IE',
'JI^', 'JI', None,
'JOR(GK)^$', 'IÖRK', 'IÖRK',
'J', 'I', 'I',
'KC(ÄEIJ)-', 'X', 'X',
'KD', 'KT', None,
'KE(LMNRST)-3^', 'KE', 'KE',
'KG(AÄEILOÖRUÜY)-', 'K', None,
'KH<^', 'K', 'K',
'KIC$', 'KIZ', 'KIZ',
'KLE(LMNRST)-3^', 'KLE', 'KLE',
'KOTELE-^', 'KOTL', 'KUTL',
'KREAT-^', 'KREA', 'KREA',
'KRÜS(TZ)--^', 'KRI', None,
'KRYS(TZ)--^', 'KRI', None,
'KRY9^', 'KRÜ', None,
'KSCH---', 'K', 'K',
'KSH--', 'K', 'K',
'K(SßXZ)7', 'X', 'X', # implies 'KST' -> 'XT'
"KT'S$", 'X', 'X',
'KTI(AIOU)-3', 'XI', 'XI',
'KT(SßXZ)', 'X', 'X',
'KY9^', 'KÜ', None,
"K'S$", 'X', 'X',
'K´S$', 'X', 'X',
'LANGES$', ' LANGES', ' LANKEZ',
'LANGE$', ' LANGE', ' LANKE',
'LANG$', ' LANK', ' LANK',
'LARVE-', 'LARF', 'LARF',
'LD(SßZ)$', 'LS', 'LZ',
"LD'S$", 'LS', 'LZ',
'LD´S$', 'LS', 'LZ',
'LEAND-^', 'LEAN', 'LEAN',
'LEERSTEHE-----^', 'LER ', 'LER ',
'LEICHBLEIB-----', 'LEICH ', 'LEIK ',
'LEICHLAUTE-----', 'LEICH ', 'LEIK ',
'LEIDERREGE------', 'LEIT ', 'LEIT ',
'LEIDGEPR----^', 'LEIT ', 'LEIT ',
'LEINSTEHE-----', 'LEIN ', 'LEIN ',
'LEL-', 'LE', 'LE',
'LE(MNRST)-3^', 'LE', 'LE',
'LETTE$', 'LET', 'LET',
'LFGNAG-', 'LFGAN', 'LFKAN',
'LICHERWEIS----', 'LICHA ', 'LIKA ',
'LIC$', 'LIZ', 'LIZ',
'LIVE^$', 'LEIF', 'LEIF',
'LT(SßZ)$', 'LS', 'LZ',
"LT'S$", 'LS', 'LZ',
'LT´S$', 'LS', 'LZ',
'LUI(GS)--', 'LU', 'LU',
'LV(AIO)-', 'LW', None,
'LY9^', 'LÜ', None,
'LSTS$', 'LS', 'LZ',
'LZ(BDFGKLMNPQRSTVWX)-', 'LS', None,
'L(SßZ)$', 'LS', None,
'MAIR-<', 'MEI', 'NEI',
'MANAG-', 'MENE', 'NENE',
'MANUEL', 'MANUEL', None,
'MASSEU(RS)-', 'MASÖ', 'NAZÖ',
'MATCH', 'MESH', 'NEZ',
'MAURICE', 'MORIS', 'NURIZ',
'MBH^$', 'MBH', 'MBH',
'MB(ßZ)$', 'MS', None,
'MB(SßTZ)-', 'M', 'N',
'MCG9^', 'MAK', 'NAK',
'MC9^', 'MAK', 'NAK',
'MEMOIR-^', 'MEMOA', 'NENUA',
'MERHAVEN$', 'MAHAFN', None,
'ME(LMNRST)-3^', 'ME', 'NE',
'MEN(STZ)--3', 'ME', None,
'MEN$', 'MEN', None,
'MIGUEL-', 'MIGE', 'NIKE',
'MIKE^$', 'MEIK', 'NEIK',
'MITHILFE----^$', 'MIT H', 'NIT ',
'MN$', 'M', None,
'MN', 'N', 'N',
'MPJUTE-', 'MPUT', 'NBUT',
'MP(ßZ)$', 'MS', None,
'MP(SßTZ)-', 'M', 'N',
'MP(BDJLMNPQVW)-', 'MB', 'NB',
'MY9^', 'MÜ', None,
'M(ßZ)$', 'MS', None,
'M´G7^', 'MAK', 'NAK',
"M'G7^", 'MAK', 'NAK',
'M´^', 'MAK', 'NAK',
"M'^", 'MAK', 'NAK',
'M', None, 'N',
'NACH^^', 'NACH', 'NAK',
'NADINE', 'NADIN', 'NATIN',
'NAIV--', 'NA', 'NA',
'NAISE$', 'NESE', 'NEZE',
'NAUGENOMM------', 'NAU ', 'NAU ',
'NAUSOGUT$', 'NAUSO GUT', 'NAUZU KUT',
'NCH$', 'NSH', 'NZ',
'NCOISE$', 'SOA', 'ZUA',
'NCOIS$', 'SOA', 'ZUA',
'NDAR$', 'NDA', 'NTA',
'NDERINGEN------', 'NDE ', 'NTE ',
'NDRO(CDKTZ)-', 'NTRO', None,
'ND(BFGJLMNPQVW)-', 'NT', None,
'ND(SßZ)$', 'NS', 'NZ',
"ND'S$", 'NS', 'NZ',
'ND´S$', 'NS', 'NZ',
'NEBEN^^', 'NEBN', 'NEBN',
'NENGELERN------', 'NEN ', 'NEN ',
'NENLERN(ET)---', 'NEN LE', 'NEN LE',
'NENZULERNE---', 'NEN ZU LE', 'NEN ZU LE',
'NE(LMNRST)-3^', 'NE', 'NE',
'NEN-3', 'NE', 'NE',
'NETTE$', 'NET', 'NET',
'NGU^^', 'NU', 'NU',
'NG(BDFJLMNPQRTVW)-', 'NK', 'NK',
'NH(AUO)-$', 'NI', 'NI',
'NICHTSAHNEN-----', 'NIX ', 'NIX ',
'NICHTSSAGE----', 'NIX ', 'NIX ',
'NICHTS^^', 'NIX', 'NIX',
'NICHT^^', 'NICHT', 'NIKT',
'NINE$', 'NIN', 'NIN',
'NON^^', 'NON', 'NUN',
'NOTLEIDE-----^', 'NOT ', 'NUT ',
'NOT^^', 'NOT', 'NUT',
'NTI(AIOU)-3', 'NZI', 'NZI',
'NTIEL--3', 'NZI', 'NZI',
'NT(SßZ)$', 'NS', 'NZ',
"NT'S$", 'NS', 'NZ',
'NT´S$', 'NS', 'NZ',
'NYLON', 'NEILON', 'NEILUN',
'NY9^', 'NÜ', None,
'NSTZUNEH---', 'NST ZU ', 'NZT ZU ',
'NSZ-', 'NS', None,
'NSTS$', 'NS', 'NZ',
'NZ(BDFGKLMNPQRSTVWX)-', 'NS', None,
'N(SßZ)$', 'NS', None,
'OBERE-', 'OBER', None,
'OBER^^', 'OBA', 'UBA',
'OEU2', 'Ö', 'Ö',
'OE<2', 'Ö', 'Ö',
'OGL-', 'OK', None,
'OGNIE-', 'ONI', 'UNI',
'OGN(AEOU)-$', 'ONI', 'UNI',
'OH(AIOÖUÜY)-', 'OH', None,
'OIE$', 'Ö', 'Ö',
'OIRE$', 'OA', 'UA',
'OIR$', 'OA', 'UA',
'OIX', 'OA', 'UA',
'OI<3', 'EU', 'EU',
'OKAY^$', 'OKE', 'UKE',
'OLYN$', 'OLIN', 'ULIN',
'OO(DLMZ)-', 'U', None,
'OO$', 'U', None,
'OO-', '', '',
'ORGINAL-----', 'ORI', 'URI',
'OTI(AÄOÖUÜ)-', 'OZI', 'UZI',
'OUI^', 'WI', 'FI',
'OUILLE$', 'ULIE', 'ULIE',
'OU(DT)-^', 'AU', 'AU',
'OUSE$', 'AUS', 'AUZ',
'OUT-', 'AU', 'AU',
'OU', 'U', 'U',
'O(FV)$', 'AU', 'AU', # due to 'OW$' -> 'AU'
'OVER--<', 'OW', None,
'OV(AOU)-', 'OW', None,
'OW$', 'AU', 'AU',
'OWS$', 'OS', 'UZ',
'OJ(AÄEIOÖUÜ)--', 'O', 'U',
'OYER', 'OIA', None,
'OY(AÄEIOÖUÜ)--', 'O', 'U',
'O(JY)<', 'EU', 'EU',
'OZ$', 'OS', None,
'O´^', 'O', 'U',
"O'^", 'O', 'U',
'O', None, 'U',
'PATIEN--^', 'PAZI', 'PAZI',
'PENSIO-^', 'PANSI', 'PANZI',
'PE(LMNRST)-3^', 'PE', 'PE',
'PFER-^', 'FE', 'FE',
'P(FH)<', 'F', 'F',
'PIC^$', 'PIK', 'PIK',
'PIC$', 'PIZ', 'PIZ',
'PIPELINE', 'PEIBLEIN', 'PEIBLEIN',
'POLYP-', 'POLÜ', None,
'POLY^^', 'POLI', 'PULI',
'PORTRAIT7', 'PORTRE', 'PURTRE',
'POWER7', 'PAUA', 'PAUA',
'PP(FH)--<', 'B', 'B',
'PP-', '', '',
'PRODUZ-^', 'PRODU', 'BRUTU',
'PRODUZI--', ' PRODU', ' BRUTU',
'PRIX^$', 'PRI', 'PRI',
'PS-^^', 'P', None,
'P(SßZ)^', None, 'Z',
'P(SßZ)$', 'BS', None,
'PT-^', '', '',
'PTI(AÄOÖUÜ)-3', 'BZI', 'BZI',
'PY9^', 'PÜ', None,
'P(AÄEIOÖRUÜY)-', 'P', 'P',
'P(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'P', None,
'P.^', None, 'P.',
'P^', 'P', None,
'P', 'B', 'B',
'QI-', 'Z', 'Z',
'QUARANT--', 'KARA', 'KARA',
'QUE(LMNRST)-3', 'KWE', 'KFE',
'QUE$', 'K', 'K',
'QUI(NS)$', 'KI', 'KI',
'QUIZ7', 'KWIS', None,
'Q(UV)7', 'KW', 'KF',
'Q<', 'K', 'K',
'RADFAHR----', 'RAT ', 'RAT ',
'RAEFTEZEHRE-----', 'REFTE ', 'REFTE ',
'RCH', 'RCH', 'RK',
'REA(DU)---3^', 'R', None,
'REBSERZEUG------', 'REBS ', 'REBZ ',
'RECHERCH^', 'RESHASH', 'REZAZ',
'RECYCL--', 'RIZEI', 'RIZEI',
'RE(ALST)-3^', 'RE', None,
'REE$', 'RI', 'RI',
'RER$', 'RA', 'RA',
'RE(MNR)-4', 'RE', 'RE',
'RETTE$', 'RET', 'RET',
'REUZ$', 'REUZ', None,
'REW$', 'RU', 'RU',
'RH<^', 'R', 'R',
'RJA(MN)--', 'RI', 'RI',
'ROWD-^', 'RAU', 'RAU',
'RTEMONNAIE-', 'RTMON', 'RTNUN',
'RTI(AÄOÖUÜ)-3', 'RZI', 'RZI',
'RTIEL--3', 'RZI', 'RZI',
'RV(AEOU)-3', 'RW', None,
'RY(KN)-$', 'RI', 'RI',
'RY9^', 'RÜ', None,
'RÄFTEZEHRE-----', 'REFTE ', 'REFTE ',
'SAISO-^', 'SES', 'ZEZ',
'SAFE^$', 'SEIF', 'ZEIF',
'SAUCE-^', 'SOS', 'ZUZ',
'SCHLAGGEBEN-----<', 'SHLAK ', 'ZLAK ',
'SCHSCH---7', '', '',
'SCHTSCH', 'SH', 'Z',
'SC(HZ)<', 'SH', 'Z',
'SC', 'SK', 'ZK',
'SELBSTST--7^^', 'SELB', 'ZELB',
'SELBST7^^', 'SELBST', 'ZELBZT',
'SERVICE7^', 'SÖRWIS', 'ZÖRFIZ',
'SERVI-^', 'SERW', None,
'SE(LMNRST)-3^', 'SE', 'ZE',
'SETTE$', 'SET', 'ZET',
'SHP-^', 'S', 'Z',
'SHST', 'SHT', 'ZT',
'SHTSH', 'SH', 'Z',
'SHT', 'ST', 'Z',
'SHY9^', 'SHÜ', None,
'SH^^', 'SH', None,
'SH3', 'SH', 'Z',
'SICHERGEGAN-----^', 'SICHA ', 'ZIKA ',
'SICHERGEHE----^', 'SICHA ', 'ZIKA ',
'SICHERGESTEL------^', 'SICHA ', 'ZIKA ',
'SICHERSTELL-----^', 'SICHA ', 'ZIKA ',
'SICHERZU(GS)--^', 'SICHA ZU ', 'ZIKA ZU ',
'SIEGLI-^', 'SIKL', 'ZIKL',
'SIGLI-^', 'SIKL', 'ZIKL',
'SIGHT', 'SEIT', 'ZEIT',
'SIGN', 'SEIN', 'ZEIN',
'SKI(NPZ)-', 'SKI', 'ZKI',
'SKI<^', 'SHI', 'ZI',
'SODASS^$', 'SO DAS', 'ZU TAZ',
'SODAß^$', 'SO DAS', 'ZU TAZ',
'SOGENAN--^', 'SO GEN', 'ZU KEN',
'SOUND-', 'SAUN', 'ZAUN',
'STAATS^^', 'STAZ', 'ZTAZ',
'STADT^^', 'STAT', 'ZTAT',
'STANDE$', ' STANDE', ' ZTANTE',
'START^^', 'START', 'ZTART',
'STAURANT7', 'STORAN', 'ZTURAN',
'STEAK-', 'STE', 'ZTE',
'STEPHEN-^$', 'STEW', None,
'STERN', 'STERN', None,
'STRAF^^', 'STRAF', 'ZTRAF',
"ST'S$", 'Z', 'Z',
'ST´S$', 'Z', 'Z',
'STST--', '', '',
'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT',
'ST(SZ)', 'Z', 'Z',
'SPAREN---^', 'SPA', 'ZPA',
'SPAREND----', ' SPA', ' ZPA',
'S(PTW)-^^', 'S', None,
'SP', 'SP', None,
'STYN(AE)-$', 'STIN', 'ZTIN',
'ST', 'ST', 'ZT',
'SUITE<', 'SIUT', 'ZIUT',
'SUKE--$', 'S', 'Z',
'SURF(EI)-', 'SÖRF', 'ZÖRF',
'SV(AEÈÉÊIÌÍÎOU)-<^', 'SW', None,
'SYB(IY)--^', 'SIB', None,
'SYL(KVW)--^', 'SI', None,
'SY9^', 'SÜ', None,
'SZE(NPT)-^', 'ZE', 'ZE',
'SZI(ELN)-^', 'ZI', 'ZI',
'SZCZ<', 'SH', 'Z',
'SZT<', 'ST', 'ZT',
'SZ<3', 'SH', 'Z',
'SÜL(KVW)--^', 'SI', None,
'S', None, 'Z',
'TCH', 'SH', 'Z',
'TD(AÄEIOÖRUÜY)-', 'T', None,
'TD(ÀÁÂÃÅÈÉÊËÌÍÎÏÒÓÔÕØÙÚÛÝŸ)-', 'T', None,
'TEAT-^', 'TEA', 'TEA',
'TERRAI7^', 'TERA', 'TERA',
'TE(LMNRST)-3^', 'TE', 'TE',
'TH<', 'T', 'T',
'TICHT-', 'TIK', 'TIK',
'TICH$', 'TIK', 'TIK',
'TIC$', 'TIZ', 'TIZ',
'TIGGESTELL-------', 'TIK ', 'TIK ',
'TIGSTELL-----', 'TIK ', 'TIK ',
'TOAS-^', 'TO', 'TU',
'TOILET-', 'TOLE', 'TULE',
'TOIN-', 'TOA', 'TUA',
'TRAECHTI-^', 'TRECHT', 'TREKT',
'TRAECHTIG--', ' TRECHT', ' TREKT',
'TRAINI-', 'TREN', 'TREN',
'TRÄCHTI-^', 'TRECHT', 'TREKT',
'TRÄCHTIG--', ' TRECHT', ' TREKT',
'TSCH', 'SH', 'Z',
'TSH', 'SH', 'Z',
'TST', 'ZT', 'ZT',
'T(Sß)', 'Z', 'Z',
'TT(SZ)--<', '', '',
'TT9', 'T', 'T',
'TV^$', 'TV', 'TV',
'TX(AEIOU)-3', 'SH', 'Z',
'TY9^', 'TÜ', None,
'TZ-', '', '',
"T'S3$", 'Z', 'Z',
'T´S3$', 'Z', 'Z',
'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
'UEBER^^', 'ÜBA', 'IBA',
'UE2', 'Ü', 'I',
'UGL-', 'UK', None,
'UH(AOÖUÜY)-', 'UH', None,
'UIE$', 'Ü', 'I',
'UM^^', 'UM', 'UN',
'UNTERE--3', 'UNTE', 'UNTE',
'UNTER^^', 'UNTA', 'UNTA',
'UNVER^^', 'UNFA', 'UNFA',
'UN^^', 'UN', 'UN',
'UTI(AÄOÖUÜ)-', 'UZI', 'UZI',
'UVE-4', 'UW', None,
'UY2', 'UI', None,
'UZZ', 'AS', 'AZ',
'VACL-^', 'WAZ', 'FAZ',
'VAC$', 'WAZ', 'FAZ',
'VAN DEN ^', 'FANDN', 'FANTN',
'VANES-^', 'WANE', None,
'VATRO-', 'WATR', None,
'VA(DHJNT)--^', 'F', None,
'VEDD-^', 'FE', 'FE',
'VE(BEHIU)--^', 'F', None,
'VEL(BDLMNT)-^', 'FEL', None,
'VENTZ-^', 'FEN', None,
'VEN(NRSZ)-^', 'FEN', None,
'VER(AB)-^$', 'WER', None,
'VERBAL^$', 'WERBAL', None,
'VERBAL(EINS)-^', 'WERBAL', None,
'VERTEBR--', 'WERTE', None,
'VEREIN-----', 'F', None,
'VEREN(AEIOU)-^', 'WEREN', None,
'VERIFI', 'WERIFI', None,
'VERON(AEIOU)-^', 'WERON', None,
'VERSEN^', 'FERSN', 'FAZN',
'VERSIERT--^', 'WERSI', None,
'VERSIO--^', 'WERS', None,
'VERSUS', 'WERSUS', None,
'VERTI(GK)-', 'WERTI', None,
'VER^^', 'FER', 'FA',
'VERSPRECHE-------', ' FER', ' FA',
'VER$', 'WA', None,
'VER', 'FA', 'FA',
'VET(HT)-^', 'FET', 'FET',
'VETTE$', 'WET', 'FET',
'VE^', 'WE', None,
'VIC$', 'WIZ', 'FIZ',
'VIELSAGE----', 'FIL ', 'FIL ',
'VIEL', 'FIL', 'FIL',
'VIEW', 'WIU', 'FIU',
'VILL(AE)-', 'WIL', None,
'VIS(ACEIKUVWZ)-<^', 'WIS', None,
'VI(ELS)--^', 'F', None,
'VILLON--', 'WILI', 'FILI',
'VIZE^^', 'FIZE', 'FIZE',
'VLIE--^', 'FL', None,
'VL(AEIOU)--', 'W', None,
'VOKA-^', 'WOK', None,
'VOL(ATUVW)--^', 'WO', None,
'VOR^^', 'FOR', 'FUR',
'VR(AEIOU)--', 'W', None,
'VV9', 'W', None,
'VY9^', 'WÜ', 'FI',
'V(ÜY)-', 'W', None,
'V(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'W', None,
'V(AEIJLRU)-<', 'W', None,
'V.^', 'V.', None,
'V<', 'F', 'F',
'WEITERENTWI-----^', 'WEITA ', 'FEITA ',
'WEITREICH-----^', 'WEIT ', 'FEIT ',
'WEITVER^', 'WEIT FER', 'FEIT FA',
'WE(LMNRST)-3^', 'WE', 'FE',
'WER(DST)-', 'WER', None,
'WIC$', 'WIZ', 'FIZ',
'WIEDERU--', 'WIDE', 'FITE',
'WIEDER^$', 'WIDA', 'FITA',
'WIEDER^^', 'WIDA ', 'FITA ',
'WIEVIEL', 'WI FIL', 'FI FIL',
'WISUEL', 'WISUEL', None,
'WR-^', 'W', None,
'WY9^', 'WÜ', 'FI',
'W(BDFGJKLMNPQRSTZ)-', 'F', None,
'W$', 'F', None,
'W', None, 'F',
'X<^', 'Z', 'Z',
'XHAVEN$', 'XAFN', None,
'X(CSZ)', 'X', 'X',
'XTS(CH)--', 'XT', 'XT',
'XT(SZ)', 'Z', 'Z',
'YE(LMNRST)-3^', 'IE', 'IE',
'YE-3', 'I', 'I',
'YOR(GK)^$', 'IÖRK', 'IÖRK',
'Y(AOU)-<7', 'I', 'I',
'Y(BKLMNPRSTX)-1', 'Ü', None,
'YVES^$', 'IF', 'IF',
'YVONNE^$', 'IWON', 'IFUN',
'Y.^', 'Y.', None,
'Y', 'I', 'I',
'ZC(AOU)-', 'SK', 'ZK',
'ZE(LMNRST)-3^', 'ZE', 'ZE',
'ZIEJ$', 'ZI', 'ZI',
'ZIGERJA(HR)-3', 'ZIGA IA', 'ZIKA IA',
'ZL(AEIOU)-', 'SL', None,
'ZS(CHT)--', '', '',
'ZS', 'SH', 'Z',
'ZUERST', 'ZUERST', 'ZUERST',
'ZUGRUNDE^$', 'ZU GRUNDE', 'ZU KRUNTE',
'ZUGRUNDE', 'ZU GRUNDE ', 'ZU KRUNTE ',
'ZUGUNSTEN', 'ZU GUNSTN', 'ZU KUNZTN',
'ZUHAUSE-', 'ZU HAUS', 'ZU AUZ',
'ZULASTEN^$', 'ZU LASTN', 'ZU LAZTN',
'ZURUECK^^', 'ZURÜK', 'ZURIK',
'ZURZEIT', 'ZUR ZEIT', 'ZUR ZEIT',
'ZURÜCK^^', 'ZURÜK', 'ZURIK',
'ZUSTANDE', 'ZU STANDE', 'ZU ZTANTE',
'ZUTAGE', 'ZU TAGE', 'ZU TAKE',
'ZUVER^^', 'ZUFA', 'ZUFA',
'ZUVIEL', 'ZU FIL', 'ZU FIL',
'ZUWENIG', 'ZU WENIK', 'ZU FENIK',
'ZY9^', 'ZÜ', None,
'ZYK3$', 'ZIK', None,
'Z(VW)7^', 'SW', None,
None, None, None
# fmt: on
)
_upper_trans = dict(
zip(
(
ord(_)
for _ in 'abcdefghijklmnopqrstuvwxyzàáâãåäæ'
+ 'çðèéêëìíîïñòóôõöøœšßþùúûüýÿ'
),
'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÅÄÆ'
+ 'ÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ',
)
)
def __init__(self, mode=1, lang='de'):
"""Initialize AlphaSIS instance.
Parameters
----------
mode : int
The ponet variant to employ (1 or 2)
lang : str
``de`` (default) for German, ``none`` for no language
.. versionadded:: 0.4.0
"""
self._mode = mode
self._lang = lang
[docs] def encode(self, word):
"""Return the phonet code for a word.
Parameters
----------
word : str
The word to transform
Returns
-------
str
The phonet value
Examples
--------
>>> pe = Phonet()
>>> pe.encode('Christopher')
'KRISTOFA'
>>> pe.encode('Niall')
'NIAL'
>>> pe.encode('Smith')
'SMIT'
>>> pe.encode('Schmidt')
'SHMIT'
>>> pe2 = Phonet(mode=2)
>>> pe2.encode('Christopher')
'KRIZTUFA'
>>> pe2.encode('Niall')
'NIAL'
>>> pe2.encode('Smith')
'ZNIT'
>>> pe2.encode('Schmidt')
'ZNIT'
>>> pe_none = Phonet(lang='none')
>>> pe_none.encode('Christopher')
'CHRISTOPHER'
>>> pe_none.encode('Niall')
'NIAL'
>>> pe_none.encode('Smith')
'SMITH'
>>> pe_none.encode('Schmidt')
'SCHMIDT'
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
phonet_hash = Counter()
alpha_pos = Counter()
phonet_hash_1 = Counter()
phonet_hash_2 = Counter()
def _initialize_phonet(lang):
"""Initialize phonet variables.
Parameters
----------
lang : str
Language to use for rules
.. versionadded:: 0.1.0
"""
if lang == 'none':
_phonet_rules = self._rules_no_lang
else:
_phonet_rules = self._rules_german
phonet_hash[''] = -1
# German and international umlauts
for j in {
'À',
'Á',
'Â',
'Ã',
'Ä',
'Å',
'Æ',
'Ç',
'È',
'É',
'Ê',
'Ë',
'Ì',
'Í',
'Î',
'Ï',
'Ð',
'Ñ',
'Ò',
'Ó',
'Ô',
'Õ',
'Ö',
'Ø',
'Ù',
'Ú',
'Û',
'Ü',
'Ý',
'Þ',
'ß',
'Œ',
'Š',
'Ÿ',
}:
alpha_pos[j] = 1
phonet_hash[j] = -1
# "normal" letters ('A'-'Z')
for i, j in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
alpha_pos[j] = i + 2
phonet_hash[j] = -1
for i in range(26):
for j in range(28):
phonet_hash_1[i, j] = -1
phonet_hash_2[i, j] = -1
# for each phonetc rule
for i in range(len(_phonet_rules)):
rule = _phonet_rules[i]
if rule and i % 3 == 0:
# calculate first hash value
k = _phonet_rules[i][0]
if phonet_hash[k] < 0 and (
_phonet_rules[i + 1] or _phonet_rules[i + 2]
):
phonet_hash[k] = i
# calculate second hash values
if k and alpha_pos[k] >= 2:
k = alpha_pos[k]
j = k - 2
rule = rule[1:]
if not rule:
rule = ' '
elif rule[0] == '(':
rule = rule[1:]
else:
rule = rule[0]
while rule and (rule[0] != ')'):
k = alpha_pos[rule[0]]
if k > 0:
# add hash value for this letter
if phonet_hash_1[j, k] < 0:
phonet_hash_1[j, k] = i
phonet_hash_2[j, k] = i
if phonet_hash_2[j, k] >= (i - 30):
phonet_hash_2[j, k] = i
else:
k = -1
if k <= 0:
# add hash value for all letters
if phonet_hash_1[j, 0] < 0:
phonet_hash_1[j, 0] = i
phonet_hash_2[j, 0] = i
rule = rule[1:]
def _phonet(term, mode, lang):
"""Return the phonet coded form of a term.
Parameters
----------
term : str
Term to transform
mode : int
The ponet variant to employ (1 or 2)
lang : str
``de`` (default) for German, ``none`` for no language
Returns
-------
str
The phonet value
.. versionadded:: 0.1.0
"""
if lang == 'none':
_phonet_rules = self._rules_no_lang
else:
_phonet_rules = self._rules_german
char0 = ''
dest = term
if not term:
return ''
term_length = len(term)
# convert input string to upper-case
src = term.translate(self._upper_trans)
# check "src"
i = 0
j = 0
zeta = 0
while i < len(src):
char = src[i]
pos = alpha_pos[char]
if pos >= 2:
xpos = pos - 2
if i + 1 == len(src):
pos = alpha_pos['']
else:
pos = alpha_pos[src[i + 1]]
start1 = phonet_hash_1[xpos, pos]
start2 = phonet_hash_1[xpos, 0]
end1 = phonet_hash_2[xpos, pos]
end2 = phonet_hash_2[xpos, 0]
# preserve rule priorities
if (start2 >= 0) and ((start1 < 0) or (start2 < start1)):
pos = start1
start1 = start2
start2 = pos
pos = end1
end1 = end2
end2 = pos
if (end1 >= start2) and (start2 >= 0):
if end2 > end1:
end1 = end2
start2 = -1
end2 = -1
else:
pos = phonet_hash[char]
start1 = pos
end1 = 10000
start2 = -1
end2 = -1
pos = start1
zeta0 = 0
if pos >= 0:
# check rules for this char
while (_phonet_rules[pos] is None) or (
_phonet_rules[pos][0] == char
):
if pos > end1:
if start2 > 0:
pos = start2
start1 = start2
start2 = -1
end1 = end2
end2 = -1
continue
break
if (_phonet_rules[pos] is None) or (
_phonet_rules[pos + mode] is None
):
# no conversion rule available
pos += 3
continue
# check whole string
matches = 1 # number of matching letters
priority = 5 # default priority
rule = _phonet_rules[pos]
rule = rule[1:]
while (
rule
and (len(src) > (i + matches))
and (src[i + matches] == rule[0])
and not rule[0].isdigit()
and (rule not in '(-<^$')
):
matches += 1
rule = rule[1:]
if rule and (rule[0] == '('):
# check an array of letters
if (
(len(src) > (i + matches))
and src[i + matches].isalpha()
and (src[i + matches] in rule[1:])
):
matches += 1
while rule and rule[0] != ')':
rule = rule[1:]
# if rule[0] == ')':
rule = rule[1:]
if rule:
priority0 = ord(rule[0])
else:
priority0 = 0
matches0 = matches
while rule and rule[0] == '-' and matches > 1:
matches -= 1
rule = rule[1:]
if rule and rule[0] == '<':
rule = rule[1:]
if rule and rule[0].isdigit():
# read priority
priority = int(rule[0])
rule = rule[1:]
if rule and rule[0:2] == '^^':
rule = rule[1:]
if (
not rule
or (
(rule[0] == '^')
and ((i == 0) or not src[i - 1].isalpha())
and (
(rule[1:2] != '$')
or (
not (
src[
i + matches0 : i + matches0 + 1
].isalpha()
)
and (
src[
i + matches0 : i + matches0 + 1
]
!= '.'
)
)
)
)
or (
(rule[0] == '$')
and (i > 0)
and src[i - 1].isalpha()
and (
(
not src[
i + matches0 : i + matches0 + 1
].isalpha()
)
and (
src[i + matches0 : i + matches0 + 1]
!= '.'
)
)
)
):
# look for continuation, if:
# matches > 1 und NO '-' in first string */
pos0 = -1
start3 = 0
start4 = 0
end3 = 0
end4 = 0
if (
(matches > 1)
and src[i + matches : i + matches + 1]
and (priority0 != ord('-'))
):
char0 = src[i + matches - 1]
pos0 = alpha_pos[char0]
if pos0 >= 2 and src[i + matches]:
xpos = pos0 - 2
pos0 = alpha_pos[src[i + matches]]
start3 = phonet_hash_1[xpos, pos0]
start4 = phonet_hash_1[xpos, 0]
end3 = phonet_hash_2[xpos, pos0]
end4 = phonet_hash_2[xpos, 0]
# preserve rule priorities
if (start4 >= 0) and (
(start3 < 0) or (start4 < start3)
):
pos0 = start3
start3 = start4
start4 = pos0
pos0 = end3
end3 = end4
end4 = pos0
if (end3 >= start4) and (start4 >= 0):
if end4 > end3:
end3 = end4
start4 = -1
end4 = -1
else:
pos0 = phonet_hash[char0]
start3 = pos0
end3 = 10000
start4 = -1
end4 = -1
pos0 = start3
# check continuation rules for src[i+matches]
if pos0 >= 0:
while (_phonet_rules[pos0] is None) or (
_phonet_rules[pos0][0] == char0
):
if pos0 > end3:
if start4 > 0:
pos0 = start4
start3 = start4
start4 = -1
end3 = end4
end4 = -1
continue
priority0 = -1
# important
break
if (_phonet_rules[pos0] is None) or (
_phonet_rules[pos0 + mode] is None
):
# no conversion rule available
pos0 += 3
continue
# check whole string
matches0 = matches
priority0 = 5
rule = _phonet_rules[pos0]
rule = rule[1:]
while (
rule
and (
src[
i + matches0 : i + matches0 + 1
]
== rule[0]
)
and (
not rule[0].isdigit()
or (rule in '(-<^$')
)
):
matches0 += 1
rule = rule[1:]
if rule and rule[0] == '(':
# check an array of letters
if src[
i + matches0 : i + matches0 + 1
].isalpha() and (
src[i + matches0] in rule[1:]
):
matches0 += 1
while rule and rule[0] != ')':
rule = rule[1:]
# if rule[0] == ')':
rule = rule[1:]
while rule and rule[0] == '-':
# "matches0" is NOT decremented
# because of
# "if (matches0 == matches)"
rule = rule[1:]
if rule and rule[0] == '<':
rule = rule[1:]
if rule and rule[0].isdigit():
priority0 = int(rule[0])
rule = rule[1:]
if (
not rule
or
# rule == '^' is not possible here
(
(rule[0] == '$')
and not src[
i + matches0 : i + matches0 + 1
].isalpha()
and (
src[
i
+ matches0 : i
+ matches0
+ 1
]
!= '.'
)
)
):
if matches0 == matches:
# this is only a partial string
pos0 += 3
continue
if priority0 < priority:
# priority is too low
pos0 += 3
continue
# continuation rule found
break
pos0 += 3
# end of "while"
if (priority0 >= priority) and (
(_phonet_rules[pos0] is not None)
and (_phonet_rules[pos0][0] == char0)
):
pos += 3
continue
# replace string
if _phonet_rules[pos] and (
'<' in _phonet_rules[pos][1:]
):
priority0 = 1
else:
priority0 = 0
rule = _phonet_rules[pos + mode]
if (priority0 == 1) and (zeta == 0):
# rule with '<' is applied
if (
(j > 0)
and rule
and (
(dest[j - 1] == char)
or (dest[j - 1] == rule[0])
)
):
j -= 1
zeta0 = 1
zeta += 1
matches0 = 0
while rule and src[i + matches0]:
src = (
src[0 : i + matches0]
+ rule[0]
+ src[i + matches0 + 1 :]
)
matches0 += 1
rule = rule[1:]
if matches0 < matches:
src = (
src[0 : i + matches0]
+ src[i + matches :]
)
char = src[i]
else:
i = i + matches - 1
zeta = 0
while len(rule) > 1:
if (j == 0) or (dest[j - 1] != rule[0]):
dest = (
dest[0:j]
+ rule[0]
+ dest[min(len(dest), j + 1) :]
)
j += 1
rule = rule[1:]
# new "current char"
if not rule:
rule = ''
char = ''
else:
char = rule[0]
if (
_phonet_rules[pos]
and '^^' in _phonet_rules[pos][1:]
):
if char:
dest = (
dest[0:j]
+ char
+ dest[min(len(dest), j + 1) :]
)
j += 1
src = src[i + 1 :]
i = 0
zeta0 = 1
break
pos += 3
if pos > end1 and start2 > 0:
pos = start2
start1 = start2
end1 = end2
start2 = -1
end2 = -1
if zeta0 == 0:
if char and ((j == 0) or (dest[j - 1] != char)):
# delete multiple letters only
dest = (
dest[0:j] + char + dest[min(j + 1, term_length) :]
)
j += 1
i += 1
zeta = 0
dest = dest[0:j]
return dest
_initialize_phonet(self._lang)
word = unicode_normalize('NFKC', word)
return _phonet(word, self._mode, self._lang)
[docs]@deprecated(
deprecated_in='0.4.0',
removed_in='0.6.0',
current_version=__version__,
details='Use the Phonet.encode method instead.',
)
def phonet(word, mode=1, lang='de'):
"""Return the phonet code for a word.
This is a wrapper for :py:meth:`Phonet.encode`.
Parameters
----------
word : str
The word to transform
mode : int
The ponet variant to employ (1 or 2)
lang : str
``de`` (default) for German, ``none`` for no language
Returns
-------
str
The phonet value
Examples
--------
>>> phonet('Christopher')
'KRISTOFA'
>>> phonet('Niall')
'NIAL'
>>> phonet('Smith')
'SMIT'
>>> phonet('Schmidt')
'SHMIT'
>>> phonet('Christopher', mode=2)
'KRIZTUFA'
>>> phonet('Niall', mode=2)
'NIAL'
>>> phonet('Smith', mode=2)
'ZNIT'
>>> phonet('Schmidt', mode=2)
'ZNIT'
>>> phonet('Christopher', lang='none')
'CHRISTOPHER'
>>> phonet('Niall', lang='none')
'NIAL'
>>> phonet('Smith', lang='none')
'SMITH'
>>> phonet('Schmidt', lang='none')
'SCHMIDT'
.. versionadded:: 0.1.0
"""
return Phonet(mode, lang).encode(word)
if __name__ == '__main__':
import doctest
doctest.testmod()