# -*- coding: utf-8 -*-
# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.phonetic.
The phonetic module implements phonetic algorithms including:
- Robert C. Russell's Index
- American Soundex
- Refined Soundex
- Daitch-Mokotoff Soundex
- Kölner Phonetik
- NYSIIS
- Match Rating Algorithm
- Metaphone
- Double Metaphone
- Caverphone
- Alpha Search Inquiry System
- Fuzzy Soundex
- Phonex
- Phonem
- Phonix
- SfinxBis
- phonet
- Standardized Phonetic Frequency Code
- Statistics Canada
- Lein
- Roger Root
- Oxford Name Compression Algorithm (ONCA)
- Eudex phonetic hash
- Haase Phonetik
- Reth-Schek Phonetik
- FONEM
- Parmar-Kumbharana
- Davidson's Consonant Code
- SoundD
- PSHP Soundex/Viewex Coding
- an early version of Henry Code
- Norphone
- Dolby Code
- Phonetic Spanish
- Spanish Metaphone
- MetaSoundex
- SoundexBR
- NRL English-to-phoneme
- Beider-Morse Phonetic Matching
"""
from __future__ import division, unicode_literals
from collections import Counter
from itertools import groupby, product
from re import compile as re_compile
from re import match as re_match
from unicodedata import normalize as unicode_normalize
from six import text_type
from six.moves import range
from ._bm import _bmpm
__all__ = ['alpha_sis', 'bmpm', 'caverphone', 'davidson', 'dm_soundex',
'dolby', 'double_metaphone', 'eudex', 'fonem', 'fuzzy_soundex',
'haase_phonetik', 'henry_early', 'koelner_phonetik',
'koelner_phonetik_alpha', 'koelner_phonetik_num_to_alpha', 'lein',
'metaphone', 'metasoundex', 'mra', 'norphone', 'nrl', 'nysiis',
'onca', 'parmar_kumbharana', 'phonem', 'phonet', 'phonetic_spanish',
'phonex', 'phonix', 'pshp_soundex_first', 'pshp_soundex_last',
'refined_soundex', 'reth_schek_phonetik', 'roger_root',
'russell_index', 'russell_index_alpha',
'russell_index_num_to_alpha', 'sfinxbis', 'sound_d', 'soundex',
'soundex_br', 'spanish_metaphone', 'spfc', 'statistics_canada']
def _delete_consecutive_repeats(word):
"""Delete consecutive repeated characters in a word.
:param str word: the word to transform
:returns: word with consecutive repeating characters collapsed to
a single instance
:rtype: str
>>> _delete_consecutive_repeats('REDDEE')
'REDE'
>>> _delete_consecutive_repeats('AEIOU')
'AEIOU'
>>> _delete_consecutive_repeats('AAACCCTTTGGG')
'ACTG'
"""
return ''.join(char for char, _ in groupby(word))
[docs]def russell_index(word):
"""Return the Russell Index (integer output) of a word.
This follows Robert C. Russell's Index algorithm, as described in
:cite:`Russell:1917`.
:param str word: the word to transform
:returns: the Russell Index value
:rtype: int
>>> russell_index('Christopher')
3813428
>>> russell_index('Niall')
715
>>> russell_index('Smith')
3614
>>> russell_index('Schmidt')
3614
"""
_russell_translation = dict(zip((ord(_) for _ in
'ABCDEFGIKLMNOPQRSTUVXYZ'),
'12341231356712383412313'))
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
word = word.replace('GH', '') # discard gh (rule 3)
word = word.rstrip('SZ') # discard /[sz]$/ (rule 3)
# translate according to Russell's mapping
word = ''.join(c for c in word if c in
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'K', 'L', 'M', 'N',
'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z'})
sdx = word.translate(_russell_translation)
# remove any 1s after the first occurrence
one = sdx.find('1')+1
if one:
sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1')
# remove repeating characters
sdx = _delete_consecutive_repeats(sdx)
# return as an int
return int(sdx) if sdx else float('NaN')
[docs]def russell_index_num_to_alpha(num):
"""Convert the Russell Index integer to an alphabetic string.
This follows Robert C. Russell's Index algorithm, as described in
:cite:`Russell:1917`.
:param int num: a Russell Index integer value
:returns: the Russell Index as an alphabetic string
:rtype: str
>>> russell_index_num_to_alpha(3813428)
'CRACDBR'
>>> russell_index_num_to_alpha(715)
'NAL'
>>> russell_index_num_to_alpha(3614)
'CMAD'
"""
_russell_num_translation = dict(zip((ord(_) for _ in '12345678'),
'ABCDLMNR'))
num = ''.join(c for c in text_type(num) if c in {'1', '2', '3', '4', '5',
'6', '7', '8'})
if num:
return num.translate(_russell_num_translation)
return ''
[docs]def russell_index_alpha(word):
"""Return the Russell Index (alphabetic output) for the word.
This follows Robert C. Russell's Index algorithm, as described in
:cite:`Russell:1917`.
:param str word: the word to transform
:returns: the Russell Index value as an alphabetic string
:rtype: str
>>> russell_index_alpha('Christopher')
'CRACDBR'
>>> russell_index_alpha('Niall')
'NAL'
>>> russell_index_alpha('Smith')
'CMAD'
>>> russell_index_alpha('Schmidt')
'CMAD'
"""
if word:
return russell_index_num_to_alpha(russell_index(word))
return ''
[docs]def soundex(word, max_length=4, var='American', reverse=False, zero_pad=True):
"""Return the Soundex code for a word.
:param str word: the word to transform
:param int max_length: the length of the code returned (defaults to 4)
:param str var: the variant of the algorithm to employ (defaults to
'American'):
- 'American' follows the American Soundex algorithm, as described at
:cite:`US:2007` and in :cite:`Knuth:1998`; this is also called
Miracode
- 'special' follows the rules from the 1880-1910 US Census
retrospective re-analysis, in which h & w are not treated as blocking
consonants but as vowels. Cf. :cite:`Repici:2013`.
- 'Census' follows the rules laid out in GIL 55 :cite:`US:1997` by the
US Census, including coding prefixed and unprefixed versions of some
names
:param bool reverse: reverse the word before computing the selected Soundex
(defaults to False); This results in "Reverse Soundex", which is useful
for blocking in cases where the initial elements may be in error.
:param bool zero_pad: pad the end of the return value with 0s to achieve a
max_length string
:returns: the Soundex value
:rtype: str
>>> soundex("Christopher")
'C623'
>>> soundex("Niall")
'N400'
>>> soundex('Smith')
'S530'
>>> soundex('Schmidt')
'S530'
>>> soundex('Christopher', max_length=-1)
'C623160000000000000000000000000000000000000000000000000000000000'
>>> soundex('Christopher', max_length=-1, zero_pad=False)
'C62316'
>>> soundex('Christopher', reverse=True)
'R132'
>>> soundex('Ashcroft')
'A261'
>>> soundex('Asicroft')
'A226'
>>> soundex('Ashcroft', var='special')
'A226'
>>> soundex('Asicroft', var='special')
'A226'
"""
_soundex_translation = dict(zip((ord(_) for _ in
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
'01230129022455012623019202'))
# Require a max_length of at least 4 and not more than 64
if max_length != -1:
max_length = min(max(4, max_length), 64)
else:
max_length = 64
# uppercase, normalize, decompose, and filter non-A-Z out
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
if var == 'Census':
# TODO: Should these prefixes be supplemented? (VANDE, DELA, VON)
if word[:3] in {'VAN', 'CON'} and len(word) > 4:
return (soundex(word, max_length, 'American', reverse, zero_pad),
soundex(word[3:], max_length, 'American', reverse,
zero_pad))
if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3:
return (soundex(word, max_length, 'American', reverse, zero_pad),
soundex(word[2:], max_length, 'American', reverse,
zero_pad))
# Otherwise, proceed as usual (var='American' mode, ostensibly)
word = ''.join(c for c in word if c in
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z'})
# Nothing to convert, return base case
if not word:
if zero_pad:
return '0'*max_length
return '0'
# Reverse word if computing Reverse Soundex
if reverse:
word = word[::-1]
# apply the Soundex algorithm
sdx = word.translate(_soundex_translation)
if var == 'special':
sdx = sdx.replace('9', '0') # special rule for 1880-1910 census
else:
sdx = sdx.replace('9', '') # rule 1
sdx = _delete_consecutive_repeats(sdx) # rule 3
if word[0] in 'HW':
sdx = word[0] + sdx
else:
sdx = word[0] + sdx[1:]
sdx = sdx.replace('0', '') # rule 1
if zero_pad:
sdx += ('0'*max_length) # rule 4
return sdx[:max_length]
[docs]def refined_soundex(word, max_length=-1, zero_pad=False,
retain_vowels=False):
"""Return the Refined Soundex code for a word.
This is Soundex, but with more character classes. It was defined at
:cite:`Boyce:1998`.
:param word: the word to transform
:param max_length: the length of the code returned (defaults to unlimited)
:param zero_pad: pad the end of the return value with 0s to achieve a
max_length string
:param retain_vowels: retain vowels (as 0) in the resulting code
:returns: the Refined Soundex value
:rtype: str
>>> refined_soundex('Christopher')
'C393619'
>>> refined_soundex('Niall')
'N87'
>>> refined_soundex('Smith')
'S386'
>>> refined_soundex('Schmidt')
'S386'
"""
_ref_soundex_translation = dict(zip((ord(_) for _ in
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
'01360240043788015936020505'))
# uppercase, normalize, decompose, and filter non-A-Z out
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
word = ''.join(c for c in word if c in
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z'})
# apply the Soundex algorithm
sdx = word[:1] + word.translate(_ref_soundex_translation)
sdx = _delete_consecutive_repeats(sdx)
if not retain_vowels:
sdx = sdx.replace('0', '') # Delete vowels, H, W, Y
if max_length > 0:
if zero_pad:
sdx += ('0' * max_length)
sdx = sdx[:max_length]
return sdx
[docs]def dm_soundex(word, max_length=6, zero_pad=True):
"""Return the Daitch-Mokotoff Soundex code for a word.
Based on Daitch-Mokotoff Soundex :cite:`Mokotoff:1997`, this returns values
of a word as a set. A collection is necessary since there can be multiple
values for a single word.
:param word: the word to transform
:param max_length: the length of the code returned (defaults to 6; must be
between 6 and 64)
:param zero_pad: pad the end of the return value with 0s to achieve a
max_length string
:returns: the Daitch-Mokotoff Soundex value
:rtype: str
>>> sorted(dm_soundex('Christopher'))
['494379', '594379']
>>> dm_soundex('Niall')
{'680000'}
>>> dm_soundex('Smith')
{'463000'}
>>> dm_soundex('Schmidt')
{'463000'}
>>> sorted(dm_soundex('The quick brown fox', max_length=20,
... zero_pad=False))
['35457976754', '3557976754']
"""
_dms_table = {'STCH': (2, 4, 4), 'DRZ': (4, 4, 4), 'ZH': (4, 4, 4),
'ZHDZH': (2, 4, 4), 'DZH': (4, 4, 4), 'DRS': (4, 4, 4),
'DZS': (4, 4, 4), 'SCHTCH': (2, 4, 4), 'SHTSH': (2, 4, 4),
'SZCZ': (2, 4, 4), 'TZS': (4, 4, 4), 'SZCS': (2, 4, 4),
'STSH': (2, 4, 4), 'SHCH': (2, 4, 4), 'D': (3, 3, 3),
'H': (5, 5, '_'), 'TTSCH': (4, 4, 4), 'THS': (4, 4, 4),
'L': (8, 8, 8), 'P': (7, 7, 7), 'CHS': (5, 54, 54),
'T': (3, 3, 3), 'X': (5, 54, 54), 'OJ': (0, 1, '_'),
'OI': (0, 1, '_'), 'SCHTSH': (2, 4, 4), 'OY': (0, 1, '_'),
'Y': (1, '_', '_'), 'TSH': (4, 4, 4), 'ZDZ': (2, 4, 4),
'TSZ': (4, 4, 4), 'SHT': (2, 43, 43), 'SCHTSCH': (2, 4, 4),
'TTSZ': (4, 4, 4), 'TTZ': (4, 4, 4), 'SCH': (4, 4, 4),
'TTS': (4, 4, 4), 'SZD': (2, 43, 43), 'AI': (0, 1, '_'),
'PF': (7, 7, 7), 'TCH': (4, 4, 4), 'PH': (7, 7, 7),
'TTCH': (4, 4, 4), 'SZT': (2, 43, 43), 'ZDZH': (2, 4, 4),
'EI': (0, 1, '_'), 'G': (5, 5, 5), 'EJ': (0, 1, '_'),
'ZD': (2, 43, 43), 'IU': (1, '_', '_'), 'K': (5, 5, 5),
'O': (0, '_', '_'), 'SHTCH': (2, 4, 4), 'S': (4, 4, 4),
'TRZ': (4, 4, 4), 'SHD': (2, 43, 43), 'DSH': (4, 4, 4),
'CSZ': (4, 4, 4), 'EU': (1, 1, '_'), 'TRS': (4, 4, 4),
'ZS': (4, 4, 4), 'STRZ': (2, 4, 4), 'UY': (0, 1, '_'),
'STRS': (2, 4, 4), 'CZS': (4, 4, 4),
'MN': ('6_6', '6_6', '6_6'), 'UI': (0, 1, '_'),
'UJ': (0, 1, '_'), 'UE': (0, '_', '_'), 'EY': (0, 1, '_'),
'W': (7, 7, 7), 'IA': (1, '_', '_'), 'FB': (7, 7, 7),
'STSCH': (2, 4, 4), 'SCHT': (2, 43, 43),
'NM': ('6_6', '6_6', '6_6'), 'SCHD': (2, 43, 43),
'B': (7, 7, 7), 'DSZ': (4, 4, 4), 'F': (7, 7, 7),
'N': (6, 6, 6), 'CZ': (4, 4, 4), 'R': (9, 9, 9),
'U': (0, '_', '_'), 'V': (7, 7, 7), 'CS': (4, 4, 4),
'Z': (4, 4, 4), 'SZ': (4, 4, 4), 'TSCH': (4, 4, 4),
'KH': (5, 5, 5), 'ST': (2, 43, 43), 'KS': (5, 54, 54),
'SH': (4, 4, 4), 'SC': (2, 4, 4), 'SD': (2, 43, 43),
'DZ': (4, 4, 4), 'ZHD': (2, 43, 43), 'DT': (3, 3, 3),
'ZSH': (4, 4, 4), 'DS': (4, 4, 4), 'TZ': (4, 4, 4),
'TS': (4, 4, 4), 'TH': (3, 3, 3), 'TC': (4, 4, 4),
'A': (0, '_', '_'), 'E': (0, '_', '_'), 'I': (0, '_', '_'),
'AJ': (0, 1, '_'), 'M': (6, 6, 6), 'Q': (5, 5, 5),
'AU': (0, 7, '_'), 'IO': (1, '_', '_'), 'AY': (0, 1, '_'),
'IE': (1, '_', '_'), 'ZSCH': (4, 4, 4),
'CH': ((5, 4), (5, 4), (5, 4)),
'CK': ((5, 45), (5, 45), (5, 45)),
'C': ((5, 4), (5, 4), (5, 4)),
'J': ((1, 4), ('_', 4), ('_', 4)),
'RZ': ((94, 4), (94, 4), (94, 4)),
'RS': ((94, 4), (94, 4), (94, 4))}
_dms_order = {'A': ('AI', 'AJ', 'AU', 'AY', 'A'),
'B': ('B',),
'C': ('CHS', 'CSZ', 'CZS', 'CH', 'CK', 'CS', 'CZ', 'C'),
'D': ('DRS', 'DRZ', 'DSH', 'DSZ', 'DZH', 'DZS', 'DS', 'DT',
'DZ', 'D'),
'E': ('EI', 'EJ', 'EU', 'EY', 'E'),
'F': ('FB', 'F'),
'G': ('G',),
'H': ('H',),
'I': ('IA', 'IE', 'IO', 'IU', 'I'),
'J': ('J',),
'K': ('KH', 'KS', 'K'),
'L': ('L',),
'M': ('MN', 'M'),
'N': ('NM', 'N'),
'O': ('OI', 'OJ', 'OY', 'O'),
'P': ('PF', 'PH', 'P'),
'Q': ('Q',),
'R': ('RS', 'RZ', 'R'),
'S': ('SCHTSCH', 'SCHTCH', 'SCHTSH', 'SHTCH', 'SHTSH',
'STSCH', 'SCHD', 'SCHT', 'SHCH', 'STCH', 'STRS',
'STRZ', 'STSH', 'SZCS', 'SZCZ', 'SCH', 'SHD', 'SHT',
'SZD', 'SZT', 'SC', 'SD', 'SH', 'ST', 'SZ', 'S'),
'T': ('TTSCH', 'TSCH', 'TTCH', 'TTSZ', 'TCH', 'THS', 'TRS',
'TRZ', 'TSH', 'TSZ', 'TTS', 'TTZ', 'TZS', 'TC', 'TH',
'TS', 'TZ', 'T'),
'U': ('UE', 'UI', 'UJ', 'UY', 'U'),
'V': ('V',),
'W': ('W',),
'X': ('X',),
'Y': ('Y',),
'Z': ('ZHDZH', 'ZDZH', 'ZSCH', 'ZDZ', 'ZHD', 'ZSH', 'ZD',
'ZH', 'ZS', 'Z')}
_vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
dms = [''] # initialize empty code list
# Require a max_length of at least 6 and not more than 64
if max_length != -1:
max_length = min(max(6, max_length), 64)
else:
max_length = 64
# uppercase, normalize, decompose, and filter non-A-Z
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
word = ''.join(c for c in word if c in
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z'})
# Nothing to convert, return base case
if not word:
if zero_pad:
return {'0'*max_length}
return {'0'}
pos = 0
while pos < len(word):
# Iterate through _dms_order, which specifies the possible substrings
# for which codes exist in the Daitch-Mokotoff coding
for sstr in _dms_order[word[pos]]: # pragma: no branch
if word[pos:].startswith(sstr):
# Having determined a valid substring start, retrieve the code
dm_val = _dms_table[sstr]
# Having retried the code (triple), determine the correct
# positional variant (first, pre-vocalic, elsewhere)
if pos == 0:
dm_val = dm_val[0]
elif (pos+len(sstr) < len(word) and
word[pos+len(sstr)] in _vowels):
dm_val = dm_val[1]
else:
dm_val = dm_val[2]
# Build the code strings
if isinstance(dm_val, tuple):
dms = [_ + text_type(dm_val[0]) for _ in dms] \
+ [_ + text_type(dm_val[1]) for _ in dms]
else:
dms = [_ + text_type(dm_val) for _ in dms]
pos += len(sstr)
break
# Filter out double letters and _ placeholders
dms = (''.join(c for c in _delete_consecutive_repeats(_) if c != '_')
for _ in dms)
# Trim codes and return set
if zero_pad:
dms = ((_ + ('0'*max_length))[:max_length] for _ in dms)
else:
dms = (_[:max_length] for _ in dms)
return set(dms)
[docs]def koelner_phonetik(word):
"""Return the Kölner Phonetik (numeric output) code for a word.
Based on the algorithm defined by :cite:`Postel:1969`.
While the output code is numeric, it is still a str because 0s can lead
the code.
:param str word: the word to transform
:returns: the Kölner Phonetik value as a numeric string
:rtype: str
>>> koelner_phonetik('Christopher')
'478237'
>>> koelner_phonetik('Niall')
'65'
>>> koelner_phonetik('Smith')
'862'
>>> koelner_phonetik('Schmidt')
'862'
>>> koelner_phonetik('Müller')
'657'
>>> koelner_phonetik('Zimmermann')
'86766'
"""
def _after(word, pos, letters):
"""Return True if word[i] follows one of the supplied letters."""
return pos > 0 and word[pos-1] in letters
def _before(word, pos, letters):
"""Return True if word[i] precedes one of the supplied letters."""
return pos+1 < len(word) and word[pos+1] in letters
_vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
sdx = ''
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
word = word.replace('Ä', 'AE')
word = word.replace('Ö', 'OE')
word = word.replace('Ü', 'UE')
word = ''.join(c for c in word if c in
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z'})
# Nothing to convert, return base case
if not word:
return sdx
for i in range(len(word)):
if word[i] in _vowels:
sdx += '0'
elif word[i] == 'B':
sdx += '1'
elif word[i] == 'P':
if _before(word, i, {'H'}):
sdx += '3'
else:
sdx += '1'
elif word[i] in {'D', 'T'}:
if _before(word, i, {'C', 'S', 'Z'}):
sdx += '8'
else:
sdx += '2'
elif word[i] in {'F', 'V', 'W'}:
sdx += '3'
elif word[i] in {'G', 'K', 'Q'}:
sdx += '4'
elif word[i] == 'C':
if _after(word, i, {'S', 'Z'}):
sdx += '8'
elif i == 0:
if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U',
'X'}):
sdx += '4'
else:
sdx += '8'
elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
sdx += '4'
else:
sdx += '8'
elif word[i] == 'X':
if _after(word, i, {'C', 'K', 'Q'}):
sdx += '8'
else:
sdx += '48'
elif word[i] == 'L':
sdx += '5'
elif word[i] in {'M', 'N'}:
sdx += '6'
elif word[i] == 'R':
sdx += '7'
elif word[i] in {'S', 'Z'}:
sdx += '8'
sdx = _delete_consecutive_repeats(sdx)
if sdx:
sdx = sdx[:1] + sdx[1:].replace('0', '')
return sdx
[docs]def koelner_phonetik_num_to_alpha(num):
"""Convert a Kölner Phonetik code from numeric to alphabetic.
:param str num: a numeric Kölner Phonetik representation (can be a str or
an int)
:returns: an alphabetic representation of the same word
:rtype: str
>>> koelner_phonetik_num_to_alpha('862')
'SNT'
>>> koelner_phonetik_num_to_alpha('657')
'NLR'
>>> koelner_phonetik_num_to_alpha('86766')
'SNRNN'
"""
_koelner_num_translation = dict(zip((ord(_) for _ in '012345678'),
'APTFKLNRS'))
num = ''.join(c for c in text_type(num) if c in {'0', '1', '2', '3', '4',
'5', '6', '7', '8'})
return num.translate(_koelner_num_translation)
[docs]def koelner_phonetik_alpha(word):
"""Return the Kölner Phonetik (alphabetic output) code for a word.
:param str word: the word to transform
:returns: the Kölner Phonetik value as an alphabetic string
:rtype: str
>>> koelner_phonetik_alpha('Smith')
'SNT'
>>> koelner_phonetik_alpha('Schmidt')
'SNT'
>>> koelner_phonetik_alpha('Müller')
'NLR'
>>> koelner_phonetik_alpha('Zimmermann')
'SNRNN'
"""
return koelner_phonetik_num_to_alpha(koelner_phonetik(word))
[docs]def nysiis(word, max_length=6, modified=False):
"""Return the NYSIIS code for a word.
The New York State Identification and Intelligence System algorithm is
defined in :cite:`Taft:1970`.
The modified version of this algorithm is described in Appendix B of
:cite:`Lynch:1977`.
:param str word: the word to transform
:param int max_length: the maximum length (default 6) of the code to return
:param bool modified: indicates whether to use USDA modified NYSIIS
:returns: the NYSIIS value
:rtype: str
>>> nysiis('Christopher')
'CRASTA'
>>> nysiis('Niall')
'NAL'
>>> nysiis('Smith')
'SNAT'
>>> nysiis('Schmidt')
'SNAD'
>>> nysiis('Christopher', max_length=-1)
'CRASTAFAR'
>>> nysiis('Christopher', max_length=8, modified=True)
'CRASTAFA'
>>> nysiis('Niall', max_length=8, modified=True)
'NAL'
>>> nysiis('Smith', max_length=8, modified=True)
'SNAT'
>>> nysiis('Schmidt', max_length=8, modified=True)
'SNAD'
"""
# Require a max_length of at least 6
if max_length > -1:
max_length = max(6, max_length)
_vowels = {'A', 'E', 'I', 'O', 'U'}
word = ''.join(c for c in word.upper() if c.isalpha())
word = word.replace('ß', 'SS')
# exit early if there are no alphas
if not word:
return ''
original_first_char = word[0]
if word[:3] == 'MAC':
word = 'MCC'+word[3:]
elif word[:2] == 'KN':
word = 'NN'+word[2:]
elif word[:1] == 'K':
word = 'C'+word[1:]
elif word[:2] in {'PH', 'PF'}:
word = 'FF'+word[2:]
elif word[:3] == 'SCH':
word = 'SSS'+word[3:]
elif modified:
if word[:2] == 'WR':
word = 'RR'+word[2:]
elif word[:2] == 'RH':
word = 'RR'+word[2:]
elif word[:2] == 'DG':
word = 'GG'+word[2:]
elif word[:1] in _vowels:
word = 'A'+word[1:]
if modified and word[-1:] in {'S', 'Z'}:
word = word[:-1]
if word[-2:] == 'EE' or word[-2:] == 'IE' or (modified and
word[-2:] == 'YE'):
word = word[:-2]+'Y'
elif word[-2:] in {'DT', 'RT', 'RD'}:
word = word[:-2]+'D'
elif word[-2:] in {'NT', 'ND'}:
word = word[:-2]+('N' if modified else 'D')
elif modified:
if word[-2:] == 'IX':
word = word[:-2]+'ICK'
elif word[-2:] == 'EX':
word = word[:-2]+'ECK'
elif word[-2:] in {'JR', 'SR'}:
return 'ERROR'
key = word[:1]
skip = 0
for i in range(1, len(word)):
if i >= len(word):
continue
elif skip:
skip -= 1
continue
elif word[i:i+2] == 'EV':
word = word[:i] + 'AF' + word[i+2:]
skip = 1
elif word[i] in _vowels:
word = word[:i] + 'A' + word[i+1:]
elif modified and i != len(word)-1 and word[i] == 'Y':
word = word[:i] + 'A' + word[i+1:]
elif word[i] == 'Q':
word = word[:i] + 'G' + word[i+1:]
elif word[i] == 'Z':
word = word[:i] + 'S' + word[i+1:]
elif word[i] == 'M':
word = word[:i] + 'N' + word[i+1:]
elif word[i:i+2] == 'KN':
word = word[:i] + 'N' + word[i+2:]
elif word[i] == 'K':
word = word[:i] + 'C' + word[i+1:]
elif modified and i == len(word)-3 and word[i:i+3] == 'SCH':
word = word[:i] + 'SSA'
skip = 2
elif word[i:i+3] == 'SCH':
word = word[:i] + 'SSS' + word[i+3:]
skip = 2
elif modified and i == len(word)-2 and word[i:i+2] == 'SH':
word = word[:i] + 'SA'
skip = 1
elif word[i:i+2] == 'SH':
word = word[:i] + 'SS' + word[i+2:]
skip = 1
elif word[i:i+2] == 'PH':
word = word[:i] + 'FF' + word[i+2:]
skip = 1
elif modified and word[i:i+3] == 'GHT':
word = word[:i] + 'TTT' + word[i+3:]
skip = 2
elif modified and word[i:i+2] == 'DG':
word = word[:i] + 'GG' + word[i+2:]
skip = 1
elif modified and word[i:i+2] == 'WR':
word = word[:i] + 'RR' + word[i+2:]
skip = 1
elif word[i] == 'H' and (word[i-1] not in _vowels or
word[i+1:i+2] not in _vowels):
word = word[:i] + word[i-1] + word[i+1:]
elif word[i] == 'W' and word[i-1] in _vowels:
word = word[:i] + word[i-1] + word[i+1:]
if word[i:i+skip+1] != key[-1:]:
key += word[i:i+skip+1]
key = _delete_consecutive_repeats(key)
if key[-1:] == 'S':
key = key[:-1]
if key[-2:] == 'AY':
key = key[:-2] + 'Y'
if key[-1:] == 'A':
key = key[:-1]
if modified and key[:1] == 'A':
key = original_first_char + key[1:]
if max_length > 0:
key = key[:max_length]
return key
[docs]def mra(word):
"""Return the MRA personal numeric identifier (PNI) for a word.
A description of the Western Airlines Surname Match Rating Algorithm can
be found on page 18 of :cite:`Moore:1977`.
:param str word: the word to transform
:returns: the MRA PNI
:rtype: str
>>> mra('Christopher')
'CHRPHR'
>>> mra('Niall')
'NL'
>>> mra('Smith')
'SMTH'
>>> mra('Schmidt')
'SCHMDT'
"""
if not word:
return word
word = word.upper()
word = word.replace('ß', 'SS')
word = word[0]+''.join(c for c in word[1:] if
c not in {'A', 'E', 'I', 'O', 'U'})
word = _delete_consecutive_repeats(word)
if len(word) > 6:
word = word[:3]+word[-3:]
return word
[docs]def caverphone(word, version=2):
"""Return the Caverphone code for a word.
A description of version 1 of the algorithm can be found in
:cite:`Hood:2002`.
A description of version 2 of the algorithm can be found in
:cite:`Hood:2004`.
:param str word: the word to transform
:param int version: the version of Caverphone to employ for encoding
(defaults to 2)
:returns: the Caverphone value
:rtype: str
>>> caverphone('Christopher')
'KRSTFA1111'
>>> caverphone('Niall')
'NA11111111'
>>> caverphone('Smith')
'SMT1111111'
>>> caverphone('Schmidt')
'SKMT111111'
>>> caverphone('Christopher', 1)
'KRSTF1'
>>> caverphone('Niall', 1)
'N11111'
>>> caverphone('Smith', 1)
'SMT111'
>>> caverphone('Schmidt', 1)
'SKMT11'
"""
_vowels = {'a', 'e', 'i', 'o', 'u'}
word = word.lower()
word = ''.join(c for c in word if c in
{'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
'y', 'z'})
def _squeeze_replace(word, char, new_char):
"""Convert strings of char in word to one instance of new_char."""
while char * 2 in word:
word = word.replace(char * 2, char)
return word.replace(char, new_char)
# the main replacement algorithm
if version != 1 and word[-1:] == 'e':
word = word[:-1]
if word:
if word[:5] == 'cough':
word = 'cou2f'+word[5:]
if word[:5] == 'rough':
word = 'rou2f'+word[5:]
if word[:5] == 'tough':
word = 'tou2f'+word[5:]
if word[:6] == 'enough':
word = 'enou2f'+word[6:]
if version != 1 and word[:6] == 'trough':
word = 'trou2f'+word[6:]
if word[:2] == 'gn':
word = '2n'+word[2:]
if word[-2:] == 'mb':
word = word[:-1]+'2'
word = word.replace('cq', '2q')
word = word.replace('ci', 'si')
word = word.replace('ce', 'se')
word = word.replace('cy', 'sy')
word = word.replace('tch', '2ch')
word = word.replace('c', 'k')
word = word.replace('q', 'k')
word = word.replace('x', 'k')
word = word.replace('v', 'f')
word = word.replace('dg', '2g')
word = word.replace('tio', 'sio')
word = word.replace('tia', 'sia')
word = word.replace('d', 't')
word = word.replace('ph', 'fh')
word = word.replace('b', 'p')
word = word.replace('sh', 's2')
word = word.replace('z', 's')
if word[0] in _vowels:
word = 'A'+word[1:]
word = word.replace('a', '3')
word = word.replace('e', '3')
word = word.replace('i', '3')
word = word.replace('o', '3')
word = word.replace('u', '3')
if version != 1:
word = word.replace('j', 'y')
if word[:2] == 'y3':
word = 'Y3'+word[2:]
if word[:1] == 'y':
word = 'A'+word[1:]
word = word.replace('y', '3')
word = word.replace('3gh3', '3kh3')
word = word.replace('gh', '22')
word = word.replace('g', 'k')
word = _squeeze_replace(word, 's', 'S')
word = _squeeze_replace(word, 't', 'T')
word = _squeeze_replace(word, 'p', 'P')
word = _squeeze_replace(word, 'k', 'K')
word = _squeeze_replace(word, 'f', 'F')
word = _squeeze_replace(word, 'm', 'M')
word = _squeeze_replace(word, 'n', 'N')
word = word.replace('w3', 'W3')
if version == 1:
word = word.replace('wy', 'Wy')
word = word.replace('wh3', 'Wh3')
if version == 1:
word = word.replace('why', 'Why')
if version != 1 and word[-1:] == 'w':
word = word[:-1]+'3'
word = word.replace('w', '2')
if word[:1] == 'h':
word = 'A'+word[1:]
word = word.replace('h', '2')
word = word.replace('r3', 'R3')
if version == 1:
word = word.replace('ry', 'Ry')
if version != 1 and word[-1:] == 'r':
word = word[:-1]+'3'
word = word.replace('r', '2')
word = word.replace('l3', 'L3')
if version == 1:
word = word.replace('ly', 'Ly')
if version != 1 and word[-1:] == 'l':
word = word[:-1]+'3'
word = word.replace('l', '2')
if version == 1:
word = word.replace('j', 'y')
word = word.replace('y3', 'Y3')
word = word.replace('y', '2')
word = word.replace('2', '')
if version != 1 and word[-1:] == '3':
word = word[:-1]+'A'
word = word.replace('3', '')
# pad with 1s, then extract the necessary length of code
word += '1'*10
if version != 1:
word = word[:10]
else:
word = word[:6]
return word
[docs]def alpha_sis(word, max_length=14):
"""Return the IBM Alpha Search Inquiry System code for a word.
The Alpha Search Inquiry System code is defined in :cite:`IBM:1973`.
This implementation is based on the description in :cite:`Moore:1977`.
A collection is necessary since there can be multiple values for a
single word. But the collection must be ordered since the first value
is the primary coding.
:param str word: the word to transform
:param int max_length: the length of the code returned (defaults to 14)
:returns: the Alpha SIS value
:rtype: tuple
>>> alpha_sis('Christopher')
('06401840000000', '07040184000000', '04018400000000')
>>> alpha_sis('Niall')
('02500000000000',)
>>> alpha_sis('Smith')
('03100000000000',)
>>> alpha_sis('Schmidt')
('06310000000000',)
"""
_alpha_sis_initials = {'GF': '08', 'GM': '03', 'GN': '02', 'KN': '02',
'PF': '08', 'PN': '02', 'PS': '00', 'WR': '04',
'A': '1', 'E': '1', 'H': '2', 'I': '1', 'J': '3',
'O': '1', 'U': '1', 'W': '4', 'Y': '5'}
_alpha_sis_initials_order = ('GF', 'GM', 'GN', 'KN', 'PF', 'PN', 'PS',
'WR', 'A', 'E', 'H', 'I', 'J', 'O', 'U', 'W',
'Y')
_alpha_sis_basic = {'SCH': '6', 'CZ': ('70', '6', '0'),
'CH': ('6', '70', '0'), 'CK': ('7', '6'),
'DS': ('0', '10'), 'DZ': ('0', '10'),
'TS': ('0', '10'), 'TZ': ('0', '10'), 'CI': '0',
'CY': '0', 'CE': '0', 'SH': '6', 'DG': '7', 'PH': '8',
'C': ('7', '6'), 'K': ('7', '6'), 'Z': '0', 'S': '0',
'D': '1', 'T': '1', 'N': '2', 'M': '3', 'R': '4',
'L': '5', 'J': '6', 'G': '7', 'Q': '7', 'X': '7',
'F': '8', 'V': '8', 'B': '9', 'P': '9'}
_alpha_sis_basic_order = ('SCH', 'CZ', 'CH', 'CK', 'DS', 'DZ', 'TS', 'TZ',
'CI', 'CY', 'CE', 'SH', 'DG', 'PH', 'C', 'K',
'Z', 'S', 'D', 'T', 'N', 'M', 'R', 'L', 'J', 'C',
'G', 'K', 'Q', 'X', 'F', 'V', 'B', 'P')
alpha = ['']
pos = 0
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
word = ''.join(c for c in word if c in
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z'})
# Clamp max_length to [4, 64]
if max_length != -1:
max_length = min(max(4, max_length), 64)
else:
max_length = 64
# Do special processing for initial substrings
for k in _alpha_sis_initials_order:
if word.startswith(k):
alpha[0] += _alpha_sis_initials[k]
pos += len(k)
break
# Add a '0' if alpha is still empty
if not alpha[0]:
alpha[0] += '0'
# Whether or not any special initial codes were encoded, iterate
# through the length of the word in the main encoding loop
while pos < len(word):
orig_pos = pos
for k in _alpha_sis_basic_order:
if word[pos:].startswith(k):
if isinstance(_alpha_sis_basic[k], tuple):
newalpha = []
for i in range(len(_alpha_sis_basic[k])):
newalpha += [_ + _alpha_sis_basic[k][i] for _ in alpha]
alpha = newalpha
else:
alpha = [_ + _alpha_sis_basic[k] for _ in alpha]
pos += len(k)
break
if pos == orig_pos:
alpha = [_ + '_' for _ in alpha]
pos += 1
# Trim doublets and placeholders
for i in range(len(alpha)):
pos = 1
while pos < len(alpha[i]):
if alpha[i][pos] == alpha[i][pos-1]:
alpha[i] = alpha[i][:pos]+alpha[i][pos+1:]
pos += 1
alpha = (_.replace('_', '') for _ in alpha)
# Trim codes and return tuple
alpha = ((_ + ('0'*max_length))[:max_length] for _ in alpha)
return tuple(alpha)
[docs]def fuzzy_soundex(word, max_length=5, zero_pad=True):
"""Return the Fuzzy Soundex code for a word.
Fuzzy Soundex is an algorithm derived from Soundex, defined in
:cite:`Holmes:2002`.
:param str word: the word to transform
:param int max_length: the length of the code returned (defaults to 4)
:param bool zero_pad: pad the end of the return value with 0s to achieve
a max_length string
:returns: the Fuzzy Soundex value
:rtype: str
>>> fuzzy_soundex('Christopher')
'K6931'
>>> fuzzy_soundex('Niall')
'N4000'
>>> fuzzy_soundex('Smith')
'S5300'
>>> fuzzy_soundex('Smith')
'S5300'
"""
_fuzzy_soundex_translation = dict(zip((ord(_) for _ in
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
'0193017-07745501769301-7-9'))
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
# Clamp max_length to [4, 64]
if max_length != -1:
max_length = min(max(4, max_length), 64)
else:
max_length = 64
if not word:
if zero_pad:
return '0' * max_length
return '0'
if word[:2] in {'CS', 'CZ', 'TS', 'TZ'}:
word = 'SS' + word[2:]
elif word[:2] == 'GN':
word = 'NN' + word[2:]
elif word[:2] in {'HR', 'WR'}:
word = 'RR' + word[2:]
elif word[:2] == 'HW':
word = 'WW' + word[2:]
elif word[:2] in {'KN', 'NG'}:
word = 'NN' + word[2:]
if word[-2:] == 'CH':
word = word[:-2] + 'KK'
elif word[-2:] == 'NT':
word = word[:-2] + 'TT'
elif word[-2:] == 'RT':
word = word[:-2] + 'RR'
elif word[-3:] == 'RDT':
word = word[:-3] + 'RR'
word = word.replace('CA', 'KA')
word = word.replace('CC', 'KK')
word = word.replace('CK', 'KK')
word = word.replace('CE', 'SE')
word = word.replace('CHL', 'KL')
word = word.replace('CL', 'KL')
word = word.replace('CHR', 'KR')
word = word.replace('CR', 'KR')
word = word.replace('CI', 'SI')
word = word.replace('CO', 'KO')
word = word.replace('CU', 'KU')
word = word.replace('CY', 'SY')
word = word.replace('DG', 'GG')
word = word.replace('GH', 'HH')
word = word.replace('MAC', 'MK')
word = word.replace('MC', 'MK')
word = word.replace('NST', 'NSS')
word = word.replace('PF', 'FF')
word = word.replace('PH', 'FF')
word = word.replace('SCH', 'SSS')
word = word.replace('TIO', 'SIO')
word = word.replace('TIA', 'SIO')
word = word.replace('TCH', 'CHH')
sdx = word.translate(_fuzzy_soundex_translation)
sdx = sdx.replace('-', '')
# remove repeating characters
sdx = _delete_consecutive_repeats(sdx)
if word[0] in {'H', 'W', 'Y'}:
sdx = word[0] + sdx
else:
sdx = word[0] + sdx[1:]
sdx = sdx.replace('0', '')
if zero_pad:
sdx += ('0'*max_length)
return sdx[:max_length]
[docs]def phonex(word, max_length=4, zero_pad=True):
"""Return the Phonex code for a word.
Phonex is an algorithm derived from Soundex, defined in :cite:`Lait:1996`.
:param str word: the word to transform
:param int max_length: the length of the code returned (defaults to 4)
:param bool zero_pad: pad the end of the return value with 0s to achieve
a max_length string
:returns: the Phonex value
:rtype: str
>>> phonex('Christopher')
'C623'
>>> phonex('Niall')
'N400'
>>> phonex('Schmidt')
'S253'
>>> phonex('Smith')
'S530'
"""
name = unicode_normalize('NFKD', text_type(word.upper()))
name = name.replace('ß', 'SS')
# Clamp max_length to [4, 64]
if max_length != -1:
max_length = min(max(4, max_length), 64)
else:
max_length = 64
name_code = last = ''
# Deletions effected by replacing with next letter which
# will be ignored due to duplicate handling of Soundex code.
# This is faster than 'moving' all subsequent letters.
# Remove any trailing Ss
while name[-1:] == 'S':
name = name[:-1]
# Phonetic equivalents of first 2 characters
# Works since duplicate letters are ignored
if name[:2] == 'KN':
name = 'N' + name[2:] # KN.. == N..
elif name[:2] == 'PH':
name = 'F' + name[2:] # PH.. == F.. (H ignored anyway)
elif name[:2] == 'WR':
name = 'R' + name[2:] # WR.. == R..
if name:
# Special case, ignore H first letter (subsequent Hs ignored anyway)
# Works since duplicate letters are ignored
if name[0] == 'H':
name = name[1:]
if name:
# Phonetic equivalents of first character
if name[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
name = 'A' + name[1:]
elif name[0] in {'B', 'P'}:
name = 'B' + name[1:]
elif name[0] in {'V', 'F'}:
name = 'F' + name[1:]
elif name[0] in {'C', 'K', 'Q'}:
name = 'C' + name[1:]
elif name[0] in {'G', 'J'}:
name = 'G' + name[1:]
elif name[0] in {'S', 'Z'}:
name = 'S' + name[1:]
name_code = last = name[0]
# Modified Soundex code
for i in range(1, len(name)):
code = '0'
if name[i] in {'B', 'F', 'P', 'V'}:
code = '1'
elif name[i] in {'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z'}:
code = '2'
elif name[i] in {'D', 'T'}:
if name[i+1:i+2] != 'C':
code = '3'
elif name[i] == 'L':
if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or
i+1 == len(name)):
code = '4'
elif name[i] in {'M', 'N'}:
if name[i+1:i+2] in {'D', 'G'}:
name = name[:i+1] + name[i] + name[i+2:]
code = '5'
elif name[i] == 'R':
if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or
i+1 == len(name)):
code = '6'
if code != last and code != '0' and i != 0:
name_code += code
last = name_code[-1]
if zero_pad:
name_code += '0' * max_length
if not name_code:
name_code = '0'
return name_code[:max_length]
[docs]def phonem(word):
"""Return the Phonem code for a word.
Phonem is defined in :cite:`Wilde:1988`.
This version is based on the Perl implementation documented at
:cite:`Wilz:2005`.
It includes some enhancements presented in the Java port at
:cite:`dcm4che:2011`.
Phonem is intended chiefly for German names/words.
:param str word: the word to transform
:returns: the Phonem value
:rtype: str
>>> phonem('Christopher')
'CRYSDOVR'
>>> phonem('Niall')
'NYAL'
>>> phonem('Smith')
'SMYD'
>>> phonem('Schmidt')
'CMYD'
"""
_phonem_substitutions = (('SC', 'C'), ('SZ', 'C'), ('CZ', 'C'),
('TZ', 'C'), ('TS', 'C'), ('KS', 'X'),
('PF', 'V'), ('QU', 'KW'), ('PH', 'V'),
('UE', 'Y'), ('AE', 'E'), ('OE', 'Ö'),
('EI', 'AY'), ('EY', 'AY'), ('EU', 'OY'),
('AU', 'A§'), ('OU', '§'))
_phonem_translation = dict(zip((ord(_) for _ in
'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ'),
'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ'))
word = unicode_normalize('NFC', text_type(word.upper()))
for i, j in _phonem_substitutions:
word = word.replace(i, j)
word = word.translate(_phonem_translation)
return ''.join(c for c in _delete_consecutive_repeats(word)
if c in {'A', 'B', 'C', 'D', 'L', 'M', 'N', 'O', 'R', 'S',
'U', 'V', 'W', 'X', 'Y', 'Ö'})
[docs]def phonix(word, max_length=4, zero_pad=True):
"""Return the Phonix code for a word.
Phonix is a Soundex-like algorithm defined in :cite:`Gadd:1990`.
This implementation is based on:
- :cite:`Pfeifer:2000`
- :cite:`Christen:2011`
- :cite:`Kollar:2007`
:param str word: the word to transform
:param int max_length: the length of the code returned (defaults to 4)
:param bool zero_pad: pad the end of the return value with 0s to achieve
a max_length string
:returns: the Phonix value
:rtype: str
>>> phonix('Christopher')
'K683'
>>> phonix('Niall')
'N400'
>>> phonix('Smith')
'S530'
>>> phonix('Schmidt')
'S530'
"""
def _start_repl(word, src, tar, post=None):
r"""Replace src with tar at the start of word."""
if post:
for i in post:
if word.startswith(src+i):
return tar + word[len(src):]
elif word.startswith(src):
return tar + word[len(src):]
return word
def _end_repl(word, src, tar, pre=None):
r"""Replace src with tar at the end of word."""
if pre:
for i in pre:
if word.endswith(i+src):
return word[:-len(src)] + tar
elif word.endswith(src):
return word[:-len(src)] + tar
return word
def _mid_repl(word, src, tar, pre=None, post=None):
r"""Replace src with tar in the middle of word."""
if pre or post:
if not pre:
return word[0] + _all_repl(word[1:], src, tar, pre, post)
elif not post:
return (_all_repl(word[:-1], src, tar, pre, post) +
word[-1])
return _all_repl(word, src, tar, pre, post)
return (word[0] +
_all_repl(word[1:-1], src, tar, pre, post) +
word[-1])
def _all_repl(word, src, tar, pre=None, post=None):
r"""Replace src with tar anywhere in word."""
if pre or post:
if post:
post = post
else:
post = frozenset(('',))
if pre:
pre = pre
else:
pre = frozenset(('',))
for i, j in ((i, j) for i in pre for j in post):
word = word.replace(i+src+j, i+tar+j)
return word
else:
return word.replace(src, tar)
_vow = {'A', 'E', 'I', 'O', 'U'}
_con = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q',
'R', 'S', 'T', 'V', 'W', 'X', 'Y', 'Z'}
_phonix_substitutions = ((_all_repl, 'DG', 'G'),
(_all_repl, 'CO', 'KO'),
(_all_repl, 'CA', 'KA'),
(_all_repl, 'CU', 'KU'),
(_all_repl, 'CY', 'SI'),
(_all_repl, 'CI', 'SI'),
(_all_repl, 'CE', 'SE'),
(_start_repl, 'CL', 'KL', _vow),
(_all_repl, 'CK', 'K'),
(_end_repl, 'GC', 'K'),
(_end_repl, 'JC', 'K'),
(_start_repl, 'CHR', 'KR', _vow),
(_start_repl, 'CR', 'KR', _vow),
(_start_repl, 'WR', 'R'),
(_all_repl, 'NC', 'NK'),
(_all_repl, 'CT', 'KT'),
(_all_repl, 'PH', 'F'),
(_all_repl, 'AA', 'AR'),
(_all_repl, 'SCH', 'SH'),
(_all_repl, 'BTL', 'TL'),
(_all_repl, 'GHT', 'T'),
(_all_repl, 'AUGH', 'ARF'),
(_mid_repl, 'LJ', 'LD', _vow, _vow),
(_all_repl, 'LOUGH', 'LOW'),
(_start_repl, 'Q', 'KW'),
(_start_repl, 'KN', 'N'),
(_end_repl, 'GN', 'N'),
(_all_repl, 'GHN', 'N'),
(_end_repl, 'GNE', 'N'),
(_all_repl, 'GHNE', 'NE'),
(_end_repl, 'GNES', 'NS'),
(_start_repl, 'GN', 'N'),
(_mid_repl, 'GN', 'N', None, _con),
(_end_repl, 'GN', 'N'),
(_start_repl, 'PS', 'S'),
(_start_repl, 'PT', 'T'),
(_start_repl, 'CZ', 'C'),
(_mid_repl, 'WZ', 'Z', _vow),
(_mid_repl, 'CZ', 'CH'),
(_all_repl, 'LZ', 'LSH'),
(_all_repl, 'RZ', 'RSH'),
(_mid_repl, 'Z', 'S', None, _vow),
(_all_repl, 'ZZ', 'TS'),
(_mid_repl, 'Z', 'TS', _con),
(_all_repl, 'HROUG', 'REW'),
(_all_repl, 'OUGH', 'OF'),
(_mid_repl, 'Q', 'KW', _vow, _vow),
(_mid_repl, 'J', 'Y', _vow, _vow),
(_start_repl, 'YJ', 'Y', _vow),
(_start_repl, 'GH', 'G'),
(_end_repl, 'GH', 'E', _vow),
(_start_repl, 'CY', 'S'),
(_all_repl, 'NX', 'NKS'),
(_start_repl, 'PF', 'F'),
(_end_repl, 'DT', 'T'),
(_end_repl, 'TL', 'TIL'),
(_end_repl, 'DL', 'DIL'),
(_all_repl, 'YTH', 'ITH'),
(_start_repl, 'TJ', 'CH', _vow),
(_start_repl, 'TSJ', 'CH', _vow),
(_start_repl, 'TS', 'T', _vow),
(_all_repl, 'TCH', 'CH'),
(_mid_repl, 'WSK', 'VSKIE', _vow),
(_end_repl, 'WSK', 'VSKIE', _vow),
(_start_repl, 'MN', 'N', _vow),
(_start_repl, 'PN', 'N', _vow),
(_mid_repl, 'STL', 'SL', _vow),
(_end_repl, 'STL', 'SL', _vow),
(_end_repl, 'TNT', 'ENT'),
(_end_repl, 'EAUX', 'OH'),
(_all_repl, 'EXCI', 'ECS'),
(_all_repl, 'X', 'ECS'),
(_end_repl, 'NED', 'ND'),
(_all_repl, 'JR', 'DR'),
(_end_repl, 'EE', 'EA'),
(_all_repl, 'ZS', 'S'),
(_mid_repl, 'R', 'AH', _vow, _con),
(_end_repl, 'R', 'AH', _vow),
(_mid_repl, 'HR', 'AH', _vow, _con),
(_end_repl, 'HR', 'AH', _vow),
(_end_repl, 'HR', 'AH', _vow),
(_end_repl, 'RE', 'AR'),
(_end_repl, 'R', 'AH', _vow),
(_all_repl, 'LLE', 'LE'),
(_end_repl, 'LE', 'ILE', _con),
(_end_repl, 'LES', 'ILES', _con),
(_end_repl, 'E', ''),
(_end_repl, 'ES', 'S'),
(_end_repl, 'SS', 'AS', _vow),
(_end_repl, 'MB', 'M', _vow),
(_all_repl, 'MPTS', 'MPS'),
(_all_repl, 'MPS', 'MS'),
(_all_repl, 'MPT', 'MT'))
_phonix_translation = dict(zip((ord(_) for _ in
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
'01230720022455012683070808'))
sdx = ''
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
word = ''.join(c for c in word if c in
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z'})
if word:
for trans in _phonix_substitutions:
word = trans[0](word, *trans[1:])
if word[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
sdx = 'v' + word[1:].translate(_phonix_translation)
else:
sdx = word[0] + word[1:].translate(_phonix_translation)
sdx = _delete_consecutive_repeats(sdx)
sdx = sdx.replace('0', '')
# Clamp max_length to [4, 64]
if max_length != -1:
max_length = min(max(4, max_length), 64)
else:
max_length = 64
if zero_pad:
sdx += '0' * max_length
if not sdx:
sdx = '0'
return sdx[:max_length]
[docs]def sfinxbis(word, max_length=-1):
"""Return the SfinxBis code for a word.
SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`.
This implementation follows the reference implementation:
:cite:`Sjoo:2009`.
SfinxBis is intended chiefly for Swedish names.
:param str word: the word to transform
:param int max_length: the length of the code returned (defaults to
unlimited)
:returns: the SfinxBis value
:rtype: tuple
>>> sfinxbis('Christopher')
('K68376',)
>>> sfinxbis('Niall')
('N4',)
>>> sfinxbis('Smith')
('S53',)
>>> sfinxbis('Schmidt')
('S53',)
>>> sfinxbis('Johansson')
('J585',)
>>> sfinxbis('Sjöberg')
('#162',)
"""
adelstitler = (' DE LA ', ' DE LAS ', ' DE LOS ', ' VAN DE ', ' VAN DEN ',
' VAN DER ', ' VON DEM ', ' VON DER ',
' AF ', ' AV ', ' DA ', ' DE ', ' DEL ', ' DEN ', ' DES ',
' DI ', ' DO ', ' DON ', ' DOS ', ' DU ', ' E ', ' IN ',
' LA ', ' LE ', ' MAC ', ' MC ', ' VAN ', ' VON ', ' Y ',
' S:T ')
_harde_vokaler = {'A', 'O', 'U', 'Å'}
_mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'}
_konsonanter = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P',
'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Z'}
_alfabet = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z', 'Ä', 'Å', 'Ö'}
_sfinxbis_translation = dict(zip((ord(_) for _ in
'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),
'123729224551268378999999999'))
_sfinxbis_substitutions = dict(zip((ord(_) for _ in
'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
'VSAAAAÄCEEEEIIIINOOOOÖUUUYY'))
def _foersvensker(lokal_ordet):
"""Return the Swedish-ized form of the word."""
lokal_ordet = lokal_ordet.replace('STIERN', 'STJÄRN')
lokal_ordet = lokal_ordet.replace('HIE', 'HJ')
lokal_ordet = lokal_ordet.replace('SIÖ', 'SJÖ')
lokal_ordet = lokal_ordet.replace('SCH', 'SH')
lokal_ordet = lokal_ordet.replace('QU', 'KV')
lokal_ordet = lokal_ordet.replace('IO', 'JO')
lokal_ordet = lokal_ordet.replace('PH', 'F')
for i in _harde_vokaler:
lokal_ordet = lokal_ordet.replace(i+'Ü', i+'J')
lokal_ordet = lokal_ordet.replace(i+'Y', i+'J')
lokal_ordet = lokal_ordet.replace(i+'I', i+'J')
for i in _mjuka_vokaler:
lokal_ordet = lokal_ordet.replace(i+'Ü', i+'J')
lokal_ordet = lokal_ordet.replace(i+'Y', i+'J')
lokal_ordet = lokal_ordet.replace(i+'I', i+'J')
if 'H' in lokal_ordet:
for i in _konsonanter:
lokal_ordet = lokal_ordet.replace('H'+i, i)
lokal_ordet = lokal_ordet.translate(_sfinxbis_substitutions)
lokal_ordet = lokal_ordet.replace('Ð', 'ETH')
lokal_ordet = lokal_ordet.replace('Þ', 'TH')
lokal_ordet = lokal_ordet.replace('ß', 'SS')
return lokal_ordet
def _koda_foersta_ljudet(lokal_ordet):
"""Return the word with the first sound coded."""
if (lokal_ordet[0:1] in _mjuka_vokaler or
lokal_ordet[0:1] in _harde_vokaler):
lokal_ordet = '$' + lokal_ordet[1:]
elif lokal_ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
lokal_ordet = 'J' + lokal_ordet[2:]
elif lokal_ordet[0:1] == 'G' and lokal_ordet[1:2] in _mjuka_vokaler:
lokal_ordet = 'J' + lokal_ordet[1:]
elif lokal_ordet[0:1] == 'Q':
lokal_ordet = 'K' + lokal_ordet[1:]
elif (lokal_ordet[0:2] == 'CH' and
lokal_ordet[2:3] in frozenset(_mjuka_vokaler | _harde_vokaler)):
lokal_ordet = '#' + lokal_ordet[2:]
elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _harde_vokaler:
lokal_ordet = 'K' + lokal_ordet[1:]
elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _konsonanter:
lokal_ordet = 'K' + lokal_ordet[1:]
elif lokal_ordet[0:1] == 'X':
lokal_ordet = 'S' + lokal_ordet[1:]
elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _mjuka_vokaler:
lokal_ordet = 'S' + lokal_ordet[1:]
elif lokal_ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
lokal_ordet = '#' + lokal_ordet[3:]
elif lokal_ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
lokal_ordet = '#' + lokal_ordet[2:]
elif lokal_ordet[0:2] == 'SK' and lokal_ordet[2:3] in _mjuka_vokaler:
lokal_ordet = '#' + lokal_ordet[2:]
elif lokal_ordet[0:1] == 'K' and lokal_ordet[1:2] in _mjuka_vokaler:
lokal_ordet = '#' + lokal_ordet[1:]
return lokal_ordet
# Steg 1, Versaler
word = unicode_normalize('NFC', text_type(word.upper()))
word = word.replace('ß', 'SS')
word = word.replace('-', ' ')
# Steg 2, Ta bort adelsprefix
for adelstitel in adelstitler:
while adelstitel in word:
word = word.replace(adelstitel, ' ')
if word.startswith(adelstitel[1:]):
word = word[len(adelstitel)-1:]
# Split word into tokens
ordlista = word.split()
# Steg 3, Ta bort dubbelteckning i början på namnet
ordlista = [_delete_consecutive_repeats(ordet) for ordet in ordlista]
if not ordlista:
# noinspection PyRedundantParentheses
return ('',)
# Steg 4, Försvenskning
ordlista = [_foersvensker(ordet) for ordet in ordlista]
# Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
ordlista = [''.join(c for c in ordet if c in _alfabet)
for ordet in ordlista]
# Steg 6, Koda första ljudet
ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]
# Steg 7, Dela upp namnet i två delar
rest = [ordet[1:] for ordet in ordlista]
# Steg 8, Utför fonetisk transformation i resten
rest = [ordet.replace('DT', 'T') for ordet in rest]
rest = [ordet.replace('X', 'KS') for ordet in rest]
# Steg 9, Koda resten till en sifferkod
for vokal in _mjuka_vokaler:
rest = [ordet.replace('C'+vokal, '8'+vokal) for ordet in rest]
rest = [ordet.translate(_sfinxbis_translation) for ordet in rest]
# Steg 10, Ta bort intilliggande dubbletter
rest = [_delete_consecutive_repeats(ordet) for ordet in rest]
# Steg 11, Ta bort alla "9"
rest = [ordet.replace('9', '') for ordet in rest]
# Steg 12, Sätt ihop delarna igen
ordlista = [''.join(ordet) for ordet in
zip((_[0:1] for _ in ordlista), rest)]
# truncate, if max_length is set
if max_length > 0:
ordlista = [ordet[:max_length] for ordet in ordlista]
return tuple(ordlista)
[docs]def phonet(word, mode=1, lang='de'):
"""Return the phonet code for a word.
phonet ("Hannoveraner Phonetik") was developed by Jörg Michael and
documented in :cite:`Michael:1999`.
This is a port of Jesper Zedlitz's code, which is licensed LGPL
:cite:`Zedlitz:2015`.
That is, in turn, based on Michael's C code, which is also licensed LGPL
:cite:`Michael:2007`.
:param str word: the word to transform
:param int mode: the ponet variant to employ (1 or 2)
:param str lang: 'de' (default) for German
'none' for no language
:returns: the phonet value
:rtype: str
>>> phonet('Christopher')
'KRISTOFA'
>>> phonet('Niall')
'NIAL'
>>> phonet('Smith')
'SMIT'
>>> phonet('Schmidt')
'SHMIT'
>>> phonet('Christopher', mode=2)
'KRIZTUFA'
>>> phonet('Niall', mode=2)
'NIAL'
>>> phonet('Smith', mode=2)
'ZNIT'
>>> phonet('Schmidt', mode=2)
'ZNIT'
>>> phonet('Christopher', lang='none')
'CHRISTOPHER'
>>> phonet('Niall', lang='none')
'NIAL'
>>> phonet('Smith', lang='none')
'SMITH'
>>> phonet('Schmidt', lang='none')
'SCHMIDT'
"""
_phonet_rules_no_lang = ( # separator chars
'´', ' ', ' ',
'"', ' ', ' ',
'`$', '', '',
'\'', ' ', ' ',
',', ',', ',',
';', ',', ',',
'-', ' ', ' ',
' ', ' ', ' ',
'.', '.', '.',
':', '.', '.',
# German umlauts
'Ä', 'AE', 'AE',
'Ö', 'OE', 'OE',
'Ü', 'UE', 'UE',
'ß', 'S', 'S',
# international umlauts
'À', 'A', 'A',
'Á', 'A', 'A',
'Â', 'A', 'A',
'Ã', 'A', 'A',
'Å', 'A', 'A',
'Æ', 'AE', 'AE',
'Ç', 'C', 'C',
'Ð', 'DJ', 'DJ',
'È', 'E', 'E',
'É', 'E', 'E',
'Ê', 'E', 'E',
'Ë', 'E', 'E',
'Ì', 'I', 'I',
'Í', 'I', 'I',
'Î', 'I', 'I',
'Ï', 'I', 'I',
'Ñ', 'NH', 'NH',
'Ò', 'O', 'O',
'Ó', 'O', 'O',
'Ô', 'O', 'O',
'Õ', 'O', 'O',
'Œ', 'OE', 'OE',
'Ø', 'OE', 'OE',
'Š', 'SH', 'SH',
'Þ', 'TH', 'TH',
'Ù', 'U', 'U',
'Ú', 'U', 'U',
'Û', 'U', 'U',
'Ý', 'Y', 'Y',
'Ÿ', 'Y', 'Y',
# 'normal' letters (A-Z)
'MC^', 'MAC', 'MAC',
'MC^', 'MAC', 'MAC',
'M´^', 'MAC', 'MAC',
'M\'^', 'MAC', 'MAC',
'O´^', 'O', 'O',
'O\'^', 'O', 'O',
'VAN DEN ^', 'VANDEN', 'VANDEN',
None, None, None)
_phonet_rules_german = ( # separator chars
'´', ' ', ' ',
'"', ' ', ' ',
'`$', '', '',
'\'', ' ', ' ',
',', ' ', ' ',
';', ' ', ' ',
'-', ' ', ' ',
' ', ' ', ' ',
'.', '.', '.',
':', '.', '.',
# German umlauts
'ÄE', 'E', 'E',
'ÄU<', 'EU', 'EU',
'ÄV(AEOU)-<', 'EW', None,
'Ä$', 'Ä', None,
'Ä<', None, 'E',
'Ä', 'E', None,
'ÖE', 'Ö', 'Ö',
'ÖU', 'Ö', 'Ö',
'ÖVER--<', 'ÖW', None,
'ÖV(AOU)-', 'ÖW', None,
'ÜBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
'ÜBER^^', 'ÜBA', 'IBA',
'ÜE', 'Ü', 'I',
'ÜVER--<', 'ÜW', None,
'ÜV(AOU)-', 'ÜW', None,
'Ü', None, 'I',
'ßCH<', None, 'Z',
'ß<', 'S', 'Z',
# international umlauts
'À<', 'A', 'A',
'Á<', 'A', 'A',
'Â<', 'A', 'A',
'Ã<', 'A', 'A',
'Å<', 'A', 'A',
'ÆER-', 'E', 'E',
'ÆU<', 'EU', 'EU',
'ÆV(AEOU)-<', 'EW', None,
'Æ$', 'Ä', None,
'Æ<', None, 'E',
'Æ', 'E', None,
'Ç', 'Z', 'Z',
'ÐÐ-', '', '',
'Ð', 'DI', 'TI',
'È<', 'E', 'E',
'É<', 'E', 'E',
'Ê<', 'E', 'E',
'Ë', 'E', 'E',
'Ì<', 'I', 'I',
'Í<', 'I', 'I',
'Î<', 'I', 'I',
'Ï', 'I', 'I',
'ÑÑ-', '', '',
'Ñ', 'NI', 'NI',
'Ò<', 'O', 'U',
'Ó<', 'O', 'U',
'Ô<', 'O', 'U',
'Õ<', 'O', 'U',
'Œ<', 'Ö', 'Ö',
'Ø(IJY)-<', 'E', 'E',
'Ø<', 'Ö', 'Ö',
'Š', 'SH', 'Z',
'Þ', 'T', 'T',
'Ù<', 'U', 'U',
'Ú<', 'U', 'U',
'Û<', 'U', 'U',
'Ý<', 'I', 'I',
'Ÿ<', 'I', 'I',
# 'normal' letters (A-Z)
'ABELLE$', 'ABL', 'ABL',
'ABELL$', 'ABL', 'ABL',
'ABIENNE$', 'ABIN', 'ABIN',
'ACHME---^', 'ACH', 'AK',
'ACEY$', 'AZI', 'AZI',
'ADV', 'ATW', None,
'AEGL-', 'EK', None,
'AEU<', 'EU', 'EU',
'AE2', 'E', 'E',
'AFTRAUBEN------', 'AFT ', 'AFT ',
'AGL-1', 'AK', None,
'AGNI-^', 'AKN', 'AKN',
'AGNIE-', 'ANI', 'ANI',
'AGN(AEOU)-$', 'ANI', 'ANI',
'AH(AIOÖUÜY)-', 'AH', None,
'AIA2', 'AIA', 'AIA',
'AIE$', 'E', 'E',
'AILL(EOU)-', 'ALI', 'ALI',
'AINE$', 'EN', 'EN',
'AIRE$', 'ER', 'ER',
'AIR-', 'E', 'E',
'AISE$', 'ES', 'EZ',
'AISSANCE$', 'ESANS', 'EZANZ',
'AISSE$', 'ES', 'EZ',
'AIX$', 'EX', 'EX',
'AJ(AÄEÈÉÊIOÖUÜ)--', 'A', 'A',
'AKTIE', 'AXIE', 'AXIE',
'AKTUEL', 'AKTUEL', None,
'ALOI^', 'ALOI', 'ALUI', # Don't merge these rules
'ALOY^', 'ALOI', 'ALUI', # needed by 'check_rules'
'AMATEU(RS)-', 'AMATÖ', 'ANATÖ',
'ANCH(OEI)-', 'ANSH', 'ANZ',
'ANDERGEGANG----', 'ANDA GE', 'ANTA KE',
'ANDERGEHE----', 'ANDA ', 'ANTA ',
'ANDERGESETZ----', 'ANDA GE', 'ANTA KE',
'ANDERGING----', 'ANDA ', 'ANTA ',
'ANDERSETZ(ET)-----', 'ANDA ', 'ANTA ',
'ANDERZUGEHE----', 'ANDA ZU ', 'ANTA ZU ',
'ANDERZUSETZE-----', 'ANDA ZU ', 'ANTA ZU ',
'ANER(BKO)---^^', 'AN', None,
'ANHAND---^$', 'AN H', 'AN ',
'ANH(AÄEIOÖUÜY)--^^', 'AN', None,
'ANIELLE$', 'ANIEL', 'ANIL',
'ANIEL', 'ANIEL', None,
'ANSTELLE----^$', 'AN ST', 'AN ZT',
'ANTI^^', 'ANTI', 'ANTI',
'ANVER^^', 'ANFA', 'ANFA',
'ATIA$', 'ATIA', 'ATIA',
'ATIA(NS)--', 'ATI', 'ATI',
'ATI(AÄOÖUÜ)-', 'AZI', 'AZI',
'AUAU--', '', '',
'AUERE$', 'AUERE', None,
'AUERE(NS)-$', 'AUERE', None,
'AUERE(AIOUY)--', 'AUER', None,
'AUER(AÄIOÖUÜY)-', 'AUER', None,
'AUER<', 'AUA', 'AUA',
'AUF^^', 'AUF', 'AUF',
'AULT$', 'O', 'U',
'AUR(BCDFGKLMNQSTVWZ)-', 'AUA', 'AUA',
'AUR$', 'AUA', 'AUA',
'AUSSE$', 'OS', 'UZ',
'AUS(ST)-^', 'AUS', 'AUS',
'AUS^^', 'AUS', 'AUS',
'AUTOFAHR----', 'AUTO ', 'AUTU ',
'AUTO^^', 'AUTO', 'AUTU',
'AUX(IY)-', 'AUX', 'AUX',
'AUX', 'O', 'U',
'AU', 'AU', 'AU',
'AVER--<', 'AW', None,
'AVIER$', 'AWIE', 'AFIE',
'AV(EÈÉÊI)-^', 'AW', None,
'AV(AOU)-', 'AW', None,
'AYRE$', 'EIRE', 'EIRE',
'AYRE(NS)-$', 'EIRE', 'EIRE',
'AYRE(AIOUY)--', 'EIR', 'EIR',
'AYR(AÄIOÖUÜY)-', 'EIR', 'EIR',
'AYR<', 'EIA', 'EIA',
'AYER--<', 'EI', 'EI',
'AY(AÄEIOÖUÜY)--', 'A', 'A',
'AË', 'E', 'E',
'A(IJY)<', 'EI', 'EI',
'BABY^$', 'BEBI', 'BEBI',
'BAB(IY)^', 'BEBI', 'BEBI',
'BEAU^$', 'BO', None,
'BEA(BCMNRU)-^', 'BEA', 'BEA',
'BEAT(AEIMORU)-^', 'BEAT', 'BEAT',
'BEE$', 'BI', 'BI',
'BEIGE^$', 'BESH', 'BEZ',
'BENOIT--', 'BENO', 'BENU',
'BER(DT)-', 'BER', None,
'BERN(DT)-', 'BERN', None,
'BE(LMNRST)-^', 'BE', 'BE',
'BETTE$', 'BET', 'BET',
'BEVOR^$', 'BEFOR', None,
'BIC$', 'BIZ', 'BIZ',
'BOWL(EI)-', 'BOL', 'BUL',
'BP(AÄEÈÉÊIÌÍÎOÖRUÜY)-', 'B', 'B',
'BRINGEND-----^', 'BRI', 'BRI',
'BRINGEND-----', ' BRI', ' BRI',
'BROW(NS)-', 'BRAU', 'BRAU',
'BUDGET7', 'BÜGE', 'BIKE',
'BUFFET7', 'BÜFE', 'BIFE',
'BYLLE$', 'BILE', 'BILE',
'BYLL$', 'BIL', 'BIL',
'BYPA--^', 'BEI', 'BEI',
'BYTE<', 'BEIT', 'BEIT',
'BY9^', 'BÜ', None,
'B(SßZ)$', 'BS', None,
'CACH(EI)-^', 'KESH', 'KEZ',
'CAE--', 'Z', 'Z',
'CA(IY)$', 'ZEI', 'ZEI',
'CE(EIJUY)--', 'Z', 'Z',
'CENT<', 'ZENT', 'ZENT',
'CERST(EI)----^', 'KE', 'KE',
'CER$', 'ZA', 'ZA',
'CE3', 'ZE', 'ZE',
'CH\'S$', 'X', 'X',
'CH´S$', 'X', 'X',
'CHAO(ST)-', 'KAO', 'KAU',
'CHAMPIO-^', 'SHEMPI', 'ZENBI',
'CHAR(AI)-^', 'KAR', 'KAR',
'CHAU(CDFSVWXZ)-', 'SHO', 'ZU',
'CHÄ(CF)-', 'SHE', 'ZE',
'CHE(CF)-', 'SHE', 'ZE',
'CHEM-^', 'KE', 'KE', # or: 'CHE', 'KE'
'CHEQUE<', 'SHEK', 'ZEK',
'CHI(CFGPVW)-', 'SHI', 'ZI',
'CH(AEUY)-<^', 'SH', 'Z',
'CHK-', '', '',
'CHO(CKPS)-^', 'SHO', 'ZU',
'CHRIS-', 'KRI', None,
'CHRO-', 'KR', None,
'CH(LOR)-<^', 'K', 'K',
'CHST-', 'X', 'X',
'CH(SßXZ)3', 'X', 'X',
'CHTNI-3', 'CHN', 'KN',
'CH^', 'K', 'K', # or: 'CH', 'K'
'CH', 'CH', 'K',
'CIC$', 'ZIZ', 'ZIZ',
'CIENCEFICT----', 'EIENS ', 'EIENZ ',
'CIENCE$', 'EIENS', 'EIENZ',
'CIER$', 'ZIE', 'ZIE',
'CYB-^', 'ZEI', 'ZEI',
'CY9^', 'ZÜ', 'ZI',
'C(IJY)-<3', 'Z', 'Z',
'CLOWN-', 'KLAU', 'KLAU',
'CCH', 'Z', 'Z',
'CCE-', 'X', 'X',
'C(CK)-', '', '',
'CLAUDET---', 'KLO', 'KLU',
'CLAUDINE^$', 'KLODIN', 'KLUTIN',
'COACH', 'KOSH', 'KUZ',
'COLE$', 'KOL', 'KUL',
'COUCH', 'KAUSH', 'KAUZ',
'COW', 'KAU', 'KAU',
'CQUES$', 'K', 'K',
'CQUE', 'K', 'K',
'CRASH--9', 'KRE', 'KRE',
'CREAT-^', 'KREA', 'KREA',
'CST', 'XT', 'XT',
'CS<^', 'Z', 'Z',
'C(SßX)', 'X', 'X',
'CT\'S$', 'X', 'X',
'CT(SßXZ)', 'X', 'X',
'CZ<', 'Z', 'Z',
'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z',
'C.^', 'C.', 'C.',
'CÄ-', 'Z', 'Z',
'CÜ$', 'ZÜ', 'ZI',
'C\'S$', 'X', 'X',
'C<', 'K', 'K',
'DAHER^$', 'DAHER', None,
'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ',
'DAVO(NR)-^$', 'DAFO', 'TAFU',
'DD(SZ)--<', '', '',
'DD9', 'D', None,
'DEPOT7', 'DEPO', 'TEBU',
'DESIGN', 'DISEIN', 'TIZEIN',
'DE(LMNRST)-3^', 'DE', 'TE',
'DETTE$', 'DET', 'TET',
'DH$', 'T', None,
'DIC$', 'DIZ', 'TIZ',
'DIDR-^', 'DIT', None,
'DIEDR-^', 'DIT', None,
'DJ(AEIOU)-^', 'I', 'I',
'DMITR-^', 'DIMIT', 'TINIT',
'DRY9^', 'DRÜ', None,
'DT-', '', '',
'DUIS-^', 'DÜ', 'TI',
'DURCH^^', 'DURCH', 'TURK',
'DVA$', 'TWA', None,
'DY9^', 'DÜ', None,
'DYS$', 'DIS', None,
'DS(CH)--<', 'T', 'T',
'DST', 'ZT', 'ZT',
'DZS(CH)--', 'T', 'T',
'D(SßZ)', 'Z', 'Z',
'D(AÄEIOÖRUÜY)-', 'D', None,
'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None,
'D\'H^', 'D', 'T',
'D´H^', 'D', 'T',
'D`H^', 'D', 'T',
'D\'S3$', 'Z', 'Z',
'D´S3$', 'Z', 'Z',
'D^', 'D', None,
'D', 'T', 'T',
'EAULT$', 'O', 'U',
'EAUX$', 'O', 'U',
'EAU', 'O', 'U',
'EAV', 'IW', 'IF',
'EAS3$', 'EAS', None,
'EA(AÄEIOÖÜY)-3', 'EA', 'EA',
'EA3$', 'EA', 'EA',
'EA3', 'I', 'I',
'EBENSO^$', 'EBNSO', 'EBNZU',
'EBENSO^^', 'EBNSO ', 'EBNZU ',
'EBEN^^', 'EBN', 'EBN',
'EE9', 'E', 'E',
'EGL-1', 'EK', None,
'EHE(IUY)--1', 'EH', None,
'EHUNG---1', 'E', None,
'EH(AÄIOÖUÜY)-1', 'EH', None,
'EIEI--', '', '',
'EIERE^$', 'EIERE', None,
'EIERE$', 'EIERE', None,
'EIERE(NS)-$', 'EIERE', None,
'EIERE(AIOUY)--', 'EIER', None,
'EIER(AÄIOÖUÜY)-', 'EIER', None,
'EIER<', 'EIA', None,
'EIGL-1', 'EIK', None,
'EIGH$', 'EI', 'EI',
'EIH--', 'E', 'E',
'EILLE$', 'EI', 'EI',
'EIR(BCDFGKLMNQSTVWZ)-', 'EIA', 'EIA',
'EIR$', 'EIA', 'EIA',
'EITRAUBEN------', 'EIT ', 'EIT ',
'EI', 'EI', 'EI',
'EJ$', 'EI', 'EI',
'ELIZ^', 'ELIS', None,
'ELZ^', 'ELS', None,
'EL-^', 'E', 'E',
'ELANG----1', 'E', 'E',
'EL(DKL)--1', 'E', 'E',
'EL(MNT)--1$', 'E', 'E',
'ELYNE$', 'ELINE', 'ELINE',
'ELYN$', 'ELIN', 'ELIN',
'EL(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'EL', 'EL',
'EL-1', 'L', 'L',
'EM-^', None, 'E',
'EM(DFKMPQT)--1', None, 'E',
'EM(AÄEÈÉÊIÌÍÎOÖUÜY)--1', None, 'E',
'EM-1', None, 'N',
'ENGAG-^', 'ANGA', 'ANKA',
'EN-^', 'E', 'E',
'ENTUEL', 'ENTUEL', None,
'EN(CDGKQSTZ)--1', 'E', 'E',
'EN(AÄEÈÉÊIÌÍÎNOÖUÜY)-1', 'EN', 'EN',
'EN-1', '', '',
'ERH(AÄEIOÖUÜ)-^', 'ERH', 'ER',
'ER-^', 'E', 'E',
'ERREGEND-----', ' ER', ' ER',
'ERT1$', 'AT', None,
'ER(DGLKMNRQTZß)-1', 'ER', None,
'ER(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'ER', 'A',
'ER1$', 'A', 'A',
'ER<1', 'A', 'A',
'ETAT7', 'ETA', 'ETA',
'ETI(AÄOÖÜU)-', 'EZI', 'EZI',
'EUERE$', 'EUERE', None,
'EUERE(NS)-$', 'EUERE', None,
'EUERE(AIOUY)--', 'EUER', None,
'EUER(AÄIOÖUÜY)-', 'EUER', None,
'EUER<', 'EUA', None,
'EUEU--', '', '',
'EUILLE$', 'Ö', 'Ö',
'EUR$', 'ÖR', 'ÖR',
'EUX', 'Ö', 'Ö',
'EUSZ$', 'EUS', None,
'EUTZ$', 'EUS', None,
'EUYS$', 'EUS', 'EUZ',
'EUZ$', 'EUS', None,
'EU', 'EU', 'EU',
'EVER--<1', 'EW', None,
'EV(ÄOÖUÜ)-1', 'EW', None,
'EYER<', 'EIA', 'EIA',
'EY<', 'EI', 'EI',
'FACETTE', 'FASET', 'FAZET',
'FANS--^$', 'FE', 'FE',
'FAN-^$', 'FE', 'FE',
'FAULT-', 'FOL', 'FUL',
'FEE(DL)-', 'FI', 'FI',
'FEHLER', 'FELA', 'FELA',
'FE(LMNRST)-3^', 'FE', 'FE',
'FOERDERN---^', 'FÖRD', 'FÖRT',
'FOERDERN---', ' FÖRD', ' FÖRT',
'FOND7', 'FON', 'FUN',
'FRAIN$', 'FRA', 'FRA',
'FRISEU(RS)-', 'FRISÖ', 'FRIZÖ',
'FY9^', 'FÜ', None,
'FÖRDERN---^', 'FÖRD', 'FÖRT',
'FÖRDERN---', ' FÖRD', ' FÖRT',
'GAGS^$', 'GEX', 'KEX',
'GAG^$', 'GEK', 'KEK',
'GD', 'KT', 'KT',
'GEGEN^^', 'GEGN', 'KEKN',
'GEGENGEKOM-----', 'GEGN ', 'KEKN ',
'GEGENGESET-----', 'GEGN ', 'KEKN ',
'GEGENKOMME-----', 'GEGN ', 'KEKN ',
'GEGENZUKOM---', 'GEGN ZU ', 'KEKN ZU ',
'GENDETWAS-----$', 'GENT ', 'KENT ',
'GENRE', 'IORE', 'IURE',
'GE(LMNRST)-3^', 'GE', 'KE',
'GER(DKT)-', 'GER', None,
'GETTE$', 'GET', 'KET',
'GGF.', 'GF.', None,
'GG-', '', '',
'GH', 'G', None,
'GI(AOU)-^', 'I', 'I',
'GION-3', 'KIO', 'KIU',
'G(CK)-', '', '',
'GJ(AEIOU)-^', 'I', 'I',
'GMBH^$', 'GMBH', 'GMBH',
'GNAC$', 'NIAK', 'NIAK',
'GNON$', 'NION', 'NIUN',
'GN$', 'N', 'N',
'GONCAL-^', 'GONZA', 'KUNZA',
'GRY9^', 'GRÜ', None,
'G(SßXZ)-<', 'K', 'K',
'GUCK-', 'KU', 'KU',
'GUISEP-^', 'IUSE', 'IUZE',
'GUI-^', 'G', 'K',
'GUTAUSSEH------^', 'GUT ', 'KUT ',
'GUTGEHEND------^', 'GUT ', 'KUT ',
'GY9^', 'GÜ', None,
'G(AÄEILOÖRUÜY)-', 'G', None,
'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None,
'G\'S$', 'X', 'X',
'G´S$', 'X', 'X',
'G^', 'G', None,
'G', 'K', 'K',
'HA(HIUY)--1', 'H', None,
'HANDVOL---^', 'HANT ', 'ANT ',
'HANNOVE-^', 'HANOF', None,
'HAVEN7$', 'HAFN', None,
'HEAD-', 'HE', 'E',
'HELIEGEN------', 'E ', 'E ',
'HESTEHEN------', 'E ', 'E ',
'HE(LMNRST)-3^', 'HE', 'E',
'HE(LMN)-1', 'E', 'E',
'HEUR1$', 'ÖR', 'ÖR',
'HE(HIUY)--1', 'H', None,
'HIH(AÄEIOÖUÜY)-1', 'IH', None,
'HLH(AÄEIOÖUÜY)-1', 'LH', None,
'HMH(AÄEIOÖUÜY)-1', 'MH', None,
'HNH(AÄEIOÖUÜY)-1', 'NH', None,
'HOBBY9^', 'HOBI', None,
'HOCHBEGAB-----^', 'HOCH ', 'UK ',
'HOCHTALEN-----^', 'HOCH ', 'UK ',
'HOCHZUFRI-----^', 'HOCH ', 'UK ',
'HO(HIY)--1', 'H', None,
'HRH(AÄEIOÖUÜY)-1', 'RH', None,
'HUH(AÄEIOÖUÜY)-1', 'UH', None,
'HUIS^^', 'HÜS', 'IZ',
'HUIS$', 'ÜS', 'IZ',
'HUI--1', 'H', None,
'HYGIEN^', 'HÜKIEN', None,
'HY9^', 'HÜ', None,
'HY(BDGMNPST)-', 'Ü', None,
'H.^', None, 'H.',
'HÄU--1', 'H', None,
'H^', 'H', '',
'H', '', '',
'ICHELL---', 'ISH', 'IZ',
'ICHI$', 'ISHI', 'IZI',
'IEC$', 'IZ', 'IZ',
'IEDENSTELLE------', 'IDN ', 'ITN ',
'IEI-3', '', '',
'IELL3', 'IEL', 'IEL',
'IENNE$', 'IN', 'IN',
'IERRE$', 'IER', 'IER',
'IERZULAN---', 'IR ZU ', 'IR ZU ',
'IETTE$', 'IT', 'IT',
'IEU', 'IÖ', 'IÖ',
'IE<4', 'I', 'I',
'IGL-1', 'IK', None,
'IGHT3$', 'EIT', 'EIT',
'IGNI(EO)-', 'INI', 'INI',
'IGN(AEOU)-$', 'INI', 'INI',
'IHER(DGLKRT)--1', 'IHE', None,
'IHE(IUY)--', 'IH', None,
'IH(AIOÖUÜY)-', 'IH', None,
'IJ(AOU)-', 'I', 'I',
'IJ$', 'I', 'I',
'IJ<', 'EI', 'EI',
'IKOLE$', 'IKOL', 'IKUL',
'ILLAN(STZ)--4', 'ILIA', 'ILIA',
'ILLAR(DT)--4', 'ILIA', 'ILIA',
'IMSTAN----^', 'IM ', 'IN ',
'INDELERREGE------', 'INDL ', 'INTL ',
'INFRAGE-----^$', 'IN ', 'IN ',
'INTERN(AOU)-^', 'INTAN', 'INTAN',
'INVER-', 'INWE', 'INFE',
'ITI(AÄIOÖUÜ)-', 'IZI', 'IZI',
'IUSZ$', 'IUS', None,
'IUTZ$', 'IUS', None,
'IUZ$', 'IUS', None,
'IVER--<', 'IW', None,
'IVIER$', 'IWIE', 'IFIE',
'IV(ÄOÖUÜ)-', 'IW', None,
'IV<3', 'IW', None,
'IY2', 'I', None,
'I(ÈÉÊ)<4', 'I', 'I',
'JAVIE---<^', 'ZA', 'ZA',
'JEANS^$', 'JINS', 'INZ',
'JEANNE^$', 'IAN', 'IAN',
'JEAN-^', 'IA', 'IA',
'JER-^', 'IE', 'IE',
'JE(LMNST)-', 'IE', 'IE',
'JI^', 'JI', None,
'JOR(GK)^$', 'IÖRK', 'IÖRK',
'J', 'I', 'I',
'KC(ÄEIJ)-', 'X', 'X',
'KD', 'KT', None,
'KE(LMNRST)-3^', 'KE', 'KE',
'KG(AÄEILOÖRUÜY)-', 'K', None,
'KH<^', 'K', 'K',
'KIC$', 'KIZ', 'KIZ',
'KLE(LMNRST)-3^', 'KLE', 'KLE',
'KOTELE-^', 'KOTL', 'KUTL',
'KREAT-^', 'KREA', 'KREA',
'KRÜS(TZ)--^', 'KRI', None,
'KRYS(TZ)--^', 'KRI', None,
'KRY9^', 'KRÜ', None,
'KSCH---', 'K', 'K',
'KSH--', 'K', 'K',
'K(SßXZ)7', 'X', 'X', # implies 'KST' -> 'XT'
'KT\'S$', 'X', 'X',
'KTI(AIOU)-3', 'XI', 'XI',
'KT(SßXZ)', 'X', 'X',
'KY9^', 'KÜ', None,
'K\'S$', 'X', 'X',
'K´S$', 'X', 'X',
'LANGES$', ' LANGES', ' LANKEZ',
'LANGE$', ' LANGE', ' LANKE',
'LANG$', ' LANK', ' LANK',
'LARVE-', 'LARF', 'LARF',
'LD(SßZ)$', 'LS', 'LZ',
'LD\'S$', 'LS', 'LZ',
'LD´S$', 'LS', 'LZ',
'LEAND-^', 'LEAN', 'LEAN',
'LEERSTEHE-----^', 'LER ', 'LER ',
'LEICHBLEIB-----', 'LEICH ', 'LEIK ',
'LEICHLAUTE-----', 'LEICH ', 'LEIK ',
'LEIDERREGE------', 'LEIT ', 'LEIT ',
'LEIDGEPR----^', 'LEIT ', 'LEIT ',
'LEINSTEHE-----', 'LEIN ', 'LEIN ',
'LEL-', 'LE', 'LE',
'LE(MNRST)-3^', 'LE', 'LE',
'LETTE$', 'LET', 'LET',
'LFGNAG-', 'LFGAN', 'LFKAN',
'LICHERWEIS----', 'LICHA ', 'LIKA ',
'LIC$', 'LIZ', 'LIZ',
'LIVE^$', 'LEIF', 'LEIF',
'LT(SßZ)$', 'LS', 'LZ',
'LT\'S$', 'LS', 'LZ',
'LT´S$', 'LS', 'LZ',
'LUI(GS)--', 'LU', 'LU',
'LV(AIO)-', 'LW', None,
'LY9^', 'LÜ', None,
'LSTS$', 'LS', 'LZ',
'LZ(BDFGKLMNPQRSTVWX)-', 'LS', None,
'L(SßZ)$', 'LS', None,
'MAIR-<', 'MEI', 'NEI',
'MANAG-', 'MENE', 'NENE',
'MANUEL', 'MANUEL', None,
'MASSEU(RS)-', 'MASÖ', 'NAZÖ',
'MATCH', 'MESH', 'NEZ',
'MAURICE', 'MORIS', 'NURIZ',
'MBH^$', 'MBH', 'MBH',
'MB(ßZ)$', 'MS', None,
'MB(SßTZ)-', 'M', 'N',
'MCG9^', 'MAK', 'NAK',
'MC9^', 'MAK', 'NAK',
'MEMOIR-^', 'MEMOA', 'NENUA',
'MERHAVEN$', 'MAHAFN', None,
'ME(LMNRST)-3^', 'ME', 'NE',
'MEN(STZ)--3', 'ME', None,
'MEN$', 'MEN', None,
'MIGUEL-', 'MIGE', 'NIKE',
'MIKE^$', 'MEIK', 'NEIK',
'MITHILFE----^$', 'MIT H', 'NIT ',
'MN$', 'M', None,
'MN', 'N', 'N',
'MPJUTE-', 'MPUT', 'NBUT',
'MP(ßZ)$', 'MS', None,
'MP(SßTZ)-', 'M', 'N',
'MP(BDJLMNPQVW)-', 'MB', 'NB',
'MY9^', 'MÜ', None,
'M(ßZ)$', 'MS', None,
'M´G7^', 'MAK', 'NAK',
'M\'G7^', 'MAK', 'NAK',
'M´^', 'MAK', 'NAK',
'M\'^', 'MAK', 'NAK',
'M', None, 'N',
'NACH^^', 'NACH', 'NAK',
'NADINE', 'NADIN', 'NATIN',
'NAIV--', 'NA', 'NA',
'NAISE$', 'NESE', 'NEZE',
'NAUGENOMM------', 'NAU ', 'NAU ',
'NAUSOGUT$', 'NAUSO GUT', 'NAUZU KUT',
'NCH$', 'NSH', 'NZ',
'NCOISE$', 'SOA', 'ZUA',
'NCOIS$', 'SOA', 'ZUA',
'NDAR$', 'NDA', 'NTA',
'NDERINGEN------', 'NDE ', 'NTE ',
'NDRO(CDKTZ)-', 'NTRO', None,
'ND(BFGJLMNPQVW)-', 'NT', None,
'ND(SßZ)$', 'NS', 'NZ',
'ND\'S$', 'NS', 'NZ',
'ND´S$', 'NS', 'NZ',
'NEBEN^^', 'NEBN', 'NEBN',
'NENGELERN------', 'NEN ', 'NEN ',
'NENLERN(ET)---', 'NEN LE', 'NEN LE',
'NENZULERNE---', 'NEN ZU LE', 'NEN ZU LE',
'NE(LMNRST)-3^', 'NE', 'NE',
'NEN-3', 'NE', 'NE',
'NETTE$', 'NET', 'NET',
'NGU^^', 'NU', 'NU',
'NG(BDFJLMNPQRTVW)-', 'NK', 'NK',
'NH(AUO)-$', 'NI', 'NI',
'NICHTSAHNEN-----', 'NIX ', 'NIX ',
'NICHTSSAGE----', 'NIX ', 'NIX ',
'NICHTS^^', 'NIX', 'NIX',
'NICHT^^', 'NICHT', 'NIKT',
'NINE$', 'NIN', 'NIN',
'NON^^', 'NON', 'NUN',
'NOTLEIDE-----^', 'NOT ', 'NUT ',
'NOT^^', 'NOT', 'NUT',
'NTI(AIOU)-3', 'NZI', 'NZI',
'NTIEL--3', 'NZI', 'NZI',
'NT(SßZ)$', 'NS', 'NZ',
'NT\'S$', 'NS', 'NZ',
'NT´S$', 'NS', 'NZ',
'NYLON', 'NEILON', 'NEILUN',
'NY9^', 'NÜ', None,
'NSTZUNEH---', 'NST ZU ', 'NZT ZU ',
'NSZ-', 'NS', None,
'NSTS$', 'NS', 'NZ',
'NZ(BDFGKLMNPQRSTVWX)-', 'NS', None,
'N(SßZ)$', 'NS', None,
'OBERE-', 'OBER', None,
'OBER^^', 'OBA', 'UBA',
'OEU2', 'Ö', 'Ö',
'OE<2', 'Ö', 'Ö',
'OGL-', 'OK', None,
'OGNIE-', 'ONI', 'UNI',
'OGN(AEOU)-$', 'ONI', 'UNI',
'OH(AIOÖUÜY)-', 'OH', None,
'OIE$', 'Ö', 'Ö',
'OIRE$', 'OA', 'UA',
'OIR$', 'OA', 'UA',
'OIX', 'OA', 'UA',
'OI<3', 'EU', 'EU',
'OKAY^$', 'OKE', 'UKE',
'OLYN$', 'OLIN', 'ULIN',
'OO(DLMZ)-', 'U', None,
'OO$', 'U', None,
'OO-', '', '',
'ORGINAL-----', 'ORI', 'URI',
'OTI(AÄOÖUÜ)-', 'OZI', 'UZI',
'OUI^', 'WI', 'FI',
'OUILLE$', 'ULIE', 'ULIE',
'OU(DT)-^', 'AU', 'AU',
'OUSE$', 'AUS', 'AUZ',
'OUT-', 'AU', 'AU',
'OU', 'U', 'U',
'O(FV)$', 'AU', 'AU', # due to 'OW$' -> 'AU'
'OVER--<', 'OW', None,
'OV(AOU)-', 'OW', None,
'OW$', 'AU', 'AU',
'OWS$', 'OS', 'UZ',
'OJ(AÄEIOÖUÜ)--', 'O', 'U',
'OYER', 'OIA', None,
'OY(AÄEIOÖUÜ)--', 'O', 'U',
'O(JY)<', 'EU', 'EU',
'OZ$', 'OS', None,
'O´^', 'O', 'U',
'O\'^', 'O', 'U',
'O', None, 'U',
'PATIEN--^', 'PAZI', 'PAZI',
'PENSIO-^', 'PANSI', 'PANZI',
'PE(LMNRST)-3^', 'PE', 'PE',
'PFER-^', 'FE', 'FE',
'P(FH)<', 'F', 'F',
'PIC^$', 'PIK', 'PIK',
'PIC$', 'PIZ', 'PIZ',
'PIPELINE', 'PEIBLEIN', 'PEIBLEIN',
'POLYP-', 'POLÜ', None,
'POLY^^', 'POLI', 'PULI',
'PORTRAIT7', 'PORTRE', 'PURTRE',
'POWER7', 'PAUA', 'PAUA',
'PP(FH)--<', 'B', 'B',
'PP-', '', '',
'PRODUZ-^', 'PRODU', 'BRUTU',
'PRODUZI--', ' PRODU', ' BRUTU',
'PRIX^$', 'PRI', 'PRI',
'PS-^^', 'P', None,
'P(SßZ)^', None, 'Z',
'P(SßZ)$', 'BS', None,
'PT-^', '', '',
'PTI(AÄOÖUÜ)-3', 'BZI', 'BZI',
'PY9^', 'PÜ', None,
'P(AÄEIOÖRUÜY)-', 'P', 'P',
'P(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'P', None,
'P.^', None, 'P.',
'P^', 'P', None,
'P', 'B', 'B',
'QI-', 'Z', 'Z',
'QUARANT--', 'KARA', 'KARA',
'QUE(LMNRST)-3', 'KWE', 'KFE',
'QUE$', 'K', 'K',
'QUI(NS)$', 'KI', 'KI',
'QUIZ7', 'KWIS', None,
'Q(UV)7', 'KW', 'KF',
'Q<', 'K', 'K',
'RADFAHR----', 'RAT ', 'RAT ',
'RAEFTEZEHRE-----', 'REFTE ', 'REFTE ',
'RCH', 'RCH', 'RK',
'REA(DU)---3^', 'R', None,
'REBSERZEUG------', 'REBS ', 'REBZ ',
'RECHERCH^', 'RESHASH', 'REZAZ',
'RECYCL--', 'RIZEI', 'RIZEI',
'RE(ALST)-3^', 'RE', None,
'REE$', 'RI', 'RI',
'RER$', 'RA', 'RA',
'RE(MNR)-4', 'RE', 'RE',
'RETTE$', 'RET', 'RET',
'REUZ$', 'REUZ', None,
'REW$', 'RU', 'RU',
'RH<^', 'R', 'R',
'RJA(MN)--', 'RI', 'RI',
'ROWD-^', 'RAU', 'RAU',
'RTEMONNAIE-', 'RTMON', 'RTNUN',
'RTI(AÄOÖUÜ)-3', 'RZI', 'RZI',
'RTIEL--3', 'RZI', 'RZI',
'RV(AEOU)-3', 'RW', None,
'RY(KN)-$', 'RI', 'RI',
'RY9^', 'RÜ', None,
'RÄFTEZEHRE-----', 'REFTE ', 'REFTE ',
'SAISO-^', 'SES', 'ZEZ',
'SAFE^$', 'SEIF', 'ZEIF',
'SAUCE-^', 'SOS', 'ZUZ',
'SCHLAGGEBEN-----<', 'SHLAK ', 'ZLAK ',
'SCHSCH---7', '', '',
'SCHTSCH', 'SH', 'Z',
'SC(HZ)<', 'SH', 'Z',
'SC', 'SK', 'ZK',
'SELBSTST--7^^', 'SELB', 'ZELB',
'SELBST7^^', 'SELBST', 'ZELBZT',
'SERVICE7^', 'SÖRWIS', 'ZÖRFIZ',
'SERVI-^', 'SERW', None,
'SE(LMNRST)-3^', 'SE', 'ZE',
'SETTE$', 'SET', 'ZET',
'SHP-^', 'S', 'Z',
'SHST', 'SHT', 'ZT',
'SHTSH', 'SH', 'Z',
'SHT', 'ST', 'Z',
'SHY9^', 'SHÜ', None,
'SH^^', 'SH', None,
'SH3', 'SH', 'Z',
'SICHERGEGAN-----^', 'SICHA ', 'ZIKA ',
'SICHERGEHE----^', 'SICHA ', 'ZIKA ',
'SICHERGESTEL------^', 'SICHA ', 'ZIKA ',
'SICHERSTELL-----^', 'SICHA ', 'ZIKA ',
'SICHERZU(GS)--^', 'SICHA ZU ', 'ZIKA ZU ',
'SIEGLI-^', 'SIKL', 'ZIKL',
'SIGLI-^', 'SIKL', 'ZIKL',
'SIGHT', 'SEIT', 'ZEIT',
'SIGN', 'SEIN', 'ZEIN',
'SKI(NPZ)-', 'SKI', 'ZKI',
'SKI<^', 'SHI', 'ZI',
'SODASS^$', 'SO DAS', 'ZU TAZ',
'SODAß^$', 'SO DAS', 'ZU TAZ',
'SOGENAN--^', 'SO GEN', 'ZU KEN',
'SOUND-', 'SAUN', 'ZAUN',
'STAATS^^', 'STAZ', 'ZTAZ',
'STADT^^', 'STAT', 'ZTAT',
'STANDE$', ' STANDE', ' ZTANTE',
'START^^', 'START', 'ZTART',
'STAURANT7', 'STORAN', 'ZTURAN',
'STEAK-', 'STE', 'ZTE',
'STEPHEN-^$', 'STEW', None,
'STERN', 'STERN', None,
'STRAF^^', 'STRAF', 'ZTRAF',
'ST\'S$', 'Z', 'Z',
'ST´S$', 'Z', 'Z',
'STST--', '', '',
'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT',
'ST(SZ)', 'Z', 'Z',
'SPAREN---^', 'SPA', 'ZPA',
'SPAREND----', ' SPA', ' ZPA',
'S(PTW)-^^', 'S', None,
'SP', 'SP', None,
'STYN(AE)-$', 'STIN', 'ZTIN',
'ST', 'ST', 'ZT',
'SUITE<', 'SIUT', 'ZIUT',
'SUKE--$', 'S', 'Z',
'SURF(EI)-', 'SÖRF', 'ZÖRF',
'SV(AEÈÉÊIÌÍÎOU)-<^', 'SW', None,
'SYB(IY)--^', 'SIB', None,
'SYL(KVW)--^', 'SI', None,
'SY9^', 'SÜ', None,
'SZE(NPT)-^', 'ZE', 'ZE',
'SZI(ELN)-^', 'ZI', 'ZI',
'SZCZ<', 'SH', 'Z',
'SZT<', 'ST', 'ZT',
'SZ<3', 'SH', 'Z',
'SÜL(KVW)--^', 'SI', None,
'S', None, 'Z',
'TCH', 'SH', 'Z',
'TD(AÄEIOÖRUÜY)-', 'T', None,
'TD(ÀÁÂÃÅÈÉÊËÌÍÎÏÒÓÔÕØÙÚÛÝŸ)-', 'T', None,
'TEAT-^', 'TEA', 'TEA',
'TERRAI7^', 'TERA', 'TERA',
'TE(LMNRST)-3^', 'TE', 'TE',
'TH<', 'T', 'T',
'TICHT-', 'TIK', 'TIK',
'TICH$', 'TIK', 'TIK',
'TIC$', 'TIZ', 'TIZ',
'TIGGESTELL-------', 'TIK ', 'TIK ',
'TIGSTELL-----', 'TIK ', 'TIK ',
'TOAS-^', 'TO', 'TU',
'TOILET-', 'TOLE', 'TULE',
'TOIN-', 'TOA', 'TUA',
'TRAECHTI-^', 'TRECHT', 'TREKT',
'TRAECHTIG--', ' TRECHT', ' TREKT',
'TRAINI-', 'TREN', 'TREN',
'TRÄCHTI-^', 'TRECHT', 'TREKT',
'TRÄCHTIG--', ' TRECHT', ' TREKT',
'TSCH', 'SH', 'Z',
'TSH', 'SH', 'Z',
'TST', 'ZT', 'ZT',
'T(Sß)', 'Z', 'Z',
'TT(SZ)--<', '', '',
'TT9', 'T', 'T',
'TV^$', 'TV', 'TV',
'TX(AEIOU)-3', 'SH', 'Z',
'TY9^', 'TÜ', None,
'TZ-', '', '',
'T\'S3$', 'Z', 'Z',
'T´S3$', 'Z', 'Z',
'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
'UEBER^^', 'ÜBA', 'IBA',
'UE2', 'Ü', 'I',
'UGL-', 'UK', None,
'UH(AOÖUÜY)-', 'UH', None,
'UIE$', 'Ü', 'I',
'UM^^', 'UM', 'UN',
'UNTERE--3', 'UNTE', 'UNTE',
'UNTER^^', 'UNTA', 'UNTA',
'UNVER^^', 'UNFA', 'UNFA',
'UN^^', 'UN', 'UN',
'UTI(AÄOÖUÜ)-', 'UZI', 'UZI',
'UVE-4', 'UW', None,
'UY2', 'UI', None,
'UZZ', 'AS', 'AZ',
'VACL-^', 'WAZ', 'FAZ',
'VAC$', 'WAZ', 'FAZ',
'VAN DEN ^', 'FANDN', 'FANTN',
'VANES-^', 'WANE', None,
'VATRO-', 'WATR', None,
'VA(DHJNT)--^', 'F', None,
'VEDD-^', 'FE', 'FE',
'VE(BEHIU)--^', 'F', None,
'VEL(BDLMNT)-^', 'FEL', None,
'VENTZ-^', 'FEN', None,
'VEN(NRSZ)-^', 'FEN', None,
'VER(AB)-^$', 'WER', None,
'VERBAL^$', 'WERBAL', None,
'VERBAL(EINS)-^', 'WERBAL', None,
'VERTEBR--', 'WERTE', None,
'VEREIN-----', 'F', None,
'VEREN(AEIOU)-^', 'WEREN', None,
'VERIFI', 'WERIFI', None,
'VERON(AEIOU)-^', 'WERON', None,
'VERSEN^', 'FERSN', 'FAZN',
'VERSIERT--^', 'WERSI', None,
'VERSIO--^', 'WERS', None,
'VERSUS', 'WERSUS', None,
'VERTI(GK)-', 'WERTI', None,
'VER^^', 'FER', 'FA',
'VERSPRECHE-------', ' FER', ' FA',
'VER$', 'WA', None,
'VER', 'FA', 'FA',
'VET(HT)-^', 'FET', 'FET',
'VETTE$', 'WET', 'FET',
'VE^', 'WE', None,
'VIC$', 'WIZ', 'FIZ',
'VIELSAGE----', 'FIL ', 'FIL ',
'VIEL', 'FIL', 'FIL',
'VIEW', 'WIU', 'FIU',
'VILL(AE)-', 'WIL', None,
'VIS(ACEIKUVWZ)-<^', 'WIS', None,
'VI(ELS)--^', 'F', None,
'VILLON--', 'WILI', 'FILI',
'VIZE^^', 'FIZE', 'FIZE',
'VLIE--^', 'FL', None,
'VL(AEIOU)--', 'W', None,
'VOKA-^', 'WOK', None,
'VOL(ATUVW)--^', 'WO', None,
'VOR^^', 'FOR', 'FUR',
'VR(AEIOU)--', 'W', None,
'VV9', 'W', None,
'VY9^', 'WÜ', 'FI',
'V(ÜY)-', 'W', None,
'V(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'W', None,
'V(AEIJLRU)-<', 'W', None,
'V.^', 'V.', None,
'V<', 'F', 'F',
'WEITERENTWI-----^', 'WEITA ', 'FEITA ',
'WEITREICH-----^', 'WEIT ', 'FEIT ',
'WEITVER^', 'WEIT FER', 'FEIT FA',
'WE(LMNRST)-3^', 'WE', 'FE',
'WER(DST)-', 'WER', None,
'WIC$', 'WIZ', 'FIZ',
'WIEDERU--', 'WIDE', 'FITE',
'WIEDER^$', 'WIDA', 'FITA',
'WIEDER^^', 'WIDA ', 'FITA ',
'WIEVIEL', 'WI FIL', 'FI FIL',
'WISUEL', 'WISUEL', None,
'WR-^', 'W', None,
'WY9^', 'WÜ', 'FI',
'W(BDFGJKLMNPQRSTZ)-', 'F', None,
'W$', 'F', None,
'W', None, 'F',
'X<^', 'Z', 'Z',
'XHAVEN$', 'XAFN', None,
'X(CSZ)', 'X', 'X',
'XTS(CH)--', 'XT', 'XT',
'XT(SZ)', 'Z', 'Z',
'YE(LMNRST)-3^', 'IE', 'IE',
'YE-3', 'I', 'I',
'YOR(GK)^$', 'IÖRK', 'IÖRK',
'Y(AOU)-<7', 'I', 'I',
'Y(BKLMNPRSTX)-1', 'Ü', None,
'YVES^$', 'IF', 'IF',
'YVONNE^$', 'IWON', 'IFUN',
'Y.^', 'Y.', None,
'Y', 'I', 'I',
'ZC(AOU)-', 'SK', 'ZK',
'ZE(LMNRST)-3^', 'ZE', 'ZE',
'ZIEJ$', 'ZI', 'ZI',
'ZIGERJA(HR)-3', 'ZIGA IA', 'ZIKA IA',
'ZL(AEIOU)-', 'SL', None,
'ZS(CHT)--', '', '',
'ZS', 'SH', 'Z',
'ZUERST', 'ZUERST', 'ZUERST',
'ZUGRUNDE^$', 'ZU GRUNDE', 'ZU KRUNTE',
'ZUGRUNDE', 'ZU GRUNDE ', 'ZU KRUNTE ',
'ZUGUNSTEN', 'ZU GUNSTN', 'ZU KUNZTN',
'ZUHAUSE-', 'ZU HAUS', 'ZU AUZ',
'ZULASTEN^$', 'ZU LASTN', 'ZU LAZTN',
'ZURUECK^^', 'ZURÜK', 'ZURIK',
'ZURZEIT', 'ZUR ZEIT', 'ZUR ZEIT',
'ZURÜCK^^', 'ZURÜK', 'ZURIK',
'ZUSTANDE', 'ZU STANDE', 'ZU ZTANTE',
'ZUTAGE', 'ZU TAGE', 'ZU TAKE',
'ZUVER^^', 'ZUFA', 'ZUFA',
'ZUVIEL', 'ZU FIL', 'ZU FIL',
'ZUWENIG', 'ZU WENIK', 'ZU FENIK',
'ZY9^', 'ZÜ', None,
'ZYK3$', 'ZIK', None,
'Z(VW)7^', 'SW', None,
None, None, None)
phonet_hash = Counter()
alpha_pos = Counter()
phonet_hash_1 = Counter()
phonet_hash_2 = Counter()
_phonet_upper_translation = dict(zip((ord(_) for _ in
'abcdefghijklmnopqrstuvwxyzàáâãåäæ' +
'çðèéêëìíîïñòóôõöøœšßþùúûüýÿ'),
'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÅÄÆ' +
'ÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ'))
def _initialize_phonet(lang):
"""Initialize phonet variables."""
if lang == 'none':
_phonet_rules = _phonet_rules_no_lang
else:
_phonet_rules = _phonet_rules_german
phonet_hash[''] = -1
# German and international umlauts
for j in {'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë',
'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø',
'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'Œ', 'Š', 'Ÿ'}:
alpha_pos[j] = 1
phonet_hash[j] = -1
# "normal" letters ('A'-'Z')
for i, j in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
alpha_pos[j] = i + 2
phonet_hash[j] = -1
for i in range(26):
for j in range(28):
phonet_hash_1[i, j] = -1
phonet_hash_2[i, j] = -1
# for each phonetc rule
for i in range(len(_phonet_rules)):
rule = _phonet_rules[i]
if rule and i % 3 == 0:
# calculate first hash value
k = _phonet_rules[i][0]
if phonet_hash[k] < 0 and (_phonet_rules[i+1] or
_phonet_rules[i+2]):
phonet_hash[k] = i
# calculate second hash values
if k and alpha_pos[k] >= 2:
k = alpha_pos[k]
j = k-2
rule = rule[1:]
if not rule:
rule = ' '
elif rule[0] == '(':
rule = rule[1:]
else:
rule = rule[0]
while rule and (rule[0] != ')'):
k = alpha_pos[rule[0]]
if k > 0:
# add hash value for this letter
if phonet_hash_1[j, k] < 0:
phonet_hash_1[j, k] = i
phonet_hash_2[j, k] = i
if phonet_hash_2[j, k] >= (i-30):
phonet_hash_2[j, k] = i
else:
k = -1
if k <= 0:
# add hash value for all letters
if phonet_hash_1[j, 0] < 0:
phonet_hash_1[j, 0] = i
phonet_hash_2[j, 0] = i
rule = rule[1:]
def _phonet(term, mode, lang):
"""Return the phonet coded form of a term."""
if lang == 'none':
_phonet_rules = _phonet_rules_no_lang
else:
_phonet_rules = _phonet_rules_german
char0 = ''
dest = term
if not term:
return ''
term_length = len(term)
# convert input string to upper-case
src = term.translate(_phonet_upper_translation)
# check "src"
i = 0
j = 0
zeta = 0
while i < len(src):
char = src[i]
pos = alpha_pos[char]
if pos >= 2:
xpos = pos-2
if i+1 == len(src):
pos = alpha_pos['']
else:
pos = alpha_pos[src[i+1]]
start1 = phonet_hash_1[xpos, pos]
start2 = phonet_hash_1[xpos, 0]
end1 = phonet_hash_2[xpos, pos]
end2 = phonet_hash_2[xpos, 0]
# preserve rule priorities
if (start2 >= 0) and ((start1 < 0) or (start2 < start1)):
pos = start1
start1 = start2
start2 = pos
pos = end1
end1 = end2
end2 = pos
if (end1 >= start2) and (start2 >= 0):
if end2 > end1:
end1 = end2
start2 = -1
end2 = -1
else:
pos = phonet_hash[char]
start1 = pos
end1 = 10000
start2 = -1
end2 = -1
pos = start1
zeta0 = 0
if pos >= 0:
# check rules for this char
while ((_phonet_rules[pos] is None) or
(_phonet_rules[pos][0] == char)):
if pos > end1:
if start2 > 0:
pos = start2
start1 = start2
start2 = -1
end1 = end2
end2 = -1
continue
break
if (((_phonet_rules[pos] is None) or
(_phonet_rules[pos + mode] is None))):
# no conversion rule available
pos += 3
continue
# check whole string
matches = 1 # number of matching letters
priority = 5 # default priority
rule = _phonet_rules[pos]
rule = rule[1:]
while (rule and
(len(src) > (i + matches)) and
(src[i + matches] == rule[0]) and
not rule[0].isdigit() and
(rule not in '(-<^$')):
matches += 1
rule = rule[1:]
if rule and (rule[0] == '('):
# check an array of letters
if (((len(src) > (i + matches)) and
src[i + matches].isalpha() and
(src[i + matches] in rule[1:]))):
matches += 1
while rule and rule[0] != ')':
rule = rule[1:]
# if rule[0] == ')':
rule = rule[1:]
if rule:
priority0 = ord(rule[0])
else:
priority0 = 0
matches0 = matches
while rule and rule[0] == '-' and matches > 1:
matches -= 1
rule = rule[1:]
if rule and rule[0] == '<':
rule = rule[1:]
if rule and rule[0].isdigit():
# read priority
priority = int(rule[0])
rule = rule[1:]
if rule and rule[0:2] == '^^':
rule = rule[1:]
if (not rule or
((rule[0] == '^') and
((i == 0) or not src[i-1].isalpha()) and
((rule[1:2] != '$') or
(not (src[i+matches0:i+matches0+1].isalpha()) and
(src[i+matches0:i+matches0+1] != '.')))) or
((rule[0] == '$') and (i > 0) and
src[i-1].isalpha() and
((not src[i+matches0:i+matches0+1].isalpha()) and
(src[i+matches0:i+matches0+1] != '.')))):
# look for continuation, if:
# matches > 1 und NO '-' in first string */
pos0 = -1
start3 = 0
start4 = 0
end3 = 0
end4 = 0
if (((matches > 1) and
src[i+matches:i+matches+1] and
(priority0 != ord('-')))):
char0 = src[i+matches-1]
pos0 = alpha_pos[char0]
if pos0 >= 2 and src[i+matches]:
xpos = pos0 - 2
pos0 = alpha_pos[src[i+matches]]
start3 = phonet_hash_1[xpos, pos0]
start4 = phonet_hash_1[xpos, 0]
end3 = phonet_hash_2[xpos, pos0]
end4 = phonet_hash_2[xpos, 0]
# preserve rule priorities
if (((start4 >= 0) and
((start3 < 0) or (start4 < start3)))):
pos0 = start3
start3 = start4
start4 = pos0
pos0 = end3
end3 = end4
end4 = pos0
if (end3 >= start4) and (start4 >= 0):
if end4 > end3:
end3 = end4
start4 = -1
end4 = -1
else:
pos0 = phonet_hash[char0]
start3 = pos0
end3 = 10000
start4 = -1
end4 = -1
pos0 = start3
# check continuation rules for src[i+matches]
if pos0 >= 0:
while ((_phonet_rules[pos0] is None) or
(_phonet_rules[pos0][0] == char0)):
if pos0 > end3:
if start4 > 0:
pos0 = start4
start3 = start4
start4 = -1
end3 = end4
end4 = -1
continue
priority0 = -1
# important
break
if (((_phonet_rules[pos0] is None) or
(_phonet_rules[pos0 + mode]
is None))):
# no conversion rule available
pos0 += 3
continue
# check whole string
matches0 = matches
priority0 = 5
rule = _phonet_rules[pos0]
rule = rule[1:]
while (rule and
(src[i+matches0:i+matches0+1] ==
rule[0]) and
(not rule[0].isdigit() or
(rule in '(-<^$'))):
matches0 += 1
rule = rule[1:]
if rule and rule[0] == '(':
# check an array of letters
if ((src[i+matches0:i+matches0+1]
.isalpha() and
(src[i+matches0] in rule[1:]))):
matches0 += 1
while rule and rule[0] != ')':
rule = rule[1:]
# if rule[0] == ')':
rule = rule[1:]
while rule and rule[0] == '-':
# "matches0" is NOT decremented
# because of "if (matches0 == matches)"
rule = rule[1:]
if rule and rule[0] == '<':
rule = rule[1:]
if rule and rule[0].isdigit():
priority0 = int(rule[0])
rule = rule[1:]
if (not rule or
# rule == '^' is not possible here
((rule[0] == '$') and not
src[i+matches0:i+matches0+1]
.isalpha() and
(src[i+matches0:i+matches0+1]
!= '.'))):
if matches0 == matches:
# this is only a partial string
pos0 += 3
continue
if priority0 < priority:
# priority is too low
pos0 += 3
continue
# continuation rule found
break
pos0 += 3
# end of "while"
if ((priority0 >= priority) and
((_phonet_rules[pos0] is not None) and
(_phonet_rules[pos0][0] == char0))):
pos += 3
continue
# replace string
if ((_phonet_rules[pos] and
('<' in _phonet_rules[pos][1:]))):
priority0 = 1
else:
priority0 = 0
rule = _phonet_rules[pos + mode]
if (priority0 == 1) and (zeta == 0):
# rule with '<' is applied
if ((j > 0) and rule and
((dest[j-1] == char) or
(dest[j-1] == rule[0]))):
j -= 1
zeta0 = 1
zeta += 1
matches0 = 0
while rule and src[i+matches0]:
src = (src[0:i+matches0] + rule[0] +
src[i+matches0+1:])
matches0 += 1
rule = rule[1:]
if matches0 < matches:
src = (src[0:i+matches0] +
src[i+matches:])
char = src[i]
else:
i = i + matches - 1
zeta = 0
while len(rule) > 1:
if (j == 0) or (dest[j - 1] != rule[0]):
dest = (dest[0:j] + rule[0] +
dest[min(len(dest), j+1):])
j += 1
rule = rule[1:]
# new "current char"
if not rule:
rule = ''
char = ''
else:
char = rule[0]
if ((_phonet_rules[pos] and
'^^' in _phonet_rules[pos][1:])):
if char:
dest = (dest[0:j] + char +
dest[min(len(dest), j + 1):])
j += 1
src = src[i + 1:]
i = 0
zeta0 = 1
break
pos += 3
if pos > end1 and start2 > 0:
pos = start2
start1 = start2
end1 = end2
start2 = -1
end2 = -1
if zeta0 == 0:
if char and ((j == 0) or (dest[j-1] != char)):
# delete multiple letters only
dest = dest[0:j] + char + dest[min(j+1, term_length):]
j += 1
i += 1
zeta = 0
dest = dest[0:j]
return dest
_initialize_phonet(lang)
word = unicode_normalize('NFKC', text_type(word))
return _phonet(word, mode, lang)
[docs]def spfc(word):
"""Return the Standardized Phonetic Frequency Code (SPFC) of a word.
Standardized Phonetic Frequency Code is roughly Soundex-like.
This implementation is based on page 19-21 of :cite:`Moore:1977`.
:param str word: the word to transform
:returns: the SPFC value
:rtype: str
>>> spfc('Christopher Smith')
'01160'
>>> spfc('Christopher Schmidt')
'01160'
>>> spfc('Niall Smith')
'01660'
>>> spfc('Niall Schmidt')
'01660'
>>> spfc('L.Smith')
'01960'
>>> spfc('R.Miller')
'65490'
>>> spfc(('L', 'Smith'))
'01960'
>>> spfc(('R', 'Miller'))
'65490'
"""
_pf1 = dict(zip((ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'),
'0011112222334445556666777'))
_pf2 = dict(zip((ord(_) for _ in
'SZCKQFPXABORDHIMNGJTUVWEL'),
'0011122233445556677788899'))
_pf3 = dict(zip((ord(_) for _ in
'BCKQVDTFLPGJXMNRSZAEHIOUWY'),
'00000112223334456677777777'))
_substitutions = (('DK', 'K'), ('DT', 'T'), ('SC', 'S'), ('KN', 'N'),
('MN', 'N'))
def _raise_word_ex():
"""Raise an AttributeError."""
raise AttributeError('word attribute must be a string with a space ' +
'or period dividing the first and last names ' +
'or a tuple/list consisting of the first and ' +
'last names')
if not word:
return ''
names = []
if isinstance(word, (str, text_type)):
names = word.split('.', 1)
if len(names) != 2:
names = word.split(' ', 1)
if len(names) != 2:
_raise_word_ex()
elif hasattr(word, '__iter__'):
if len(word) != 2:
_raise_word_ex()
names = word
else:
_raise_word_ex()
names = [unicode_normalize('NFKD', text_type(_.strip()
.replace('ß', 'SS')
.upper()))
for _ in names]
code = ''
def steps_one_to_three(name):
"""Perform the first three steps of SPFC."""
# filter out non A-Z
name = ''.join(_ for _ in name if _ in
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
'W', 'X', 'Y', 'Z'})
# 1. In the field, convert DK to K, DT to T, SC to S, KN to N,
# and MN to N
for subst in _substitutions:
name = name.replace(subst[0], subst[1])
# 2. In the name field, replace multiple letters with a single letter
name = _delete_consecutive_repeats(name)
# 3. Remove vowels, W, H, and Y, but keep the first letter in the name
# field.
if name:
name = name[0] + ''.join(_ for _ in name[1:] if _ not in
{'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'})
return name
names = [steps_one_to_three(_) for _ in names]
# 4. The first digit of the code is obtained using PF1 and the first letter
# of the name field. Remove this letter after coding.
if names[1]:
code += names[1][0].translate(_pf1)
names[1] = names[1][1:]
# 5. Using the last letters of the name, use Table PF3 to obtain the
# second digit of the code. Use as many letters as possible and remove
# after coding.
if names[1]:
if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS':
code += '8'
names[1] = names[1][:-3]
elif names[1][-2:] == 'SN':
code += '8'
names[1] = names[1][:-2]
elif names[1][-3:] == 'STR':
code += '9'
names[1] = names[1][:-3]
elif names[1][-2:] in {'SR', 'TN', 'TD'}:
code += '9'
names[1] = names[1][:-2]
elif names[1][-3:] == 'DRS':
code += '7'
names[1] = names[1][:-3]
elif names[1][-2:] in {'TR', 'MN'}:
code += '7'
names[1] = names[1][:-2]
else:
code += names[1][-1].translate(_pf3)
names[1] = names[1][:-1]
# 6. The third digit is found using Table PF2 and the first character of
# the first name. Remove after coding.
if names[0]:
code += names[0][0].translate(_pf2)
names[0] = names[0][1:]
# 7. The fourth digit is found using Table PF2 and the first character of
# the name field. If no letters remain use zero. After coding remove the
# letter.
# 8. The fifth digit is found in the same manner as the fourth using the
# remaining characters of the name field if any.
for _ in range(2):
if names[1]:
code += names[1][0].translate(_pf2)
names[1] = names[1][1:]
else:
code += '0'
return code
[docs]def statistics_canada(word, max_length=4):
"""Return the Statistics Canada code for a word.
The original description of this algorithm could not be located, and
may only have been specified in an unpublished TR. The coding does not
appear to be in use by Statistics Canada any longer. In its place, this is
an implementation of the "Census modified Statistics Canada name coding
procedure".
The modified version of this algorithm is described in Appendix B of
:cite:`Moore:1977`.
:param str word: the word to transform
:param int max_length: the maximum length (default 4) of the code to return
:returns: the Statistics Canada name code value
:rtype: str
>>> statistics_canada('Christopher')
'CHRS'
>>> statistics_canada('Niall')
'NL'
>>> statistics_canada('Smith')
'SMTH'
>>> statistics_canada('Schmidt')
'SCHM'
"""
# uppercase, normalize, decompose, and filter non-A-Z out
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
word = ''.join(c for c in word if c in
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z'})
if not word:
return ''
code = word[1:]
for vowel in {'A', 'E', 'I', 'O', 'U', 'Y'}:
code = code.replace(vowel, '')
code = word[0]+code
code = _delete_consecutive_repeats(code)
code = code.replace(' ', '')
return code[:max_length]
[docs]def lein(word, max_length=4, zero_pad=True):
"""Return the Lein code for a word.
This is Lein name coding, described in :cite:`Moore:1977`.
:param str word: the word to transform
:param int max_length: the maximum length (default 4) of the code to return
:param bool zero_pad: pad the end of the return value with 0s to achieve a
max_length string
:returns: the Lein code
:rtype: str
>>> lein('Christopher')
'C351'
>>> lein('Niall')
'N300'
>>> lein('Smith')
'S210'
>>> lein('Schmidt')
'S521'
"""
_lein_translation = dict(zip((ord(_) for _ in
'BCDFGJKLMNPQRSTVXZ'),
'451455532245351455'))
# uppercase, normalize, decompose, and filter non-A-Z out
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
word = ''.join(c for c in word if c in
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z'})
code = word[:1] # Rule 1
word = word[1:].translate({32: None, 65: None, 69: None, 72: None,
73: None, 79: None, 85: None, 87: None,
89: None}) # Rule 2
word = _delete_consecutive_repeats(word) # Rule 3
code += word.translate(_lein_translation) # Rule 4
if zero_pad:
code += ('0'*max_length) # Rule 4
return code[:max_length]
[docs]def roger_root(word, max_length=5, zero_pad=True):
"""Return the Roger Root code for a word.
This is Roger Root name coding, described in :cite:`Moore:1977`.
:param str word: the word to transform
:param int max_length: the maximum length (default 5) of the code to return
:param bool zero_pad: pad the end of the return value with 0s to achieve a
max_length string
:returns: the Roger Root code
:rtype: str
>>> roger_root('Christopher')
'06401'
>>> roger_root('Niall')
'02500'
>>> roger_root('Smith')
'00310'
>>> roger_root('Schmidt')
'06310'
"""
# uppercase, normalize, decompose, and filter non-A-Z out
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
word = ''.join(c for c in word if c in
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z'})
# '*' is used to prevent combining by _delete_consecutive_repeats()
_init_patterns = {4: {'TSCH': '06'},
3: {'TSH': '06', 'SCH': '06'},
2: {'CE': '0*0', 'CH': '06', 'CI': '0*0', 'CY': '0*0',
'DG': '07', 'GF': '08', 'GM': '03', 'GN': '02',
'KN': '02', 'PF': '08', 'PH': '08', 'PN': '02',
'SH': '06', 'TS': '0*0', 'WR': '04'},
1: {'A': '1', 'B': '09', 'C': '07', 'D': '01', 'E': '1',
'F': '08', 'G': '07', 'H': '2', 'I': '1', 'J': '3',
'K': '07', 'L': '05', 'M': '03', 'N': '02', 'O': '1',
'P': '09', 'Q': '07', 'R': '04', 'S': '0*0',
'T': '01', 'U': '1', 'V': '08', 'W': '4', 'X': '07',
'Y': '5', 'Z': '0*0'}}
_med_patterns = {4: {'TSCH': '6'},
3: {'TSH': '6', 'SCH': '6'},
2: {'CE': '0', 'CH': '6', 'CI': '0', 'CY': '0', 'DG': '7',
'PH': '8', 'SH': '6', 'TS': '0'},
1: {'B': '9', 'C': '7', 'D': '1', 'F': '8', 'G': '7',
'J': '6', 'K': '7', 'L': '5', 'M': '3', 'N': '2',
'P': '9', 'Q': '7', 'R': '4', 'S': '0', 'T': '1',
'V': '8', 'X': '7', 'Z': '0',
'A': '*', 'E': '*', 'H': '*', 'I': '*', 'O': '*',
'U': '*', 'W': '*', 'Y': '*'}}
code = ''
pos = 0
# Do first digit(s) first
for num in range(4, 0, -1):
if word[:num] in _init_patterns[num]:
code = _init_patterns[num][word[:num]]
pos += num
break
# Then code subsequent digits
while pos < len(word):
for num in range(4, 0, -1): # pragma: no branch
if word[pos:pos+num] in _med_patterns[num]:
code += _med_patterns[num][word[pos:pos+num]]
pos += num
break
code = _delete_consecutive_repeats(code)
code = code.replace('*', '')
if zero_pad:
code += '0'*max_length
return code[:max_length]
[docs]def onca(word, max_length=4, zero_pad=True):
"""Return the Oxford Name Compression Algorithm (ONCA) code for a word.
This is the Oxford Name Compression Algorithm, based on :cite:`Gill:1997`.
I can find no complete description of the "anglicised version of the NYSIIS
method" identified as the first step in this algorithm, so this is likely
not a precisely correct implementation, in that it employs the standard
NYSIIS algorithm.
:param str word: the word to transform
:param int max_length: the maximum length (default 5) of the code to return
:param bool zero_pad: pad the end of the return value with 0s to achieve a
max_length string
:returns: the ONCA code
:rtype: str
>>> onca('Christopher')
'C623'
>>> onca('Niall')
'N400'
>>> onca('Smith')
'S530'
>>> onca('Schmidt')
'S530'
"""
# In the most extreme case, 3 characters of NYSIIS input can be compressed
# to one character of output, so give it triple the max_length.
return soundex(nysiis(word, max_length=max_length*3), max_length,
zero_pad=zero_pad)
[docs]def eudex(word, max_length=8):
"""Return the eudex phonetic hash of a word.
This implementation of eudex phonetic hashing is based on the specification
(not the reference implementation) at :cite:`Ticki:2016`.
Further details can be found at :cite:`Ticki:2016b`.
:param str word: the word to transform
:param int max_length: the length in bits of the code returned (default 8)
:returns: the eudex hash
:rtype: int
>>> eudex('Colin')
432345564238053650
>>> eudex('Christopher')
433648490138894409
>>> eudex('Niall')
648518346341351840
>>> eudex('Smith')
720575940412906756
>>> eudex('Schmidt')
720589151732307997
"""
_trailing_phones = {
'a': 0, # a
'b': 0b01001000, # b
'c': 0b00001100, # c
'd': 0b00011000, # d
'e': 0, # e
'f': 0b01000100, # f
'g': 0b00001000, # g
'h': 0b00000100, # h
'i': 1, # i
'j': 0b00000101, # j
'k': 0b00001001, # k
'l': 0b10100000, # l
'm': 0b00000010, # m
'n': 0b00010010, # n
'o': 0, # o
'p': 0b01001001, # p
'q': 0b10101000, # q
'r': 0b10100001, # r
's': 0b00010100, # s
't': 0b00011101, # t
'u': 1, # u
'v': 0b01000101, # v
'w': 0b00000000, # w
'x': 0b10000100, # x
'y': 1, # y
'z': 0b10010100, # z
'ß': 0b00010101, # ß
'à': 0, # à
'á': 0, # á
'â': 0, # â
'ã': 0, # ã
'ä': 0, # ä[æ]
'å': 1, # å[oː]
'æ': 0, # æ[æ]
'ç': 0b10010101, # ç[t͡ʃ]
'è': 1, # è
'é': 1, # é
'ê': 1, # ê
'ë': 1, # ë
'ì': 1, # ì
'í': 1, # í
'î': 1, # î
'ï': 1, # ï
'ð': 0b00010101, # ð[ð̠](represented as a non-plosive T)
'ñ': 0b00010111, # ñ[nj](represented as a combination of n and j)
'ò': 0, # ò
'ó': 0, # ó
'ô': 0, # ô
'õ': 0, # õ
'ö': 1, # ö[ø]
'÷': 0b11111111, # ÷
'ø': 1, # ø[ø]
'ù': 1, # ù
'ú': 1, # ú
'û': 1, # û
'ü': 1, # ü
'ý': 1, # ý
'þ': 0b00010101, # þ[ð̠](represented as a non-plosive T)
'ÿ': 1, # ÿ
}
_initial_phones = {
'a': 0b10000100, # a*
'b': 0b00100100, # b
'c': 0b00000110, # c
'd': 0b00001100, # d
'e': 0b11011000, # e*
'f': 0b00100010, # f
'g': 0b00000100, # g
'h': 0b00000010, # h
'i': 0b11111000, # i*
'j': 0b00000011, # j
'k': 0b00000101, # k
'l': 0b01010000, # l
'm': 0b00000001, # m
'n': 0b00001001, # n
'o': 0b10010100, # o*
'p': 0b00100101, # p
'q': 0b01010100, # q
'r': 0b01010001, # r
's': 0b00001010, # s
't': 0b00001110, # t
'u': 0b11100000, # u*
'v': 0b00100011, # v
'w': 0b00000000, # w
'x': 0b01000010, # x
'y': 0b11100100, # y*
'z': 0b01001010, # z
'ß': 0b00001011, # ß
'à': 0b10000101, # à
'á': 0b10000101, # á
'â': 0b10000000, # â
'ã': 0b10000110, # ã
'ä': 0b10100110, # ä [æ]
'å': 0b11000010, # å [oː]
'æ': 0b10100111, # æ [æ]
'ç': 0b01010100, # ç [t͡ʃ]
'è': 0b11011001, # è
'é': 0b11011001, # é
'ê': 0b11011001, # ê
'ë': 0b11000110, # ë [ə] or [œ]
'ì': 0b11111001, # ì
'í': 0b11111001, # í
'î': 0b11111001, # î
'ï': 0b11111001, # ï
'ð': 0b00001011, # ð [ð̠] (represented as a non-plosive T)
'ñ': 0b00001011, # ñ [nj] (represented as a combination of n and j)
'ò': 0b10010101, # ò
'ó': 0b10010101, # ó
'ô': 0b10010101, # ô
'õ': 0b10010101, # õ
'ö': 0b11011100, # ö [œ] or [ø]
'÷': 0b11111111, # ÷
'ø': 0b11011101, # ø [œ] or [ø]
'ù': 0b11100001, # ù
'ú': 0b11100001, # ú
'û': 0b11100001, # û
'ü': 0b11100101, # ü
'ý': 0b11100101, # ý
'þ': 0b00001011, # þ [ð̠] (represented as a non-plosive T)
'ÿ': 0b11100101, # ÿ
}
# Lowercase input & filter unknown characters
word = ''.join(char for char in word.lower() if char in _initial_phones)
if not word:
word = '÷'
# Perform initial eudex coding of each character
values = [_initial_phones[word[0]]]
values += [_trailing_phones[char] for char in word[1:]]
# Right-shift by one to determine if second instance should be skipped
shifted_values = [_ >> 1 for _ in values]
condensed_values = [values[0]]
for n in range(1, len(shifted_values)):
if shifted_values[n] != shifted_values[n-1]:
condensed_values.append(values[n])
# Add padding after first character & trim beyond max_length
values = ([condensed_values[0]] +
[0]*max(0, max_length - len(condensed_values)) +
condensed_values[1:max_length])
# Combine individual character values into eudex hash
hash_value = 0
for val in values:
hash_value = (hash_value << 8) | val
return hash_value
[docs]def haase_phonetik(word, primary_only=False):
"""Return the Haase Phonetik (numeric output) code for a word.
Based on the algorithm described at :cite:`Prante:2015`.
Based on the original :cite:`Haase:2000`.
While the output code is numeric, it is nevertheless a str.
:param str word: the word to transform
:param bool primary_only: if True, only the primary code is returned
:returns: the Haase Phonetik value as a numeric string
:rtype: tuple
>>> haase_phonetik('Joachim')
('9496',)
>>> haase_phonetik('Christoph')
('4798293', '8798293')
>>> haase_phonetik('Jörg')
('974',)
>>> haase_phonetik('Smith')
('8692',)
>>> haase_phonetik('Schmidt')
('8692', '4692')
"""
def _after(word, i, letters):
"""Return True if word[i] follows one of the supplied letters."""
if i > 0 and word[i-1] in letters:
return True
return False
def _before(word, i, letters):
"""Return True if word[i] precedes one of the supplied letters."""
if i+1 < len(word) and word[i+1] in letters:
return True
return False
_vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
word = word.replace('Ä', 'AE')
word = word.replace('Ö', 'OE')
word = word.replace('Ü', 'UE')
word = ''.join(c for c in word if c in
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z'})
variants = []
if primary_only:
variants = [word]
else:
pos = 0
if word[:2] == 'CH':
variants.append(('CH', 'SCH'))
pos += 2
len_3_vars = {'OWN': 'AUN', 'WSK': 'RSK', 'SCH': 'CH', 'GLI': 'LI',
'AUX': 'O', 'EUX': 'O'}
while pos < len(word):
if word[pos:pos+4] == 'ILLE':
variants.append(('ILLE', 'I'))
pos += 4
elif word[pos:pos+3] in len_3_vars:
variants.append((word[pos:pos+3], len_3_vars[word[pos:pos+3]]))
pos += 3
elif word[pos:pos+2] == 'RB':
variants.append(('RB', 'RW'))
pos += 2
elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
variants.append(('EAU', 'O'))
pos += 3
elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
if word[pos:] == 'O':
variants.append(('O', 'OW'))
else:
variants.append(('A', 'AR'))
pos += 1
else:
variants.append((word[pos],))
pos += 1
variants = [''.join(letters) for letters in product(*variants)]
def _haase_code(word):
sdx = ''
for i in range(len(word)):
if word[i] in _vowels:
sdx += '9'
elif word[i] == 'B':
sdx += '1'
elif word[i] == 'P':
if _before(word, i, {'H'}):
sdx += '3'
else:
sdx += '1'
elif word[i] in {'D', 'T'}:
if _before(word, i, {'C', 'S', 'Z'}):
sdx += '8'
else:
sdx += '2'
elif word[i] in {'F', 'V', 'W'}:
sdx += '3'
elif word[i] in {'G', 'K', 'Q'}:
sdx += '4'
elif word[i] == 'C':
if _after(word, i, {'S', 'Z'}):
sdx += '8'
elif i == 0:
if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R',
'U', 'X'}):
sdx += '4'
else:
sdx += '8'
elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
sdx += '4'
else:
sdx += '8'
elif word[i] == 'X':
if _after(word, i, {'C', 'K', 'Q'}):
sdx += '8'
else:
sdx += '48'
elif word[i] == 'L':
sdx += '5'
elif word[i] in {'M', 'N'}:
sdx += '6'
elif word[i] == 'R':
sdx += '7'
elif word[i] in {'S', 'Z'}:
sdx += '8'
sdx = _delete_consecutive_repeats(sdx)
return sdx
encoded = tuple(_haase_code(word) for word in variants)
if len(encoded) > 1:
encoded_set = set()
encoded_single = []
for code in encoded:
if code not in encoded_set:
encoded_set.add(code)
encoded_single.append(code)
return tuple(encoded_single)
return encoded
[docs]def reth_schek_phonetik(word):
"""Return Reth-Schek Phonetik code for a word.
This algorithm is proposed in :cite:`Reth:1977`.
Since I couldn't secure a copy of that document (maybe I'll look for it
next time I'm in Germany), this implementation is based on what I could
glean from the implementations published by German Record Linkage
Center (www.record-linkage.de):
- Privacy-preserving Record Linkage (PPRL) (in R) :cite:`Rukasz:2018`
- Merge ToolBox (in Java) :cite:`Schnell:2004`
Rules that are unclear:
- Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked)
- Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo)
- Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't
think of a German word with '-tui-' in it.)
- Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'?
:param str word: the word to transform
:returns: the Reth-Schek Phonetik code
:rtype: str
>>> reth_schek_phonetik('Joachim')
'JOAGHIM'
>>> reth_schek_phonetik('Christoph')
'GHRISDOF'
>>> reth_schek_phonetik('Jörg')
'JOERG'
>>> reth_schek_phonetik('Smith')
'SMID'
>>> reth_schek_phonetik('Schmidt')
'SCHMID'
"""
replacements = {3: {'AEH': 'E', 'IEH': 'I', 'OEH': 'OE', 'UEH': 'UE',
'SCH': 'CH', 'ZIO': 'TIO', 'TIU': 'TIO', 'ZIU': 'TIO',
'CHS': 'X', 'CKS': 'X', 'AEU': 'OI'},
2: {'LL': 'L', 'AA': 'A', 'AH': 'A', 'BB': 'B', 'PP': 'B',
'BP': 'B', 'PB': 'B', 'DD': 'D', 'DT': 'D', 'TT': 'D',
'TH': 'D', 'EE': 'E', 'EH': 'E', 'AE': 'E', 'FF': 'F',
'PH': 'F', 'KK': 'K', 'GG': 'G', 'GK': 'G', 'KG': 'G',
'CK': 'G', 'CC': 'C', 'IE': 'I', 'IH': 'I', 'MM': 'M',
'NN': 'N', 'OO': 'O', 'OH': 'O', 'SZ': 'S', 'UH': 'U',
'GS': 'X', 'KS': 'X', 'TZ': 'Z', 'AY': 'AI',
'EI': 'AI', 'EY': 'AI', 'EU': 'OI', 'RR': 'R',
'SS': 'S', 'KW': 'QU'},
1: {'P': 'B', 'T': 'D', 'V': 'F', 'W': 'F', 'C': 'G',
'K': 'G', 'Y': 'I'}}
# Uppercase
word = word.upper()
# Replace umlauts/eszett
word = word.replace('Ä', 'AE')
word = word.replace('Ö', 'OE')
word = word.replace('Ü', 'UE')
word = word.replace('ß', 'SS')
# Main loop, using above replacements table
pos = 0
while pos < len(word):
for num in range(3, 0, -1):
if word[pos:pos+num] in replacements[num]:
word = (word[:pos] + replacements[num][word[pos:pos+num]]
+ word[pos+num:])
pos += 1
break
else:
pos += 1 # Advance if nothing is recognized
# Change 'CH' back(?) to 'SCH'
word = word.replace('CH', 'SCH')
# Replace final sequences
if word[-2:] == 'ER':
word = word[:-2]+'R'
elif word[-2:] == 'EL':
word = word[:-2]+'L'
elif word[-1:] == 'H':
word = word[:-1]
return word
[docs]def fonem(word):
"""Return the FONEM code of a word.
FONEM is a phonetic algorithm designed for French (particularly surnames in
Saguenay, Canada), defined in :cite:`Bouchard:1981`.
Guillaume Plique's Javascript implementation :cite:`Plique:2018` at
https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js
was also consulted for this implementation.
:param str word: the word to transform
:returns: the FONEM code
:rtype: str
>>> fonem('Marchand')
'MARCHEN'
>>> fonem('Beaulieu')
'BOLIEU'
>>> fonem('Beaumont')
'BOMON'
>>> fonem('Legrand')
'LEGREN'
>>> fonem('Pelletier')
'PELETIER'
"""
# I don't see a sane way of doing this without regexps :(
rule_table = {
# Vowels & groups of vowels
'V-1': (re_compile('E?AU'), 'O'),
'V-2,5': (re_compile('(E?AU|O)L[TX]$'), 'O'),
'V-3,4': (re_compile('E?AU[TX]$'), 'O'),
'V-6': (re_compile('E?AUL?D$'), 'O'),
'V-7': (re_compile(r'(?<!G)AY$'), 'E'),
'V-8': (re_compile('EUX$'), 'EU'),
'V-9': (re_compile('EY(?=$|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'),
'V-10': ('Y', 'I'),
'V-11': (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'),
'V-12': (re_compile('(?<=[AEIOUY])ILL'), 'Y'),
'V-13': (re_compile('OU(?=[AEOU]|I(?!LL))'), 'W'),
'V-14': (re_compile(r'([AEIOUY])(?=\1)'), ''),
# Nasal vowels
'V-15': (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'),
'V-16': (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'),
'V-17': (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'),
'V-18': (re_compile('(AI[MN]|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'),
'IN'),
'V-19': (re_compile('B(O|U|OU)RNE?$'), 'BURN'),
'V-20': (re_compile('(^IM|(?<=[BCDFGHJKLMNPQRSTVWXZ])' +
'IM(?=[BCDFGHJKLMPQRSTVWXZ]))'), 'IN'),
# Consonants and groups of consonants
'C-1': ('BV', 'V'),
'C-2': (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'),
'C-3': (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'),
'C-4': (re_compile('^C(?=[EIY])'), 'S'),
'C-5': (re_compile('^C(?=[OUA])'), 'K'),
'C-6': (re_compile('(?<=[AEIOUY])C$'), 'K'),
'C-7': (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'),
'C-8': (re_compile('CC(?=[AOU])'), 'K'),
'C-9': (re_compile('CC(?=[EIY])'), 'X'),
'C-10': (re_compile('G(?=[EIY])'), 'J'),
'C-11': (re_compile('GA(?=I?[MN])'), 'G#'),
'C-12': (re_compile('GE(O|AU)'), 'JO'),
'C-13': (re_compile('GNI(?=[AEIOUY])'), 'GN'),
'C-14': (re_compile('(?<![PCS])H'), ''),
'C-15': ('JEA', 'JA'),
'C-16': (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'),
'C-17': (re_compile('^MC'), 'MA#'),
'C-18': ('PH', 'F'),
'C-19': ('QU', 'K'),
'C-20': (re_compile('^SC(?=[EIY])'), 'S'),
'C-21': (re_compile('(?<=.)SC(?=[EIY])'), 'SS'),
'C-22': (re_compile('(?<=.)SC(?=[AOU])'), 'SK'),
'C-23': ('SH', 'CH'),
'C-24': (re_compile('TIA$'), 'SSIA'),
'C-25': (re_compile('(?<=[AIOUY])W'), ''),
'C-26': (re_compile('X[CSZ]'), 'X'),
'C-27': (re_compile('(?<=[AEIOUY])Z|(?<=[BCDFGHJKLMNPQRSTVWXZ])' +
'Z(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'S'),
'C-28': (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'),
'C-28a': (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'C'),
'C-28b': (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])|^)SS'), 'S'),
'C-28bb': (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'S'),
'C-28c': (re_compile('((?<=[^I])|^)LL'), 'L'),
'C-28d': (re_compile('ILE$'), 'ILLE'),
'C-29': (re_compile('(ILS|[CS]H|[MN]P|R[CFKLNSX])$|([BCDFGHJKL' +
'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'),
lambda m: (m.group(1) or '') + (m.group(2) or '')),
'C-30,32': (re_compile('^(SA?INT?|SEI[NM]|CINQ?|ST)(?!E)-?'), 'ST-'),
'C-31,33': (re_compile('^(SAINTE|STE)-?'), 'STE-'),
# Rules to undo rule bleeding prevention in C-11, C-16, C-17
'C-34': ('G#', 'GA'),
'C-35': ('MA#', 'MAC')
}
rule_order = [
'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d',
'C-12',
'C-8', 'C-9', 'C-10',
'C-16', 'C-17', 'C-2', 'C-3', 'C-7',
'V-2,5', 'V-3,4', 'V-6',
'V-1', 'C-14',
'C-31,33', 'C-30,32',
'C-11', 'V-15', 'V-17', 'V-18',
'V-7', 'V-8', 'V-9', 'V-10', 'V-11', 'V-12', 'V-13', 'V-16',
'V-19', 'V-20',
'C-1', 'C-4', 'C-5', 'C-6', 'C-13', 'C-15',
'C-18', 'C-19', 'C-20', 'C-21', 'C-22', 'C-23', 'C-24',
'C-25', 'C-26', 'C-27',
'C-29',
'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d',
'C-34', 'C-35'
]
# normalize, upper-case, and filter non-French letters
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.translate({198: 'AE', 338: 'OE'})
word = ''.join(c for c in word if c in
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z', '-'})
for rule in rule_order:
regex, repl = rule_table[rule]
if isinstance(regex, text_type):
word = word.replace(regex, repl)
else:
word = regex.sub(repl, word)
return word
[docs]def parmar_kumbharana(word):
"""Return the Parmar-Kumbharana encoding of a word.
This is based on the phonetic algorithm proposed in :cite:`Parmar:2014`.
:param str word: the word to transform
:returns: the Parmar-Kumbharana encoding
:rtype: str
>>> parmar_kumbharana('Gough')
'GF'
>>> parmar_kumbharana('pneuma')
'NM'
>>> parmar_kumbharana('knight')
'NT'
>>> parmar_kumbharana('trice')
'TRS'
>>> parmar_kumbharana('judge')
'JJ'
"""
rule_table = {4: {'OUGH': 'F'},
3: {'DGE': 'J',
'OUL': 'U',
'GHT': 'T'},
2: {'CE': 'S', 'CI': 'S', 'CY': 'S',
'GE': 'J', 'GI': 'J', 'GY': 'J',
'WR': 'R',
'GN': 'N', 'KN': 'N', 'PN': 'N',
'CK': 'K',
'SH': 'S'}}
vowel_trans = {65: '', 69: '', 73: '', 79: '', 85: '', 89: ''}
word = word.upper() # Rule 3
word = _delete_consecutive_repeats(word) # Rule 4
# Rule 5
i = 0
while i < len(word):
for match_len in range(4, 1, -1):
if word[i:i+match_len] in rule_table[match_len]:
repl = rule_table[match_len][word[i:i+match_len]]
word = (word[:i] + repl + word[i+match_len:])
i += len(repl)
break
else:
i += 1
word = word[0]+word[1:].translate(vowel_trans) # Rule 6
return word
[docs]def davidson(lname, fname='.', omit_fname=False):
"""Return Davidson's Consonant Code.
This is based on the name compression system described in
:cite:`Davidson:1962`.
:cite:`Dolby:1970` identifies this as having been the name compression
algorithm used by SABRE.
:param str lname: Last name (or word) to be encoded
:param str fname: First name (optional), of which the first character is
included in the code.
:param bool omit_fname: Set to True to completely omit the first character
of the first name
:returns: Davidson's Consonant Code
:rtype: str
>>> davidson('Gough')
'G .'
>>> davidson('pneuma')
'PNM .'
>>> davidson('knight')
'KNGT.'
>>> davidson('trice')
'TRC .'
>>> davidson('judge')
'JDG .'
>>> davidson('Smith', 'James')
'SMT J'
>>> davidson('Wasserman', 'Tabitha')
'WSRMT'
"""
trans = {65: '', 69: '', 73: '', 79: '', 85: '', 72: '', 87: '', 89: ''}
lname = text_type(lname.upper())
code = _delete_consecutive_repeats(lname[:1] + lname[1:].translate(trans))
code = code[:4] + (4-len(code))*' '
if not omit_fname:
code += fname[:1].upper()
return code
[docs]def sound_d(word, max_length=4):
"""Return the SoundD code.
SoundD is defined in :cite:`Varol:2012`.
:param str word: the word to transform
:param int max_length: the length of the code returned (defaults to 4)
:returns: the SoundD code
:rtype: str
>>> sound_d('Gough')
'2000'
>>> sound_d('pneuma')
'5500'
>>> sound_d('knight')
'5300'
>>> sound_d('trice')
'3620'
>>> sound_d('judge')
'2200'
"""
_ref_soundd_translation = dict(zip((ord(_) for _ in
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
'01230120022455012623010202'))
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
word = ''.join(c for c in word if c in
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z'})
if word[:2] in {'KN', 'GN', 'PN', 'AC', 'WR'}:
word = word[1:]
elif word[:1] == 'X':
word = 'S'+word[1:]
elif word[:2] == 'WH':
word = 'W'+word[2:]
word = word.replace('DGE', '20').replace('DGI', '20').replace('GH', '0')
word = word.translate(_ref_soundd_translation)
word = _delete_consecutive_repeats(word)
word = word.replace('0', '')
if max_length != -1:
if len(word) < max_length:
word += '0' * (max_length-len(word))
else:
word = word[:max_length]
return word
[docs]def pshp_soundex_last(lname, max_length=4, german=False):
"""Calculate the PSHP Soundex/Viewex Coding of a last name.
This coding is based on :cite:`Hershberg:1976`.
Reference was also made to the German version of the same:
:cite:`Hershberg:1979`.
A separate function, pshp_soundex_first() is used for first names.
:param str lname: the last name to encode
:param int max_length: the length of the code returned (defaults to 4)
:param bool german: set to True if the name is German (different rules
apply)
:returns: the PSHP Soundex/Viewex Coding
:rtype: str
>>> pshp_soundex_last('Smith')
'S530'
>>> pshp_soundex_last('Waters')
'W350'
>>> pshp_soundex_last('James')
'J500'
>>> pshp_soundex_last('Schmidt')
'S530'
>>> pshp_soundex_last('Ashcroft')
'A225'
"""
lname = unicode_normalize('NFKD', text_type(lname.upper()))
lname = lname.replace('ß', 'SS')
lname = ''.join(c for c in lname if c in
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
'W', 'X', 'Y', 'Z'})
# A. Prefix treatment
if lname[:3] == 'VON' or lname[:3] == 'VAN':
lname = lname[3:].strip()
# The rule implemented below says "MC, MAC become 1". I believe it meant to
# say they become M except in German data (where superscripted 1 indicates
# "except in German data"). It doesn't make sense for them to become 1
# (BPFV -> 1) or to apply outside German. Unfortunately, both articles have
# this error(?).
if not german:
if lname[:3] == 'MAC':
lname = 'M'+lname[3:]
elif lname[:2] == 'MC':
lname = 'M'+lname[2:]
# The non-German-only rule to strip ' is unnecessary due to filtering
if lname[:1] in {'E', 'I', 'O', 'U'}:
lname = 'A' + lname[1:]
elif lname[:2] in {'GE', 'GI', 'GY'}:
lname = 'J' + lname[1:]
elif lname[:2] in {'CE', 'CI', 'CY'}:
lname = 'S' + lname[1:]
elif lname[:3] == 'CHR':
lname = 'K' + lname[1:]
elif lname[:1] == 'C' and lname[:2] != 'CH':
lname = 'K' + lname[1:]
if lname[:2] == 'KN':
lname = 'N' + lname[1:]
elif lname[:2] == 'PH':
lname = 'F' + lname[1:]
elif lname[:3] in {'WIE', 'WEI'}:
lname = 'V' + lname[1:]
if german and lname[:1] in {'W', 'M', 'Y', 'Z'}:
lname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[lname[0]]+lname[1:]
code = lname[:1]
# B. Postfix treatment
if german: # moved from end of postfix treatment due to blocking
if lname[-3:] == 'TES':
lname = lname[:-3]
elif lname[-2:] == 'TS':
lname = lname[:-2]
if lname[-3:] == 'TZE':
lname = lname[:-3]
elif lname[-2:] == 'ZE':
lname = lname[:-2]
if lname[-1:] == 'Z':
lname = lname[:-1]
elif lname[-2:] == 'TE':
lname = lname[:-2]
if lname[-1:] == 'R':
lname = lname[:-1] + 'N'
elif lname[-2:] in {'SE', 'CE'}:
lname = lname[:-2]
if lname[-2:] == 'SS':
lname = lname[:-2]
elif lname[-1:] == 'S':
lname = lname[:-1]
if not german:
l5_repl = {'STOWN': 'SAWON', 'MPSON': 'MASON'}
l4_repl = {'NSEN': 'ASEN', 'MSON': 'ASON', 'STEN': 'SAEN',
'STON': 'SAON'}
if lname[-5:] in l5_repl:
lname = lname[:-5] + l5_repl[lname[-5:]]
elif lname[-4:] in l4_repl:
lname = lname[:-4] + l4_repl[lname[-4:]]
if lname[-2:] in {'NG', 'ND'}:
lname = lname[:-1]
if not german and lname[-3:] in {'GAN', 'GEN'}:
lname = lname[:-3]+'A'+lname[-2:]
# C. Infix Treatment
lname = lname.replace('CK', 'C')
lname = lname.replace('SCH', 'S')
lname = lname.replace('DT', 'T')
lname = lname.replace('ND', 'N')
lname = lname.replace('NG', 'N')
lname = lname.replace('LM', 'M')
lname = lname.replace('MN', 'M')
lname = lname.replace('WIE', 'VIE')
lname = lname.replace('WEI', 'VEI')
# D. Soundexing
# code for X & Y are unspecified, but presumably are 2 & 0
_pshp_translation = dict(zip((ord(_) for _ in
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
'01230120022455012523010202'))
lname = lname.translate(_pshp_translation)
lname = _delete_consecutive_repeats(lname)
code += lname[1:]
code = code.replace('0', '') # rule 1
if max_length != -1:
if len(code) < max_length:
code += '0' * (max_length-len(code))
else:
code = code[:max_length]
return code
[docs]def pshp_soundex_first(fname, max_length=4, german=False):
"""Calculate the PSHP Soundex/Viewex Coding of a first name.
This coding is based on :cite:`Hershberg:1976`.
Reference was also made to the German version of the same:
:cite:`Hershberg:1979`.
A separate function, pshp_soundex_last() is used for last names.
:param str fname: the first name to encode
:param int max_length: the length of the code returned (defaults to 4)
:param bool german: set to True if the name is German (different rules
apply)
:returns: the PSHP Soundex/Viewex Coding
:rtype: str
>>> pshp_soundex_first('Smith')
'S530'
>>> pshp_soundex_first('Waters')
'W352'
>>> pshp_soundex_first('James')
'J700'
>>> pshp_soundex_first('Schmidt')
'S500'
>>> pshp_soundex_first('Ashcroft')
'A220'
>>> pshp_soundex_first('John')
'J500'
>>> pshp_soundex_first('Colin')
'K400'
>>> pshp_soundex_first('Niall')
'N400'
>>> pshp_soundex_first('Sally')
'S400'
>>> pshp_soundex_first('Jane')
'J500'
"""
fname = unicode_normalize('NFKD', text_type(fname.upper()))
fname = fname.replace('ß', 'SS')
fname = ''.join(c for c in fname if c in
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
'W', 'X', 'Y', 'Z'})
# special rules
if fname == 'JAMES':
code = 'J7'
elif fname == 'PAT':
code = 'P7'
else:
# A. Prefix treatment
if fname[:2] in {'GE', 'GI', 'GY'}:
fname = 'J' + fname[1:]
elif fname[:2] in {'CE', 'CI', 'CY'}:
fname = 'S' + fname[1:]
elif fname[:3] == 'CHR':
fname = 'K' + fname[1:]
elif fname[:1] == 'C' and fname[:2] != 'CH':
fname = 'K' + fname[1:]
if fname[:2] == 'KN':
fname = 'N' + fname[1:]
elif fname[:2] == 'PH':
fname = 'F' + fname[1:]
elif fname[:3] in {'WIE', 'WEI'}:
fname = 'V' + fname[1:]
if german and fname[:1] in {'W', 'M', 'Y', 'Z'}:
fname = ({'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[fname[0]] +
fname[1:])
code = fname[:1]
# B. Soundex coding
# code for Y unspecified, but presumably is 0
_pshp_translation = dict(zip((ord(_) for _ in
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
'01230120022455012523010202'))
fname = fname.translate(_pshp_translation)
fname = _delete_consecutive_repeats(fname)
code += fname[1:]
syl_ptr = code.find('0')
syl2_ptr = code[syl_ptr + 1:].find('0')
if syl_ptr != -1 and syl2_ptr != -1 and syl2_ptr - syl_ptr > -1:
code = code[:syl_ptr + 2]
code = code.replace('0', '') # rule 1
if max_length != -1:
if len(code) < max_length:
code += '0' * (max_length-len(code))
else:
code = code[:max_length]
return code
[docs]def henry_early(word, max_length=3):
"""Calculate the early version of the Henry code for a word.
The early version of Henry coding is given in :cite:`Legare:1972`. This is
different from the later version defined in :cite:`Henry:1976`.
:param str word: the word to transform
:param int max_length: the length of the code returned (defaults to 3)
:returns: the early Henry code
:rtype: str
>>> henry_early('Marchand')
'MRC'
>>> henry_early('Beaulieu')
'BL'
>>> henry_early('Beaumont')
'BM'
>>> henry_early('Legrand')
'LGR'
>>> henry_early('Pelletier')
'PLT'
"""
_cons = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q',
'R', 'S', 'T', 'V', 'W', 'X', 'Z'}
_vows = {'A', 'E', 'I', 'O', 'U', 'Y'}
_diph = {'AI': 'E', 'AY': 'E', 'EI': 'E', 'AU': 'O', 'OI': 'O', 'OU': 'O',
'EU': 'U'}
# _unaltered = {'B', 'D', 'F', 'J', 'K', 'L', 'M', 'N', 'R', 'T', 'V'}
_simple = {'W': 'V', 'X': 'S', 'Z': 'S'}
word = unicode_normalize('NFKD', text_type(word.upper()))
word = ''.join(c for c in word if c in
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z'})
if not word:
return ''
# Rule Ia seems to be covered entirely in II
# Rule Ib
if word[0] in _vows:
# Ib1
if (((word[1:2] in _cons-{'M', 'N'} and word[2:3] in _cons) or
(word[1:2] in _cons and word[2:3] not in _cons))):
if word[0] == 'Y':
word = 'I'+word[1:]
# Ib2
elif word[1:2] in {'M', 'N'} and word[2:3] in _cons:
if word[0] == 'E':
word = 'A'+word[1:]
elif word[0] in {'I', 'U', 'Y'}:
word = 'E'+word[1:]
# Ib3
elif word[:2] in _diph:
word = _diph[word[:2]]+word[2:]
# Ib4
elif word[1:2] in _vows and word[0] == 'Y':
word = 'I' + word[1:]
code = ''
skip = 0
# Rule II
for pos, char in enumerate(word):
nxch = word[pos+1:pos+2]
prev = word[pos-1:pos]
if skip:
skip -= 1
elif char in _vows:
code += char
# IIc
elif char == nxch:
skip = 1
code += char
elif word[pos:pos+2] in {'CQ', 'DT', 'SC'}:
continue
# IIb
elif char in _simple:
code += _simple[char]
elif char in {'C', 'G', 'P', 'Q', 'S'}:
if char == 'C':
if nxch in {'A', 'O', 'U', 'L', 'R'}:
code += 'K'
elif nxch in {'E', 'I', 'Y'}:
code += 'S'
elif nxch == 'H':
if word[pos+2:pos+3] in _vows:
code += 'C'
else: # CHR, CHL, etc.
code += 'K'
else:
code += 'C'
elif char == 'G':
if nxch in {'A', 'O', 'U', 'L', 'R'}:
code += 'G'
elif nxch in {'E', 'I', 'Y'}:
code += 'J'
elif nxch == 'N':
code += 'N'
elif char == 'P':
if nxch != 'H':
code += 'P'
else:
code += 'F'
elif char == 'Q':
if word[pos+1:pos+3] in {'UE', 'UI', 'UY'}:
code += 'G'
else: # QUA, QUO, etc.
code += 'K'
else: # S...
if word[pos:pos+6] == 'SAINTE':
code += 'X'
skip = 5
elif word[pos:pos+5] == 'SAINT':
code += 'X'
skip = 4
elif word[pos:pos+3] == 'STE':
code += 'X'
skip = 2
elif word[pos:pos+2] == 'ST':
code += 'X'
skip = 1
elif nxch in _cons:
continue
else:
code += 'S'
# IId
elif char == 'H' and prev in _cons:
continue
elif char in _cons-{'L', 'R'} and nxch in _cons-{'L', 'R'}:
continue
elif char == 'L' and nxch in {'M', 'N'}:
continue
elif char in {'M', 'N'} and prev in _vows and nxch in _cons:
continue
# IIa
else:
code += char
# IIe1
if code[-4:] in {'AULT', 'EULT', 'OULT'}:
code = code[:-2]
# The following are blocked by rules above
# elif code[-4:-3] in _vows and code[-3:] == 'MPS':
# code = code[:-3]
# elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND',
# 'NS', 'NT'}:
# code = code[:-2]
elif code[-2:-1] == 'R' and code[-1:] in _cons:
code = code[:-1]
# IIe2
elif code[-2:-1] in _vows and code[-1:] in {'D', 'M', 'N', 'S', 'T'}:
code = code[:-1]
elif code[-2:] == 'ER':
code = code[:-1]
# Drop non-initial vowels
code = code[:1]+code[1:].translate({65: '', 69: '', 73: '', 79: '', 85: '',
89: ''})
if max_length != -1:
code = code[:max_length]
return code
[docs]def norphone(word):
"""Return the Norphone code.
The reference implementation by Lars Marius Garshol is available in
:cite:`Garshol:2015`.
Norphone was designed for Norwegian, but this implementation has been
extended to support Swedish vowels as well. This function incorporates
the "not implemented" rules from the above file's rule set.
:param str word: the word to transform
:returns: the Norphone code
:rtype: str
>>> norphone('Hansen')
'HNSN'
>>> norphone('Larsen')
'LRSN'
>>> norphone('Aagaard')
'ÅKRT'
>>> norphone('Braaten')
'BRTN'
>>> norphone('Sandvik')
'SNVK'
"""
_vowels = {'A', 'E', 'I', 'O', 'U', 'Y', 'Å', 'Æ', 'Ø', 'Ä', 'Ö'}
replacements = {4: {'SKEI': 'X'},
3: {'SKJ': 'X', 'KEI': 'X'},
2: {'CH': 'K', 'CK': 'K', 'GJ': 'J', 'GH': 'K', 'HG': 'K',
'HJ': 'J', 'HL': 'L', 'HR': 'R', 'KJ': 'X', 'KI': 'X',
'LD': 'L', 'ND': 'N', 'PH': 'F', 'TH': 'T', 'SJ': 'X'},
1: {'W': 'V', 'X': 'KS', 'Z': 'S', 'D': 'T', 'G': 'K'}}
word = word.upper()
code = ''
skip = 0
if word[0:2] == 'AA':
code = 'Å'
skip = 2
elif word[0:2] == 'GI':
code = 'J'
skip = 2
elif word[0:3] == 'SKY':
code = 'X'
skip = 3
elif word[0:2] == 'EI':
code = 'Æ'
skip = 2
elif word[0:2] == 'KY':
code = 'X'
skip = 2
elif word[:1] == 'C':
code = 'K'
skip = 1
elif word[:1] == 'Ä':
code = 'Æ'
skip = 1
elif word[:1] == 'Ö':
code = 'Ø'
skip = 1
if word[-2:] == 'DT':
word = word[:-2]+'T'
# Though the rules indicate this rule applies in all positions, the
# reference implementation indicates it applies only in final position.
elif word[-2:-1] in _vowels and word[-1:] == 'D':
word = word[:-2]
for pos, char in enumerate(word):
if skip:
skip -= 1
else:
for length in sorted(replacements, reverse=True):
if word[pos:pos+length] in replacements[length]:
code += replacements[length][word[pos:pos+length]]
skip = length-1
break
else:
if not pos or char not in _vowels:
code += char
code = _delete_consecutive_repeats(code)
return code
[docs]def dolby(word, max_length=-1, keep_vowels=False, vowel_char='*'):
r"""Return the Dolby Code of a name.
This follows "A Spelling Equivalent Abbreviation Algorithm For Personal
Names" from :cite:`Dolby:1970` and :cite:`Cunningham:1969`.
:param word: the word to encode
:param max_length: maximum length of the returned Dolby code -- this also
activates the fixed-length code mode if it is greater than 0
:param keep_vowels: if True, retains all vowel markers
:param vowel_char: the vowel marker character (default to \*)
:returns: the Dolby Code
:rtype: str
>>> dolby('Hansen')
'H*NSN'
>>> dolby('Larsen')
'L*RSN'
>>> dolby('Aagaard')
'*GR'
>>> dolby('Braaten')
'BR*DN'
>>> dolby('Sandvik')
'S*NVK'
>>> dolby('Hansen', max_length=6)
'H*NS*N'
>>> dolby('Larsen', max_length=6)
'L*RS*N'
>>> dolby('Aagaard', max_length=6)
'*G*R '
>>> dolby('Braaten', max_length=6)
'BR*D*N'
>>> dolby('Sandvik', max_length=6)
'S*NF*K'
>>> dolby('Smith')
'SM*D'
>>> dolby('Waters')
'W*DRS'
>>> dolby('James')
'J*MS'
>>> dolby('Schmidt')
'SM*D'
>>> dolby('Ashcroft')
'*SKRFD'
>>> dolby('Smith', max_length=6)
'SM*D '
>>> dolby('Waters', max_length=6)
'W*D*RS'
>>> dolby('James', max_length=6)
'J*M*S '
>>> dolby('Schmidt', max_length=6)
'SM*D '
>>> dolby('Ashcroft', max_length=6)
'*SKRFD'
"""
_vowels = {'A', 'E', 'I', 'O', 'U', 'Y'}
# uppercase, normalize, decompose, and filter non-A-Z out
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
word = ''.join(c for c in word if c in
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z'})
# Rule 1 (FL2)
if word[:3] in {'MCG', 'MAG', 'MAC'}:
word = 'MK'+word[3:]
elif word[:2] == 'MC':
word = 'MK'+word[2:]
# Rule 2 (FL3)
pos = len(word)-2
while pos > -1:
if word[pos:pos+2] in {'DT', 'LD', 'ND', 'NT', 'RC', 'RD', 'RT', 'SC',
'SK', 'ST'}:
word = word[:pos+1]+word[pos+2:]
pos += 1
pos -= 1
# Rule 3 (FL4)
# Although the rule indicates "after the first letter", the test cases make
# it clear that these apply to the first letter also.
word = word.replace('X', 'KS')
word = word.replace('CE', 'SE')
word = word.replace('CI', 'SI')
word = word.replace('CY', 'SI')
# not in the rule set, but they seem to have intended it
word = word.replace('TCH', 'CH')
pos = word.find('CH', 1)
while pos != -1:
if word[pos-1:pos] not in _vowels:
word = word[:pos]+'S'+word[pos+1:]
pos = word.find('CH', pos+1)
word = word.replace('C', 'K')
word = word.replace('Z', 'S')
word = word.replace('WR', 'R')
word = word.replace('DG', 'G')
word = word.replace('QU', 'K')
word = word.replace('T', 'D')
word = word.replace('PH', 'F')
# Rule 4 (FL5)
# Although the rule indicates "after the first letter", the test cases make
# it clear that these apply to the first letter also.
pos = word.find('K', 0)
while pos != -1:
if pos > 1 and word[pos-1:pos] not in _vowels | {'L', 'N', 'R'}:
word = word[:pos-1]+word[pos:]
pos -= 1
pos = word.find('K', pos+1)
# Rule FL6
if max_length > 0 and word[-1:] == 'E':
word = word[:-1]
# Rule 5 (FL7)
word = _delete_consecutive_repeats(word)
# Rule 6 (FL8)
if word[:2] == 'PF':
word = word[1:]
if word[-2:] == 'PF':
word = word[:-1]
elif word[-2:] == 'GH':
if word[-3:-2] in _vowels:
word = word[:-2]+'F'
else:
word = word[:-2]+'G'
word = word.replace('GH', '')
# Rule FL9
if max_length > 0:
word = word.replace('V', 'F')
# Rules 7-9 (FL10-FL12)
first = 1 + (1 if max_length > 0 else 0)
code = ''
for pos, char in enumerate(word):
if char in _vowels:
if first or keep_vowels:
code += vowel_char
first -= 1
elif pos > 0 and char in {'W', 'H'}:
continue
else:
code += char
if max_length > 0:
# Rule FL13
if len(code) > max_length and code[-1:] == 'S':
code = code[:-1]
if keep_vowels:
code = code[:max_length]
else:
# Rule FL14
code = code[:max_length + 2]
# Rule FL15
while len(code) > max_length:
vowels = len(code) - max_length
excess = vowels - 1
word = code
code = ''
for char in word:
if char == vowel_char:
if vowels:
code += char
vowels -= 1
else:
code += char
code = code[:max_length + excess]
# Rule FL16
code += ' ' * (max_length - len(code))
return code
[docs]def phonetic_spanish(word, max_length=-1):
"""Return the PhoneticSpanish coding of word.
This follows the coding described in :cite:`Amon:2012` and
:cite:`delPilarAngeles:2015`.
:param str word: the word to transform
:param int max_length: the length of the code returned (defaults to
unlimited)
:returns: the PhoneticSpanish code
:rtype: str
>>> phonetic_spanish('Perez')
'094'
>>> phonetic_spanish('Martinez')
'69364'
>>> phonetic_spanish('Gutierrez')
'83994'
>>> phonetic_spanish('Santiago')
'4638'
>>> phonetic_spanish('Nicolás')
'6454'
"""
_es_soundex_translation = dict(zip((ord(_) for _ in
'BCDFGHJKLMNPQRSTVXYZ'),
'14328287566079431454'))
# uppercase, normalize, and decompose, filter to A-Z minus vowels & W
word = unicode_normalize('NFKD', text_type(word.upper()))
word = ''.join(c for c in word if c in
{'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N',
'P', 'Q', 'R', 'S', 'T', 'V', 'X', 'Y', 'Z'})
# merge repeated Ls & Rs
word = word.replace('LL', 'L')
word = word.replace('R', 'R')
# apply the Soundex algorithm
sdx = word.translate(_es_soundex_translation)
if max_length > 0:
sdx = (sdx+('0'*max_length))[:max_length]
return sdx
[docs]def soundex_br(word, max_length=4, zero_pad=True):
"""Return the SoundexBR encoding of a word.
This is based on :cite:`Marcelino:2015`.
:param str word: the word to transform
:param int max_length: the length of the code returned (defaults to 4)
:param bool zero_pad: pad the end of the return value with 0s to achieve a
max_length string
:returns: the SoundexBR code
:rtype: str
>>> soundex_br('Oliveira')
'O416'
>>> soundex_br('Almeida')
'A453'
>>> soundex_br('Barbosa')
'B612'
>>> soundex_br('Araújo')
'A620'
>>> soundex_br('Gonçalves')
'G524'
>>> soundex_br('Goncalves')
'G524'
"""
_soundex_br_translation = dict(zip((ord(_) for _ in
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
'01230120022455012623010202'))
word = unicode_normalize('NFKD', text_type(word.upper()))
word = ''.join(c for c in word if c in
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z'})
if word[:2] == 'WA':
first = 'V'
elif word[:1] == 'K' and word[1:2] in {'A', 'O', 'U'}:
first = 'C'
elif word[:1] == 'C' and word[1:2] in {'I', 'E'}:
first = 'S'
elif word[:1] == 'G' and word[1:2] in {'E', 'I'}:
first = 'J'
elif word[:1] == 'Y':
first = 'I'
elif word[:1] == 'H':
first = word[1:2]
word = word[1:]
else:
first = word[:1]
sdx = first + word[1:].translate(_soundex_br_translation)
sdx = _delete_consecutive_repeats(sdx)
sdx = sdx.replace('0', '')
if zero_pad:
sdx += ('0'*max_length)
return sdx[:max_length]
[docs]def nrl(word):
"""Return the Naval Research Laboratory phonetic encoding of a word.
This is defined by :cite:`Elovitz:1976`.
:param str word: the word to transform
:returns: the NRL phonetic encoding
:rtype: str
>>> nrl('the')
'DHAX'
>>> nrl('round')
'rAWnd'
>>> nrl('quick')
'kwIHk'
>>> nrl('eaten')
'IYtEHn'
>>> nrl('Smith')
'smIHTH'
>>> nrl('Larsen')
'lAArsEHn'
"""
def _to_regex(pattern, left_match=True):
new_pattern = ''
replacements = {'#': '[AEIOU]+',
':': '[BCDFGHJKLMNPQRSTVWXYZ]*',
'^': '[BCDFGHJKLMNPQRSTVWXYZ]',
'.': '[BDVGJLMNTWZ]',
'%': '(ER|E|ES|ED|ING|ELY)',
'+': '[EIY]',
' ': '^'}
for char in pattern:
new_pattern += (replacements[char] if char in replacements
else char)
if left_match:
new_pattern += '$'
if '^' not in pattern:
new_pattern = '^.*' + new_pattern
else:
new_pattern = '^' + new_pattern.replace('^', '$')
if '$' not in new_pattern:
new_pattern += '.*$'
return new_pattern
rules = {' ': (('', ' ', '', ' '),
('', '-', '', ''),
('.', '\'S', '', 'z'),
('#:.E', '\'S', '', 'z'),
('#', '\'S', '', 'z'),
('', '\'', '', ''),
('', ',', '', ' '),
('', '.', '', ' '),
('', '?', '', ' '),
('', '!', '', ' ')),
'A': (('', 'A', ' ', 'AX'),
(' ', 'ARE', ' ', 'AAr'),
(' ', 'AR', 'O', 'AXr'),
('', 'AR', '#', 'EHr'),
('^', 'AS', '#', 'EYs'),
('', 'A', 'WA', 'AX'),
('', 'AW', '', 'AO'),
(' :', 'ANY', '', 'EHnIY'),
('', 'A', '^+#', 'EY'),
('#:', 'ALLY', '', 'AXlIY'),
(' ', 'AL', '#', 'AXl'),
('', 'AGAIN', '', 'AXgEHn'),
('#:', 'AG', 'E', 'IHj'),
('', 'A', '^+:#', 'AE'),
(' :', 'A', '^+ ', 'EY'),
('', 'A', '^%', 'EY'),
(' ', 'ARR', '', 'AXr'),
('', 'ARR', '', 'AEr'),
(' :', 'AR', ' ', 'AAr'),
('', 'AR', ' ', 'ER'),
('', 'AR', '', 'AAr'),
('', 'AIR', '', 'EHr'),
('', 'AI', '', 'EY'),
('', 'AY', '', 'EY'),
('', 'AU', '', 'AO'),
('#:', 'AL', ' ', 'AXl'),
('#:', 'ALS', ' ', 'AXlz'),
('', 'ALK', '', 'AOk'),
('', 'AL', '^', 'AOl'),
(' :', 'ABLE', '', 'EYbAXl'),
('', 'ABLE', '', 'AXbAXl'),
('', 'ANG', '+', 'EYnj'),
('', 'A', '', 'AE')),
'B': ((' ', 'BE', '^#', 'bIH'),
('', 'BEING', '', 'bIYIHNG'),
(' ', 'BOTH', ' ', 'bOWTH'),
(' ', 'BUS', '#', 'bIHz'),
('', 'BUIL', '', 'bIHl'),
('', 'B', '', 'b')),
'C': ((' ', 'CH', '^', 'k'),
('^E', 'CH', '', 'k'),
('', 'CH', '', 'CH'),
(' S', 'CI', '#', 'sAY'),
('', 'CI', 'A', 'SH'),
('', 'CI', 'O', 'SH'),
('', 'CI', 'EN', 'SH'),
('', 'C', '+', 's'),
('', 'CK', '', 'k'),
('', 'COM', '%', 'kAHm'),
('', 'C', '', 'k')),
'D': (('#:', 'DED', ' ', 'dIHd'),
('.E', 'D', ' ', 'd'),
('#:^E', 'D', ' ', 't'),
(' ', 'DE', '^#', 'dIH'),
(' ', 'DO', ' ', 'dUW'),
(' ', 'DOES', '', 'dAHz'),
(' ', 'DOING', '', 'dUWIHNG'),
(' ', 'DOW', '', 'dAW'),
('', 'DU', 'A', 'jUW'),
('', 'D', '', 'd')),
'E': (('#:', 'E', ' ', ''),
('\':^', 'E', ' ', ''),
(' :', 'E', ' ', 'IY'),
('#', 'ED', ' ', 'd'),
('#:', 'E', 'D ', ''),
('', 'EV', 'ER', 'EHv'),
('', 'E', '^%', 'IY'),
('', 'ERI', '#', 'IYrIY'),
('', 'ERI', '', 'EHrIH'),
('#:', 'ER', '#', 'ER'),
('', 'ER', '#', 'EHr'),
('', 'ER', '', 'ER'),
(' ', 'EVEN', '', 'IYvEHn'),
('#:', 'E', 'W', ''),
('T', 'EW', '', 'UW'),
('S', 'EW', '', 'UW'),
('R', 'EW', '', 'UW'),
('D', 'EW', '', 'UW'),
('L', 'EW', '', 'UW'),
('Z', 'EW', '', 'UW'),
('N', 'EW', '', 'UW'),
('J', 'EW', '', 'UW'),
('TH', 'EW', '', 'UW'),
('CH', 'EW', '', 'UW'),
('SH', 'EW', '', 'UW'),
('', 'EW', '', 'yUW'),
('', 'E', 'O', 'IY'),
('#:S', 'ES', ' ', 'IHz'),
('#:C', 'ES', ' ', 'IHz'),
('#:G', 'ES', ' ', 'IHz'),
('#:Z', 'ES', ' ', 'IHz'),
('#:X', 'ES', ' ', 'IHz'),
('#:J', 'ES', ' ', 'IHz'),
('#:CH', 'ES', ' ', 'IHz'),
('#:SH', 'ES', ' ', 'IHz'),
('#:', 'E', 'S ', ''),
('#:', 'ELY', ' ', 'lIY'),
('#:', 'EMENT', '', 'mEHnt'),
('', 'EFUL', '', 'fUHl'),
('', 'EE', '', 'IY'),
('', 'EARN', '', 'ERn'),
(' ', 'EAR', '^', 'ER'),
('', 'EAD', '', 'EHd'),
('#:', 'EA', ' ', 'IYAX'),
('', 'EA', 'SU', 'EH'),
('', 'EA', '', 'IY'),
('', 'EIGH', '', 'EY'),
('', 'EI', '', 'IY'),
(' ', 'EYE', '', 'AY'),
('', 'EY', '', 'IY'),
('', 'EU', '', 'yUW'),
('', 'E', '', 'EH')),
'F': (('', 'FUL', '', 'fUHl'),
('', 'F', '', 'f')),
'G': (('', 'GIV', '', 'gIHv'),
(' ', 'G', 'I^', 'g'),
('', 'GE', 'T', 'gEH'),
('SU', 'GGES', '', 'gjEHs'),
('', 'GG', '', 'g'),
(' B#', 'G', '', 'g'),
('', 'G', '+', 'j'),
('', 'GREAT', '', 'grEYt'),
('#', 'GH', '', ''),
('', 'G', '', 'g')),
'H': ((' ', 'HAV', '', 'hAEv'),
(' ', 'HERE', '', 'hIYr'),
(' ', 'HOUR', '', 'AWER'),
('', 'HOW', '', 'hAW'),
('', 'H', '#', 'h'),
('', 'H', '', '')),
'I': ((' ', 'IN', '', 'IHn'),
(' ', 'I', ' ', 'AY'),
('', 'IN', 'D', 'AYn'),
('', 'IER', '', 'IYER'),
('#:R', 'IED', '', 'IYd'),
('', 'IED', ' ', 'AYd'),
('', 'IEN', '', 'IYEHn'),
('', 'IE', 'T', 'AYEH'),
(' :', 'I', '%', 'AY'),
('', 'I', '%', 'IY'),
('', 'IE', '', 'IY'),
('', 'I', '^+:#', 'IH'),
('', 'IR', '#', 'AYr'),
('', 'IZ', '%', 'AYz'),
('', 'IS', '%', 'AYz'),
('', 'I', 'D%', 'AY'),
('+^', 'I', '^+', 'IH'),
('', 'I', 'T%', 'AY'),
('#:^', 'I', '^+', 'IH'),
('', 'I', '^+', 'AY'),
('', 'IR', '', 'ER'),
('', 'IGH', '', 'AY'),
('', 'ILD', '', 'AYld'),
('', 'IGN', ' ', 'AYn'),
('', 'IGN', '^', 'AYn'),
('', 'IGN', '%', 'AYn'),
('', 'IQUE', '', 'IYk'),
('', 'I', '', 'IH')),
'J': (('', 'J', '', 'j'),),
'K': ((' ', 'K', 'N', ''),
('', 'K', '', 'k')),
'L': (('', 'LO', 'C#', 'lOW'),
('L', 'L', '', ''),
('#:^', 'L', '%', 'AXl'),
('', 'LEAD', '', 'lIYd'),
('', 'L', '', 'l')),
'M': (('', 'MOV', '', 'mUWv'),
('', 'M', '', 'm')),
'N': (('E', 'NG', '+', 'nj'),
('', 'NG', 'R', 'NGg'),
('', 'NG', '#', 'NGg'),
('', 'NGL', '%', 'NGgAXl'),
('', 'NG', '', 'NG'),
('', 'NK', '', 'NGk'),
(' ', 'NOW', ' ', 'nAW'),
('', 'N', '', 'n')),
'O': (('', 'OF', ' ', 'AXv'),
('', 'OROUGH', '', 'EROW'),
('#:', 'OR', ' ', 'ER'),
('#:', 'ORS', ' ', 'ERz'),
('', 'OR', '', 'AOr'),
(' ', 'ONE', '', 'wAHn'),
('', 'OW', '', 'OW'),
(' ', 'OVER', '', 'OWvER'),
('', 'OV', '', 'AHv'),
('', 'O', '^%', 'OW'),
('', 'O', '^EN', 'OW'),
('', 'O', '^I#', 'OW'),
('', 'OL', 'D', 'OWl'),
('', 'OUGHT', '', 'AOt'),
('', 'OUGH', '', 'AHf'),
(' ', 'OU', '', 'AW'),
('H', 'OU', 'S#', 'AW'),
('', 'OUS', '', 'AXs'),
('', 'OUR', '', 'AOr'),
('', 'OULD', '', 'UHd'),
('^', 'OU', '^L', 'AH'),
('', 'OUP', '', 'UWp'),
('', 'OU', '', 'AW'),
('', 'OY', '', 'OY'),
('', 'OING', '', 'OWIHNG'),
('', 'OI', '', 'OY'),
('', 'OOR', '', 'AOr'),
('', 'OOK', '', 'UHk'),
('', 'OOD', '', 'UHd'),
('', 'OO', '', 'UW'),
('', 'O', 'E', 'OW'),
('', 'O', ' ', 'OW'),
('', 'OA', '', 'OW'),
(' ', 'ONLY', '', 'OWnlIY'),
(' ', 'ONCE', '', 'wAHns'),
('', 'ON\'T', '', 'OWnt'),
('C', 'O', 'N', 'AA'),
('', 'O', 'NG', 'AO'),
(' :^', 'O', 'N', 'AH'),
('I', 'ON', '', 'AXn'),
('#:', 'ON', ' ', 'AXn'),
('#^', 'ON', '', 'AXn'),
('', 'O', 'ST ', 'OW'),
('', 'OF', '^', 'AOf'),
('', 'OTHER', '', 'AHDHER'),
('', 'OSS', ' ', 'AOs'),
('#:^', 'OM', '', 'AHm'),
('', 'O', '', 'AA')),
'P': (('', 'PH', '', 'f'),
('', 'PEOP', '', 'pIYp'),
('', 'POW', '', 'pAW'),
('', 'PUT', ' ', 'pUHt'),
('', 'P', '', 'p')),
'Q': (('', 'QUAR', '', 'kwAOr'),
('', 'QU', '', 'kw'),
('', 'Q', '', 'k')),
'R': ((' ', 'RE', '^#', 'rIY'),
('', 'R', '', 'r')),
'S': (('', 'SH', '', 'SH'),
('#', 'SION', '', 'ZHAXn'),
('', 'SOME', '', 'sAHm'),
('#', 'SUR', '#', 'ZHER'),
('', 'SUR', '#', 'SHER'),
('#', 'SU', '#', 'ZHUW'),
('#', 'SSU', '#', 'SHUW'),
('#', 'SED', ' ', 'zd'),
('#', 'S', '#', 'z'),
('', 'SAID', '', 'sEHd'),
('^', 'SION', '', 'SHAXn'),
('', 'S', 'S', ''),
('.', 'S', ' ', 'z'),
('#:.E', 'S', ' ', 'z'),
('#:^##', 'S', ' ', 'z'),
('#:^#', 'S', ' ', 's'),
('U', 'S', ' ', 's'),
(' :#', 'S', ' ', 'z'),
(' ', 'SCH', '', 'sk'),
('', 'S', 'C+', ''),
('#', 'SM', '', 'zm'),
('#', 'SN', '\'', 'zAXn'),
('', 'S', '', 's')),
'T': ((' ', 'THE', ' ', 'DHAX'),
('', 'TO', ' ', 'tUW'),
('', 'THAT', ' ', 'DHAEt'),
(' ', 'THIS', ' ', 'DHIHs'),
(' ', 'THEY', '', 'DHEY'),
(' ', 'THERE', '', 'DHEHr'),
('', 'THER', '', 'DHER'),
('', 'THEIR', '', 'DHEHr'),
(' ', 'THAN', ' ', 'DHAEn'),
(' ', 'THEM', ' ', 'DHEHm'),
('', 'THESE', ' ', 'DHIYz'),
(' ', 'THEN', '', 'DHEHn'),
('', 'THROUGH', '', 'THrUW'),
('', 'THOSE', '', 'DHOWz'),
('', 'THOUGH', ' ', 'DHOW'),
(' ', 'THUS', '', 'DHAHs'),
('', 'TH', '', 'TH'),
('#:', 'TED', ' ', 'tIHd'),
('S', 'TI', '#N', 'CH'),
('', 'TI', 'O', 'SH'),
('', 'TI', 'A', 'SH'),
('', 'TIEN', '', 'SHAXn'),
('', 'TUR', '#', 'CHER'),
('', 'TU', 'A', 'CHUW'),
(' ', 'TWO', '', 'tUW'),
('', 'T', '', 't')),
'U': ((' ', 'UN', 'I', 'yUWn'),
(' ', 'UN', '', 'AHn'),
(' ', 'UPON', '', 'AXpAOn'),
('T', 'UR', '#', 'UHr'),
('S', 'UR', '#', 'UHr'),
('R', 'UR', '#', 'UHr'),
('D', 'UR', '#', 'UHr'),
('L', 'UR', '#', 'UHr'),
('Z', 'UR', '#', 'UHr'),
('N', 'UR', '#', 'UHr'),
('J', 'UR', '#', 'UHr'),
('TH', 'UR', '#', 'UHr'),
('CH', 'UR', '#', 'UHr'),
('SH', 'UR', '#', 'UHr'),
('', 'UR', '#', 'yUHr'),
('', 'UR', '', 'ER'),
('', 'U', '^ ', 'AH'),
('', 'U', '^^', 'AH'),
('', 'UY', '', 'AY'),
(' G', 'U', '#', ''),
('G', 'U', '%', ''),
('G', 'U', '#', 'w'),
('#N', 'U', '', 'yUW'),
('T', 'U', '', 'UW'),
('S', 'U', '', 'UW'),
('R', 'U', '', 'UW'),
('D', 'U', '', 'UW'),
('L', 'U', '', 'UW'),
('Z', 'U', '', 'UW'),
('N', 'U', '', 'UW'),
('J', 'U', '', 'UW'),
('TH', 'U', '', 'UW'),
('CH', 'U', '', 'UW'),
('SH', 'U', '', 'UW'),
('', 'U', '', 'yUW')),
'V': (('', 'VIEW', '', 'vyUW'),
('', 'V', '', 'v')),
'W': ((' ', 'WERE', '', 'wER'),
('', 'WA', 'S', 'wAA'),
('', 'WA', 'T', 'wAA'),
('', 'WHERE', '', 'WHEHr'),
('', 'WHAT', '', 'WHAAt'),
('', 'WHOL', '', 'hOWl'),
('', 'WHO', '', 'hUW'),
('', 'WH', '', 'WH'),
('', 'WAR', '', 'wAOr'),
('', 'WOR', '^', 'wER'),
('', 'WR', '', 'r'),
('', 'W', '', 'w')),
'X': (('', 'X', '', 'ks'),),
'Y': (('', 'YOUNG', '', 'yAHNG'),
(' ', 'YOU', '', 'yUW'),
(' ', 'YES', '', 'yEHs'),
(' ', 'Y', '', 'y'),
('#:^', 'Y', ' ', 'IY'),
('#:^', 'Y', 'I', 'IY'),
(' :', 'Y', ' ', 'AY'),
(' :', 'Y', '#', 'AY'),
(' :', 'Y', '^+:#', 'IH'),
(' :', 'Y', '^#', 'AY'),
('', 'Y', '', 'IH')),
'Z': (('', 'Z', '', 'z'),)}
word = word.upper()
pron = ''
pos = 0
while pos < len(word):
left_orig = word[:pos]
right_orig = word[pos:]
first = word[pos] if word[pos] in rules else ' '
for rule in rules[first]:
left, match, right, out = rule
if right_orig.startswith(match):
if left:
l_pattern = _to_regex(left, left_match=True)
if right:
r_pattern = _to_regex(right, left_match=False)
if ((not left or re_match(l_pattern, left_orig)) and
(not right or
re_match(r_pattern, right_orig[len(match):]))):
pron += out
pos += len(match)
break
else:
pron += word[pos]
pos += 1
return pron
[docs]def bmpm(word, language_arg=0, name_mode='gen', match_mode='approx',
concat=False, filter_langs=False):
"""Return the Beider-Morse Phonetic Matching algorithm code for a word.
The Beider-Morse Phonetic Matching algorithm is described in
:cite:`Beider:2008`.
The reference implementation is licensed under GPLv3.
:param str word: the word to transform
:param str language_arg: the language of the term; supported values
include:
- 'any'
- 'arabic'
- 'cyrillic'
- 'czech'
- 'dutch'
- 'english'
- 'french'
- 'german'
- 'greek'
- 'greeklatin'
- 'hebrew'
- 'hungarian'
- 'italian'
- 'latvian'
- 'polish'
- 'portuguese'
- 'romanian'
- 'russian'
- 'spanish'
- 'turkish'
:param str name_mode: the name mode of the algorithm:
- 'gen' -- general (default)
- 'ash' -- Ashkenazi
- 'sep' -- Sephardic
:param str match_mode: matching mode: 'approx' or 'exact'
:param bool concat: concatenation mode
:param bool filter_langs: filter out incompatible languages
:returns: the BMPM value(s)
:rtype: tuple
>>> bmpm('Christopher')
'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
xristYfir xristopi xritopir xritopi xristofi xritofir xritofi tzristopir
tzristofir zristopir zristopi zritopir zritopi zristofir zristofi zritofir
zritofi'
>>> bmpm('Niall')
'nial niol'
>>> bmpm('Smith')
'zmit'
>>> bmpm('Schmidt')
'zmit stzmit'
>>> bmpm('Christopher', language_arg='German')
'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
xristYfir'
>>> bmpm('Christopher', language_arg='English')
'tzristofir tzrQstofir tzristafir tzrQstafir xristofir xrQstofir xristafir
xrQstafir'
>>> bmpm('Christopher', language_arg='German', name_mode='ash')
'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
xristYfir'
>>> bmpm('Christopher', language_arg='German', match_mode='exact')
'xriStopher xriStofer xristopher xristofer'
"""
return _bmpm(word, language_arg, name_mode, match_mode,
concat, filter_langs)
if __name__ == '__main__':
import doctest
doctest.testmod()