Source code for abydos.phonetic

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic.

The phonetic module implements phonetic algorithms including:

    - Robert C. Russell's Index
    - American Soundex
    - Refined Soundex
    - Daitch-Mokotoff Soundex
    - Kölner Phonetik
    - NYSIIS
    - Match Rating Algorithm
    - Metaphone
    - Double Metaphone
    - Caverphone
    - Alpha Search Inquiry System
    - Fuzzy Soundex
    - Phonex
    - Phonem
    - Phonix
    - SfinxBis
    - phonet
    - Standardized Phonetic Frequency Code
    - Statistics Canada
    - Lein
    - Roger Root
    - Oxford Name Compression Algorithm (ONCA)
    - Eudex phonetic hash
    - Haase Phonetik
    - Reth-Schek Phonetik
    - FONEM
    - Parmar-Kumbharana
    - Davidson's Consonant Code
    - SoundD
    - PSHP Soundex/Viewex Coding
    - an early version of Henry Code
    - Norphone
    - Dolby Code
    - Phonetic Spanish
    - Spanish Metaphone
    - MetaSoundex
    - SoundexBR
    - NRL English-to-phoneme
    - Beider-Morse Phonetic Matching
"""

from __future__ import division, unicode_literals

from collections import Counter
from itertools import groupby, product
from re import compile as re_compile
from re import match as re_match
from unicodedata import normalize as unicode_normalize

from six import text_type
from six.moves import range

from ._bm import _bmpm

__all__ = ['alpha_sis', 'bmpm', 'caverphone', 'davidson', 'dm_soundex',
           'dolby', 'double_metaphone', 'eudex', 'fonem', 'fuzzy_soundex',
           'haase_phonetik', 'henry_early', 'koelner_phonetik',
           'koelner_phonetik_alpha', 'koelner_phonetik_num_to_alpha', 'lein',
           'metaphone', 'metasoundex', 'mra', 'norphone', 'nrl', 'nysiis',
           'onca', 'parmar_kumbharana', 'phonem', 'phonet', 'phonetic_spanish',
           'phonex', 'phonix', 'pshp_soundex_first', 'pshp_soundex_last',
           'refined_soundex', 'reth_schek_phonetik', 'roger_root',
           'russell_index', 'russell_index_alpha',
           'russell_index_num_to_alpha', 'sfinxbis', 'sound_d', 'soundex',
           'soundex_br', 'spanish_metaphone', 'spfc', 'statistics_canada']


def _delete_consecutive_repeats(word):
    """Delete consecutive repeated characters in a word.

    :param str word: the word to transform
    :returns: word with consecutive repeating characters collapsed to
        a single instance
    :rtype: str

    >>> _delete_consecutive_repeats('REDDEE')
    'REDE'
    >>> _delete_consecutive_repeats('AEIOU')
    'AEIOU'
    >>> _delete_consecutive_repeats('AAACCCTTTGGG')
    'ACTG'
    """
    return ''.join(char for char, _ in groupby(word))


[docs]def russell_index(word):
    """Return the Russell Index (integer output) of a word.

    This follows Robert C. Russell's Index algorithm, as described in
    :cite:`Russell:1917`.

    :param str word: the word to transform
    :returns: the Russell Index value
    :rtype: int

    >>> russell_index('Christopher')
    3813428
    >>> russell_index('Niall')
    715
    >>> russell_index('Smith')
    3614
    >>> russell_index('Schmidt')
    3614
    """
    _russell_translation = dict(zip((ord(_) for _ in
                                     'ABCDEFGIKLMNOPQRSTUVXYZ'),
                                    '12341231356712383412313'))

    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')
    word = word.replace('GH', '')  # discard gh (rule 3)
    word = word.rstrip('SZ')  # discard /[sz]$/ (rule 3)

    # translate according to Russell's mapping
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'K', 'L', 'M', 'N',
                    'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z'})
    sdx = word.translate(_russell_translation)

    # remove any 1s after the first occurrence
    one = sdx.find('1')+1
    if one:
        sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1')

    # remove repeating characters
    sdx = _delete_consecutive_repeats(sdx)

    # return as an int
    return int(sdx) if sdx else float('NaN')


[docs]def russell_index_num_to_alpha(num):
    """Convert the Russell Index integer to an alphabetic string.

    This follows Robert C. Russell's Index algorithm, as described in
    :cite:`Russell:1917`.

    :param int num: a Russell Index integer value
    :returns: the Russell Index as an alphabetic string
    :rtype: str

    >>> russell_index_num_to_alpha(3813428)
    'CRACDBR'
    >>> russell_index_num_to_alpha(715)
    'NAL'
    >>> russell_index_num_to_alpha(3614)
    'CMAD'
    """
    _russell_num_translation = dict(zip((ord(_) for _ in '12345678'),
                                        'ABCDLMNR'))
    num = ''.join(c for c in text_type(num) if c in {'1', '2', '3', '4', '5',
                                                     '6', '7', '8'})
    if num:
        return num.translate(_russell_num_translation)
    return ''


[docs]def russell_index_alpha(word):
    """Return the Russell Index (alphabetic output) for the word.

    This follows Robert C. Russell's Index algorithm, as described in
    :cite:`Russell:1917`.

    :param str word: the word to transform
    :returns: the Russell Index value as an alphabetic string
    :rtype: str

    >>> russell_index_alpha('Christopher')
    'CRACDBR'
    >>> russell_index_alpha('Niall')
    'NAL'
    >>> russell_index_alpha('Smith')
    'CMAD'
    >>> russell_index_alpha('Schmidt')
    'CMAD'
    """
    if word:
        return russell_index_num_to_alpha(russell_index(word))
    return ''


[docs]def soundex(word, max_length=4, var='American', reverse=False, zero_pad=True):
    """Return the Soundex code for a word.

    :param str word: the word to transform
    :param int max_length: the length of the code returned (defaults to 4)
    :param str var: the variant of the algorithm to employ (defaults to
        'American'):

        - 'American' follows the American Soundex algorithm, as described at
          :cite:`US:2007` and in :cite:`Knuth:1998`; this is also called
          Miracode
        - 'special' follows the rules from the 1880-1910 US Census
          retrospective re-analysis, in which h & w are not treated as blocking
          consonants but as vowels. Cf. :cite:`Repici:2013`.
        - 'Census' follows the rules laid out in GIL 55 :cite:`US:1997` by the
          US Census, including coding prefixed and unprefixed versions of some
          names

    :param bool reverse: reverse the word before computing the selected Soundex
        (defaults to False); This results in "Reverse Soundex", which is useful
        for blocking in cases where the initial elements may be in error.
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
        max_length string
    :returns: the Soundex value
    :rtype: str

    >>> soundex("Christopher")
    'C623'
    >>> soundex("Niall")
    'N400'
    >>> soundex('Smith')
    'S530'
    >>> soundex('Schmidt')
    'S530'

    >>> soundex('Christopher', max_length=-1)
    'C623160000000000000000000000000000000000000000000000000000000000'
    >>> soundex('Christopher', max_length=-1, zero_pad=False)
    'C62316'

    >>> soundex('Christopher', reverse=True)
    'R132'

    >>> soundex('Ashcroft')
    'A261'
    >>> soundex('Asicroft')
    'A226'
    >>> soundex('Ashcroft', var='special')
    'A226'
    >>> soundex('Asicroft', var='special')
    'A226'
    """
    _soundex_translation = dict(zip((ord(_) for _ in
                                     'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
                                    '01230129022455012623019202'))

    # Require a max_length of at least 4 and not more than 64
    if max_length != -1:
        max_length = min(max(4, max_length), 64)
    else:
        max_length = 64

    # uppercase, normalize, decompose, and filter non-A-Z out
    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')

    if var == 'Census':
        # TODO: Should these prefixes be supplemented? (VANDE, DELA, VON)
        if word[:3] in {'VAN', 'CON'} and len(word) > 4:
            return (soundex(word, max_length, 'American', reverse, zero_pad),
                    soundex(word[3:], max_length, 'American', reverse,
                            zero_pad))
        if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3:
            return (soundex(word, max_length, 'American', reverse, zero_pad),
                    soundex(word[2:], max_length, 'American', reverse,
                            zero_pad))
        # Otherwise, proceed as usual (var='American' mode, ostensibly)

    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})

    # Nothing to convert, return base case
    if not word:
        if zero_pad:
            return '0'*max_length
        return '0'

    # Reverse word if computing Reverse Soundex
    if reverse:
        word = word[::-1]

    # apply the Soundex algorithm
    sdx = word.translate(_soundex_translation)

    if var == 'special':
        sdx = sdx.replace('9', '0')  # special rule for 1880-1910 census
    else:
        sdx = sdx.replace('9', '')  # rule 1
    sdx = _delete_consecutive_repeats(sdx)  # rule 3

    if word[0] in 'HW':
        sdx = word[0] + sdx
    else:
        sdx = word[0] + sdx[1:]
    sdx = sdx.replace('0', '')  # rule 1

    if zero_pad:
        sdx += ('0'*max_length)  # rule 4

    return sdx[:max_length]


[docs]def refined_soundex(word, max_length=-1, zero_pad=False,
                    retain_vowels=False):
    """Return the Refined Soundex code for a word.

    This is Soundex, but with more character classes. It was defined at
    :cite:`Boyce:1998`.

    :param word: the word to transform
    :param max_length: the length of the code returned (defaults to unlimited)
    :param zero_pad: pad the end of the return value with 0s to achieve a
        max_length string
    :param retain_vowels: retain vowels (as 0) in the resulting code
    :returns: the Refined Soundex value
    :rtype: str

    >>> refined_soundex('Christopher')
    'C393619'
    >>> refined_soundex('Niall')
    'N87'
    >>> refined_soundex('Smith')
    'S386'
    >>> refined_soundex('Schmidt')
    'S386'
    """
    _ref_soundex_translation = dict(zip((ord(_) for _ in
                                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
                                        '01360240043788015936020505'))

    # uppercase, normalize, decompose, and filter non-A-Z out
    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})

    # apply the Soundex algorithm
    sdx = word[:1] + word.translate(_ref_soundex_translation)
    sdx = _delete_consecutive_repeats(sdx)
    if not retain_vowels:
        sdx = sdx.replace('0', '')  # Delete vowels, H, W, Y

    if max_length > 0:
        if zero_pad:
            sdx += ('0' * max_length)
        sdx = sdx[:max_length]

    return sdx


[docs]def dm_soundex(word, max_length=6, zero_pad=True):
    """Return the Daitch-Mokotoff Soundex code for a word.

    Based on Daitch-Mokotoff Soundex :cite:`Mokotoff:1997`, this returns values
    of a word as a set. A collection is necessary since there can be multiple
    values for a single word.

    :param word: the word to transform
    :param max_length: the length of the code returned (defaults to 6; must be
        between 6 and 64)
    :param zero_pad: pad the end of the return value with 0s to achieve a
        max_length string
    :returns: the Daitch-Mokotoff Soundex value
    :rtype: str

    >>> sorted(dm_soundex('Christopher'))
    ['494379', '594379']
    >>> dm_soundex('Niall')
    {'680000'}
    >>> dm_soundex('Smith')
    {'463000'}
    >>> dm_soundex('Schmidt')
    {'463000'}

    >>> sorted(dm_soundex('The quick brown fox', max_length=20,
    ... zero_pad=False))
    ['35457976754', '3557976754']
    """
    _dms_table = {'STCH': (2, 4, 4), 'DRZ': (4, 4, 4), 'ZH': (4, 4, 4),
                  'ZHDZH': (2, 4, 4), 'DZH': (4, 4, 4), 'DRS': (4, 4, 4),
                  'DZS': (4, 4, 4), 'SCHTCH': (2, 4, 4), 'SHTSH': (2, 4, 4),
                  'SZCZ': (2, 4, 4), 'TZS': (4, 4, 4), 'SZCS': (2, 4, 4),
                  'STSH': (2, 4, 4), 'SHCH': (2, 4, 4), 'D': (3, 3, 3),
                  'H': (5, 5, '_'), 'TTSCH': (4, 4, 4), 'THS': (4, 4, 4),
                  'L': (8, 8, 8), 'P': (7, 7, 7), 'CHS': (5, 54, 54),
                  'T': (3, 3, 3), 'X': (5, 54, 54), 'OJ': (0, 1, '_'),
                  'OI': (0, 1, '_'), 'SCHTSH': (2, 4, 4), 'OY': (0, 1, '_'),
                  'Y': (1, '_', '_'), 'TSH': (4, 4, 4), 'ZDZ': (2, 4, 4),
                  'TSZ': (4, 4, 4), 'SHT': (2, 43, 43), 'SCHTSCH': (2, 4, 4),
                  'TTSZ': (4, 4, 4), 'TTZ': (4, 4, 4), 'SCH': (4, 4, 4),
                  'TTS': (4, 4, 4), 'SZD': (2, 43, 43), 'AI': (0, 1, '_'),
                  'PF': (7, 7, 7), 'TCH': (4, 4, 4), 'PH': (7, 7, 7),
                  'TTCH': (4, 4, 4), 'SZT': (2, 43, 43), 'ZDZH': (2, 4, 4),
                  'EI': (0, 1, '_'), 'G': (5, 5, 5), 'EJ': (0, 1, '_'),
                  'ZD': (2, 43, 43), 'IU': (1, '_', '_'), 'K': (5, 5, 5),
                  'O': (0, '_', '_'), 'SHTCH': (2, 4, 4), 'S': (4, 4, 4),
                  'TRZ': (4, 4, 4), 'SHD': (2, 43, 43), 'DSH': (4, 4, 4),
                  'CSZ': (4, 4, 4), 'EU': (1, 1, '_'), 'TRS': (4, 4, 4),
                  'ZS': (4, 4, 4), 'STRZ': (2, 4, 4), 'UY': (0, 1, '_'),
                  'STRS': (2, 4, 4), 'CZS': (4, 4, 4),
                  'MN': ('6_6', '6_6', '6_6'), 'UI': (0, 1, '_'),
                  'UJ': (0, 1, '_'), 'UE': (0, '_', '_'), 'EY': (0, 1, '_'),
                  'W': (7, 7, 7), 'IA': (1, '_', '_'), 'FB': (7, 7, 7),
                  'STSCH': (2, 4, 4), 'SCHT': (2, 43, 43),
                  'NM': ('6_6', '6_6', '6_6'), 'SCHD': (2, 43, 43),
                  'B': (7, 7, 7), 'DSZ': (4, 4, 4), 'F': (7, 7, 7),
                  'N': (6, 6, 6), 'CZ': (4, 4, 4), 'R': (9, 9, 9),
                  'U': (0, '_', '_'), 'V': (7, 7, 7), 'CS': (4, 4, 4),
                  'Z': (4, 4, 4), 'SZ': (4, 4, 4), 'TSCH': (4, 4, 4),
                  'KH': (5, 5, 5), 'ST': (2, 43, 43), 'KS': (5, 54, 54),
                  'SH': (4, 4, 4), 'SC': (2, 4, 4), 'SD': (2, 43, 43),
                  'DZ': (4, 4, 4), 'ZHD': (2, 43, 43), 'DT': (3, 3, 3),
                  'ZSH': (4, 4, 4), 'DS': (4, 4, 4), 'TZ': (4, 4, 4),
                  'TS': (4, 4, 4), 'TH': (3, 3, 3), 'TC': (4, 4, 4),
                  'A': (0, '_', '_'), 'E': (0, '_', '_'), 'I': (0, '_', '_'),
                  'AJ': (0, 1, '_'), 'M': (6, 6, 6), 'Q': (5, 5, 5),
                  'AU': (0, 7, '_'), 'IO': (1, '_', '_'), 'AY': (0, 1, '_'),
                  'IE': (1, '_', '_'), 'ZSCH': (4, 4, 4),
                  'CH': ((5, 4), (5, 4), (5, 4)),
                  'CK': ((5, 45), (5, 45), (5, 45)),
                  'C': ((5, 4), (5, 4), (5, 4)),
                  'J': ((1, 4), ('_', 4), ('_', 4)),
                  'RZ': ((94, 4), (94, 4), (94, 4)),
                  'RS': ((94, 4), (94, 4), (94, 4))}

    _dms_order = {'A': ('AI', 'AJ', 'AU', 'AY', 'A'),
                  'B': ('B',),
                  'C': ('CHS', 'CSZ', 'CZS', 'CH', 'CK', 'CS', 'CZ', 'C'),
                  'D': ('DRS', 'DRZ', 'DSH', 'DSZ', 'DZH', 'DZS', 'DS', 'DT',
                        'DZ', 'D'),
                  'E': ('EI', 'EJ', 'EU', 'EY', 'E'),
                  'F': ('FB', 'F'),
                  'G': ('G',),
                  'H': ('H',),
                  'I': ('IA', 'IE', 'IO', 'IU', 'I'),
                  'J': ('J',),
                  'K': ('KH', 'KS', 'K'),
                  'L': ('L',),
                  'M': ('MN', 'M'),
                  'N': ('NM', 'N'),
                  'O': ('OI', 'OJ', 'OY', 'O'),
                  'P': ('PF', 'PH', 'P'),
                  'Q': ('Q',),
                  'R': ('RS', 'RZ', 'R'),
                  'S': ('SCHTSCH', 'SCHTCH', 'SCHTSH', 'SHTCH', 'SHTSH',
                        'STSCH', 'SCHD', 'SCHT', 'SHCH', 'STCH', 'STRS',
                        'STRZ', 'STSH', 'SZCS', 'SZCZ', 'SCH', 'SHD', 'SHT',
                        'SZD', 'SZT', 'SC', 'SD', 'SH', 'ST', 'SZ', 'S'),
                  'T': ('TTSCH', 'TSCH', 'TTCH', 'TTSZ', 'TCH', 'THS', 'TRS',
                        'TRZ', 'TSH', 'TSZ', 'TTS', 'TTZ', 'TZS', 'TC', 'TH',
                        'TS', 'TZ', 'T'),
                  'U': ('UE', 'UI', 'UJ', 'UY', 'U'),
                  'V': ('V',),
                  'W': ('W',),
                  'X': ('X',),
                  'Y': ('Y',),
                  'Z': ('ZHDZH', 'ZDZH', 'ZSCH', 'ZDZ', 'ZHD', 'ZSH', 'ZD',
                        'ZH', 'ZS', 'Z')}

    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
    dms = ['']  # initialize empty code list

    # Require a max_length of at least 6 and not more than 64
    if max_length != -1:
        max_length = min(max(6, max_length), 64)
    else:
        max_length = 64

    # uppercase, normalize, decompose, and filter non-A-Z
    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})

    # Nothing to convert, return base case
    if not word:
        if zero_pad:
            return {'0'*max_length}
        return {'0'}

    pos = 0
    while pos < len(word):
        # Iterate through _dms_order, which specifies the possible substrings
        # for which codes exist in the Daitch-Mokotoff coding
        for sstr in _dms_order[word[pos]]:  # pragma: no branch
            if word[pos:].startswith(sstr):
                # Having determined a valid substring start, retrieve the code
                dm_val = _dms_table[sstr]

                # Having retried the code (triple), determine the correct
                # positional variant (first, pre-vocalic, elsewhere)
                if pos == 0:
                    dm_val = dm_val[0]
                elif (pos+len(sstr) < len(word) and
                      word[pos+len(sstr)] in _vowels):
                    dm_val = dm_val[1]
                else:
                    dm_val = dm_val[2]

                # Build the code strings
                if isinstance(dm_val, tuple):
                    dms = [_ + text_type(dm_val[0]) for _ in dms] \
                            + [_ + text_type(dm_val[1]) for _ in dms]
                else:
                    dms = [_ + text_type(dm_val) for _ in dms]
                pos += len(sstr)
                break

    # Filter out double letters and _ placeholders
    dms = (''.join(c for c in _delete_consecutive_repeats(_) if c != '_')
           for _ in dms)

    # Trim codes and return set
    if zero_pad:
        dms = ((_ + ('0'*max_length))[:max_length] for _ in dms)
    else:
        dms = (_[:max_length] for _ in dms)
    return set(dms)


[docs]def koelner_phonetik(word):
    """Return the Kölner Phonetik (numeric output) code for a word.

    Based on the algorithm defined by :cite:`Postel:1969`.

    While the output code is numeric, it is still a str because 0s can lead
    the code.

    :param str word: the word to transform
    :returns: the Kölner Phonetik value as a numeric string
    :rtype: str

    >>> koelner_phonetik('Christopher')
    '478237'
    >>> koelner_phonetik('Niall')
    '65'
    >>> koelner_phonetik('Smith')
    '862'
    >>> koelner_phonetik('Schmidt')
    '862'
    >>> koelner_phonetik('Müller')
    '657'
    >>> koelner_phonetik('Zimmermann')
    '86766'
    """
    def _after(word, pos, letters):
        """Return True if word[i] follows one of the supplied letters."""
        return pos > 0 and word[pos-1] in letters

    def _before(word, pos, letters):
        """Return True if word[i] precedes one of the supplied letters."""
        return pos+1 < len(word) and word[pos+1] in letters

    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}

    sdx = ''

    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')

    word = word.replace('Ä', 'AE')
    word = word.replace('Ö', 'OE')
    word = word.replace('Ü', 'UE')
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})

    # Nothing to convert, return base case
    if not word:
        return sdx

    for i in range(len(word)):
        if word[i] in _vowels:
            sdx += '0'
        elif word[i] == 'B':
            sdx += '1'
        elif word[i] == 'P':
            if _before(word, i, {'H'}):
                sdx += '3'
            else:
                sdx += '1'
        elif word[i] in {'D', 'T'}:
            if _before(word, i, {'C', 'S', 'Z'}):
                sdx += '8'
            else:
                sdx += '2'
        elif word[i] in {'F', 'V', 'W'}:
            sdx += '3'
        elif word[i] in {'G', 'K', 'Q'}:
            sdx += '4'
        elif word[i] == 'C':
            if _after(word, i, {'S', 'Z'}):
                sdx += '8'
            elif i == 0:
                if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U',
                                     'X'}):
                    sdx += '4'
                else:
                    sdx += '8'
            elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
                sdx += '4'
            else:
                sdx += '8'
        elif word[i] == 'X':
            if _after(word, i, {'C', 'K', 'Q'}):
                sdx += '8'
            else:
                sdx += '48'
        elif word[i] == 'L':
            sdx += '5'
        elif word[i] in {'M', 'N'}:
            sdx += '6'
        elif word[i] == 'R':
            sdx += '7'
        elif word[i] in {'S', 'Z'}:
            sdx += '8'

    sdx = _delete_consecutive_repeats(sdx)

    if sdx:
        sdx = sdx[:1] + sdx[1:].replace('0', '')

    return sdx


[docs]def koelner_phonetik_num_to_alpha(num):
    """Convert a Kölner Phonetik code from numeric to alphabetic.

    :param str num: a numeric Kölner Phonetik representation (can be a str or
        an int)
    :returns: an alphabetic representation of the same word
    :rtype: str

    >>> koelner_phonetik_num_to_alpha('862')
    'SNT'
    >>> koelner_phonetik_num_to_alpha('657')
    'NLR'
    >>> koelner_phonetik_num_to_alpha('86766')
    'SNRNN'
    """
    _koelner_num_translation = dict(zip((ord(_) for _ in '012345678'),
                                        'APTFKLNRS'))
    num = ''.join(c for c in text_type(num) if c in {'0', '1', '2', '3', '4',
                                                     '5', '6', '7', '8'})
    return num.translate(_koelner_num_translation)


[docs]def koelner_phonetik_alpha(word):
    """Return the Kölner Phonetik (alphabetic output) code for a word.

    :param str word: the word to transform
    :returns: the Kölner Phonetik value as an alphabetic string
    :rtype: str

    >>> koelner_phonetik_alpha('Smith')
    'SNT'
    >>> koelner_phonetik_alpha('Schmidt')
    'SNT'
    >>> koelner_phonetik_alpha('Müller')
    'NLR'
    >>> koelner_phonetik_alpha('Zimmermann')
    'SNRNN'
    """
    return koelner_phonetik_num_to_alpha(koelner_phonetik(word))


[docs]def nysiis(word, max_length=6, modified=False):
    """Return the NYSIIS code for a word.

    The New York State Identification and Intelligence System algorithm is
    defined in :cite:`Taft:1970`.

    The modified version of this algorithm is described in Appendix B of
    :cite:`Lynch:1977`.

    :param str word: the word to transform
    :param int max_length: the maximum length (default 6) of the code to return
    :param bool modified: indicates whether to use USDA modified NYSIIS
    :returns: the NYSIIS value
    :rtype: str

    >>> nysiis('Christopher')
    'CRASTA'
    >>> nysiis('Niall')
    'NAL'
    >>> nysiis('Smith')
    'SNAT'
    >>> nysiis('Schmidt')
    'SNAD'

    >>> nysiis('Christopher', max_length=-1)
    'CRASTAFAR'

    >>> nysiis('Christopher', max_length=8, modified=True)
    'CRASTAFA'
    >>> nysiis('Niall', max_length=8, modified=True)
    'NAL'
    >>> nysiis('Smith', max_length=8, modified=True)
    'SNAT'
    >>> nysiis('Schmidt', max_length=8, modified=True)
    'SNAD'
    """
    # Require a max_length of at least 6
    if max_length > -1:
        max_length = max(6, max_length)

    _vowels = {'A', 'E', 'I', 'O', 'U'}

    word = ''.join(c for c in word.upper() if c.isalpha())
    word = word.replace('ß', 'SS')

    # exit early if there are no alphas
    if not word:
        return ''

    original_first_char = word[0]

    if word[:3] == 'MAC':
        word = 'MCC'+word[3:]
    elif word[:2] == 'KN':
        word = 'NN'+word[2:]
    elif word[:1] == 'K':
        word = 'C'+word[1:]
    elif word[:2] in {'PH', 'PF'}:
        word = 'FF'+word[2:]
    elif word[:3] == 'SCH':
        word = 'SSS'+word[3:]
    elif modified:
        if word[:2] == 'WR':
            word = 'RR'+word[2:]
        elif word[:2] == 'RH':
            word = 'RR'+word[2:]
        elif word[:2] == 'DG':
            word = 'GG'+word[2:]
        elif word[:1] in _vowels:
            word = 'A'+word[1:]

    if modified and word[-1:] in {'S', 'Z'}:
        word = word[:-1]

    if word[-2:] == 'EE' or word[-2:] == 'IE' or (modified and
                                                  word[-2:] == 'YE'):
        word = word[:-2]+'Y'
    elif word[-2:] in {'DT', 'RT', 'RD'}:
        word = word[:-2]+'D'
    elif word[-2:] in {'NT', 'ND'}:
        word = word[:-2]+('N' if modified else 'D')
    elif modified:
        if word[-2:] == 'IX':
            word = word[:-2]+'ICK'
        elif word[-2:] == 'EX':
            word = word[:-2]+'ECK'
        elif word[-2:] in {'JR', 'SR'}:
            return 'ERROR'

    key = word[:1]

    skip = 0
    for i in range(1, len(word)):
        if i >= len(word):
            continue
        elif skip:
            skip -= 1
            continue
        elif word[i:i+2] == 'EV':
            word = word[:i] + 'AF' + word[i+2:]
            skip = 1
        elif word[i] in _vowels:
            word = word[:i] + 'A' + word[i+1:]
        elif modified and i != len(word)-1 and word[i] == 'Y':
            word = word[:i] + 'A' + word[i+1:]
        elif word[i] == 'Q':
            word = word[:i] + 'G' + word[i+1:]
        elif word[i] == 'Z':
            word = word[:i] + 'S' + word[i+1:]
        elif word[i] == 'M':
            word = word[:i] + 'N' + word[i+1:]
        elif word[i:i+2] == 'KN':
            word = word[:i] + 'N' + word[i+2:]
        elif word[i] == 'K':
            word = word[:i] + 'C' + word[i+1:]
        elif modified and i == len(word)-3 and word[i:i+3] == 'SCH':
            word = word[:i] + 'SSA'
            skip = 2
        elif word[i:i+3] == 'SCH':
            word = word[:i] + 'SSS' + word[i+3:]
            skip = 2
        elif modified and i == len(word)-2 and word[i:i+2] == 'SH':
            word = word[:i] + 'SA'
            skip = 1
        elif word[i:i+2] == 'SH':
            word = word[:i] + 'SS' + word[i+2:]
            skip = 1
        elif word[i:i+2] == 'PH':
            word = word[:i] + 'FF' + word[i+2:]
            skip = 1
        elif modified and word[i:i+3] == 'GHT':
            word = word[:i] + 'TTT' + word[i+3:]
            skip = 2
        elif modified and word[i:i+2] == 'DG':
            word = word[:i] + 'GG' + word[i+2:]
            skip = 1
        elif modified and word[i:i+2] == 'WR':
            word = word[:i] + 'RR' + word[i+2:]
            skip = 1
        elif word[i] == 'H' and (word[i-1] not in _vowels or
                                 word[i+1:i+2] not in _vowels):
            word = word[:i] + word[i-1] + word[i+1:]
        elif word[i] == 'W' and word[i-1] in _vowels:
            word = word[:i] + word[i-1] + word[i+1:]

        if word[i:i+skip+1] != key[-1:]:
            key += word[i:i+skip+1]

    key = _delete_consecutive_repeats(key)

    if key[-1:] == 'S':
        key = key[:-1]
    if key[-2:] == 'AY':
        key = key[:-2] + 'Y'
    if key[-1:] == 'A':
        key = key[:-1]
    if modified and key[:1] == 'A':
        key = original_first_char + key[1:]

    if max_length > 0:
        key = key[:max_length]

    return key


[docs]def mra(word):
    """Return the MRA personal numeric identifier (PNI) for a word.

    A description of the Western Airlines Surname Match Rating Algorithm can
    be found on page 18 of :cite:`Moore:1977`.

    :param str word: the word to transform
    :returns: the MRA PNI
    :rtype: str

    >>> mra('Christopher')
    'CHRPHR'
    >>> mra('Niall')
    'NL'
    >>> mra('Smith')
    'SMTH'
    >>> mra('Schmidt')
    'SCHMDT'
    """
    if not word:
        return word
    word = word.upper()
    word = word.replace('ß', 'SS')
    word = word[0]+''.join(c for c in word[1:] if
                           c not in {'A', 'E', 'I', 'O', 'U'})
    word = _delete_consecutive_repeats(word)
    if len(word) > 6:
        word = word[:3]+word[-3:]
    return word


[docs]def metaphone(word, max_length=-1):
    """Return the Metaphone code for a word.

    Based on Lawrence Philips' Pick BASIC code from 1990 :cite:`Philips:1990`,
    as described in :cite:`Philips:1990b`.
    This incorporates some corrections to the above code, particularly
    some of those suggested by Michael Kuhn in :cite:`Kuhn:1995`.

    :param str word: the word to transform
    :param int max_length: the maximum length of the returned Metaphone code
        (defaults to 64, but in Philips' original implementation this was 4)
    :returns: the Metaphone value
    :rtype: str


    >>> metaphone('Christopher')
    'KRSTFR'
    >>> metaphone('Niall')
    'NL'
    >>> metaphone('Smith')
    'SM0'
    >>> metaphone('Schmidt')
    'SKMTT'
    """
    _vowels = {'A', 'E', 'I', 'O', 'U'}
    _frontv = {'E', 'I', 'Y'}
    _varson = {'C', 'G', 'P', 'S', 'T'}

    # Require a max_length of at least 4
    if max_length != -1:
        max_length = max(4, max_length)
    else:
        max_length = 64

    # As in variable sound--those modified by adding an "h"
    ename = ''.join(c for c in word.upper() if c.isalnum())
    ename = ename.replace('ß', 'SS')

    # Delete non-alphanumeric characters and make all caps
    if not ename:
        return ''
    if ename[0:2] in {'PN', 'AE', 'KN', 'GN', 'WR'}:
        ename = ename[1:]
    elif ename[0] == 'X':
        ename = 'S' + ename[1:]
    elif ename[0:2] == 'WH':
        ename = 'W' + ename[2:]

    # Convert to metaphone
    elen = len(ename)-1
    metaph = ''
    for i in range(len(ename)):
        if len(metaph) >= max_length:
            break
        if ((ename[i] not in {'G', 'T'} and
             i > 0 and ename[i-1] == ename[i])):
            continue

        if ename[i] in _vowels and i == 0:
            metaph = ename[i]

        elif ename[i] == 'B':
            if i != elen or ename[i-1] != 'M':
                metaph += ename[i]

        elif ename[i] == 'C':
            if not (i > 0 and ename[i-1] == 'S' and ename[i+1:i+2] in _frontv):
                if ename[i+1:i+3] == 'IA':
                    metaph += 'X'
                elif ename[i+1:i+2] in _frontv:
                    metaph += 'S'
                elif i > 0 and ename[i-1:i+2] == 'SCH':
                    metaph += 'K'
                elif ename[i+1:i+2] == 'H':
                    if i == 0 and i+1 < elen and ename[i+2:i+3] not in _vowels:
                        metaph += 'K'
                    else:
                        metaph += 'X'
                else:
                    metaph += 'K'

        elif ename[i] == 'D':
            if ename[i+1:i+2] == 'G' and ename[i+2:i+3] in _frontv:
                metaph += 'J'
            else:
                metaph += 'T'

        elif ename[i] == 'G':
            if ename[i+1:i+2] == 'H' and not (i+1 == elen or
                                              ename[i+2:i+3] not in _vowels):
                continue
            elif i > 0 and ((i+1 == elen and ename[i+1] == 'N') or
                            (i+3 == elen and ename[i+1:i+4] == 'NED')):
                continue
            elif (i-1 > 0 and i+1 <= elen and ename[i-1] == 'D' and
                  ename[i+1] in _frontv):
                continue
            elif ename[i+1:i+2] == 'G':
                continue
            elif ename[i+1:i+2] in _frontv:
                if i == 0 or ename[i-1] != 'G':
                    metaph += 'J'
                else:
                    metaph += 'K'
            else:
                metaph += 'K'

        elif ename[i] == 'H':
            if ((i > 0 and ename[i-1] in _vowels and
                 ename[i+1:i+2] not in _vowels)):
                continue
            elif i > 0 and ename[i-1] in _varson:
                continue
            else:
                metaph += 'H'

        elif ename[i] in {'F', 'J', 'L', 'M', 'N', 'R'}:
            metaph += ename[i]

        elif ename[i] == 'K':
            if i > 0 and ename[i-1] == 'C':
                continue
            else:
                metaph += 'K'

        elif ename[i] == 'P':
            if ename[i+1:i+2] == 'H':
                metaph += 'F'
            else:
                metaph += 'P'

        elif ename[i] == 'Q':
            metaph += 'K'

        elif ename[i] == 'S':
            if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and
                 ename[i+2] in 'OA')):
                metaph += 'X'
            elif ename[i+1:i+2] == 'H':
                metaph += 'X'
            else:
                metaph += 'S'

        elif ename[i] == 'T':
            if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and
                 ename[i+2] in {'A', 'O'})):
                metaph += 'X'
            elif ename[i+1:i+2] == 'H':
                metaph += '0'
            elif ename[i+1:i+3] != 'CH':
                if ename[i-1:i] != 'T':
                    metaph += 'T'

        elif ename[i] == 'V':
            metaph += 'F'

        elif ename[i] in 'WY':
            if ename[i+1:i+2] in _vowels:
                metaph += ename[i]

        elif ename[i] == 'X':
            metaph += 'KS'

        elif ename[i] == 'Z':
            metaph += 'S'

    return metaph


[docs]def double_metaphone(word, max_length=-1):
    """Return the Double Metaphone code for a word.

    Based on Lawrence Philips' (Visual) C++ code from 1999
    :cite:`Philips:2000`.

    :param word: the word to transform
    :param max_length: the maximum length of the returned Double Metaphone
        codes (defaults to 64, but in Philips' original implementation this
        was 4)
    :returns: the Double Metaphone value(s)
    :rtype: tuple

    >>> double_metaphone('Christopher')
    ('KRSTFR', '')
    >>> double_metaphone('Niall')
    ('NL', '')
    >>> double_metaphone('Smith')
    ('SM0', 'XMT')
    >>> double_metaphone('Schmidt')
    ('XMT', 'SMT')
    """
    # Require a max_length of at least 4
    if max_length != -1:
        max_length = max(4, max_length)
    else:
        max_length = 64

    primary = ''
    secondary = ''

    def _slavo_germanic():
        """Return True if the word appears to be Slavic or Germanic."""
        if 'W' in word or 'K' in word or 'CZ' in word:
            return True
        return False

    def _metaph_add(pri, sec=''):
        """Return a new metaphone tuple with the supplied elements."""
        newpri = primary
        newsec = secondary
        if pri:
            newpri += pri
        if sec:
            if sec != ' ':
                newsec += sec
        else:
            newsec += pri
        return newpri, newsec

    def _is_vowel(pos):
        """Return True if the character at word[pos] is a vowel."""
        if pos >= 0 and word[pos] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
            return True
        return False

    def _get_at(pos):
        """Return the character at word[pos]."""
        return word[pos]

    def _string_at(pos, slen, substrings):
        """Return True if word[pos:pos+slen] is in substrings."""
        if pos < 0:
            return False
        return word[pos:pos+slen] in substrings

    current = 0
    length = len(word)
    if length < 1:
        return '', ''
    last = length - 1

    word = word.upper()
    word = word.replace('ß', 'SS')

    # Pad the original string so that we can index beyond the edge of the world
    word += '     '

    # Skip these when at start of word
    if word[0:2] in {'GN', 'KN', 'PN', 'WR', 'PS'}:
        current += 1

    # Initial 'X' is pronounced 'Z' e.g. 'Xavier'
    if _get_at(0) == 'X':
        primary, secondary = _metaph_add('S')  # 'Z' maps to 'S'
        current += 1

    # Main loop
    while True:
        if current >= length:
            break

        if _get_at(current) in {'A', 'E', 'I', 'O', 'U', 'Y'}:
            if current == 0:
                # All init vowels now map to 'A'
                primary, secondary = _metaph_add('A')
            current += 1
            continue

        elif _get_at(current) == 'B':
            # "-mb", e.g", "dumb", already skipped over...
            primary, secondary = _metaph_add('P')
            if _get_at(current + 1) == 'B':
                current += 2
            else:
                current += 1
            continue

        elif _get_at(current) == 'Ç':
            primary, secondary = _metaph_add('S')
            current += 1
            continue

        elif _get_at(current) == 'C':
            # Various Germanic
            if (current > 1 and not _is_vowel(current - 2) and
                    _string_at((current - 1), 3, {'ACH'}) and
                    ((_get_at(current + 2) != 'I') and
                     ((_get_at(current + 2) != 'E') or
                      _string_at((current - 2), 6,
                                 {'BACHER', 'MACHER'})))):
                primary, secondary = _metaph_add('K')
                current += 2
                continue

            # Special case 'caesar'
            elif current == 0 and _string_at(current, 6, {'CAESAR'}):
                primary, secondary = _metaph_add('S')
                current += 2
                continue

            # Italian 'chianti'
            elif _string_at(current, 4, {'CHIA'}):
                primary, secondary = _metaph_add('K')
                current += 2
                continue

            elif _string_at(current, 2, {'CH'}):
                # Find 'Michael'
                if current > 0 and _string_at(current, 4, {'CHAE'}):
                    primary, secondary = _metaph_add('K', 'X')
                    current += 2
                    continue

                # Greek roots e.g. 'chemistry', 'chorus'
                elif (current == 0 and
                      (_string_at((current + 1), 5,
                                  {'HARAC', 'HARIS'}) or
                       _string_at((current + 1), 3,
                                  {'HOR', 'HYM', 'HIA', 'HEM'})) and
                      not _string_at(0, 5, {'CHORE'})):
                    primary, secondary = _metaph_add('K')
                    current += 2
                    continue

                # Germanic, Greek, or otherwise 'ch' for 'kh' sound
                elif ((_string_at(0, 4, {'VAN ', 'VON '}) or
                       _string_at(0, 3, {'SCH'})) or
                      # 'architect but not 'arch', 'orchestra', 'orchid'
                      _string_at((current - 2), 6,
                                 {'ORCHES', 'ARCHIT', 'ORCHID'}) or
                      _string_at((current + 2), 1, {'T', 'S'}) or
                      ((_string_at((current - 1), 1,
                                   {'A', 'O', 'U', 'E'}) or
                        (current == 0)) and
                       # e.g., 'wachtler', 'wechsler', but not 'tichner'
                       _string_at((current + 2), 1,
                                  {'L', 'R', 'N', 'M', 'B', 'H', 'F', 'V', 'W',
                                   ' '}))):
                    primary, secondary = _metaph_add('K')

                else:
                    if current > 0:
                        if _string_at(0, 2, {'MC'}):
                            # e.g., "McHugh"
                            primary, secondary = _metaph_add('K')
                        else:
                            primary, secondary = _metaph_add('X', 'K')
                    else:
                        primary, secondary = _metaph_add('X')

                current += 2
                continue

            # e.g, 'czerny'
            elif (_string_at(current, 2, {'CZ'}) and
                  not _string_at((current - 2), 4, {'WICZ'})):
                primary, secondary = _metaph_add('S', 'X')
                current += 2
                continue

            # e.g., 'focaccia'
            elif _string_at((current + 1), 3, {'CIA'}):
                primary, secondary = _metaph_add('X')
                current += 3

            # double 'C', but not if e.g. 'McClellan'
            elif (_string_at(current, 2, {'CC'}) and
                  not ((current == 1) and (_get_at(0) == 'M'))):
                # 'bellocchio' but not 'bacchus'
                if ((_string_at((current + 2), 1,
                                {'I', 'E', 'H'}) and
                     not _string_at((current + 2), 2, ['HU']))):
                    # 'accident', 'accede' 'succeed'
                    if ((((current == 1) and _get_at(current - 1) == 'A') or
                         _string_at((current - 1), 5,
                                    {'UCCEE', 'UCCES'}))):
                        primary, secondary = _metaph_add('KS')
                    # 'bacci', 'bertucci', other italian
                    else:
                        primary, secondary = _metaph_add('X')
                    current += 3
                    continue
                else:  # Pierce's rule
                    primary, secondary = _metaph_add('K')
                    current += 2
                    continue

            elif _string_at(current, 2, {'CK', 'CG', 'CQ'}):
                primary, secondary = _metaph_add('K')
                current += 2
                continue

            elif _string_at(current, 2, {'CI', 'CE', 'CY'}):
                # Italian vs. English
                if _string_at(current, 3, {'CIO', 'CIE', 'CIA'}):
                    primary, secondary = _metaph_add('S', 'X')
                else:
                    primary, secondary = _metaph_add('S')
                current += 2
                continue

            # else
            else:
                primary, secondary = _metaph_add('K')

                # name sent in 'mac caffrey', 'mac gregor
                if _string_at((current + 1), 2, {' C', ' Q', ' G'}):
                    current += 3
                elif (_string_at((current + 1), 1,
                                 {'C', 'K', 'Q'}) and
                      not _string_at((current + 1), 2, {'CE', 'CI'})):
                    current += 2
                else:
                    current += 1
                continue

        elif _get_at(current) == 'D':
            if _string_at(current, 2, {'DG'}):
                if _string_at((current + 2), 1, {'I', 'E', 'Y'}):
                    # e.g. 'edge'
                    primary, secondary = _metaph_add('J')
                    current += 3
                    continue
                else:
                    # e.g. 'edgar'
                    primary, secondary = _metaph_add('TK')
                    current += 2
                    continue

            elif _string_at(current, 2, {'DT', 'DD'}):
                primary, secondary = _metaph_add('T')
                current += 2
                continue

            # else
            else:
                primary, secondary = _metaph_add('T')
                current += 1
                continue

        elif _get_at(current) == 'F':
            if _get_at(current + 1) == 'F':
                current += 2
            else:
                current += 1
            primary, secondary = _metaph_add('F')
            continue

        elif _get_at(current) == 'G':
            if _get_at(current + 1) == 'H':
                if (current > 0) and not _is_vowel(current - 1):
                    primary, secondary = _metaph_add('K')
                    current += 2
                    continue

                # 'ghislane', ghiradelli
                elif current == 0:
                    if _get_at(current + 2) == 'I':
                        primary, secondary = _metaph_add('J')
                    else:
                        primary, secondary = _metaph_add('K')
                    current += 2
                    continue

                # Parker's rule (with some further refinements) - e.g., 'hugh'
                elif (((current > 1) and
                       _string_at((current - 2), 1, {'B', 'H', 'D'})) or
                      # e.g., 'bough'
                      ((current > 2) and
                       _string_at((current - 3), 1, {'B', 'H', 'D'})) or
                      # e.g., 'broughton'
                      ((current > 3) and
                       _string_at((current - 4), 1, {'B', 'H'}))):
                    current += 2
                    continue
                else:
                    # e.g. 'laugh', 'McLaughlin', 'cough',
                    #      'gough', 'rough', 'tough'
                    if ((current > 2) and
                            (_get_at(current - 1) == 'U') and
                            (_string_at((current - 3), 1,
                                        {'C', 'G', 'L', 'R', 'T'}))):
                        primary, secondary = _metaph_add('F')
                    elif (current > 0) and _get_at(current - 1) != 'I':
                        primary, secondary = _metaph_add('K')
                    current += 2
                    continue

            elif _get_at(current + 1) == 'N':
                if (current == 1) and _is_vowel(0) and not _slavo_germanic():
                    primary, secondary = _metaph_add('KN', 'N')
                # not e.g. 'cagney'
                elif (not _string_at((current + 2), 2, {'EY'}) and
                      (_get_at(current + 1) != 'Y') and
                      not _slavo_germanic()):
                    primary, secondary = _metaph_add('N', 'KN')
                else:
                    primary, secondary = _metaph_add('KN')
                current += 2
                continue

            # 'tagliaro'
            elif (_string_at((current + 1), 2, {'LI'}) and
                  not _slavo_germanic()):
                primary, secondary = _metaph_add('KL', 'L')
                current += 2
                continue

            # -ges-, -gep-, -gel-, -gie- at beginning
            elif ((current == 0) and
                  ((_get_at(current + 1) == 'Y') or
                   _string_at((current + 1), 2, {'ES', 'EP', 'EB', 'EL', 'EY',
                                                 'IB', 'IL', 'IN', 'IE', 'EI',
                                                 'ER'}))):
                primary, secondary = _metaph_add('K', 'J')
                current += 2
                continue

            #  -ger-,  -gy-
            elif ((_string_at((current + 1), 2, {'ER'}) or
                   (_get_at(current + 1) == 'Y')) and not
                  _string_at(0, 6, {'DANGER', 'RANGER', 'MANGER'}) and not
                  _string_at((current - 1), 1, {'E', 'I'}) and not
                  _string_at((current - 1), 3, {'RGY', 'OGY'})):
                primary, secondary = _metaph_add('K', 'J')
                current += 2
                continue

            #  italian e.g, 'biaggi'
            elif (_string_at((current + 1), 1, {'E', 'I', 'Y'}) or
                  _string_at((current - 1), 4, {'AGGI', 'OGGI'})):
                # obvious germanic
                if (((_string_at(0, 4, {'VAN ', 'VON '}) or
                      _string_at(0, 3, {'SCH'})) or
                     _string_at((current + 1), 2, {'ET'}))):
                    primary, secondary = _metaph_add('K')
                elif _string_at((current + 1), 4, {'IER '}):
                    primary, secondary = _metaph_add('J')
                else:
                    primary, secondary = _metaph_add('J', 'K')
                current += 2
                continue

            else:
                if _get_at(current + 1) == 'G':
                    current += 2
                else:
                    current += 1
                primary, secondary = _metaph_add('K')
                continue

        elif _get_at(current) == 'H':
            # only keep if first & before vowel or btw. 2 vowels
            if ((((current == 0) or _is_vowel(current - 1)) and
                 _is_vowel(current + 1))):
                primary, secondary = _metaph_add('H')
                current += 2
            else:  # also takes care of 'HH'
                current += 1
            continue

        elif _get_at(current) == 'J':
            # obvious spanish, 'jose', 'san jacinto'
            if _string_at(current, 4, ['JOSE']) or _string_at(0, 4, {'SAN '}):
                if ((((current == 0) and (_get_at(current + 4) == ' ')) or
                     _string_at(0, 4, ['SAN ']))):
                    primary, secondary = _metaph_add('H')
                else:
                    primary, secondary = _metaph_add('J', 'H')
                current += 1
                continue

            elif (current == 0) and not _string_at(current, 4, {'JOSE'}):
                # Yankelovich/Jankelowicz
                primary, secondary = _metaph_add('J', 'A')
            # Spanish pron. of e.g. 'bajador'
            elif (_is_vowel(current - 1) and
                  not _slavo_germanic() and
                  ((_get_at(current + 1) == 'A') or
                   (_get_at(current + 1) == 'O'))):
                primary, secondary = _metaph_add('J', 'H')
            elif current == last:
                primary, secondary = _metaph_add('J', ' ')
            elif (not _string_at((current + 1), 1,
                                 {'L', 'T', 'K', 'S', 'N', 'M', 'B', 'Z'}) and
                  not _string_at((current - 1), 1, {'S', 'K', 'L'})):
                primary, secondary = _metaph_add('J')

            if _get_at(current + 1) == 'J':  # it could happen!
                current += 2
            else:
                current += 1
            continue

        elif _get_at(current) == 'K':
            if _get_at(current + 1) == 'K':
                current += 2
            else:
                current += 1
            primary, secondary = _metaph_add('K')
            continue

        elif _get_at(current) == 'L':
            if _get_at(current + 1) == 'L':
                # Spanish e.g. 'cabrillo', 'gallegos'
                if (((current == (length - 3)) and
                     _string_at((current - 1), 4, {'ILLO', 'ILLA', 'ALLE'})) or
                        ((_string_at((last - 1), 2, {'AS', 'OS'}) or
                          _string_at(last, 1, {'A', 'O'})) and
                         _string_at((current - 1), 4, {'ALLE'}))):
                    primary, secondary = _metaph_add('L', ' ')
                    current += 2
                    continue
                current += 2
            else:
                current += 1
            primary, secondary = _metaph_add('L')
            continue

        elif _get_at(current) == 'M':
            if (((_string_at((current - 1), 3, {'UMB'}) and
                  (((current + 1) == last) or
                   _string_at((current + 2), 2, {'ER'}))) or
                 # 'dumb', 'thumb'
                 (_get_at(current + 1) == 'M'))):
                current += 2
            else:
                current += 1
            primary, secondary = _metaph_add('M')
            continue

        elif _get_at(current) == 'N':
            if _get_at(current + 1) == 'N':
                current += 2
            else:
                current += 1
            primary, secondary = _metaph_add('N')
            continue

        elif _get_at(current) == 'Ñ':
            current += 1
            primary, secondary = _metaph_add('N')
            continue

        elif _get_at(current) == 'P':
            if _get_at(current + 1) == 'H':
                primary, secondary = _metaph_add('F')
                current += 2
                continue

            # also account for "campbell", "raspberry"
            elif _string_at((current + 1), 1, {'P', 'B'}):
                current += 2
            else:
                current += 1
            primary, secondary = _metaph_add('P')
            continue

        elif _get_at(current) == 'Q':
            if _get_at(current + 1) == 'Q':
                current += 2
            else:
                current += 1
            primary, secondary = _metaph_add('K')
            continue

        elif _get_at(current) == 'R':
            # french e.g. 'rogier', but exclude 'hochmeier'
            if (((current == last) and
                 not _slavo_germanic() and
                 _string_at((current - 2), 2, {'IE'}) and
                 not _string_at((current - 4), 2, {'ME', 'MA'}))):
                primary, secondary = _metaph_add('', 'R')
            else:
                primary, secondary = _metaph_add('R')

            if _get_at(current + 1) == 'R':
                current += 2
            else:
                current += 1
            continue

        elif _get_at(current) == 'S':
            # special cases 'island', 'isle', 'carlisle', 'carlysle'
            if _string_at((current - 1), 3, {'ISL', 'YSL'}):
                current += 1
                continue

            # special case 'sugar-'
            elif (current == 0) and _string_at(current, 5, {'SUGAR'}):
                primary, secondary = _metaph_add('X', 'S')
                current += 1
                continue

            elif _string_at(current, 2, {'SH'}):
                # Germanic
                if _string_at((current + 1), 4,
                              {'HEIM', 'HOEK', 'HOLM', 'HOLZ'}):
                    primary, secondary = _metaph_add('S')
                else:
                    primary, secondary = _metaph_add('X')
                current += 2
                continue

            # Italian & Armenian
            elif (_string_at(current, 3, {'SIO', 'SIA'}) or
                  _string_at(current, 4, {'SIAN'})):
                if not _slavo_germanic():
                    primary, secondary = _metaph_add('S', 'X')
                else:
                    primary, secondary = _metaph_add('S')
                current += 3
                continue

            # German & anglicisations, e.g. 'smith' match 'schmidt',
            #                               'snider' match 'schneider'
            # also, -sz- in Slavic language although in Hungarian it is
            #       pronounced 's'
            elif (((current == 0) and
                   _string_at((current + 1), 1, {'M', 'N', 'L', 'W'})) or
                  _string_at((current + 1), 1, {'Z'})):
                primary, secondary = _metaph_add('S', 'X')
                if _string_at((current + 1), 1, {'Z'}):
                    current += 2
                else:
                    current += 1
                continue

            elif _string_at(current, 2, {'SC'}):
                # Schlesinger's rule
                if _get_at(current + 2) == 'H':
                    # dutch origin, e.g. 'school', 'schooner'
                    if _string_at((current + 3), 2,
                                  {'OO', 'ER', 'EN', 'UY', 'ED', 'EM'}):
                        # 'schermerhorn', 'schenker'
                        if _string_at((current + 3), 2, {'ER', 'EN'}):
                            primary, secondary = _metaph_add('X', 'SK')
                        else:
                            primary, secondary = _metaph_add('SK')
                        current += 3
                        continue
                    else:
                        if (((current == 0) and not _is_vowel(3) and
                             (_get_at(3) != 'W'))):
                            primary, secondary = _metaph_add('X', 'S')
                        else:
                            primary, secondary = _metaph_add('X')
                        current += 3
                        continue

                elif _string_at((current + 2), 1, {'I', 'E', 'Y'}):
                    primary, secondary = _metaph_add('S')
                    current += 3
                    continue

                # else
                else:
                    primary, secondary = _metaph_add('SK')
                    current += 3
                    continue

            else:
                # french e.g. 'resnais', 'artois'
                if (current == last) and _string_at((current - 2), 2,
                                                    {'AI', 'OI'}):
                    primary, secondary = _metaph_add('', 'S')
                else:
                    primary, secondary = _metaph_add('S')

                if _string_at((current + 1), 1, {'S', 'Z'}):
                    current += 2
                else:
                    current += 1
                continue

        elif _get_at(current) == 'T':
            if _string_at(current, 4, {'TION'}):
                primary, secondary = _metaph_add('X')
                current += 3
                continue

            elif _string_at(current, 3, {'TIA', 'TCH'}):
                primary, secondary = _metaph_add('X')
                current += 3
                continue

            elif (_string_at(current, 2, {'TH'}) or
                  _string_at(current, 3, {'TTH'})):
                # special case 'thomas', 'thames' or germanic
                if ((_string_at((current + 2), 2, {'OM', 'AM'}) or
                     _string_at(0, 4, {'VAN ', 'VON '}) or
                     _string_at(0, 3, {'SCH'}))):
                    primary, secondary = _metaph_add('T')
                else:
                    primary, secondary = _metaph_add('0', 'T')
                current += 2
                continue

            elif _string_at((current + 1), 1, {'T', 'D'}):
                current += 2
            else:
                current += 1
            primary, secondary = _metaph_add('T')
            continue

        elif _get_at(current) == 'V':
            if _get_at(current + 1) == 'V':
                current += 2
            else:
                current += 1
            primary, secondary = _metaph_add('F')
            continue

        elif _get_at(current) == 'W':
            # can also be in middle of word
            if _string_at(current, 2, {'WR'}):
                primary, secondary = _metaph_add('R')
                current += 2
                continue
            elif ((current == 0) and
                  (_is_vowel(current + 1) or _string_at(current, 2, {'WH'}))):
                # Wasserman should match Vasserman
                if _is_vowel(current + 1):
                    primary, secondary = _metaph_add('A', 'F')
                else:
                    # need Uomo to match Womo
                    primary, secondary = _metaph_add('A')

            # Arnow should match Arnoff
            if ((((current == last) and _is_vowel(current - 1)) or
                 _string_at((current - 1), 5,
                            {'EWSKI', 'EWSKY', 'OWSKI', 'OWSKY'}) or
                 _string_at(0, 3, ['SCH']))):
                primary, secondary = _metaph_add('', 'F')
                current += 1
                continue
            # Polish e.g. 'filipowicz'
            elif _string_at(current, 4, {'WICZ', 'WITZ'}):
                primary, secondary = _metaph_add('TS', 'FX')
                current += 4
                continue
            # else skip it
            else:
                current += 1
                continue

        elif _get_at(current) == 'X':
            # French e.g. breaux
            if (not ((current == last) and
                     (_string_at((current - 3), 3, {'IAU', 'EAU'}) or
                      _string_at((current - 2), 2, {'AU', 'OU'})))):
                primary, secondary = _metaph_add('KS')

            if _string_at((current + 1), 1, {'C', 'X'}):
                current += 2
            else:
                current += 1
            continue

        elif _get_at(current) == 'Z':
            # Chinese Pinyin e.g. 'zhao'
            if _get_at(current + 1) == 'H':
                primary, secondary = _metaph_add('J')
                current += 2
                continue
            elif (_string_at((current + 1), 2, {'ZO', 'ZI', 'ZA'}) or
                  (_slavo_germanic() and ((current > 0) and
                                          _get_at(current - 1) != 'T'))):
                primary, secondary = _metaph_add('S', 'TS')
            else:
                primary, secondary = _metaph_add('S')

            if _get_at(current + 1) == 'Z':
                current += 2
            else:
                current += 1
            continue

        else:
            current += 1

    if max_length > 0:
        primary = primary[:max_length]
        secondary = secondary[:max_length]
    if primary == secondary:
        secondary = ''

    return primary, secondary


[docs]def caverphone(word, version=2):
    """Return the Caverphone code for a word.

    A description of version 1 of the algorithm can be found in
    :cite:`Hood:2002`.

    A description of version 2 of the algorithm can be found in
    :cite:`Hood:2004`.

    :param str word: the word to transform
    :param int version: the version of Caverphone to employ for encoding
        (defaults to 2)
    :returns: the Caverphone value
    :rtype: str

    >>> caverphone('Christopher')
    'KRSTFA1111'
    >>> caverphone('Niall')
    'NA11111111'
    >>> caverphone('Smith')
    'SMT1111111'
    >>> caverphone('Schmidt')
    'SKMT111111'

    >>> caverphone('Christopher', 1)
    'KRSTF1'
    >>> caverphone('Niall', 1)
    'N11111'
    >>> caverphone('Smith', 1)
    'SMT111'
    >>> caverphone('Schmidt', 1)
    'SKMT11'
    """
    _vowels = {'a', 'e', 'i', 'o', 'u'}

    word = word.lower()
    word = ''.join(c for c in word if c in
                   {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                    'y', 'z'})

    def _squeeze_replace(word, char, new_char):
        """Convert strings of char in word to one instance of new_char."""
        while char * 2 in word:
            word = word.replace(char * 2, char)
        return word.replace(char, new_char)

    # the main replacement algorithm
    if version != 1 and word[-1:] == 'e':
        word = word[:-1]
    if word:
        if word[:5] == 'cough':
            word = 'cou2f'+word[5:]
        if word[:5] == 'rough':
            word = 'rou2f'+word[5:]
        if word[:5] == 'tough':
            word = 'tou2f'+word[5:]
        if word[:6] == 'enough':
            word = 'enou2f'+word[6:]
        if version != 1 and word[:6] == 'trough':
            word = 'trou2f'+word[6:]
        if word[:2] == 'gn':
            word = '2n'+word[2:]
        if word[-2:] == 'mb':
            word = word[:-1]+'2'
        word = word.replace('cq', '2q')
        word = word.replace('ci', 'si')
        word = word.replace('ce', 'se')
        word = word.replace('cy', 'sy')
        word = word.replace('tch', '2ch')
        word = word.replace('c', 'k')
        word = word.replace('q', 'k')
        word = word.replace('x', 'k')
        word = word.replace('v', 'f')
        word = word.replace('dg', '2g')
        word = word.replace('tio', 'sio')
        word = word.replace('tia', 'sia')
        word = word.replace('d', 't')
        word = word.replace('ph', 'fh')
        word = word.replace('b', 'p')
        word = word.replace('sh', 's2')
        word = word.replace('z', 's')
        if word[0] in _vowels:
            word = 'A'+word[1:]
        word = word.replace('a', '3')
        word = word.replace('e', '3')
        word = word.replace('i', '3')
        word = word.replace('o', '3')
        word = word.replace('u', '3')
        if version != 1:
            word = word.replace('j', 'y')
            if word[:2] == 'y3':
                word = 'Y3'+word[2:]
            if word[:1] == 'y':
                word = 'A'+word[1:]
            word = word.replace('y', '3')
        word = word.replace('3gh3', '3kh3')
        word = word.replace('gh', '22')
        word = word.replace('g', 'k')

        word = _squeeze_replace(word, 's', 'S')
        word = _squeeze_replace(word, 't', 'T')
        word = _squeeze_replace(word, 'p', 'P')
        word = _squeeze_replace(word, 'k', 'K')
        word = _squeeze_replace(word, 'f', 'F')
        word = _squeeze_replace(word, 'm', 'M')
        word = _squeeze_replace(word, 'n', 'N')

        word = word.replace('w3', 'W3')
        if version == 1:
            word = word.replace('wy', 'Wy')
        word = word.replace('wh3', 'Wh3')
        if version == 1:
            word = word.replace('why', 'Why')
        if version != 1 and word[-1:] == 'w':
            word = word[:-1]+'3'
        word = word.replace('w', '2')
        if word[:1] == 'h':
            word = 'A'+word[1:]
        word = word.replace('h', '2')
        word = word.replace('r3', 'R3')
        if version == 1:
            word = word.replace('ry', 'Ry')
        if version != 1 and word[-1:] == 'r':
            word = word[:-1]+'3'
        word = word.replace('r', '2')
        word = word.replace('l3', 'L3')
        if version == 1:
            word = word.replace('ly', 'Ly')
        if version != 1 and word[-1:] == 'l':
            word = word[:-1]+'3'
        word = word.replace('l', '2')
        if version == 1:
            word = word.replace('j', 'y')
            word = word.replace('y3', 'Y3')
            word = word.replace('y', '2')
        word = word.replace('2', '')
        if version != 1 and word[-1:] == '3':
            word = word[:-1]+'A'
        word = word.replace('3', '')

    # pad with 1s, then extract the necessary length of code
    word += '1'*10
    if version != 1:
        word = word[:10]
    else:
        word = word[:6]

    return word


[docs]def alpha_sis(word, max_length=14):
    """Return the IBM Alpha Search Inquiry System code for a word.

    The Alpha Search Inquiry System code is defined in :cite:`IBM:1973`.
    This implementation is based on the description in :cite:`Moore:1977`.

    A collection is necessary since there can be multiple values for a
    single word. But the collection must be ordered since the first value
    is the primary coding.

    :param str word: the word to transform
    :param int max_length: the length of the code returned (defaults to 14)
    :returns: the Alpha SIS value
    :rtype: tuple

    >>> alpha_sis('Christopher')
    ('06401840000000', '07040184000000', '04018400000000')
    >>> alpha_sis('Niall')
    ('02500000000000',)
    >>> alpha_sis('Smith')
    ('03100000000000',)
    >>> alpha_sis('Schmidt')
    ('06310000000000',)
    """
    _alpha_sis_initials = {'GF': '08', 'GM': '03', 'GN': '02', 'KN': '02',
                           'PF': '08', 'PN': '02', 'PS': '00', 'WR': '04',
                           'A': '1', 'E': '1', 'H': '2', 'I': '1', 'J': '3',
                           'O': '1', 'U': '1', 'W': '4', 'Y': '5'}
    _alpha_sis_initials_order = ('GF', 'GM', 'GN', 'KN', 'PF', 'PN', 'PS',
                                 'WR', 'A', 'E', 'H', 'I', 'J', 'O', 'U', 'W',
                                 'Y')
    _alpha_sis_basic = {'SCH': '6', 'CZ': ('70', '6', '0'),
                        'CH': ('6', '70', '0'), 'CK': ('7', '6'),
                        'DS': ('0', '10'), 'DZ': ('0', '10'),
                        'TS': ('0', '10'), 'TZ': ('0', '10'), 'CI': '0',
                        'CY': '0', 'CE': '0', 'SH': '6', 'DG': '7', 'PH': '8',
                        'C': ('7', '6'), 'K': ('7', '6'), 'Z': '0', 'S': '0',
                        'D': '1', 'T': '1', 'N': '2', 'M': '3', 'R': '4',
                        'L': '5', 'J': '6', 'G': '7', 'Q': '7', 'X': '7',
                        'F': '8', 'V': '8', 'B': '9', 'P': '9'}
    _alpha_sis_basic_order = ('SCH', 'CZ', 'CH', 'CK', 'DS', 'DZ', 'TS', 'TZ',
                              'CI', 'CY', 'CE', 'SH', 'DG', 'PH', 'C', 'K',
                              'Z', 'S', 'D', 'T', 'N', 'M', 'R', 'L', 'J', 'C',
                              'G', 'K', 'Q', 'X', 'F', 'V', 'B', 'P')

    alpha = ['']
    pos = 0
    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})

    # Clamp max_length to [4, 64]
    if max_length != -1:
        max_length = min(max(4, max_length), 64)
    else:
        max_length = 64

    # Do special processing for initial substrings
    for k in _alpha_sis_initials_order:
        if word.startswith(k):
            alpha[0] += _alpha_sis_initials[k]
            pos += len(k)
            break

    # Add a '0' if alpha is still empty
    if not alpha[0]:
        alpha[0] += '0'

    # Whether or not any special initial codes were encoded, iterate
    # through the length of the word in the main encoding loop
    while pos < len(word):
        orig_pos = pos
        for k in _alpha_sis_basic_order:
            if word[pos:].startswith(k):
                if isinstance(_alpha_sis_basic[k], tuple):
                    newalpha = []
                    for i in range(len(_alpha_sis_basic[k])):
                        newalpha += [_ + _alpha_sis_basic[k][i] for _ in alpha]
                    alpha = newalpha
                else:
                    alpha = [_ + _alpha_sis_basic[k] for _ in alpha]
                pos += len(k)
                break
        if pos == orig_pos:
            alpha = [_ + '_' for _ in alpha]
            pos += 1

    # Trim doublets and placeholders
    for i in range(len(alpha)):
        pos = 1
        while pos < len(alpha[i]):
            if alpha[i][pos] == alpha[i][pos-1]:
                alpha[i] = alpha[i][:pos]+alpha[i][pos+1:]
            pos += 1
    alpha = (_.replace('_', '') for _ in alpha)

    # Trim codes and return tuple
    alpha = ((_ + ('0'*max_length))[:max_length] for _ in alpha)
    return tuple(alpha)


[docs]def fuzzy_soundex(word, max_length=5, zero_pad=True):
    """Return the Fuzzy Soundex code for a word.

    Fuzzy Soundex is an algorithm derived from Soundex, defined in
    :cite:`Holmes:2002`.

    :param str word: the word to transform
    :param int max_length: the length of the code returned (defaults to 4)
    :param bool zero_pad: pad the end of the return value with 0s to achieve
        a max_length string
    :returns: the Fuzzy Soundex value
    :rtype: str

    >>> fuzzy_soundex('Christopher')
    'K6931'
    >>> fuzzy_soundex('Niall')
    'N4000'
    >>> fuzzy_soundex('Smith')
    'S5300'
    >>> fuzzy_soundex('Smith')
    'S5300'
    """
    _fuzzy_soundex_translation = dict(zip((ord(_) for _ in
                                           'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
                                          '0193017-07745501769301-7-9'))

    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')

    # Clamp max_length to [4, 64]
    if max_length != -1:
        max_length = min(max(4, max_length), 64)
    else:
        max_length = 64

    if not word:
        if zero_pad:
            return '0' * max_length
        return '0'

    if word[:2] in {'CS', 'CZ', 'TS', 'TZ'}:
        word = 'SS' + word[2:]
    elif word[:2] == 'GN':
        word = 'NN' + word[2:]
    elif word[:2] in {'HR', 'WR'}:
        word = 'RR' + word[2:]
    elif word[:2] == 'HW':
        word = 'WW' + word[2:]
    elif word[:2] in {'KN', 'NG'}:
        word = 'NN' + word[2:]

    if word[-2:] == 'CH':
        word = word[:-2] + 'KK'
    elif word[-2:] == 'NT':
        word = word[:-2] + 'TT'
    elif word[-2:] == 'RT':
        word = word[:-2] + 'RR'
    elif word[-3:] == 'RDT':
        word = word[:-3] + 'RR'

    word = word.replace('CA', 'KA')
    word = word.replace('CC', 'KK')
    word = word.replace('CK', 'KK')
    word = word.replace('CE', 'SE')
    word = word.replace('CHL', 'KL')
    word = word.replace('CL', 'KL')
    word = word.replace('CHR', 'KR')
    word = word.replace('CR', 'KR')
    word = word.replace('CI', 'SI')
    word = word.replace('CO', 'KO')
    word = word.replace('CU', 'KU')
    word = word.replace('CY', 'SY')
    word = word.replace('DG', 'GG')
    word = word.replace('GH', 'HH')
    word = word.replace('MAC', 'MK')
    word = word.replace('MC', 'MK')
    word = word.replace('NST', 'NSS')
    word = word.replace('PF', 'FF')
    word = word.replace('PH', 'FF')
    word = word.replace('SCH', 'SSS')
    word = word.replace('TIO', 'SIO')
    word = word.replace('TIA', 'SIO')
    word = word.replace('TCH', 'CHH')

    sdx = word.translate(_fuzzy_soundex_translation)
    sdx = sdx.replace('-', '')

    # remove repeating characters
    sdx = _delete_consecutive_repeats(sdx)

    if word[0] in {'H', 'W', 'Y'}:
        sdx = word[0] + sdx
    else:
        sdx = word[0] + sdx[1:]

    sdx = sdx.replace('0', '')

    if zero_pad:
        sdx += ('0'*max_length)

    return sdx[:max_length]


[docs]def phonex(word, max_length=4, zero_pad=True):
    """Return the Phonex code for a word.

    Phonex is an algorithm derived from Soundex, defined in :cite:`Lait:1996`.

    :param str word: the word to transform
    :param int max_length: the length of the code returned (defaults to 4)
    :param bool zero_pad: pad the end of the return value with 0s to achieve
        a max_length string
    :returns: the Phonex value
    :rtype: str

    >>> phonex('Christopher')
    'C623'
    >>> phonex('Niall')
    'N400'
    >>> phonex('Schmidt')
    'S253'
    >>> phonex('Smith')
    'S530'
    """
    name = unicode_normalize('NFKD', text_type(word.upper()))
    name = name.replace('ß', 'SS')

    # Clamp max_length to [4, 64]
    if max_length != -1:
        max_length = min(max(4, max_length), 64)
    else:
        max_length = 64

    name_code = last = ''

    # Deletions effected by replacing with next letter which
    # will be ignored due to duplicate handling of Soundex code.
    # This is faster than 'moving' all subsequent letters.

    # Remove any trailing Ss
    while name[-1:] == 'S':
        name = name[:-1]

    # Phonetic equivalents of first 2 characters
    # Works since duplicate letters are ignored
    if name[:2] == 'KN':
        name = 'N' + name[2:]  # KN.. == N..
    elif name[:2] == 'PH':
        name = 'F' + name[2:]  # PH.. == F.. (H ignored anyway)
    elif name[:2] == 'WR':
        name = 'R' + name[2:]  # WR.. == R..

    if name:
        # Special case, ignore H first letter (subsequent Hs ignored anyway)
        # Works since duplicate letters are ignored
        if name[0] == 'H':
            name = name[1:]

    if name:
        # Phonetic equivalents of first character
        if name[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
            name = 'A' + name[1:]
        elif name[0] in {'B', 'P'}:
            name = 'B' + name[1:]
        elif name[0] in {'V', 'F'}:
            name = 'F' + name[1:]
        elif name[0] in {'C', 'K', 'Q'}:
            name = 'C' + name[1:]
        elif name[0] in {'G', 'J'}:
            name = 'G' + name[1:]
        elif name[0] in {'S', 'Z'}:
            name = 'S' + name[1:]

        name_code = last = name[0]

    # Modified Soundex code
    for i in range(1, len(name)):
        code = '0'
        if name[i] in {'B', 'F', 'P', 'V'}:
            code = '1'
        elif name[i] in {'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z'}:
            code = '2'
        elif name[i] in {'D', 'T'}:
            if name[i+1:i+2] != 'C':
                code = '3'
        elif name[i] == 'L':
            if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or
                    i+1 == len(name)):
                code = '4'
        elif name[i] in {'M', 'N'}:
            if name[i+1:i+2] in {'D', 'G'}:
                name = name[:i+1] + name[i] + name[i+2:]
            code = '5'
        elif name[i] == 'R':
            if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or
                    i+1 == len(name)):
                code = '6'

        if code != last and code != '0' and i != 0:
            name_code += code

        last = name_code[-1]

    if zero_pad:
        name_code += '0' * max_length
    if not name_code:
        name_code = '0'
    return name_code[:max_length]


[docs]def phonem(word):
    """Return the Phonem code for a word.

    Phonem is defined in :cite:`Wilde:1988`.

    This version is based on the Perl implementation documented at
    :cite:`Wilz:2005`.
    It includes some enhancements presented in the Java port at
    :cite:`dcm4che:2011`.

    Phonem is intended chiefly for German names/words.

    :param str word: the word to transform
    :returns: the Phonem value
    :rtype: str

    >>> phonem('Christopher')
    'CRYSDOVR'
    >>> phonem('Niall')
    'NYAL'
    >>> phonem('Smith')
    'SMYD'
    >>> phonem('Schmidt')
    'CMYD'
    """
    _phonem_substitutions = (('SC', 'C'), ('SZ', 'C'), ('CZ', 'C'),
                             ('TZ', 'C'), ('TS', 'C'), ('KS', 'X'),
                             ('PF', 'V'), ('QU', 'KW'), ('PH', 'V'),
                             ('UE', 'Y'), ('AE', 'E'), ('OE', 'Ö'),
                             ('EI', 'AY'), ('EY', 'AY'), ('EU', 'OY'),
                             ('AU', 'A§'), ('OU', '§'))
    _phonem_translation = dict(zip((ord(_) for _ in
                                    'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜÝ§ÚÙÛÔÒÓÕØ'),
                                   'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ'))

    word = unicode_normalize('NFC', text_type(word.upper()))
    for i, j in _phonem_substitutions:
        word = word.replace(i, j)
    word = word.translate(_phonem_translation)

    return ''.join(c for c in _delete_consecutive_repeats(word)
                   if c in {'A', 'B', 'C', 'D', 'L', 'M', 'N', 'O', 'R', 'S',
                            'U', 'V', 'W', 'X', 'Y', 'Ö'})


[docs]def phonix(word, max_length=4, zero_pad=True):
    """Return the Phonix code for a word.

    Phonix is a Soundex-like algorithm defined in :cite:`Gadd:1990`.

    This implementation is based on:
    - :cite:`Pfeifer:2000`
    - :cite:`Christen:2011`
    - :cite:`Kollar:2007`

    :param str word: the word to transform
    :param int max_length: the length of the code returned (defaults to 4)
    :param bool zero_pad: pad the end of the return value with 0s to achieve
        a max_length string
    :returns: the Phonix value
    :rtype: str

    >>> phonix('Christopher')
    'K683'
    >>> phonix('Niall')
    'N400'
    >>> phonix('Smith')
    'S530'
    >>> phonix('Schmidt')
    'S530'
    """
    def _start_repl(word, src, tar, post=None):
        r"""Replace src with tar at the start of word."""
        if post:
            for i in post:
                if word.startswith(src+i):
                    return tar + word[len(src):]
        elif word.startswith(src):
            return tar + word[len(src):]
        return word

    def _end_repl(word, src, tar, pre=None):
        r"""Replace src with tar at the end of word."""
        if pre:
            for i in pre:
                if word.endswith(i+src):
                    return word[:-len(src)] + tar
        elif word.endswith(src):
            return word[:-len(src)] + tar
        return word

    def _mid_repl(word, src, tar, pre=None, post=None):
        r"""Replace src with tar in the middle of word."""
        if pre or post:
            if not pre:
                return word[0] + _all_repl(word[1:], src, tar, pre, post)
            elif not post:
                return (_all_repl(word[:-1], src, tar, pre, post) +
                        word[-1])
            return _all_repl(word, src, tar, pre, post)
        return (word[0] +
                _all_repl(word[1:-1], src, tar, pre, post) +
                word[-1])

    def _all_repl(word, src, tar, pre=None, post=None):
        r"""Replace src with tar anywhere in word."""
        if pre or post:
            if post:
                post = post
            else:
                post = frozenset(('',))
            if pre:
                pre = pre
            else:
                pre = frozenset(('',))

            for i, j in ((i, j) for i in pre for j in post):
                word = word.replace(i+src+j, i+tar+j)
            return word
        else:
            return word.replace(src, tar)

    _vow = {'A', 'E', 'I', 'O', 'U'}
    _con = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q',
            'R', 'S', 'T', 'V', 'W', 'X', 'Y', 'Z'}

    _phonix_substitutions = ((_all_repl, 'DG', 'G'),
                             (_all_repl, 'CO', 'KO'),
                             (_all_repl, 'CA', 'KA'),
                             (_all_repl, 'CU', 'KU'),
                             (_all_repl, 'CY', 'SI'),
                             (_all_repl, 'CI', 'SI'),
                             (_all_repl, 'CE', 'SE'),
                             (_start_repl, 'CL', 'KL', _vow),
                             (_all_repl, 'CK', 'K'),
                             (_end_repl, 'GC', 'K'),
                             (_end_repl, 'JC', 'K'),
                             (_start_repl, 'CHR', 'KR', _vow),
                             (_start_repl, 'CR', 'KR', _vow),
                             (_start_repl, 'WR', 'R'),
                             (_all_repl, 'NC', 'NK'),
                             (_all_repl, 'CT', 'KT'),
                             (_all_repl, 'PH', 'F'),
                             (_all_repl, 'AA', 'AR'),
                             (_all_repl, 'SCH', 'SH'),
                             (_all_repl, 'BTL', 'TL'),
                             (_all_repl, 'GHT', 'T'),
                             (_all_repl, 'AUGH', 'ARF'),
                             (_mid_repl, 'LJ', 'LD', _vow, _vow),
                             (_all_repl, 'LOUGH', 'LOW'),
                             (_start_repl, 'Q', 'KW'),
                             (_start_repl, 'KN', 'N'),
                             (_end_repl, 'GN', 'N'),
                             (_all_repl, 'GHN', 'N'),
                             (_end_repl, 'GNE', 'N'),
                             (_all_repl, 'GHNE', 'NE'),
                             (_end_repl, 'GNES', 'NS'),
                             (_start_repl, 'GN', 'N'),
                             (_mid_repl, 'GN', 'N', None, _con),
                             (_end_repl, 'GN', 'N'),
                             (_start_repl, 'PS', 'S'),
                             (_start_repl, 'PT', 'T'),
                             (_start_repl, 'CZ', 'C'),
                             (_mid_repl, 'WZ', 'Z', _vow),
                             (_mid_repl, 'CZ', 'CH'),
                             (_all_repl, 'LZ', 'LSH'),
                             (_all_repl, 'RZ', 'RSH'),
                             (_mid_repl, 'Z', 'S', None, _vow),
                             (_all_repl, 'ZZ', 'TS'),
                             (_mid_repl, 'Z', 'TS', _con),
                             (_all_repl, 'HROUG', 'REW'),
                             (_all_repl, 'OUGH', 'OF'),
                             (_mid_repl, 'Q', 'KW', _vow, _vow),
                             (_mid_repl, 'J', 'Y', _vow, _vow),
                             (_start_repl, 'YJ', 'Y', _vow),
                             (_start_repl, 'GH', 'G'),
                             (_end_repl, 'GH', 'E', _vow),
                             (_start_repl, 'CY', 'S'),
                             (_all_repl, 'NX', 'NKS'),
                             (_start_repl, 'PF', 'F'),
                             (_end_repl, 'DT', 'T'),
                             (_end_repl, 'TL', 'TIL'),
                             (_end_repl, 'DL', 'DIL'),
                             (_all_repl, 'YTH', 'ITH'),
                             (_start_repl, 'TJ', 'CH', _vow),
                             (_start_repl, 'TSJ', 'CH', _vow),
                             (_start_repl, 'TS', 'T', _vow),
                             (_all_repl, 'TCH', 'CH'),
                             (_mid_repl, 'WSK', 'VSKIE', _vow),
                             (_end_repl, 'WSK', 'VSKIE', _vow),
                             (_start_repl, 'MN', 'N', _vow),
                             (_start_repl, 'PN', 'N', _vow),
                             (_mid_repl, 'STL', 'SL', _vow),
                             (_end_repl, 'STL', 'SL', _vow),
                             (_end_repl, 'TNT', 'ENT'),
                             (_end_repl, 'EAUX', 'OH'),
                             (_all_repl, 'EXCI', 'ECS'),
                             (_all_repl, 'X', 'ECS'),
                             (_end_repl, 'NED', 'ND'),
                             (_all_repl, 'JR', 'DR'),
                             (_end_repl, 'EE', 'EA'),
                             (_all_repl, 'ZS', 'S'),
                             (_mid_repl, 'R', 'AH', _vow, _con),
                             (_end_repl, 'R', 'AH', _vow),
                             (_mid_repl, 'HR', 'AH', _vow, _con),
                             (_end_repl, 'HR', 'AH', _vow),
                             (_end_repl, 'HR', 'AH', _vow),
                             (_end_repl, 'RE', 'AR'),
                             (_end_repl, 'R', 'AH', _vow),
                             (_all_repl, 'LLE', 'LE'),
                             (_end_repl, 'LE', 'ILE', _con),
                             (_end_repl, 'LES', 'ILES', _con),
                             (_end_repl, 'E', ''),
                             (_end_repl, 'ES', 'S'),
                             (_end_repl, 'SS', 'AS', _vow),
                             (_end_repl, 'MB', 'M', _vow),
                             (_all_repl, 'MPTS', 'MPS'),
                             (_all_repl, 'MPS', 'MS'),
                             (_all_repl, 'MPT', 'MT'))

    _phonix_translation = dict(zip((ord(_) for _ in
                                    'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
                                   '01230720022455012683070808'))

    sdx = ''

    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})
    if word:
        for trans in _phonix_substitutions:
            word = trans[0](word, *trans[1:])
        if word[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
            sdx = 'v' + word[1:].translate(_phonix_translation)
        else:
            sdx = word[0] + word[1:].translate(_phonix_translation)
        sdx = _delete_consecutive_repeats(sdx)
        sdx = sdx.replace('0', '')

    # Clamp max_length to [4, 64]
    if max_length != -1:
        max_length = min(max(4, max_length), 64)
    else:
        max_length = 64

    if zero_pad:
        sdx += '0' * max_length
    if not sdx:
        sdx = '0'
    return sdx[:max_length]


[docs]def sfinxbis(word, max_length=-1):
    """Return the SfinxBis code for a word.

    SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`.

    This implementation follows the reference implementation:
    :cite:`Sjoo:2009`.

    SfinxBis is intended chiefly for Swedish names.

    :param str word: the word to transform
    :param int max_length: the length of the code returned (defaults to
        unlimited)
    :returns: the SfinxBis value
    :rtype: tuple

    >>> sfinxbis('Christopher')
    ('K68376',)
    >>> sfinxbis('Niall')
    ('N4',)
    >>> sfinxbis('Smith')
    ('S53',)
    >>> sfinxbis('Schmidt')
    ('S53',)

    >>> sfinxbis('Johansson')
    ('J585',)
    >>> sfinxbis('Sjöberg')
    ('#162',)
    """
    adelstitler = (' DE LA ', ' DE LAS ', ' DE LOS ', ' VAN DE ', ' VAN DEN ',
                   ' VAN DER ', ' VON DEM ', ' VON DER ',
                   ' AF ', ' AV ', ' DA ', ' DE ', ' DEL ', ' DEN ', ' DES ',
                   ' DI ', ' DO ', ' DON ', ' DOS ', ' DU ', ' E ', ' IN ',
                   ' LA ', ' LE ', ' MAC ', ' MC ', ' VAN ', ' VON ', ' Y ',
                   ' S:T ')

    _harde_vokaler = {'A', 'O', 'U', 'Å'}
    _mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'}
    _konsonanter = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P',
                    'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Z'}
    _alfabet = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                'Y', 'Z', 'Ä', 'Å', 'Ö'}

    _sfinxbis_translation = dict(zip((ord(_) for _ in
                                      'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),
                                     '123729224551268378999999999'))

    _sfinxbis_substitutions = dict(zip((ord(_) for _ in
                                        'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
                                       'VSAAAAÄCEEEEIIIINOOOOÖUUUYY'))

    def _foersvensker(lokal_ordet):
        """Return the Swedish-ized form of the word."""
        lokal_ordet = lokal_ordet.replace('STIERN', 'STJÄRN')
        lokal_ordet = lokal_ordet.replace('HIE', 'HJ')
        lokal_ordet = lokal_ordet.replace('SIÖ', 'SJÖ')
        lokal_ordet = lokal_ordet.replace('SCH', 'SH')
        lokal_ordet = lokal_ordet.replace('QU', 'KV')
        lokal_ordet = lokal_ordet.replace('IO', 'JO')
        lokal_ordet = lokal_ordet.replace('PH', 'F')

        for i in _harde_vokaler:
            lokal_ordet = lokal_ordet.replace(i+'Ü', i+'J')
            lokal_ordet = lokal_ordet.replace(i+'Y', i+'J')
            lokal_ordet = lokal_ordet.replace(i+'I', i+'J')
        for i in _mjuka_vokaler:
            lokal_ordet = lokal_ordet.replace(i+'Ü', i+'J')
            lokal_ordet = lokal_ordet.replace(i+'Y', i+'J')
            lokal_ordet = lokal_ordet.replace(i+'I', i+'J')

        if 'H' in lokal_ordet:
            for i in _konsonanter:
                lokal_ordet = lokal_ordet.replace('H'+i, i)

        lokal_ordet = lokal_ordet.translate(_sfinxbis_substitutions)

        lokal_ordet = lokal_ordet.replace('Ð', 'ETH')
        lokal_ordet = lokal_ordet.replace('Þ', 'TH')
        lokal_ordet = lokal_ordet.replace('ß', 'SS')

        return lokal_ordet

    def _koda_foersta_ljudet(lokal_ordet):
        """Return the word with the first sound coded."""
        if (lokal_ordet[0:1] in _mjuka_vokaler or
                lokal_ordet[0:1] in _harde_vokaler):
            lokal_ordet = '$' + lokal_ordet[1:]
        elif lokal_ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
            lokal_ordet = 'J' + lokal_ordet[2:]
        elif lokal_ordet[0:1] == 'G' and lokal_ordet[1:2] in _mjuka_vokaler:
            lokal_ordet = 'J' + lokal_ordet[1:]
        elif lokal_ordet[0:1] == 'Q':
            lokal_ordet = 'K' + lokal_ordet[1:]
        elif (lokal_ordet[0:2] == 'CH' and
              lokal_ordet[2:3] in frozenset(_mjuka_vokaler | _harde_vokaler)):
            lokal_ordet = '#' + lokal_ordet[2:]
        elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _harde_vokaler:
            lokal_ordet = 'K' + lokal_ordet[1:]
        elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _konsonanter:
            lokal_ordet = 'K' + lokal_ordet[1:]
        elif lokal_ordet[0:1] == 'X':
            lokal_ordet = 'S' + lokal_ordet[1:]
        elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _mjuka_vokaler:
            lokal_ordet = 'S' + lokal_ordet[1:]
        elif lokal_ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
            lokal_ordet = '#' + lokal_ordet[3:]
        elif lokal_ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
            lokal_ordet = '#' + lokal_ordet[2:]
        elif lokal_ordet[0:2] == 'SK' and lokal_ordet[2:3] in _mjuka_vokaler:
            lokal_ordet = '#' + lokal_ordet[2:]
        elif lokal_ordet[0:1] == 'K' and lokal_ordet[1:2] in _mjuka_vokaler:
            lokal_ordet = '#' + lokal_ordet[1:]
        return lokal_ordet

    # Steg 1, Versaler
    word = unicode_normalize('NFC', text_type(word.upper()))
    word = word.replace('ß', 'SS')
    word = word.replace('-', ' ')

    # Steg 2, Ta bort adelsprefix
    for adelstitel in adelstitler:
        while adelstitel in word:
            word = word.replace(adelstitel, ' ')
        if word.startswith(adelstitel[1:]):
            word = word[len(adelstitel)-1:]

    # Split word into tokens
    ordlista = word.split()

    # Steg 3, Ta bort dubbelteckning i början på namnet
    ordlista = [_delete_consecutive_repeats(ordet) for ordet in ordlista]
    if not ordlista:
        # noinspection PyRedundantParentheses
        return ('',)

    # Steg 4, Försvenskning
    ordlista = [_foersvensker(ordet) for ordet in ordlista]

    # Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
    ordlista = [''.join(c for c in ordet if c in _alfabet)
                for ordet in ordlista]

    # Steg 6, Koda första ljudet
    ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]

    # Steg 7, Dela upp namnet i två delar
    rest = [ordet[1:] for ordet in ordlista]

    # Steg 8, Utför fonetisk transformation i resten
    rest = [ordet.replace('DT', 'T') for ordet in rest]
    rest = [ordet.replace('X', 'KS') for ordet in rest]

    # Steg 9, Koda resten till en sifferkod
    for vokal in _mjuka_vokaler:
        rest = [ordet.replace('C'+vokal, '8'+vokal) for ordet in rest]
    rest = [ordet.translate(_sfinxbis_translation) for ordet in rest]

    # Steg 10, Ta bort intilliggande dubbletter
    rest = [_delete_consecutive_repeats(ordet) for ordet in rest]

    # Steg 11, Ta bort alla "9"
    rest = [ordet.replace('9', '') for ordet in rest]

    # Steg 12, Sätt ihop delarna igen
    ordlista = [''.join(ordet) for ordet in
                zip((_[0:1] for _ in ordlista), rest)]

    # truncate, if max_length is set
    if max_length > 0:
        ordlista = [ordet[:max_length] for ordet in ordlista]

    return tuple(ordlista)


[docs]def phonet(word, mode=1, lang='de'):
    """Return the phonet code for a word.

    phonet ("Hannoveraner Phonetik") was developed by Jörg Michael and
    documented in :cite:`Michael:1999`.

    This is a port of Jesper Zedlitz's code, which is licensed LGPL
    :cite:`Zedlitz:2015`.

    That is, in turn, based on Michael's C code, which is also licensed LGPL
    :cite:`Michael:2007`.

    :param str word: the word to transform
    :param int mode: the ponet variant to employ (1 or 2)
    :param str lang: 'de' (default) for German
            'none' for no language
    :returns: the phonet value
    :rtype: str

    >>> phonet('Christopher')
    'KRISTOFA'
    >>> phonet('Niall')
    'NIAL'
    >>> phonet('Smith')
    'SMIT'
    >>> phonet('Schmidt')
    'SHMIT'

    >>> phonet('Christopher', mode=2)
    'KRIZTUFA'
    >>> phonet('Niall', mode=2)
    'NIAL'
    >>> phonet('Smith', mode=2)
    'ZNIT'
    >>> phonet('Schmidt', mode=2)
    'ZNIT'

    >>> phonet('Christopher', lang='none')
    'CHRISTOPHER'
    >>> phonet('Niall', lang='none')
    'NIAL'
    >>> phonet('Smith', lang='none')
    'SMITH'
    >>> phonet('Schmidt', lang='none')
    'SCHMIDT'
    """
    _phonet_rules_no_lang = (  # separator chars
        '´', ' ', ' ',
        '"', ' ', ' ',
        '`$', '', '',
        '\'', ' ', ' ',
        ',', ',', ',',
        ';', ',', ',',
        '-', ' ', ' ',
        ' ', ' ', ' ',
        '.', '.', '.',
        ':', '.', '.',
        # German umlauts
        'Ä', 'AE', 'AE',
        'Ö', 'OE', 'OE',
        'Ü', 'UE', 'UE',
        'ß', 'S', 'S',
        # international umlauts
        'À', 'A', 'A',
        'Á', 'A', 'A',
        'Â', 'A', 'A',
        'Ã', 'A', 'A',
        'Å', 'A', 'A',
        'Æ', 'AE', 'AE',
        'Ç', 'C', 'C',
        'Ð', 'DJ', 'DJ',
        'È', 'E', 'E',
        'É', 'E', 'E',
        'Ê', 'E', 'E',
        'Ë', 'E', 'E',
        'Ì', 'I', 'I',
        'Í', 'I', 'I',
        'Î', 'I', 'I',
        'Ï', 'I', 'I',
        'Ñ', 'NH', 'NH',
        'Ò', 'O', 'O',
        'Ó', 'O', 'O',
        'Ô', 'O', 'O',
        'Õ', 'O', 'O',
        'Œ', 'OE', 'OE',
        'Ø', 'OE', 'OE',
        'Š', 'SH', 'SH',
        'Þ', 'TH', 'TH',
        'Ù', 'U', 'U',
        'Ú', 'U', 'U',
        'Û', 'U', 'U',
        'Ý', 'Y', 'Y',
        'Ÿ', 'Y', 'Y',
        # 'normal' letters (A-Z)
        'MC^', 'MAC', 'MAC',
        'MC^', 'MAC', 'MAC',
        'M´^', 'MAC', 'MAC',
        'M\'^', 'MAC', 'MAC',
        'O´^', 'O', 'O',
        'O\'^', 'O', 'O',
        'VAN DEN ^', 'VANDEN', 'VANDEN',
        None, None, None)

    _phonet_rules_german = (  # separator chars
        '´', ' ', ' ',
        '"', ' ', ' ',
        '`$', '', '',
        '\'', ' ', ' ',
        ',', ' ', ' ',
        ';', ' ', ' ',
        '-', ' ', ' ',
        ' ', ' ', ' ',
        '.', '.', '.',
        ':', '.', '.',
        # German umlauts
        'ÄE', 'E', 'E',
        'ÄU<', 'EU', 'EU',
        'ÄV(AEOU)-<', 'EW', None,
        'Ä$', 'Ä', None,
        'Ä<', None, 'E',
        'Ä', 'E', None,
        'ÖE', 'Ö', 'Ö',
        'ÖU', 'Ö', 'Ö',
        'ÖVER--<', 'ÖW', None,
        'ÖV(AOU)-', 'ÖW', None,
        'ÜBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
        'ÜBER^^', 'ÜBA', 'IBA',
        'ÜE', 'Ü', 'I',
        'ÜVER--<', 'ÜW', None,
        'ÜV(AOU)-', 'ÜW', None,
        'Ü', None, 'I',
        'ßCH<', None, 'Z',
        'ß<', 'S', 'Z',
        # international umlauts
        'À<', 'A', 'A',
        'Á<', 'A', 'A',
        'Â<', 'A', 'A',
        'Ã<', 'A', 'A',
        'Å<', 'A', 'A',
        'ÆER-', 'E', 'E',
        'ÆU<', 'EU', 'EU',
        'ÆV(AEOU)-<', 'EW', None,
        'Æ$', 'Ä', None,
        'Æ<', None, 'E',
        'Æ', 'E', None,
        'Ç', 'Z', 'Z',
        'ÐÐ-', '', '',
        'Ð', 'DI', 'TI',
        'È<', 'E', 'E',
        'É<', 'E', 'E',
        'Ê<', 'E', 'E',
        'Ë', 'E', 'E',
        'Ì<', 'I', 'I',
        'Í<', 'I', 'I',
        'Î<', 'I', 'I',
        'Ï', 'I', 'I',
        'ÑÑ-', '', '',
        'Ñ', 'NI', 'NI',
        'Ò<', 'O', 'U',
        'Ó<', 'O', 'U',
        'Ô<', 'O', 'U',
        'Õ<', 'O', 'U',
        'Œ<', 'Ö', 'Ö',
        'Ø(IJY)-<', 'E', 'E',
        'Ø<', 'Ö', 'Ö',
        'Š', 'SH', 'Z',
        'Þ', 'T', 'T',
        'Ù<', 'U', 'U',
        'Ú<', 'U', 'U',
        'Û<', 'U', 'U',
        'Ý<', 'I', 'I',
        'Ÿ<', 'I', 'I',
        # 'normal' letters (A-Z)
        'ABELLE$', 'ABL', 'ABL',
        'ABELL$', 'ABL', 'ABL',
        'ABIENNE$', 'ABIN', 'ABIN',
        'ACHME---^', 'ACH', 'AK',
        'ACEY$', 'AZI', 'AZI',
        'ADV', 'ATW', None,
        'AEGL-', 'EK', None,
        'AEU<', 'EU', 'EU',
        'AE2', 'E', 'E',
        'AFTRAUBEN------', 'AFT ', 'AFT ',
        'AGL-1', 'AK', None,
        'AGNI-^', 'AKN', 'AKN',
        'AGNIE-', 'ANI', 'ANI',
        'AGN(AEOU)-$', 'ANI', 'ANI',
        'AH(AIOÖUÜY)-', 'AH', None,
        'AIA2', 'AIA', 'AIA',
        'AIE$', 'E', 'E',
        'AILL(EOU)-', 'ALI', 'ALI',
        'AINE$', 'EN', 'EN',
        'AIRE$', 'ER', 'ER',
        'AIR-', 'E', 'E',
        'AISE$', 'ES', 'EZ',
        'AISSANCE$', 'ESANS', 'EZANZ',
        'AISSE$', 'ES', 'EZ',
        'AIX$', 'EX', 'EX',
        'AJ(AÄEÈÉÊIOÖUÜ)--', 'A', 'A',
        'AKTIE', 'AXIE', 'AXIE',
        'AKTUEL', 'AKTUEL', None,
        'ALOI^', 'ALOI', 'ALUI',  # Don't merge these rules
        'ALOY^', 'ALOI', 'ALUI',  # needed by 'check_rules'
        'AMATEU(RS)-', 'AMATÖ', 'ANATÖ',
        'ANCH(OEI)-', 'ANSH', 'ANZ',
        'ANDERGEGANG----', 'ANDA GE', 'ANTA KE',
        'ANDERGEHE----', 'ANDA ', 'ANTA ',
        'ANDERGESETZ----', 'ANDA GE', 'ANTA KE',
        'ANDERGING----', 'ANDA ', 'ANTA ',
        'ANDERSETZ(ET)-----', 'ANDA ', 'ANTA ',
        'ANDERZUGEHE----', 'ANDA ZU ', 'ANTA ZU ',
        'ANDERZUSETZE-----', 'ANDA ZU ', 'ANTA ZU ',
        'ANER(BKO)---^^', 'AN', None,
        'ANHAND---^$', 'AN H', 'AN ',
        'ANH(AÄEIOÖUÜY)--^^', 'AN', None,
        'ANIELLE$', 'ANIEL', 'ANIL',
        'ANIEL', 'ANIEL', None,
        'ANSTELLE----^$', 'AN ST', 'AN ZT',
        'ANTI^^', 'ANTI', 'ANTI',
        'ANVER^^', 'ANFA', 'ANFA',
        'ATIA$', 'ATIA', 'ATIA',
        'ATIA(NS)--', 'ATI', 'ATI',
        'ATI(AÄOÖUÜ)-', 'AZI', 'AZI',
        'AUAU--', '', '',
        'AUERE$', 'AUERE', None,
        'AUERE(NS)-$', 'AUERE', None,
        'AUERE(AIOUY)--', 'AUER', None,
        'AUER(AÄIOÖUÜY)-', 'AUER', None,
        'AUER<', 'AUA', 'AUA',
        'AUF^^', 'AUF', 'AUF',
        'AULT$', 'O', 'U',
        'AUR(BCDFGKLMNQSTVWZ)-', 'AUA', 'AUA',
        'AUR$', 'AUA', 'AUA',
        'AUSSE$', 'OS', 'UZ',
        'AUS(ST)-^', 'AUS', 'AUS',
        'AUS^^', 'AUS', 'AUS',
        'AUTOFAHR----', 'AUTO ', 'AUTU ',
        'AUTO^^', 'AUTO', 'AUTU',
        'AUX(IY)-', 'AUX', 'AUX',
        'AUX', 'O', 'U',
        'AU', 'AU', 'AU',
        'AVER--<', 'AW', None,
        'AVIER$', 'AWIE', 'AFIE',
        'AV(EÈÉÊI)-^', 'AW', None,
        'AV(AOU)-', 'AW', None,
        'AYRE$', 'EIRE', 'EIRE',
        'AYRE(NS)-$', 'EIRE', 'EIRE',
        'AYRE(AIOUY)--', 'EIR', 'EIR',
        'AYR(AÄIOÖUÜY)-', 'EIR', 'EIR',
        'AYR<', 'EIA', 'EIA',
        'AYER--<', 'EI', 'EI',
        'AY(AÄEIOÖUÜY)--', 'A', 'A',
        'AË', 'E', 'E',
        'A(IJY)<', 'EI', 'EI',
        'BABY^$', 'BEBI', 'BEBI',
        'BAB(IY)^', 'BEBI', 'BEBI',
        'BEAU^$', 'BO', None,
        'BEA(BCMNRU)-^', 'BEA', 'BEA',
        'BEAT(AEIMORU)-^', 'BEAT', 'BEAT',
        'BEE$', 'BI', 'BI',
        'BEIGE^$', 'BESH', 'BEZ',
        'BENOIT--', 'BENO', 'BENU',
        'BER(DT)-', 'BER', None,
        'BERN(DT)-', 'BERN', None,
        'BE(LMNRST)-^', 'BE', 'BE',
        'BETTE$', 'BET', 'BET',
        'BEVOR^$', 'BEFOR', None,
        'BIC$', 'BIZ', 'BIZ',
        'BOWL(EI)-', 'BOL', 'BUL',
        'BP(AÄEÈÉÊIÌÍÎOÖRUÜY)-', 'B', 'B',
        'BRINGEND-----^', 'BRI', 'BRI',
        'BRINGEND-----', ' BRI', ' BRI',
        'BROW(NS)-', 'BRAU', 'BRAU',
        'BUDGET7', 'BÜGE', 'BIKE',
        'BUFFET7', 'BÜFE', 'BIFE',
        'BYLLE$', 'BILE', 'BILE',
        'BYLL$', 'BIL', 'BIL',
        'BYPA--^', 'BEI', 'BEI',
        'BYTE<', 'BEIT', 'BEIT',
        'BY9^', 'BÜ', None,
        'B(SßZ)$', 'BS', None,
        'CACH(EI)-^', 'KESH', 'KEZ',
        'CAE--', 'Z', 'Z',
        'CA(IY)$', 'ZEI', 'ZEI',
        'CE(EIJUY)--', 'Z', 'Z',
        'CENT<', 'ZENT', 'ZENT',
        'CERST(EI)----^', 'KE', 'KE',
        'CER$', 'ZA', 'ZA',
        'CE3', 'ZE', 'ZE',
        'CH\'S$', 'X', 'X',
        'CH´S$', 'X', 'X',
        'CHAO(ST)-', 'KAO', 'KAU',
        'CHAMPIO-^', 'SHEMPI', 'ZENBI',
        'CHAR(AI)-^', 'KAR', 'KAR',
        'CHAU(CDFSVWXZ)-', 'SHO', 'ZU',
        'CHÄ(CF)-', 'SHE', 'ZE',
        'CHE(CF)-', 'SHE', 'ZE',
        'CHEM-^', 'KE', 'KE',  # or: 'CHE', 'KE'
        'CHEQUE<', 'SHEK', 'ZEK',
        'CHI(CFGPVW)-', 'SHI', 'ZI',
        'CH(AEUY)-<^', 'SH', 'Z',
        'CHK-', '', '',
        'CHO(CKPS)-^', 'SHO', 'ZU',
        'CHRIS-', 'KRI', None,
        'CHRO-', 'KR', None,
        'CH(LOR)-<^', 'K', 'K',
        'CHST-', 'X', 'X',
        'CH(SßXZ)3', 'X', 'X',
        'CHTNI-3', 'CHN', 'KN',
        'CH^', 'K', 'K',  # or: 'CH', 'K'
        'CH', 'CH', 'K',
        'CIC$', 'ZIZ', 'ZIZ',
        'CIENCEFICT----', 'EIENS ', 'EIENZ ',
        'CIENCE$', 'EIENS', 'EIENZ',
        'CIER$', 'ZIE', 'ZIE',
        'CYB-^', 'ZEI', 'ZEI',
        'CY9^', 'ZÜ', 'ZI',
        'C(IJY)-<3', 'Z', 'Z',
        'CLOWN-', 'KLAU', 'KLAU',
        'CCH', 'Z', 'Z',
        'CCE-', 'X', 'X',
        'C(CK)-', '', '',
        'CLAUDET---', 'KLO', 'KLU',
        'CLAUDINE^$', 'KLODIN', 'KLUTIN',
        'COACH', 'KOSH', 'KUZ',
        'COLE$', 'KOL', 'KUL',
        'COUCH', 'KAUSH', 'KAUZ',
        'COW', 'KAU', 'KAU',
        'CQUES$', 'K', 'K',
        'CQUE', 'K', 'K',
        'CRASH--9', 'KRE', 'KRE',
        'CREAT-^', 'KREA', 'KREA',
        'CST', 'XT', 'XT',
        'CS<^', 'Z', 'Z',
        'C(SßX)', 'X', 'X',
        'CT\'S$', 'X', 'X',
        'CT(SßXZ)', 'X', 'X',
        'CZ<', 'Z', 'Z',
        'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z',
        'C.^', 'C.', 'C.',
        'CÄ-', 'Z', 'Z',
        'CÜ$', 'ZÜ', 'ZI',
        'C\'S$', 'X', 'X',
        'C<', 'K', 'K',
        'DAHER^$', 'DAHER', None,
        'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ',
        'DAVO(NR)-^$', 'DAFO', 'TAFU',
        'DD(SZ)--<', '', '',
        'DD9', 'D', None,
        'DEPOT7', 'DEPO', 'TEBU',
        'DESIGN', 'DISEIN', 'TIZEIN',
        'DE(LMNRST)-3^', 'DE', 'TE',
        'DETTE$', 'DET', 'TET',
        'DH$', 'T', None,
        'DIC$', 'DIZ', 'TIZ',
        'DIDR-^', 'DIT', None,
        'DIEDR-^', 'DIT', None,
        'DJ(AEIOU)-^', 'I', 'I',
        'DMITR-^', 'DIMIT', 'TINIT',
        'DRY9^', 'DRÜ', None,
        'DT-', '', '',
        'DUIS-^', 'DÜ', 'TI',
        'DURCH^^', 'DURCH', 'TURK',
        'DVA$', 'TWA', None,
        'DY9^', 'DÜ', None,
        'DYS$', 'DIS', None,
        'DS(CH)--<', 'T', 'T',
        'DST', 'ZT', 'ZT',
        'DZS(CH)--', 'T', 'T',
        'D(SßZ)', 'Z', 'Z',
        'D(AÄEIOÖRUÜY)-', 'D', None,
        'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None,
        'D\'H^', 'D', 'T',
        'D´H^', 'D', 'T',
        'D`H^', 'D', 'T',
        'D\'S3$', 'Z', 'Z',
        'D´S3$', 'Z', 'Z',
        'D^', 'D', None,
        'D', 'T', 'T',
        'EAULT$', 'O', 'U',
        'EAUX$', 'O', 'U',
        'EAU', 'O', 'U',
        'EAV', 'IW', 'IF',
        'EAS3$', 'EAS', None,
        'EA(AÄEIOÖÜY)-3', 'EA', 'EA',
        'EA3$', 'EA', 'EA',
        'EA3', 'I', 'I',
        'EBENSO^$', 'EBNSO', 'EBNZU',
        'EBENSO^^', 'EBNSO ', 'EBNZU ',
        'EBEN^^', 'EBN', 'EBN',
        'EE9', 'E', 'E',
        'EGL-1', 'EK', None,
        'EHE(IUY)--1', 'EH', None,
        'EHUNG---1', 'E', None,
        'EH(AÄIOÖUÜY)-1', 'EH', None,
        'EIEI--', '', '',
        'EIERE^$', 'EIERE', None,
        'EIERE$', 'EIERE', None,
        'EIERE(NS)-$', 'EIERE', None,
        'EIERE(AIOUY)--', 'EIER', None,
        'EIER(AÄIOÖUÜY)-', 'EIER', None,
        'EIER<', 'EIA', None,
        'EIGL-1', 'EIK', None,
        'EIGH$', 'EI', 'EI',
        'EIH--', 'E', 'E',
        'EILLE$', 'EI', 'EI',
        'EIR(BCDFGKLMNQSTVWZ)-', 'EIA', 'EIA',
        'EIR$', 'EIA', 'EIA',
        'EITRAUBEN------', 'EIT ', 'EIT ',
        'EI', 'EI', 'EI',
        'EJ$', 'EI', 'EI',
        'ELIZ^', 'ELIS', None,
        'ELZ^', 'ELS', None,
        'EL-^', 'E', 'E',
        'ELANG----1', 'E', 'E',
        'EL(DKL)--1', 'E', 'E',
        'EL(MNT)--1$', 'E', 'E',
        'ELYNE$', 'ELINE', 'ELINE',
        'ELYN$', 'ELIN', 'ELIN',
        'EL(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'EL', 'EL',
        'EL-1', 'L', 'L',
        'EM-^', None, 'E',
        'EM(DFKMPQT)--1', None, 'E',
        'EM(AÄEÈÉÊIÌÍÎOÖUÜY)--1', None, 'E',
        'EM-1', None, 'N',
        'ENGAG-^', 'ANGA', 'ANKA',
        'EN-^', 'E', 'E',
        'ENTUEL', 'ENTUEL', None,
        'EN(CDGKQSTZ)--1', 'E', 'E',
        'EN(AÄEÈÉÊIÌÍÎNOÖUÜY)-1', 'EN', 'EN',
        'EN-1', '', '',
        'ERH(AÄEIOÖUÜ)-^', 'ERH', 'ER',
        'ER-^', 'E', 'E',
        'ERREGEND-----', ' ER', ' ER',
        'ERT1$', 'AT', None,
        'ER(DGLKMNRQTZß)-1', 'ER', None,
        'ER(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'ER', 'A',
        'ER1$', 'A', 'A',
        'ER<1', 'A', 'A',
        'ETAT7', 'ETA', 'ETA',
        'ETI(AÄOÖÜU)-', 'EZI', 'EZI',
        'EUERE$', 'EUERE', None,
        'EUERE(NS)-$', 'EUERE', None,
        'EUERE(AIOUY)--', 'EUER', None,
        'EUER(AÄIOÖUÜY)-', 'EUER', None,
        'EUER<', 'EUA', None,
        'EUEU--', '', '',
        'EUILLE$', 'Ö', 'Ö',
        'EUR$', 'ÖR', 'ÖR',
        'EUX', 'Ö', 'Ö',
        'EUSZ$', 'EUS', None,
        'EUTZ$', 'EUS', None,
        'EUYS$', 'EUS', 'EUZ',
        'EUZ$', 'EUS', None,
        'EU', 'EU', 'EU',
        'EVER--<1', 'EW', None,
        'EV(ÄOÖUÜ)-1', 'EW', None,
        'EYER<', 'EIA', 'EIA',
        'EY<', 'EI', 'EI',
        'FACETTE', 'FASET', 'FAZET',
        'FANS--^$', 'FE', 'FE',
        'FAN-^$', 'FE', 'FE',
        'FAULT-', 'FOL', 'FUL',
        'FEE(DL)-', 'FI', 'FI',
        'FEHLER', 'FELA', 'FELA',
        'FE(LMNRST)-3^', 'FE', 'FE',
        'FOERDERN---^', 'FÖRD', 'FÖRT',
        'FOERDERN---', ' FÖRD', ' FÖRT',
        'FOND7', 'FON', 'FUN',
        'FRAIN$', 'FRA', 'FRA',
        'FRISEU(RS)-', 'FRISÖ', 'FRIZÖ',
        'FY9^', 'FÜ', None,
        'FÖRDERN---^', 'FÖRD', 'FÖRT',
        'FÖRDERN---', ' FÖRD', ' FÖRT',
        'GAGS^$', 'GEX', 'KEX',
        'GAG^$', 'GEK', 'KEK',
        'GD', 'KT', 'KT',
        'GEGEN^^', 'GEGN', 'KEKN',
        'GEGENGEKOM-----', 'GEGN ', 'KEKN ',
        'GEGENGESET-----', 'GEGN ', 'KEKN ',
        'GEGENKOMME-----', 'GEGN ', 'KEKN ',
        'GEGENZUKOM---', 'GEGN ZU ', 'KEKN ZU ',
        'GENDETWAS-----$', 'GENT ', 'KENT ',
        'GENRE', 'IORE', 'IURE',
        'GE(LMNRST)-3^', 'GE', 'KE',
        'GER(DKT)-', 'GER', None,
        'GETTE$', 'GET', 'KET',
        'GGF.', 'GF.', None,
        'GG-', '', '',
        'GH', 'G', None,
        'GI(AOU)-^', 'I', 'I',
        'GION-3', 'KIO', 'KIU',
        'G(CK)-', '', '',
        'GJ(AEIOU)-^', 'I', 'I',
        'GMBH^$', 'GMBH', 'GMBH',
        'GNAC$', 'NIAK', 'NIAK',
        'GNON$', 'NION', 'NIUN',
        'GN$', 'N', 'N',
        'GONCAL-^', 'GONZA', 'KUNZA',
        'GRY9^', 'GRÜ', None,
        'G(SßXZ)-<', 'K', 'K',
        'GUCK-', 'KU', 'KU',
        'GUISEP-^', 'IUSE', 'IUZE',
        'GUI-^', 'G', 'K',
        'GUTAUSSEH------^', 'GUT ', 'KUT ',
        'GUTGEHEND------^', 'GUT ', 'KUT ',
        'GY9^', 'GÜ', None,
        'G(AÄEILOÖRUÜY)-', 'G', None,
        'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None,
        'G\'S$', 'X', 'X',
        'G´S$', 'X', 'X',
        'G^', 'G', None,
        'G', 'K', 'K',
        'HA(HIUY)--1', 'H', None,
        'HANDVOL---^', 'HANT ', 'ANT ',
        'HANNOVE-^', 'HANOF', None,
        'HAVEN7$', 'HAFN', None,
        'HEAD-', 'HE', 'E',
        'HELIEGEN------', 'E ', 'E ',
        'HESTEHEN------', 'E ', 'E ',
        'HE(LMNRST)-3^', 'HE', 'E',
        'HE(LMN)-1', 'E', 'E',
        'HEUR1$', 'ÖR', 'ÖR',
        'HE(HIUY)--1', 'H', None,
        'HIH(AÄEIOÖUÜY)-1', 'IH', None,
        'HLH(AÄEIOÖUÜY)-1', 'LH', None,
        'HMH(AÄEIOÖUÜY)-1', 'MH', None,
        'HNH(AÄEIOÖUÜY)-1', 'NH', None,
        'HOBBY9^', 'HOBI', None,
        'HOCHBEGAB-----^', 'HOCH ', 'UK ',
        'HOCHTALEN-----^', 'HOCH ', 'UK ',
        'HOCHZUFRI-----^', 'HOCH ', 'UK ',
        'HO(HIY)--1', 'H', None,
        'HRH(AÄEIOÖUÜY)-1', 'RH', None,
        'HUH(AÄEIOÖUÜY)-1', 'UH', None,
        'HUIS^^', 'HÜS', 'IZ',
        'HUIS$', 'ÜS', 'IZ',
        'HUI--1', 'H', None,
        'HYGIEN^', 'HÜKIEN', None,
        'HY9^', 'HÜ', None,
        'HY(BDGMNPST)-', 'Ü', None,
        'H.^', None, 'H.',
        'HÄU--1', 'H', None,
        'H^', 'H', '',
        'H', '', '',
        'ICHELL---', 'ISH', 'IZ',
        'ICHI$', 'ISHI', 'IZI',
        'IEC$', 'IZ', 'IZ',
        'IEDENSTELLE------', 'IDN ', 'ITN ',
        'IEI-3', '', '',
        'IELL3', 'IEL', 'IEL',
        'IENNE$', 'IN', 'IN',
        'IERRE$', 'IER', 'IER',
        'IERZULAN---', 'IR ZU ', 'IR ZU ',
        'IETTE$', 'IT', 'IT',
        'IEU', 'IÖ', 'IÖ',
        'IE<4', 'I', 'I',
        'IGL-1', 'IK', None,
        'IGHT3$', 'EIT', 'EIT',
        'IGNI(EO)-', 'INI', 'INI',
        'IGN(AEOU)-$', 'INI', 'INI',
        'IHER(DGLKRT)--1', 'IHE', None,
        'IHE(IUY)--', 'IH', None,
        'IH(AIOÖUÜY)-', 'IH', None,
        'IJ(AOU)-', 'I', 'I',
        'IJ$', 'I', 'I',
        'IJ<', 'EI', 'EI',
        'IKOLE$', 'IKOL', 'IKUL',
        'ILLAN(STZ)--4', 'ILIA', 'ILIA',
        'ILLAR(DT)--4', 'ILIA', 'ILIA',
        'IMSTAN----^', 'IM ', 'IN ',
        'INDELERREGE------', 'INDL ', 'INTL ',
        'INFRAGE-----^$', 'IN ', 'IN ',
        'INTERN(AOU)-^', 'INTAN', 'INTAN',
        'INVER-', 'INWE', 'INFE',
        'ITI(AÄIOÖUÜ)-', 'IZI', 'IZI',
        'IUSZ$', 'IUS', None,
        'IUTZ$', 'IUS', None,
        'IUZ$', 'IUS', None,
        'IVER--<', 'IW', None,
        'IVIER$', 'IWIE', 'IFIE',
        'IV(ÄOÖUÜ)-', 'IW', None,
        'IV<3', 'IW', None,
        'IY2', 'I', None,
        'I(ÈÉÊ)<4', 'I', 'I',
        'JAVIE---<^', 'ZA', 'ZA',
        'JEANS^$', 'JINS', 'INZ',
        'JEANNE^$', 'IAN', 'IAN',
        'JEAN-^', 'IA', 'IA',
        'JER-^', 'IE', 'IE',
        'JE(LMNST)-', 'IE', 'IE',
        'JI^', 'JI', None,
        'JOR(GK)^$', 'IÖRK', 'IÖRK',
        'J', 'I', 'I',
        'KC(ÄEIJ)-', 'X', 'X',
        'KD', 'KT', None,
        'KE(LMNRST)-3^', 'KE', 'KE',
        'KG(AÄEILOÖRUÜY)-', 'K', None,
        'KH<^', 'K', 'K',
        'KIC$', 'KIZ', 'KIZ',
        'KLE(LMNRST)-3^', 'KLE', 'KLE',
        'KOTELE-^', 'KOTL', 'KUTL',
        'KREAT-^', 'KREA', 'KREA',
        'KRÜS(TZ)--^', 'KRI', None,
        'KRYS(TZ)--^', 'KRI', None,
        'KRY9^', 'KRÜ', None,
        'KSCH---', 'K', 'K',
        'KSH--', 'K', 'K',
        'K(SßXZ)7', 'X', 'X',  # implies 'KST' -> 'XT'
        'KT\'S$', 'X', 'X',
        'KTI(AIOU)-3', 'XI', 'XI',
        'KT(SßXZ)', 'X', 'X',
        'KY9^', 'KÜ', None,
        'K\'S$', 'X', 'X',
        'K´S$', 'X', 'X',
        'LANGES$', ' LANGES', ' LANKEZ',
        'LANGE$', ' LANGE', ' LANKE',
        'LANG$', ' LANK', ' LANK',
        'LARVE-', 'LARF', 'LARF',
        'LD(SßZ)$', 'LS', 'LZ',
        'LD\'S$', 'LS', 'LZ',
        'LD´S$', 'LS', 'LZ',
        'LEAND-^', 'LEAN', 'LEAN',
        'LEERSTEHE-----^', 'LER ', 'LER ',
        'LEICHBLEIB-----', 'LEICH ', 'LEIK ',
        'LEICHLAUTE-----', 'LEICH ', 'LEIK ',
        'LEIDERREGE------', 'LEIT ', 'LEIT ',
        'LEIDGEPR----^', 'LEIT ', 'LEIT ',
        'LEINSTEHE-----', 'LEIN ', 'LEIN ',
        'LEL-', 'LE', 'LE',
        'LE(MNRST)-3^', 'LE', 'LE',
        'LETTE$', 'LET', 'LET',
        'LFGNAG-', 'LFGAN', 'LFKAN',
        'LICHERWEIS----', 'LICHA ', 'LIKA ',
        'LIC$', 'LIZ', 'LIZ',
        'LIVE^$', 'LEIF', 'LEIF',
        'LT(SßZ)$', 'LS', 'LZ',
        'LT\'S$', 'LS', 'LZ',
        'LT´S$', 'LS', 'LZ',
        'LUI(GS)--', 'LU', 'LU',
        'LV(AIO)-', 'LW', None,
        'LY9^', 'LÜ', None,
        'LSTS$', 'LS', 'LZ',
        'LZ(BDFGKLMNPQRSTVWX)-', 'LS', None,
        'L(SßZ)$', 'LS', None,
        'MAIR-<', 'MEI', 'NEI',
        'MANAG-', 'MENE', 'NENE',
        'MANUEL', 'MANUEL', None,
        'MASSEU(RS)-', 'MASÖ', 'NAZÖ',
        'MATCH', 'MESH', 'NEZ',
        'MAURICE', 'MORIS', 'NURIZ',
        'MBH^$', 'MBH', 'MBH',
        'MB(ßZ)$', 'MS', None,
        'MB(SßTZ)-', 'M', 'N',
        'MCG9^', 'MAK', 'NAK',
        'MC9^', 'MAK', 'NAK',
        'MEMOIR-^', 'MEMOA', 'NENUA',
        'MERHAVEN$', 'MAHAFN', None,
        'ME(LMNRST)-3^', 'ME', 'NE',
        'MEN(STZ)--3', 'ME', None,
        'MEN$', 'MEN', None,
        'MIGUEL-', 'MIGE', 'NIKE',
        'MIKE^$', 'MEIK', 'NEIK',
        'MITHILFE----^$', 'MIT H', 'NIT ',
        'MN$', 'M', None,
        'MN', 'N', 'N',
        'MPJUTE-', 'MPUT', 'NBUT',
        'MP(ßZ)$', 'MS', None,
        'MP(SßTZ)-', 'M', 'N',
        'MP(BDJLMNPQVW)-', 'MB', 'NB',
        'MY9^', 'MÜ', None,
        'M(ßZ)$', 'MS', None,
        'M´G7^', 'MAK', 'NAK',
        'M\'G7^', 'MAK', 'NAK',
        'M´^', 'MAK', 'NAK',
        'M\'^', 'MAK', 'NAK',
        'M', None, 'N',
        'NACH^^', 'NACH', 'NAK',
        'NADINE', 'NADIN', 'NATIN',
        'NAIV--', 'NA', 'NA',
        'NAISE$', 'NESE', 'NEZE',
        'NAUGENOMM------', 'NAU ', 'NAU ',
        'NAUSOGUT$', 'NAUSO GUT', 'NAUZU KUT',
        'NCH$', 'NSH', 'NZ',
        'NCOISE$', 'SOA', 'ZUA',
        'NCOIS$', 'SOA', 'ZUA',
        'NDAR$', 'NDA', 'NTA',
        'NDERINGEN------', 'NDE ', 'NTE ',
        'NDRO(CDKTZ)-', 'NTRO', None,
        'ND(BFGJLMNPQVW)-', 'NT', None,
        'ND(SßZ)$', 'NS', 'NZ',
        'ND\'S$', 'NS', 'NZ',
        'ND´S$', 'NS', 'NZ',
        'NEBEN^^', 'NEBN', 'NEBN',
        'NENGELERN------', 'NEN ', 'NEN ',
        'NENLERN(ET)---', 'NEN LE', 'NEN LE',
        'NENZULERNE---', 'NEN ZU LE', 'NEN ZU LE',
        'NE(LMNRST)-3^', 'NE', 'NE',
        'NEN-3', 'NE', 'NE',
        'NETTE$', 'NET', 'NET',
        'NGU^^', 'NU', 'NU',
        'NG(BDFJLMNPQRTVW)-', 'NK', 'NK',
        'NH(AUO)-$', 'NI', 'NI',
        'NICHTSAHNEN-----', 'NIX ', 'NIX ',
        'NICHTSSAGE----', 'NIX ', 'NIX ',
        'NICHTS^^', 'NIX', 'NIX',
        'NICHT^^', 'NICHT', 'NIKT',
        'NINE$', 'NIN', 'NIN',
        'NON^^', 'NON', 'NUN',
        'NOTLEIDE-----^', 'NOT ', 'NUT ',
        'NOT^^', 'NOT', 'NUT',
        'NTI(AIOU)-3', 'NZI', 'NZI',
        'NTIEL--3', 'NZI', 'NZI',
        'NT(SßZ)$', 'NS', 'NZ',
        'NT\'S$', 'NS', 'NZ',
        'NT´S$', 'NS', 'NZ',
        'NYLON', 'NEILON', 'NEILUN',
        'NY9^', 'NÜ', None,
        'NSTZUNEH---', 'NST ZU ', 'NZT ZU ',
        'NSZ-', 'NS', None,
        'NSTS$', 'NS', 'NZ',
        'NZ(BDFGKLMNPQRSTVWX)-', 'NS', None,
        'N(SßZ)$', 'NS', None,
        'OBERE-', 'OBER', None,
        'OBER^^', 'OBA', 'UBA',
        'OEU2', 'Ö', 'Ö',
        'OE<2', 'Ö', 'Ö',
        'OGL-', 'OK', None,
        'OGNIE-', 'ONI', 'UNI',
        'OGN(AEOU)-$', 'ONI', 'UNI',
        'OH(AIOÖUÜY)-', 'OH', None,
        'OIE$', 'Ö', 'Ö',
        'OIRE$', 'OA', 'UA',
        'OIR$', 'OA', 'UA',
        'OIX', 'OA', 'UA',
        'OI<3', 'EU', 'EU',
        'OKAY^$', 'OKE', 'UKE',
        'OLYN$', 'OLIN', 'ULIN',
        'OO(DLMZ)-', 'U', None,
        'OO$', 'U', None,
        'OO-', '', '',
        'ORGINAL-----', 'ORI', 'URI',
        'OTI(AÄOÖUÜ)-', 'OZI', 'UZI',
        'OUI^', 'WI', 'FI',
        'OUILLE$', 'ULIE', 'ULIE',
        'OU(DT)-^', 'AU', 'AU',
        'OUSE$', 'AUS', 'AUZ',
        'OUT-', 'AU', 'AU',
        'OU', 'U', 'U',
        'O(FV)$', 'AU', 'AU',  # due to 'OW$' -> 'AU'
        'OVER--<', 'OW', None,
        'OV(AOU)-', 'OW', None,
        'OW$', 'AU', 'AU',
        'OWS$', 'OS', 'UZ',
        'OJ(AÄEIOÖUÜ)--', 'O', 'U',
        'OYER', 'OIA', None,
        'OY(AÄEIOÖUÜ)--', 'O', 'U',
        'O(JY)<', 'EU', 'EU',
        'OZ$', 'OS', None,
        'O´^', 'O', 'U',
        'O\'^', 'O', 'U',
        'O', None, 'U',
        'PATIEN--^', 'PAZI', 'PAZI',
        'PENSIO-^', 'PANSI', 'PANZI',
        'PE(LMNRST)-3^', 'PE', 'PE',
        'PFER-^', 'FE', 'FE',
        'P(FH)<', 'F', 'F',
        'PIC^$', 'PIK', 'PIK',
        'PIC$', 'PIZ', 'PIZ',
        'PIPELINE', 'PEIBLEIN', 'PEIBLEIN',
        'POLYP-', 'POLÜ', None,
        'POLY^^', 'POLI', 'PULI',
        'PORTRAIT7', 'PORTRE', 'PURTRE',
        'POWER7', 'PAUA', 'PAUA',
        'PP(FH)--<', 'B', 'B',
        'PP-', '', '',
        'PRODUZ-^', 'PRODU', 'BRUTU',
        'PRODUZI--', ' PRODU', ' BRUTU',
        'PRIX^$', 'PRI', 'PRI',
        'PS-^^', 'P', None,
        'P(SßZ)^', None, 'Z',
        'P(SßZ)$', 'BS', None,
        'PT-^', '', '',
        'PTI(AÄOÖUÜ)-3', 'BZI', 'BZI',
        'PY9^', 'PÜ', None,
        'P(AÄEIOÖRUÜY)-', 'P', 'P',
        'P(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'P', None,
        'P.^', None, 'P.',
        'P^', 'P', None,
        'P', 'B', 'B',
        'QI-', 'Z', 'Z',
        'QUARANT--', 'KARA', 'KARA',
        'QUE(LMNRST)-3', 'KWE', 'KFE',
        'QUE$', 'K', 'K',
        'QUI(NS)$', 'KI', 'KI',
        'QUIZ7', 'KWIS', None,
        'Q(UV)7', 'KW', 'KF',
        'Q<', 'K', 'K',
        'RADFAHR----', 'RAT ', 'RAT ',
        'RAEFTEZEHRE-----', 'REFTE ', 'REFTE ',
        'RCH', 'RCH', 'RK',
        'REA(DU)---3^', 'R', None,
        'REBSERZEUG------', 'REBS ', 'REBZ ',
        'RECHERCH^', 'RESHASH', 'REZAZ',
        'RECYCL--', 'RIZEI', 'RIZEI',
        'RE(ALST)-3^', 'RE', None,
        'REE$', 'RI', 'RI',
        'RER$', 'RA', 'RA',
        'RE(MNR)-4', 'RE', 'RE',
        'RETTE$', 'RET', 'RET',
        'REUZ$', 'REUZ', None,
        'REW$', 'RU', 'RU',
        'RH<^', 'R', 'R',
        'RJA(MN)--', 'RI', 'RI',
        'ROWD-^', 'RAU', 'RAU',
        'RTEMONNAIE-', 'RTMON', 'RTNUN',
        'RTI(AÄOÖUÜ)-3', 'RZI', 'RZI',
        'RTIEL--3', 'RZI', 'RZI',
        'RV(AEOU)-3', 'RW', None,
        'RY(KN)-$', 'RI', 'RI',
        'RY9^', 'RÜ', None,
        'RÄFTEZEHRE-----', 'REFTE ', 'REFTE ',
        'SAISO-^', 'SES', 'ZEZ',
        'SAFE^$', 'SEIF', 'ZEIF',
        'SAUCE-^', 'SOS', 'ZUZ',
        'SCHLAGGEBEN-----<', 'SHLAK ', 'ZLAK ',
        'SCHSCH---7', '', '',
        'SCHTSCH', 'SH', 'Z',
        'SC(HZ)<', 'SH', 'Z',
        'SC', 'SK', 'ZK',
        'SELBSTST--7^^', 'SELB', 'ZELB',
        'SELBST7^^', 'SELBST', 'ZELBZT',
        'SERVICE7^', 'SÖRWIS', 'ZÖRFIZ',
        'SERVI-^', 'SERW', None,
        'SE(LMNRST)-3^', 'SE', 'ZE',
        'SETTE$', 'SET', 'ZET',
        'SHP-^', 'S', 'Z',
        'SHST', 'SHT', 'ZT',
        'SHTSH', 'SH', 'Z',
        'SHT', 'ST', 'Z',
        'SHY9^', 'SHÜ', None,
        'SH^^', 'SH', None,
        'SH3', 'SH', 'Z',
        'SICHERGEGAN-----^', 'SICHA ', 'ZIKA ',
        'SICHERGEHE----^', 'SICHA ', 'ZIKA ',
        'SICHERGESTEL------^', 'SICHA ', 'ZIKA ',
        'SICHERSTELL-----^', 'SICHA ', 'ZIKA ',
        'SICHERZU(GS)--^', 'SICHA ZU ', 'ZIKA ZU ',
        'SIEGLI-^', 'SIKL', 'ZIKL',
        'SIGLI-^', 'SIKL', 'ZIKL',
        'SIGHT', 'SEIT', 'ZEIT',
        'SIGN', 'SEIN', 'ZEIN',
        'SKI(NPZ)-', 'SKI', 'ZKI',
        'SKI<^', 'SHI', 'ZI',
        'SODASS^$', 'SO DAS', 'ZU TAZ',
        'SODAß^$', 'SO DAS', 'ZU TAZ',
        'SOGENAN--^', 'SO GEN', 'ZU KEN',
        'SOUND-', 'SAUN', 'ZAUN',
        'STAATS^^', 'STAZ', 'ZTAZ',
        'STADT^^', 'STAT', 'ZTAT',
        'STANDE$', ' STANDE', ' ZTANTE',
        'START^^', 'START', 'ZTART',
        'STAURANT7', 'STORAN', 'ZTURAN',
        'STEAK-', 'STE', 'ZTE',
        'STEPHEN-^$', 'STEW', None,
        'STERN', 'STERN', None,
        'STRAF^^', 'STRAF', 'ZTRAF',
        'ST\'S$', 'Z', 'Z',
        'ST´S$', 'Z', 'Z',
        'STST--', '', '',
        'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT',
        'ST(SZ)', 'Z', 'Z',
        'SPAREN---^', 'SPA', 'ZPA',
        'SPAREND----', ' SPA', ' ZPA',
        'S(PTW)-^^', 'S', None,
        'SP', 'SP', None,
        'STYN(AE)-$', 'STIN', 'ZTIN',
        'ST', 'ST', 'ZT',
        'SUITE<', 'SIUT', 'ZIUT',
        'SUKE--$', 'S', 'Z',
        'SURF(EI)-', 'SÖRF', 'ZÖRF',
        'SV(AEÈÉÊIÌÍÎOU)-<^', 'SW', None,
        'SYB(IY)--^', 'SIB', None,
        'SYL(KVW)--^', 'SI', None,
        'SY9^', 'SÜ', None,
        'SZE(NPT)-^', 'ZE', 'ZE',
        'SZI(ELN)-^', 'ZI', 'ZI',
        'SZCZ<', 'SH', 'Z',
        'SZT<', 'ST', 'ZT',
        'SZ<3', 'SH', 'Z',
        'SÜL(KVW)--^', 'SI', None,
        'S', None, 'Z',
        'TCH', 'SH', 'Z',
        'TD(AÄEIOÖRUÜY)-', 'T', None,
        'TD(ÀÁÂÃÅÈÉÊËÌÍÎÏÒÓÔÕØÙÚÛÝŸ)-', 'T', None,
        'TEAT-^', 'TEA', 'TEA',
        'TERRAI7^', 'TERA', 'TERA',
        'TE(LMNRST)-3^', 'TE', 'TE',
        'TH<', 'T', 'T',
        'TICHT-', 'TIK', 'TIK',
        'TICH$', 'TIK', 'TIK',
        'TIC$', 'TIZ', 'TIZ',
        'TIGGESTELL-------', 'TIK ', 'TIK ',
        'TIGSTELL-----', 'TIK ', 'TIK ',
        'TOAS-^', 'TO', 'TU',
        'TOILET-', 'TOLE', 'TULE',
        'TOIN-', 'TOA', 'TUA',
        'TRAECHTI-^', 'TRECHT', 'TREKT',
        'TRAECHTIG--', ' TRECHT', ' TREKT',
        'TRAINI-', 'TREN', 'TREN',
        'TRÄCHTI-^', 'TRECHT', 'TREKT',
        'TRÄCHTIG--', ' TRECHT', ' TREKT',
        'TSCH', 'SH', 'Z',
        'TSH', 'SH', 'Z',
        'TST', 'ZT', 'ZT',
        'T(Sß)', 'Z', 'Z',
        'TT(SZ)--<', '', '',
        'TT9', 'T', 'T',
        'TV^$', 'TV', 'TV',
        'TX(AEIOU)-3', 'SH', 'Z',
        'TY9^', 'TÜ', None,
        'TZ-', '', '',
        'T\'S3$', 'Z', 'Z',
        'T´S3$', 'Z', 'Z',
        'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
        'UEBER^^', 'ÜBA', 'IBA',
        'UE2', 'Ü', 'I',
        'UGL-', 'UK', None,
        'UH(AOÖUÜY)-', 'UH', None,
        'UIE$', 'Ü', 'I',
        'UM^^', 'UM', 'UN',
        'UNTERE--3', 'UNTE', 'UNTE',
        'UNTER^^', 'UNTA', 'UNTA',
        'UNVER^^', 'UNFA', 'UNFA',
        'UN^^', 'UN', 'UN',
        'UTI(AÄOÖUÜ)-', 'UZI', 'UZI',
        'UVE-4', 'UW', None,
        'UY2', 'UI', None,
        'UZZ', 'AS', 'AZ',
        'VACL-^', 'WAZ', 'FAZ',
        'VAC$', 'WAZ', 'FAZ',
        'VAN DEN ^', 'FANDN', 'FANTN',
        'VANES-^', 'WANE', None,
        'VATRO-', 'WATR', None,
        'VA(DHJNT)--^', 'F', None,
        'VEDD-^', 'FE', 'FE',
        'VE(BEHIU)--^', 'F', None,
        'VEL(BDLMNT)-^', 'FEL', None,
        'VENTZ-^', 'FEN', None,
        'VEN(NRSZ)-^', 'FEN', None,
        'VER(AB)-^$', 'WER', None,
        'VERBAL^$', 'WERBAL', None,
        'VERBAL(EINS)-^', 'WERBAL', None,
        'VERTEBR--', 'WERTE', None,
        'VEREIN-----', 'F', None,
        'VEREN(AEIOU)-^', 'WEREN', None,
        'VERIFI', 'WERIFI', None,
        'VERON(AEIOU)-^', 'WERON', None,
        'VERSEN^', 'FERSN', 'FAZN',
        'VERSIERT--^', 'WERSI', None,
        'VERSIO--^', 'WERS', None,
        'VERSUS', 'WERSUS', None,
        'VERTI(GK)-', 'WERTI', None,
        'VER^^', 'FER', 'FA',
        'VERSPRECHE-------', ' FER', ' FA',
        'VER$', 'WA', None,
        'VER', 'FA', 'FA',
        'VET(HT)-^', 'FET', 'FET',
        'VETTE$', 'WET', 'FET',
        'VE^', 'WE', None,
        'VIC$', 'WIZ', 'FIZ',
        'VIELSAGE----', 'FIL ', 'FIL ',
        'VIEL', 'FIL', 'FIL',
        'VIEW', 'WIU', 'FIU',
        'VILL(AE)-', 'WIL', None,
        'VIS(ACEIKUVWZ)-<^', 'WIS', None,
        'VI(ELS)--^', 'F', None,
        'VILLON--', 'WILI', 'FILI',
        'VIZE^^', 'FIZE', 'FIZE',
        'VLIE--^', 'FL', None,
        'VL(AEIOU)--', 'W', None,
        'VOKA-^', 'WOK', None,
        'VOL(ATUVW)--^', 'WO', None,
        'VOR^^', 'FOR', 'FUR',
        'VR(AEIOU)--', 'W', None,
        'VV9', 'W', None,
        'VY9^', 'WÜ', 'FI',
        'V(ÜY)-', 'W', None,
        'V(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'W', None,
        'V(AEIJLRU)-<', 'W', None,
        'V.^', 'V.', None,
        'V<', 'F', 'F',
        'WEITERENTWI-----^', 'WEITA ', 'FEITA ',
        'WEITREICH-----^', 'WEIT ', 'FEIT ',
        'WEITVER^', 'WEIT FER', 'FEIT FA',
        'WE(LMNRST)-3^', 'WE', 'FE',
        'WER(DST)-', 'WER', None,
        'WIC$', 'WIZ', 'FIZ',
        'WIEDERU--', 'WIDE', 'FITE',
        'WIEDER^$', 'WIDA', 'FITA',
        'WIEDER^^', 'WIDA ', 'FITA ',
        'WIEVIEL', 'WI FIL', 'FI FIL',
        'WISUEL', 'WISUEL', None,
        'WR-^', 'W', None,
        'WY9^', 'WÜ', 'FI',
        'W(BDFGJKLMNPQRSTZ)-', 'F', None,
        'W$', 'F', None,
        'W', None, 'F',
        'X<^', 'Z', 'Z',
        'XHAVEN$', 'XAFN', None,
        'X(CSZ)', 'X', 'X',
        'XTS(CH)--', 'XT', 'XT',
        'XT(SZ)', 'Z', 'Z',
        'YE(LMNRST)-3^', 'IE', 'IE',
        'YE-3', 'I', 'I',
        'YOR(GK)^$', 'IÖRK', 'IÖRK',
        'Y(AOU)-<7', 'I', 'I',
        'Y(BKLMNPRSTX)-1', 'Ü', None,
        'YVES^$', 'IF', 'IF',
        'YVONNE^$', 'IWON', 'IFUN',
        'Y.^', 'Y.', None,
        'Y', 'I', 'I',
        'ZC(AOU)-', 'SK', 'ZK',
        'ZE(LMNRST)-3^', 'ZE', 'ZE',
        'ZIEJ$', 'ZI', 'ZI',
        'ZIGERJA(HR)-3', 'ZIGA IA', 'ZIKA IA',
        'ZL(AEIOU)-', 'SL', None,
        'ZS(CHT)--', '', '',
        'ZS', 'SH', 'Z',
        'ZUERST', 'ZUERST', 'ZUERST',
        'ZUGRUNDE^$', 'ZU GRUNDE', 'ZU KRUNTE',
        'ZUGRUNDE', 'ZU GRUNDE ', 'ZU KRUNTE ',
        'ZUGUNSTEN', 'ZU GUNSTN', 'ZU KUNZTN',
        'ZUHAUSE-', 'ZU HAUS', 'ZU AUZ',
        'ZULASTEN^$', 'ZU LASTN', 'ZU LAZTN',
        'ZURUECK^^', 'ZURÜK', 'ZURIK',
        'ZURZEIT', 'ZUR ZEIT', 'ZUR ZEIT',
        'ZURÜCK^^', 'ZURÜK', 'ZURIK',
        'ZUSTANDE', 'ZU STANDE', 'ZU ZTANTE',
        'ZUTAGE', 'ZU TAGE', 'ZU TAKE',
        'ZUVER^^', 'ZUFA', 'ZUFA',
        'ZUVIEL', 'ZU FIL', 'ZU FIL',
        'ZUWENIG', 'ZU WENIK', 'ZU FENIK',
        'ZY9^', 'ZÜ', None,
        'ZYK3$', 'ZIK', None,
        'Z(VW)7^', 'SW', None,
        None, None, None)

    phonet_hash = Counter()
    alpha_pos = Counter()

    phonet_hash_1 = Counter()
    phonet_hash_2 = Counter()

    _phonet_upper_translation = dict(zip((ord(_) for _ in
                                          'abcdefghijklmnopqrstuvwxyzàáâãåäæ' +
                                          'çðèéêëìíîïñòóôõöøœšßþùúûüýÿ'),
                                         'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÅÄÆ' +
                                         'ÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ'))

    def _initialize_phonet(lang):
        """Initialize phonet variables."""
        if lang == 'none':
            _phonet_rules = _phonet_rules_no_lang
        else:
            _phonet_rules = _phonet_rules_german

        phonet_hash[''] = -1

        # German and international umlauts
        for j in {'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë',
                  'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø',
                  'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'Œ', 'Š', 'Ÿ'}:
            alpha_pos[j] = 1
            phonet_hash[j] = -1

        # "normal" letters ('A'-'Z')
        for i, j in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
            alpha_pos[j] = i + 2
            phonet_hash[j] = -1

        for i in range(26):
            for j in range(28):
                phonet_hash_1[i, j] = -1
                phonet_hash_2[i, j] = -1

        # for each phonetc rule
        for i in range(len(_phonet_rules)):
            rule = _phonet_rules[i]

            if rule and i % 3 == 0:
                # calculate first hash value
                k = _phonet_rules[i][0]

                if phonet_hash[k] < 0 and (_phonet_rules[i+1] or
                                           _phonet_rules[i+2]):
                    phonet_hash[k] = i

                # calculate second hash values
                if k and alpha_pos[k] >= 2:
                    k = alpha_pos[k]

                    j = k-2
                    rule = rule[1:]

                    if not rule:
                        rule = ' '
                    elif rule[0] == '(':
                        rule = rule[1:]
                    else:
                        rule = rule[0]

                    while rule and (rule[0] != ')'):
                        k = alpha_pos[rule[0]]

                        if k > 0:
                            # add hash value for this letter
                            if phonet_hash_1[j, k] < 0:
                                phonet_hash_1[j, k] = i
                                phonet_hash_2[j, k] = i

                            if phonet_hash_2[j, k] >= (i-30):
                                phonet_hash_2[j, k] = i
                            else:
                                k = -1

                        if k <= 0:
                            # add hash value for all letters
                            if phonet_hash_1[j, 0] < 0:
                                phonet_hash_1[j, 0] = i

                            phonet_hash_2[j, 0] = i

                        rule = rule[1:]

    def _phonet(term, mode, lang):
        """Return the phonet coded form of a term."""
        if lang == 'none':
            _phonet_rules = _phonet_rules_no_lang
        else:
            _phonet_rules = _phonet_rules_german

        char0 = ''
        dest = term

        if not term:
            return ''

        term_length = len(term)

        # convert input string to upper-case
        src = term.translate(_phonet_upper_translation)

        # check "src"
        i = 0
        j = 0
        zeta = 0

        while i < len(src):
            char = src[i]

            pos = alpha_pos[char]

            if pos >= 2:
                xpos = pos-2

                if i+1 == len(src):
                    pos = alpha_pos['']
                else:
                    pos = alpha_pos[src[i+1]]

                start1 = phonet_hash_1[xpos, pos]
                start2 = phonet_hash_1[xpos, 0]
                end1 = phonet_hash_2[xpos, pos]
                end2 = phonet_hash_2[xpos, 0]

                # preserve rule priorities
                if (start2 >= 0) and ((start1 < 0) or (start2 < start1)):
                    pos = start1
                    start1 = start2
                    start2 = pos
                    pos = end1
                    end1 = end2
                    end2 = pos

                if (end1 >= start2) and (start2 >= 0):
                    if end2 > end1:
                        end1 = end2

                    start2 = -1
                    end2 = -1
            else:
                pos = phonet_hash[char]
                start1 = pos
                end1 = 10000
                start2 = -1
                end2 = -1

            pos = start1
            zeta0 = 0

            if pos >= 0:
                # check rules for this char
                while ((_phonet_rules[pos] is None) or
                       (_phonet_rules[pos][0] == char)):
                    if pos > end1:
                        if start2 > 0:
                            pos = start2
                            start1 = start2
                            start2 = -1
                            end1 = end2
                            end2 = -1
                            continue

                        break

                    if (((_phonet_rules[pos] is None) or
                         (_phonet_rules[pos + mode] is None))):
                        # no conversion rule available
                        pos += 3
                        continue

                    # check whole string
                    matches = 1  # number of matching letters
                    priority = 5  # default priority
                    rule = _phonet_rules[pos]
                    rule = rule[1:]

                    while (rule and
                           (len(src) > (i + matches)) and
                           (src[i + matches] == rule[0]) and
                           not rule[0].isdigit() and
                           (rule not in '(-<^$')):
                        matches += 1
                        rule = rule[1:]

                    if rule and (rule[0] == '('):
                        # check an array of letters
                        if (((len(src) > (i + matches)) and
                             src[i + matches].isalpha() and
                             (src[i + matches] in rule[1:]))):
                            matches += 1

                            while rule and rule[0] != ')':
                                rule = rule[1:]

                            # if rule[0] == ')':
                            rule = rule[1:]

                    if rule:
                        priority0 = ord(rule[0])
                    else:
                        priority0 = 0

                    matches0 = matches

                    while rule and rule[0] == '-' and matches > 1:
                        matches -= 1
                        rule = rule[1:]

                    if rule and rule[0] == '<':
                        rule = rule[1:]

                    if rule and rule[0].isdigit():
                        # read priority
                        priority = int(rule[0])
                        rule = rule[1:]

                    if rule and rule[0:2] == '^^':
                        rule = rule[1:]

                    if (not rule or
                            ((rule[0] == '^') and
                             ((i == 0) or not src[i-1].isalpha()) and
                             ((rule[1:2] != '$') or
                              (not (src[i+matches0:i+matches0+1].isalpha()) and
                               (src[i+matches0:i+matches0+1] != '.')))) or
                            ((rule[0] == '$') and (i > 0) and
                             src[i-1].isalpha() and
                             ((not src[i+matches0:i+matches0+1].isalpha()) and
                              (src[i+matches0:i+matches0+1] != '.')))):
                        # look for continuation, if:
                        # matches > 1 und NO '-' in first string */
                        pos0 = -1

                        start3 = 0
                        start4 = 0
                        end3 = 0
                        end4 = 0

                        if (((matches > 1) and
                             src[i+matches:i+matches+1] and
                             (priority0 != ord('-')))):
                            char0 = src[i+matches-1]
                            pos0 = alpha_pos[char0]

                            if pos0 >= 2 and src[i+matches]:
                                xpos = pos0 - 2
                                pos0 = alpha_pos[src[i+matches]]
                                start3 = phonet_hash_1[xpos, pos0]
                                start4 = phonet_hash_1[xpos, 0]
                                end3 = phonet_hash_2[xpos, pos0]
                                end4 = phonet_hash_2[xpos, 0]

                                # preserve rule priorities
                                if (((start4 >= 0) and
                                     ((start3 < 0) or (start4 < start3)))):
                                    pos0 = start3
                                    start3 = start4
                                    start4 = pos0
                                    pos0 = end3
                                    end3 = end4
                                    end4 = pos0

                                if (end3 >= start4) and (start4 >= 0):
                                    if end4 > end3:
                                        end3 = end4

                                    start4 = -1
                                    end4 = -1
                            else:
                                pos0 = phonet_hash[char0]
                                start3 = pos0
                                end3 = 10000
                                start4 = -1
                                end4 = -1

                            pos0 = start3

                        # check continuation rules for src[i+matches]
                        if pos0 >= 0:
                            while ((_phonet_rules[pos0] is None) or
                                   (_phonet_rules[pos0][0] == char0)):
                                if pos0 > end3:
                                    if start4 > 0:
                                        pos0 = start4
                                        start3 = start4
                                        start4 = -1
                                        end3 = end4
                                        end4 = -1
                                        continue

                                    priority0 = -1

                                    # important
                                    break

                                if (((_phonet_rules[pos0] is None) or
                                     (_phonet_rules[pos0 + mode]
                                      is None))):
                                    # no conversion rule available
                                    pos0 += 3
                                    continue

                                # check whole string
                                matches0 = matches
                                priority0 = 5
                                rule = _phonet_rules[pos0]
                                rule = rule[1:]

                                while (rule and
                                       (src[i+matches0:i+matches0+1] ==
                                        rule[0]) and
                                       (not rule[0].isdigit() or
                                        (rule in '(-<^$'))):
                                    matches0 += 1
                                    rule = rule[1:]

                                if rule and rule[0] == '(':
                                    # check an array of letters
                                    if ((src[i+matches0:i+matches0+1]
                                         .isalpha() and
                                         (src[i+matches0] in rule[1:]))):
                                        matches0 += 1

                                        while rule and rule[0] != ')':
                                            rule = rule[1:]

                                        # if rule[0] == ')':
                                        rule = rule[1:]

                                while rule and rule[0] == '-':
                                    # "matches0" is NOT decremented
                                    # because of  "if (matches0 == matches)"
                                    rule = rule[1:]

                                if rule and rule[0] == '<':
                                    rule = rule[1:]

                                if rule and rule[0].isdigit():
                                    priority0 = int(rule[0])
                                    rule = rule[1:]

                                if (not rule or
                                        # rule == '^' is not possible here
                                        ((rule[0] == '$') and not
                                         src[i+matches0:i+matches0+1]
                                         .isalpha() and
                                         (src[i+matches0:i+matches0+1]
                                          != '.'))):
                                    if matches0 == matches:
                                        # this is only a partial string
                                        pos0 += 3
                                        continue

                                    if priority0 < priority:
                                        # priority is too low
                                        pos0 += 3
                                        continue

                                    # continuation rule found
                                    break

                                pos0 += 3

                            # end of "while"
                            if ((priority0 >= priority) and
                                    ((_phonet_rules[pos0] is not None) and
                                     (_phonet_rules[pos0][0] == char0))):

                                pos += 3
                                continue

                        # replace string
                        if ((_phonet_rules[pos] and
                             ('<' in _phonet_rules[pos][1:]))):
                            priority0 = 1
                        else:
                            priority0 = 0

                        rule = _phonet_rules[pos + mode]

                        if (priority0 == 1) and (zeta == 0):
                            # rule with '<' is applied
                            if ((j > 0) and rule and
                                    ((dest[j-1] == char) or
                                     (dest[j-1] == rule[0]))):
                                j -= 1

                            zeta0 = 1
                            zeta += 1
                            matches0 = 0

                            while rule and src[i+matches0]:
                                src = (src[0:i+matches0] + rule[0] +
                                       src[i+matches0+1:])
                                matches0 += 1
                                rule = rule[1:]

                            if matches0 < matches:
                                src = (src[0:i+matches0] +
                                       src[i+matches:])

                            char = src[i]
                        else:
                            i = i + matches - 1
                            zeta = 0

                            while len(rule) > 1:
                                if (j == 0) or (dest[j - 1] != rule[0]):
                                    dest = (dest[0:j] + rule[0] +
                                            dest[min(len(dest), j+1):])
                                    j += 1

                                rule = rule[1:]

                            # new "current char"
                            if not rule:
                                rule = ''
                                char = ''
                            else:
                                char = rule[0]

                            if ((_phonet_rules[pos] and
                                 '^^' in _phonet_rules[pos][1:])):
                                if char:
                                    dest = (dest[0:j] + char +
                                            dest[min(len(dest), j + 1):])
                                    j += 1

                                src = src[i + 1:]
                                i = 0
                                zeta0 = 1

                        break

                    pos += 3

                    if pos > end1 and start2 > 0:
                        pos = start2
                        start1 = start2
                        end1 = end2
                        start2 = -1
                        end2 = -1

            if zeta0 == 0:
                if char and ((j == 0) or (dest[j-1] != char)):
                    # delete multiple letters only
                    dest = dest[0:j] + char + dest[min(j+1, term_length):]
                    j += 1

                i += 1
                zeta = 0

        dest = dest[0:j]

        return dest

    _initialize_phonet(lang)

    word = unicode_normalize('NFKC', text_type(word))
    return _phonet(word, mode, lang)


[docs]def spfc(word):
    """Return the Standardized Phonetic Frequency Code (SPFC) of a word.

    Standardized Phonetic Frequency Code is roughly Soundex-like.
    This implementation is based on page 19-21 of :cite:`Moore:1977`.

    :param str word: the word to transform
    :returns: the SPFC value
    :rtype: str

    >>> spfc('Christopher Smith')
    '01160'
    >>> spfc('Christopher Schmidt')
    '01160'
    >>> spfc('Niall Smith')
    '01660'
    >>> spfc('Niall Schmidt')
    '01660'

    >>> spfc('L.Smith')
    '01960'
    >>> spfc('R.Miller')
    '65490'

    >>> spfc(('L', 'Smith'))
    '01960'
    >>> spfc(('R', 'Miller'))
    '65490'
    """
    _pf1 = dict(zip((ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'),
                    '0011112222334445556666777'))
    _pf2 = dict(zip((ord(_) for _ in
                     'SZCKQFPXABORDHIMNGJTUVWEL'),
                    '0011122233445556677788899'))
    _pf3 = dict(zip((ord(_) for _ in
                     'BCKQVDTFLPGJXMNRSZAEHIOUWY'),
                    '00000112223334456677777777'))

    _substitutions = (('DK', 'K'), ('DT', 'T'), ('SC', 'S'), ('KN', 'N'),
                      ('MN', 'N'))

    def _raise_word_ex():
        """Raise an AttributeError."""
        raise AttributeError('word attribute must be a string with a space ' +
                             'or period dividing the first and last names ' +
                             'or a tuple/list consisting of the first and ' +
                             'last names')

    if not word:
        return ''

    names = []
    if isinstance(word, (str, text_type)):
        names = word.split('.', 1)
        if len(names) != 2:
            names = word.split(' ', 1)
            if len(names) != 2:
                _raise_word_ex()
    elif hasattr(word, '__iter__'):
        if len(word) != 2:
            _raise_word_ex()
        names = word
    else:
        _raise_word_ex()

    names = [unicode_normalize('NFKD', text_type(_.strip()
                                                 .replace('ß', 'SS')
                                                 .upper()))
             for _ in names]
    code = ''

    def steps_one_to_three(name):
        """Perform the first three steps of SPFC."""
        # filter out non A-Z
        name = ''.join(_ for _ in name if _ in
                       {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
                        'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
                        'W', 'X', 'Y', 'Z'})

        # 1. In the field, convert DK to K, DT to T, SC to S, KN to N,
        # and MN to N
        for subst in _substitutions:
            name = name.replace(subst[0], subst[1])

        # 2. In the name field, replace multiple letters with a single letter
        name = _delete_consecutive_repeats(name)

        # 3. Remove vowels, W, H, and Y, but keep the first letter in the name
        # field.
        if name:
            name = name[0] + ''.join(_ for _ in name[1:] if _ not in
                                     {'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'})
        return name

    names = [steps_one_to_three(_) for _ in names]

    # 4. The first digit of the code is obtained using PF1 and the first letter
    # of the name field. Remove this letter after coding.
    if names[1]:
        code += names[1][0].translate(_pf1)
        names[1] = names[1][1:]

    # 5. Using the last letters of the name, use Table PF3 to obtain the
    # second digit of the code. Use as many letters as possible and remove
    # after coding.
    if names[1]:
        if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS':
            code += '8'
            names[1] = names[1][:-3]
        elif names[1][-2:] == 'SN':
            code += '8'
            names[1] = names[1][:-2]
        elif names[1][-3:] == 'STR':
            code += '9'
            names[1] = names[1][:-3]
        elif names[1][-2:] in {'SR', 'TN', 'TD'}:
            code += '9'
            names[1] = names[1][:-2]
        elif names[1][-3:] == 'DRS':
            code += '7'
            names[1] = names[1][:-3]
        elif names[1][-2:] in {'TR', 'MN'}:
            code += '7'
            names[1] = names[1][:-2]
        else:
            code += names[1][-1].translate(_pf3)
            names[1] = names[1][:-1]

    # 6. The third digit is found using Table PF2 and the first character of
    # the first name. Remove after coding.
    if names[0]:
        code += names[0][0].translate(_pf2)
        names[0] = names[0][1:]

    # 7. The fourth digit is found using Table PF2 and the first character of
    # the name field. If no letters remain use zero. After coding remove the
    # letter.
    # 8. The fifth digit is found in the same manner as the fourth using the
    # remaining characters of the name field if any.
    for _ in range(2):
        if names[1]:
            code += names[1][0].translate(_pf2)
            names[1] = names[1][1:]
        else:
            code += '0'

    return code


[docs]def statistics_canada(word, max_length=4):
    """Return the Statistics Canada code for a word.

    The original description of this algorithm could not be located, and
    may only have been specified in an unpublished TR. The coding does not
    appear to be in use by Statistics Canada any longer. In its place, this is
    an implementation of the "Census modified Statistics Canada name coding
    procedure".

    The modified version of this algorithm is described in Appendix B of
     :cite:`Moore:1977`.

    :param str word: the word to transform
    :param int max_length: the maximum length (default 4) of the code to return
    :returns: the Statistics Canada name code value
    :rtype: str

    >>> statistics_canada('Christopher')
    'CHRS'
    >>> statistics_canada('Niall')
    'NL'
    >>> statistics_canada('Smith')
    'SMTH'
    >>> statistics_canada('Schmidt')
    'SCHM'
    """
    # uppercase, normalize, decompose, and filter non-A-Z out
    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})
    if not word:
        return ''

    code = word[1:]
    for vowel in {'A', 'E', 'I', 'O', 'U', 'Y'}:
        code = code.replace(vowel, '')
    code = word[0]+code
    code = _delete_consecutive_repeats(code)
    code = code.replace(' ', '')

    return code[:max_length]


[docs]def lein(word, max_length=4, zero_pad=True):
    """Return the Lein code for a word.

    This is Lein name coding, described in :cite:`Moore:1977`.

    :param str word: the word to transform
    :param int max_length: the maximum length (default 4) of the code to return
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
        max_length string
    :returns: the Lein code
    :rtype: str

    >>> lein('Christopher')
    'C351'
    >>> lein('Niall')
    'N300'
    >>> lein('Smith')
    'S210'
    >>> lein('Schmidt')
    'S521'
    """
    _lein_translation = dict(zip((ord(_) for _ in
                                  'BCDFGJKLMNPQRSTVXZ'),
                                 '451455532245351455'))

    # uppercase, normalize, decompose, and filter non-A-Z out
    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})

    code = word[:1]  # Rule 1
    word = word[1:].translate({32: None, 65: None, 69: None, 72: None,
                               73: None, 79: None, 85: None, 87: None,
                               89: None})  # Rule 2
    word = _delete_consecutive_repeats(word)  # Rule 3
    code += word.translate(_lein_translation)  # Rule 4

    if zero_pad:
        code += ('0'*max_length)  # Rule 4

    return code[:max_length]


[docs]def roger_root(word, max_length=5, zero_pad=True):
    """Return the Roger Root code for a word.

    This is Roger Root name coding, described in :cite:`Moore:1977`.

    :param str word: the word to transform
    :param int max_length: the maximum length (default 5) of the code to return
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
        max_length string
    :returns: the Roger Root code
    :rtype: str

    >>> roger_root('Christopher')
    '06401'
    >>> roger_root('Niall')
    '02500'
    >>> roger_root('Smith')
    '00310'
    >>> roger_root('Schmidt')
    '06310'
    """
    # uppercase, normalize, decompose, and filter non-A-Z out
    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})

    # '*' is used to prevent combining by _delete_consecutive_repeats()
    _init_patterns = {4: {'TSCH': '06'},
                      3: {'TSH': '06', 'SCH': '06'},
                      2: {'CE': '0*0', 'CH': '06', 'CI': '0*0', 'CY': '0*0',
                          'DG': '07', 'GF': '08', 'GM': '03', 'GN': '02',
                          'KN': '02', 'PF': '08', 'PH': '08', 'PN': '02',
                          'SH': '06', 'TS': '0*0', 'WR': '04'},
                      1: {'A': '1', 'B': '09', 'C': '07', 'D': '01', 'E': '1',
                          'F': '08', 'G': '07', 'H': '2', 'I': '1', 'J': '3',
                          'K': '07', 'L': '05', 'M': '03', 'N': '02', 'O': '1',
                          'P': '09', 'Q': '07', 'R': '04', 'S': '0*0',
                          'T': '01', 'U': '1', 'V': '08', 'W': '4', 'X': '07',
                          'Y': '5', 'Z': '0*0'}}

    _med_patterns = {4: {'TSCH': '6'},
                     3: {'TSH': '6', 'SCH': '6'},
                     2: {'CE': '0', 'CH': '6', 'CI': '0', 'CY': '0', 'DG': '7',
                         'PH': '8', 'SH': '6', 'TS': '0'},
                     1: {'B': '9', 'C': '7', 'D': '1', 'F': '8', 'G': '7',
                         'J': '6', 'K': '7', 'L': '5', 'M': '3', 'N': '2',
                         'P': '9', 'Q': '7', 'R': '4', 'S': '0', 'T': '1',
                         'V': '8', 'X': '7', 'Z': '0',
                         'A': '*', 'E': '*', 'H': '*', 'I': '*', 'O': '*',
                         'U': '*', 'W': '*', 'Y': '*'}}

    code = ''
    pos = 0

    # Do first digit(s) first
    for num in range(4, 0, -1):
        if word[:num] in _init_patterns[num]:
            code = _init_patterns[num][word[:num]]
            pos += num
            break

    # Then code subsequent digits
    while pos < len(word):
        for num in range(4, 0, -1):  # pragma: no branch
            if word[pos:pos+num] in _med_patterns[num]:
                code += _med_patterns[num][word[pos:pos+num]]
                pos += num
                break

    code = _delete_consecutive_repeats(code)
    code = code.replace('*', '')

    if zero_pad:
        code += '0'*max_length

    return code[:max_length]


[docs]def onca(word, max_length=4, zero_pad=True):
    """Return the Oxford Name Compression Algorithm (ONCA) code for a word.

    This is the Oxford Name Compression Algorithm, based on :cite:`Gill:1997`.

    I can find no complete description of the "anglicised version of the NYSIIS
    method" identified as the first step in this algorithm, so this is likely
    not a precisely correct implementation, in that it employs the standard
    NYSIIS algorithm.

    :param str word: the word to transform
    :param int max_length: the maximum length (default 5) of the code to return
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
        max_length string
    :returns: the ONCA code
    :rtype: str

    >>> onca('Christopher')
    'C623'
    >>> onca('Niall')
    'N400'
    >>> onca('Smith')
    'S530'
    >>> onca('Schmidt')
    'S530'
    """
    # In the most extreme case, 3 characters of NYSIIS input can be compressed
    # to one character of output, so give it triple the max_length.
    return soundex(nysiis(word, max_length=max_length*3), max_length,
                   zero_pad=zero_pad)


[docs]def eudex(word, max_length=8):
    """Return the eudex phonetic hash of a word.

    This implementation of eudex phonetic hashing is based on the specification
    (not the reference implementation) at :cite:`Ticki:2016`.

    Further details can be found at :cite:`Ticki:2016b`.

    :param str word: the word to transform
    :param int max_length: the length in bits of the code returned (default 8)
    :returns: the eudex hash
    :rtype: int

    >>> eudex('Colin')
    432345564238053650
    >>> eudex('Christopher')
    433648490138894409
    >>> eudex('Niall')
    648518346341351840
    >>> eudex('Smith')
    720575940412906756
    >>> eudex('Schmidt')
    720589151732307997
    """
    _trailing_phones = {
        'a': 0,  # a
        'b': 0b01001000,  # b
        'c': 0b00001100,  # c
        'd': 0b00011000,  # d
        'e': 0,  # e
        'f': 0b01000100,  # f
        'g': 0b00001000,  # g
        'h': 0b00000100,  # h
        'i': 1,  # i
        'j': 0b00000101,  # j
        'k': 0b00001001,  # k
        'l': 0b10100000,  # l
        'm': 0b00000010,  # m
        'n': 0b00010010,  # n
        'o': 0,  # o
        'p': 0b01001001,  # p
        'q': 0b10101000,  # q
        'r': 0b10100001,  # r
        's': 0b00010100,  # s
        't': 0b00011101,  # t
        'u': 1,  # u
        'v': 0b01000101,  # v
        'w': 0b00000000,  # w
        'x': 0b10000100,  # x
        'y': 1,  # y
        'z': 0b10010100,  # z

        'ß': 0b00010101,  # ß
        'à': 0,  # à
        'á': 0,  # á
        'â': 0,  # â
        'ã': 0,  # ã
        'ä': 0,  # ä[æ]
        'å': 1,  # å[oː]
        'æ': 0,  # æ[æ]
        'ç': 0b10010101,  # ç[t͡ʃ]
        'è': 1,  # è
        'é': 1,  # é
        'ê': 1,  # ê
        'ë': 1,  # ë
        'ì': 1,  # ì
        'í': 1,  # í
        'î': 1,  # î
        'ï': 1,  # ï
        'ð': 0b00010101,  # ð[ð̠](represented as a non-plosive T)
        'ñ': 0b00010111,  # ñ[nj](represented as a combination of n and j)
        'ò': 0,  # ò
        'ó': 0,  # ó
        'ô': 0,  # ô
        'õ': 0,  # õ
        'ö': 1,  # ö[ø]
        '÷': 0b11111111,  # ÷
        'ø': 1,  # ø[ø]
        'ù': 1,  # ù
        'ú': 1,  # ú
        'û': 1,  # û
        'ü': 1,  # ü
        'ý': 1,  # ý
        'þ': 0b00010101,  # þ[ð̠](represented as a non-plosive T)
        'ÿ': 1,  # ÿ
    }

    _initial_phones = {
        'a': 0b10000100,  # a*
        'b': 0b00100100,  # b
        'c': 0b00000110,  # c
        'd': 0b00001100,  # d
        'e': 0b11011000,  # e*
        'f': 0b00100010,  # f
        'g': 0b00000100,  # g
        'h': 0b00000010,  # h
        'i': 0b11111000,  # i*
        'j': 0b00000011,  # j
        'k': 0b00000101,  # k
        'l': 0b01010000,  # l
        'm': 0b00000001,  # m
        'n': 0b00001001,  # n
        'o': 0b10010100,  # o*
        'p': 0b00100101,  # p
        'q': 0b01010100,  # q
        'r': 0b01010001,  # r
        's': 0b00001010,  # s
        't': 0b00001110,  # t
        'u': 0b11100000,  # u*
        'v': 0b00100011,  # v
        'w': 0b00000000,  # w
        'x': 0b01000010,  # x
        'y': 0b11100100,  # y*
        'z': 0b01001010,  # z

        'ß': 0b00001011,  # ß
        'à': 0b10000101,  # à
        'á': 0b10000101,  # á
        'â': 0b10000000,  # â
        'ã': 0b10000110,  # ã
        'ä': 0b10100110,  # ä [æ]
        'å': 0b11000010,  # å [oː]
        'æ': 0b10100111,  # æ [æ]
        'ç': 0b01010100,  # ç [t͡ʃ]
        'è': 0b11011001,  # è
        'é': 0b11011001,  # é
        'ê': 0b11011001,  # ê
        'ë': 0b11000110,  # ë [ə] or [œ]
        'ì': 0b11111001,  # ì
        'í': 0b11111001,  # í
        'î': 0b11111001,  # î
        'ï': 0b11111001,  # ï
        'ð': 0b00001011,  # ð [ð̠] (represented as a non-plosive T)
        'ñ': 0b00001011,  # ñ [nj] (represented as a combination of n and j)
        'ò': 0b10010101,  # ò
        'ó': 0b10010101,  # ó
        'ô': 0b10010101,  # ô
        'õ': 0b10010101,  # õ
        'ö': 0b11011100,  # ö [œ] or [ø]
        '÷': 0b11111111,  # ÷
        'ø': 0b11011101,  # ø [œ] or [ø]
        'ù': 0b11100001,  # ù
        'ú': 0b11100001,  # ú
        'û': 0b11100001,  # û
        'ü': 0b11100101,  # ü
        'ý': 0b11100101,  # ý
        'þ': 0b00001011,  # þ [ð̠] (represented as a non-plosive T)
        'ÿ': 0b11100101,  # ÿ
    }
    # Lowercase input & filter unknown characters
    word = ''.join(char for char in word.lower() if char in _initial_phones)

    if not word:
        word = '÷'

    # Perform initial eudex coding of each character
    values = [_initial_phones[word[0]]]
    values += [_trailing_phones[char] for char in word[1:]]

    # Right-shift by one to determine if second instance should be skipped
    shifted_values = [_ >> 1 for _ in values]
    condensed_values = [values[0]]
    for n in range(1, len(shifted_values)):
        if shifted_values[n] != shifted_values[n-1]:
            condensed_values.append(values[n])

    # Add padding after first character & trim beyond max_length
    values = ([condensed_values[0]] +
              [0]*max(0, max_length - len(condensed_values)) +
              condensed_values[1:max_length])

    # Combine individual character values into eudex hash
    hash_value = 0
    for val in values:
        hash_value = (hash_value << 8) | val

    return hash_value


[docs]def haase_phonetik(word, primary_only=False):
    """Return the Haase Phonetik (numeric output) code for a word.

    Based on the algorithm described at :cite:`Prante:2015`.

    Based on the original :cite:`Haase:2000`.

    While the output code is numeric, it is nevertheless a str.

    :param str word: the word to transform
    :param bool primary_only: if True, only the primary code is returned
    :returns: the Haase Phonetik value as a numeric string
    :rtype: tuple

    >>> haase_phonetik('Joachim')
    ('9496',)
    >>> haase_phonetik('Christoph')
    ('4798293', '8798293')
    >>> haase_phonetik('Jörg')
    ('974',)
    >>> haase_phonetik('Smith')
    ('8692',)
    >>> haase_phonetik('Schmidt')
    ('8692', '4692')
    """
    def _after(word, i, letters):
        """Return True if word[i] follows one of the supplied letters."""
        if i > 0 and word[i-1] in letters:
            return True
        return False

    def _before(word, i, letters):
        """Return True if word[i] precedes one of the supplied letters."""
        if i+1 < len(word) and word[i+1] in letters:
            return True
        return False

    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}

    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')

    word = word.replace('Ä', 'AE')
    word = word.replace('Ö', 'OE')
    word = word.replace('Ü', 'UE')
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})

    variants = []
    if primary_only:
        variants = [word]
    else:
        pos = 0
        if word[:2] == 'CH':
            variants.append(('CH', 'SCH'))
            pos += 2
        len_3_vars = {'OWN': 'AUN', 'WSK': 'RSK', 'SCH': 'CH', 'GLI': 'LI',
                      'AUX': 'O', 'EUX': 'O'}
        while pos < len(word):
            if word[pos:pos+4] == 'ILLE':
                variants.append(('ILLE', 'I'))
                pos += 4
            elif word[pos:pos+3] in len_3_vars:
                variants.append((word[pos:pos+3], len_3_vars[word[pos:pos+3]]))
                pos += 3
            elif word[pos:pos+2] == 'RB':
                variants.append(('RB', 'RW'))
                pos += 2
            elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
                variants.append(('EAU', 'O'))
                pos += 3
            elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
                if word[pos:] == 'O':
                    variants.append(('O', 'OW'))
                else:
                    variants.append(('A', 'AR'))
                pos += 1
            else:
                variants.append((word[pos],))
                pos += 1

        variants = [''.join(letters) for letters in product(*variants)]

    def _haase_code(word):
        sdx = ''
        for i in range(len(word)):
            if word[i] in _vowels:
                sdx += '9'
            elif word[i] == 'B':
                sdx += '1'
            elif word[i] == 'P':
                if _before(word, i, {'H'}):
                    sdx += '3'
                else:
                    sdx += '1'
            elif word[i] in {'D', 'T'}:
                if _before(word, i, {'C', 'S', 'Z'}):
                    sdx += '8'
                else:
                    sdx += '2'
            elif word[i] in {'F', 'V', 'W'}:
                sdx += '3'
            elif word[i] in {'G', 'K', 'Q'}:
                sdx += '4'
            elif word[i] == 'C':
                if _after(word, i, {'S', 'Z'}):
                    sdx += '8'
                elif i == 0:
                    if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R',
                                         'U', 'X'}):
                        sdx += '4'
                    else:
                        sdx += '8'
                elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
                    sdx += '4'
                else:
                    sdx += '8'
            elif word[i] == 'X':
                if _after(word, i, {'C', 'K', 'Q'}):
                    sdx += '8'
                else:
                    sdx += '48'
            elif word[i] == 'L':
                sdx += '5'
            elif word[i] in {'M', 'N'}:
                sdx += '6'
            elif word[i] == 'R':
                sdx += '7'
            elif word[i] in {'S', 'Z'}:
                sdx += '8'

        sdx = _delete_consecutive_repeats(sdx)

        return sdx

    encoded = tuple(_haase_code(word) for word in variants)
    if len(encoded) > 1:
        encoded_set = set()
        encoded_single = []
        for code in encoded:
            if code not in encoded_set:
                encoded_set.add(code)
                encoded_single.append(code)
        return tuple(encoded_single)

    return encoded


[docs]def reth_schek_phonetik(word):
    """Return Reth-Schek Phonetik code for a word.

    This algorithm is proposed in :cite:`Reth:1977`.

    Since I couldn't secure a copy of that document (maybe I'll look for it
    next time I'm in Germany), this implementation is based on what I could
    glean from the implementations published by German Record Linkage
    Center (www.record-linkage.de):

    - Privacy-preserving Record Linkage (PPRL) (in R) :cite:`Rukasz:2018`
    - Merge ToolBox (in Java) :cite:`Schnell:2004`

    Rules that are unclear:

    - Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked)
    - Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo)
    - Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't
      think of a German word with '-tui-' in it.)
    - Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'?

    :param str word: the word to transform
    :returns: the Reth-Schek Phonetik code
    :rtype: str

    >>> reth_schek_phonetik('Joachim')
    'JOAGHIM'
    >>> reth_schek_phonetik('Christoph')
    'GHRISDOF'
    >>> reth_schek_phonetik('Jörg')
    'JOERG'
    >>> reth_schek_phonetik('Smith')
    'SMID'
    >>> reth_schek_phonetik('Schmidt')
    'SCHMID'
    """
    replacements = {3: {'AEH': 'E', 'IEH': 'I', 'OEH': 'OE', 'UEH': 'UE',
                        'SCH': 'CH', 'ZIO': 'TIO', 'TIU': 'TIO', 'ZIU': 'TIO',
                        'CHS': 'X', 'CKS': 'X', 'AEU': 'OI'},
                    2: {'LL': 'L', 'AA': 'A', 'AH': 'A', 'BB': 'B', 'PP': 'B',
                        'BP': 'B', 'PB': 'B', 'DD': 'D', 'DT': 'D', 'TT': 'D',
                        'TH': 'D', 'EE': 'E', 'EH': 'E', 'AE': 'E', 'FF': 'F',
                        'PH': 'F', 'KK': 'K', 'GG': 'G', 'GK': 'G', 'KG': 'G',
                        'CK': 'G', 'CC': 'C', 'IE': 'I', 'IH': 'I', 'MM': 'M',
                        'NN': 'N', 'OO': 'O', 'OH': 'O', 'SZ': 'S', 'UH': 'U',
                        'GS': 'X', 'KS': 'X', 'TZ': 'Z', 'AY': 'AI',
                        'EI': 'AI', 'EY': 'AI', 'EU': 'OI', 'RR': 'R',
                        'SS': 'S', 'KW': 'QU'},
                    1: {'P': 'B', 'T': 'D', 'V': 'F', 'W': 'F', 'C': 'G',
                        'K': 'G', 'Y': 'I'}}

    # Uppercase
    word = word.upper()

    # Replace umlauts/eszett
    word = word.replace('Ä', 'AE')
    word = word.replace('Ö', 'OE')
    word = word.replace('Ü', 'UE')
    word = word.replace('ß', 'SS')

    # Main loop, using above replacements table
    pos = 0
    while pos < len(word):
        for num in range(3, 0, -1):
            if word[pos:pos+num] in replacements[num]:
                word = (word[:pos] + replacements[num][word[pos:pos+num]]
                        + word[pos+num:])
                pos += 1
                break
        else:
            pos += 1  # Advance if nothing is recognized

    # Change 'CH' back(?) to 'SCH'
    word = word.replace('CH', 'SCH')

    # Replace final sequences
    if word[-2:] == 'ER':
        word = word[:-2]+'R'
    elif word[-2:] == 'EL':
        word = word[:-2]+'L'
    elif word[-1:] == 'H':
        word = word[:-1]

    return word


[docs]def fonem(word):
    """Return the FONEM code of a word.

    FONEM is a phonetic algorithm designed for French (particularly surnames in
    Saguenay, Canada), defined in :cite:`Bouchard:1981`.

    Guillaume Plique's Javascript implementation :cite:`Plique:2018` at
    https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js
    was also consulted for this implementation.

    :param str word: the word to transform
    :returns: the FONEM code
    :rtype: str

    >>> fonem('Marchand')
    'MARCHEN'
    >>> fonem('Beaulieu')
    'BOLIEU'
    >>> fonem('Beaumont')
    'BOMON'
    >>> fonem('Legrand')
    'LEGREN'
    >>> fonem('Pelletier')
    'PELETIER'
    """
    # I don't see a sane way of doing this without regexps :(
    rule_table = {
        # Vowels & groups of vowels
        'V-1':     (re_compile('E?AU'), 'O'),
        'V-2,5':   (re_compile('(E?AU|O)L[TX]$'), 'O'),
        'V-3,4':   (re_compile('E?AU[TX]$'), 'O'),
        'V-6':     (re_compile('E?AUL?D$'), 'O'),
        'V-7':     (re_compile(r'(?<!G)AY$'), 'E'),
        'V-8':     (re_compile('EUX$'), 'EU'),
        'V-9':     (re_compile('EY(?=$|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'),
        'V-10':    ('Y', 'I'),
        'V-11':    (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'),
        'V-12':    (re_compile('(?<=[AEIOUY])ILL'), 'Y'),
        'V-13':    (re_compile('OU(?=[AEOU]|I(?!LL))'), 'W'),
        'V-14':    (re_compile(r'([AEIOUY])(?=\1)'), ''),
        # Nasal vowels
        'V-15':    (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'),
        'V-16':    (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'),
        'V-17':    (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'),
        'V-18':    (re_compile('(AI[MN]|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'),
                    'IN'),
        'V-19':    (re_compile('B(O|U|OU)RNE?$'), 'BURN'),
        'V-20':    (re_compile('(^IM|(?<=[BCDFGHJKLMNPQRSTVWXZ])' +
                               'IM(?=[BCDFGHJKLMPQRSTVWXZ]))'), 'IN'),
        # Consonants and groups of consonants
        'C-1':     ('BV', 'V'),
        'C-2':     (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'),
        'C-3':     (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'),
        'C-4':     (re_compile('^C(?=[EIY])'), 'S'),
        'C-5':     (re_compile('^C(?=[OUA])'), 'K'),
        'C-6':     (re_compile('(?<=[AEIOUY])C$'), 'K'),
        'C-7':     (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'),
        'C-8':     (re_compile('CC(?=[AOU])'), 'K'),
        'C-9':     (re_compile('CC(?=[EIY])'), 'X'),
        'C-10':    (re_compile('G(?=[EIY])'), 'J'),
        'C-11':    (re_compile('GA(?=I?[MN])'), 'G#'),
        'C-12':    (re_compile('GE(O|AU)'), 'JO'),
        'C-13':    (re_compile('GNI(?=[AEIOUY])'), 'GN'),
        'C-14':    (re_compile('(?<![PCS])H'), ''),
        'C-15':    ('JEA', 'JA'),
        'C-16':    (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'),
        'C-17':    (re_compile('^MC'), 'MA#'),
        'C-18':    ('PH', 'F'),
        'C-19':    ('QU', 'K'),
        'C-20':    (re_compile('^SC(?=[EIY])'), 'S'),
        'C-21':    (re_compile('(?<=.)SC(?=[EIY])'), 'SS'),
        'C-22':    (re_compile('(?<=.)SC(?=[AOU])'), 'SK'),
        'C-23':    ('SH', 'CH'),
        'C-24':    (re_compile('TIA$'), 'SSIA'),
        'C-25':    (re_compile('(?<=[AIOUY])W'), ''),
        'C-26':    (re_compile('X[CSZ]'), 'X'),
        'C-27':    (re_compile('(?<=[AEIOUY])Z|(?<=[BCDFGHJKLMNPQRSTVWXZ])' +
                               'Z(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'S'),
        'C-28':    (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'),
        'C-28a':   (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'C'),
        'C-28b':   (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])|^)SS'), 'S'),
        'C-28bb':  (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'S'),
        'C-28c':   (re_compile('((?<=[^I])|^)LL'), 'L'),
        'C-28d':   (re_compile('ILE$'), 'ILLE'),
        'C-29':    (re_compile('(ILS|[CS]H|[MN]P|R[CFKLNSX])$|([BCDFGHJKL' +
                               'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'),
                    lambda m: (m.group(1) or '') + (m.group(2) or '')),
        'C-30,32': (re_compile('^(SA?INT?|SEI[NM]|CINQ?|ST)(?!E)-?'), 'ST-'),
        'C-31,33': (re_compile('^(SAINTE|STE)-?'), 'STE-'),
        # Rules to undo rule bleeding prevention in C-11, C-16, C-17
        'C-34':    ('G#', 'GA'),
        'C-35':    ('MA#', 'MAC')
    }
    rule_order = [
        'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d',
        'C-12',
        'C-8', 'C-9', 'C-10',
        'C-16', 'C-17', 'C-2', 'C-3', 'C-7',
        'V-2,5', 'V-3,4', 'V-6',
        'V-1', 'C-14',
        'C-31,33', 'C-30,32',
        'C-11', 'V-15', 'V-17', 'V-18',
        'V-7', 'V-8', 'V-9', 'V-10', 'V-11', 'V-12', 'V-13', 'V-16',
        'V-19', 'V-20',
        'C-1', 'C-4', 'C-5', 'C-6', 'C-13', 'C-15',
        'C-18', 'C-19', 'C-20', 'C-21', 'C-22', 'C-23', 'C-24',
        'C-25', 'C-26', 'C-27',
        'C-29',
        'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d',
        'C-34', 'C-35'
    ]

    # normalize, upper-case, and filter non-French letters
    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.translate({198: 'AE', 338: 'OE'})
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z', '-'})

    for rule in rule_order:
        regex, repl = rule_table[rule]
        if isinstance(regex, text_type):
            word = word.replace(regex, repl)
        else:
            word = regex.sub(repl, word)

    return word


[docs]def parmar_kumbharana(word):
    """Return the Parmar-Kumbharana encoding of a word.

    This is based on the phonetic algorithm proposed in :cite:`Parmar:2014`.

    :param str word: the word to transform
    :returns: the Parmar-Kumbharana encoding
    :rtype: str

    >>> parmar_kumbharana('Gough')
    'GF'
    >>> parmar_kumbharana('pneuma')
    'NM'
    >>> parmar_kumbharana('knight')
    'NT'
    >>> parmar_kumbharana('trice')
    'TRS'
    >>> parmar_kumbharana('judge')
    'JJ'
    """
    rule_table = {4: {'OUGH': 'F'},
                  3: {'DGE': 'J',
                      'OUL': 'U',
                      'GHT': 'T'},
                  2: {'CE': 'S', 'CI': 'S', 'CY': 'S',
                      'GE': 'J', 'GI': 'J', 'GY': 'J',
                      'WR': 'R',
                      'GN': 'N', 'KN': 'N', 'PN': 'N',
                      'CK': 'K',
                      'SH': 'S'}}
    vowel_trans = {65: '', 69: '', 73: '', 79: '', 85: '', 89: ''}

    word = word.upper()  # Rule 3
    word = _delete_consecutive_repeats(word)  # Rule 4

    # Rule 5
    i = 0
    while i < len(word):
        for match_len in range(4, 1, -1):
            if word[i:i+match_len] in rule_table[match_len]:
                repl = rule_table[match_len][word[i:i+match_len]]
                word = (word[:i] + repl + word[i+match_len:])
                i += len(repl)
                break
        else:
            i += 1

    word = word[0]+word[1:].translate(vowel_trans)  # Rule 6
    return word


[docs]def davidson(lname, fname='.', omit_fname=False):
    """Return Davidson's Consonant Code.

    This is based on the name compression system described in
    :cite:`Davidson:1962`.

    :cite:`Dolby:1970` identifies this as having been the name compression
    algorithm used by SABRE.

    :param str lname: Last name (or word) to be encoded
    :param str fname: First name (optional), of which the first character is
        included in the code.
    :param bool omit_fname: Set to True to completely omit the first character
        of the first name
    :returns: Davidson's Consonant Code
    :rtype: str

    >>> davidson('Gough')
    'G   .'
    >>> davidson('pneuma')
    'PNM .'
    >>> davidson('knight')
    'KNGT.'
    >>> davidson('trice')
    'TRC .'
    >>> davidson('judge')
    'JDG .'
    >>> davidson('Smith', 'James')
    'SMT J'
    >>> davidson('Wasserman', 'Tabitha')
    'WSRMT'
    """
    trans = {65: '', 69: '', 73: '', 79: '', 85: '', 72: '', 87: '', 89: ''}

    lname = text_type(lname.upper())
    code = _delete_consecutive_repeats(lname[:1] + lname[1:].translate(trans))
    code = code[:4] + (4-len(code))*' '

    if not omit_fname:
        code += fname[:1].upper()

    return code


[docs]def sound_d(word, max_length=4):
    """Return the SoundD code.

    SoundD is defined in :cite:`Varol:2012`.

    :param str word: the word to transform
    :param int max_length: the length of the code returned (defaults to 4)
    :returns: the SoundD code
    :rtype: str

    >>> sound_d('Gough')
    '2000'
    >>> sound_d('pneuma')
    '5500'
    >>> sound_d('knight')
    '5300'
    >>> sound_d('trice')
    '3620'
    >>> sound_d('judge')
    '2200'
    """
    _ref_soundd_translation = dict(zip((ord(_) for _ in
                                        'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
                                       '01230120022455012623010202'))

    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})

    if word[:2] in {'KN', 'GN', 'PN', 'AC', 'WR'}:
        word = word[1:]
    elif word[:1] == 'X':
        word = 'S'+word[1:]
    elif word[:2] == 'WH':
        word = 'W'+word[2:]

    word = word.replace('DGE', '20').replace('DGI', '20').replace('GH', '0')

    word = word.translate(_ref_soundd_translation)
    word = _delete_consecutive_repeats(word)
    word = word.replace('0', '')

    if max_length != -1:
        if len(word) < max_length:
            word += '0' * (max_length-len(word))
        else:
            word = word[:max_length]

    return word


[docs]def pshp_soundex_last(lname, max_length=4, german=False):
    """Calculate the PSHP Soundex/Viewex Coding of a last name.

    This coding is based on :cite:`Hershberg:1976`.

    Reference was also made to the German version of the same:
    :cite:`Hershberg:1979`.

    A separate function, pshp_soundex_first() is used for first names.

    :param str lname: the last name to encode
    :param int max_length: the length of the code returned (defaults to 4)
    :param bool german: set to True if the name is German (different rules
        apply)
    :returns: the PSHP Soundex/Viewex Coding
    :rtype: str

    >>> pshp_soundex_last('Smith')
    'S530'
    >>> pshp_soundex_last('Waters')
    'W350'
    >>> pshp_soundex_last('James')
    'J500'
    >>> pshp_soundex_last('Schmidt')
    'S530'
    >>> pshp_soundex_last('Ashcroft')
    'A225'
    """
    lname = unicode_normalize('NFKD', text_type(lname.upper()))
    lname = lname.replace('ß', 'SS')
    lname = ''.join(c for c in lname if c in
                    {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
                     'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
                     'W', 'X', 'Y', 'Z'})

    # A. Prefix treatment
    if lname[:3] == 'VON' or lname[:3] == 'VAN':
        lname = lname[3:].strip()

    # The rule implemented below says "MC, MAC become 1". I believe it meant to
    # say they become M except in German data (where superscripted 1 indicates
    # "except in German data"). It doesn't make sense for them to become 1
    # (BPFV -> 1) or to apply outside German. Unfortunately, both articles have
    # this error(?).
    if not german:
        if lname[:3] == 'MAC':
            lname = 'M'+lname[3:]
        elif lname[:2] == 'MC':
            lname = 'M'+lname[2:]

    # The non-German-only rule to strip ' is unnecessary due to filtering

    if lname[:1] in {'E', 'I', 'O', 'U'}:
        lname = 'A' + lname[1:]
    elif lname[:2] in {'GE', 'GI', 'GY'}:
        lname = 'J' + lname[1:]
    elif lname[:2] in {'CE', 'CI', 'CY'}:
        lname = 'S' + lname[1:]
    elif lname[:3] == 'CHR':
        lname = 'K' + lname[1:]
    elif lname[:1] == 'C' and lname[:2] != 'CH':
        lname = 'K' + lname[1:]

    if lname[:2] == 'KN':
        lname = 'N' + lname[1:]
    elif lname[:2] == 'PH':
        lname = 'F' + lname[1:]
    elif lname[:3] in {'WIE', 'WEI'}:
        lname = 'V' + lname[1:]

    if german and lname[:1] in {'W', 'M', 'Y', 'Z'}:
        lname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[lname[0]]+lname[1:]

    code = lname[:1]

    # B. Postfix treatment
    if german:  # moved from end of postfix treatment due to blocking
        if lname[-3:] == 'TES':
            lname = lname[:-3]
        elif lname[-2:] == 'TS':
            lname = lname[:-2]
        if lname[-3:] == 'TZE':
            lname = lname[:-3]
        elif lname[-2:] == 'ZE':
            lname = lname[:-2]
        if lname[-1:] == 'Z':
            lname = lname[:-1]
        elif lname[-2:] == 'TE':
            lname = lname[:-2]

    if lname[-1:] == 'R':
        lname = lname[:-1] + 'N'
    elif lname[-2:] in {'SE', 'CE'}:
        lname = lname[:-2]
    if lname[-2:] == 'SS':
        lname = lname[:-2]
    elif lname[-1:] == 'S':
        lname = lname[:-1]

    if not german:
        l5_repl = {'STOWN': 'SAWON', 'MPSON': 'MASON'}
        l4_repl = {'NSEN': 'ASEN', 'MSON': 'ASON', 'STEN': 'SAEN',
                   'STON': 'SAON'}
        if lname[-5:] in l5_repl:
            lname = lname[:-5] + l5_repl[lname[-5:]]
        elif lname[-4:] in l4_repl:
            lname = lname[:-4] + l4_repl[lname[-4:]]

    if lname[-2:] in {'NG', 'ND'}:
        lname = lname[:-1]
    if not german and lname[-3:] in {'GAN', 'GEN'}:
        lname = lname[:-3]+'A'+lname[-2:]

    # C. Infix Treatment
    lname = lname.replace('CK', 'C')
    lname = lname.replace('SCH', 'S')
    lname = lname.replace('DT', 'T')
    lname = lname.replace('ND', 'N')
    lname = lname.replace('NG', 'N')
    lname = lname.replace('LM', 'M')
    lname = lname.replace('MN', 'M')
    lname = lname.replace('WIE', 'VIE')
    lname = lname.replace('WEI', 'VEI')

    # D. Soundexing
    # code for X & Y are unspecified, but presumably are 2 & 0
    _pshp_translation = dict(zip((ord(_) for _ in
                                  'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
                                 '01230120022455012523010202'))

    lname = lname.translate(_pshp_translation)
    lname = _delete_consecutive_repeats(lname)

    code += lname[1:]
    code = code.replace('0', '')  # rule 1

    if max_length != -1:
        if len(code) < max_length:
            code += '0' * (max_length-len(code))
        else:
            code = code[:max_length]

    return code


[docs]def pshp_soundex_first(fname, max_length=4, german=False):
    """Calculate the PSHP Soundex/Viewex Coding of a first name.

    This coding is based on :cite:`Hershberg:1976`.

    Reference was also made to the German version of the same:
    :cite:`Hershberg:1979`.

    A separate function, pshp_soundex_last() is used for last names.

    :param str fname: the first name to encode
    :param int max_length: the length of the code returned (defaults to 4)
    :param bool german: set to True if the name is German (different rules
        apply)
    :returns: the PSHP Soundex/Viewex Coding
    :rtype: str

    >>> pshp_soundex_first('Smith')
    'S530'
    >>> pshp_soundex_first('Waters')
    'W352'
    >>> pshp_soundex_first('James')
    'J700'
    >>> pshp_soundex_first('Schmidt')
    'S500'
    >>> pshp_soundex_first('Ashcroft')
    'A220'
    >>> pshp_soundex_first('John')
    'J500'
    >>> pshp_soundex_first('Colin')
    'K400'
    >>> pshp_soundex_first('Niall')
    'N400'
    >>> pshp_soundex_first('Sally')
    'S400'
    >>> pshp_soundex_first('Jane')
    'J500'
    """
    fname = unicode_normalize('NFKD', text_type(fname.upper()))
    fname = fname.replace('ß', 'SS')
    fname = ''.join(c for c in fname if c in
                    {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
                     'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
                     'W', 'X', 'Y', 'Z'})

    # special rules
    if fname == 'JAMES':
        code = 'J7'
    elif fname == 'PAT':
        code = 'P7'

    else:
        # A. Prefix treatment
        if fname[:2] in {'GE', 'GI', 'GY'}:
            fname = 'J' + fname[1:]
        elif fname[:2] in {'CE', 'CI', 'CY'}:
            fname = 'S' + fname[1:]
        elif fname[:3] == 'CHR':
            fname = 'K' + fname[1:]
        elif fname[:1] == 'C' and fname[:2] != 'CH':
            fname = 'K' + fname[1:]

        if fname[:2] == 'KN':
            fname = 'N' + fname[1:]
        elif fname[:2] == 'PH':
            fname = 'F' + fname[1:]
        elif fname[:3] in {'WIE', 'WEI'}:
            fname = 'V' + fname[1:]

        if german and fname[:1] in {'W', 'M', 'Y', 'Z'}:
            fname = ({'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[fname[0]] +
                     fname[1:])

        code = fname[:1]

        # B. Soundex coding
        # code for Y unspecified, but presumably is 0
        _pshp_translation = dict(zip((ord(_) for _ in
                                      'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
                                     '01230120022455012523010202'))

        fname = fname.translate(_pshp_translation)
        fname = _delete_consecutive_repeats(fname)

        code += fname[1:]
        syl_ptr = code.find('0')
        syl2_ptr = code[syl_ptr + 1:].find('0')
        if syl_ptr != -1 and syl2_ptr != -1 and syl2_ptr - syl_ptr > -1:
            code = code[:syl_ptr + 2]

        code = code.replace('0', '')  # rule 1

    if max_length != -1:
        if len(code) < max_length:
            code += '0' * (max_length-len(code))
        else:
            code = code[:max_length]

    return code


[docs]def henry_early(word, max_length=3):
    """Calculate the early version of the Henry code for a word.

    The early version of Henry coding is given in :cite:`Legare:1972`. This is
    different from the later version defined in :cite:`Henry:1976`.

    :param str word: the word to transform
    :param int max_length: the length of the code returned (defaults to 3)
    :returns: the early Henry code
    :rtype: str

    >>> henry_early('Marchand')
    'MRC'
    >>> henry_early('Beaulieu')
    'BL'
    >>> henry_early('Beaumont')
    'BM'
    >>> henry_early('Legrand')
    'LGR'
    >>> henry_early('Pelletier')
    'PLT'
    """
    _cons = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q',
             'R', 'S', 'T', 'V', 'W', 'X', 'Z'}
    _vows = {'A', 'E', 'I', 'O', 'U', 'Y'}
    _diph = {'AI': 'E', 'AY': 'E', 'EI': 'E', 'AU': 'O', 'OI': 'O', 'OU': 'O',
             'EU': 'U'}
    # _unaltered = {'B', 'D', 'F', 'J', 'K', 'L', 'M', 'N', 'R', 'T', 'V'}
    _simple = {'W': 'V', 'X': 'S', 'Z': 'S'}

    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})

    if not word:
        return ''

    # Rule Ia seems to be covered entirely in II

    # Rule Ib
    if word[0] in _vows:
        # Ib1
        if (((word[1:2] in _cons-{'M', 'N'} and word[2:3] in _cons) or
             (word[1:2] in _cons and word[2:3] not in _cons))):
            if word[0] == 'Y':
                word = 'I'+word[1:]
        # Ib2
        elif word[1:2] in {'M', 'N'} and word[2:3] in _cons:
            if word[0] == 'E':
                word = 'A'+word[1:]
            elif word[0] in {'I', 'U', 'Y'}:
                word = 'E'+word[1:]
        # Ib3
        elif word[:2] in _diph:
            word = _diph[word[:2]]+word[2:]
        # Ib4
        elif word[1:2] in _vows and word[0] == 'Y':
            word = 'I' + word[1:]

    code = ''
    skip = 0

    # Rule II
    for pos, char in enumerate(word):
        nxch = word[pos+1:pos+2]
        prev = word[pos-1:pos]

        if skip:
            skip -= 1
        elif char in _vows:
            code += char
        # IIc
        elif char == nxch:
            skip = 1
            code += char
        elif word[pos:pos+2] in {'CQ', 'DT', 'SC'}:
            continue
        # IIb
        elif char in _simple:
            code += _simple[char]
        elif char in {'C', 'G', 'P', 'Q', 'S'}:
            if char == 'C':
                if nxch in {'A', 'O', 'U', 'L', 'R'}:
                    code += 'K'
                elif nxch in {'E', 'I', 'Y'}:
                    code += 'S'
                elif nxch == 'H':
                    if word[pos+2:pos+3] in _vows:
                        code += 'C'
                    else:  # CHR, CHL, etc.
                        code += 'K'
                else:
                    code += 'C'
            elif char == 'G':
                if nxch in {'A', 'O', 'U', 'L', 'R'}:
                    code += 'G'
                elif nxch in {'E', 'I', 'Y'}:
                    code += 'J'
                elif nxch == 'N':
                    code += 'N'
            elif char == 'P':
                if nxch != 'H':
                    code += 'P'
                else:
                    code += 'F'
            elif char == 'Q':
                if word[pos+1:pos+3] in {'UE', 'UI', 'UY'}:
                    code += 'G'
                else:  # QUA, QUO, etc.
                    code += 'K'
            else:  # S...
                if word[pos:pos+6] == 'SAINTE':
                    code += 'X'
                    skip = 5
                elif word[pos:pos+5] == 'SAINT':
                    code += 'X'
                    skip = 4
                elif word[pos:pos+3] == 'STE':
                    code += 'X'
                    skip = 2
                elif word[pos:pos+2] == 'ST':
                    code += 'X'
                    skip = 1
                elif nxch in _cons:
                    continue
                else:
                    code += 'S'
        # IId
        elif char == 'H' and prev in _cons:
            continue
        elif char in _cons-{'L', 'R'} and nxch in _cons-{'L', 'R'}:
            continue
        elif char == 'L' and nxch in {'M', 'N'}:
            continue
        elif char in {'M', 'N'} and prev in _vows and nxch in _cons:
            continue
        # IIa
        else:
            code += char

    # IIe1
    if code[-4:] in {'AULT', 'EULT', 'OULT'}:
        code = code[:-2]
    # The following are blocked by rules above
    # elif code[-4:-3] in _vows and code[-3:] == 'MPS':
    #    code = code[:-3]
    # elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND',
    #                                             'NS', 'NT'}:
    #    code = code[:-2]
    elif code[-2:-1] == 'R' and code[-1:] in _cons:
        code = code[:-1]
    # IIe2
    elif code[-2:-1] in _vows and code[-1:] in {'D', 'M', 'N', 'S', 'T'}:
        code = code[:-1]
    elif code[-2:] == 'ER':
        code = code[:-1]

    # Drop non-initial vowels
    code = code[:1]+code[1:].translate({65: '', 69: '', 73: '', 79: '', 85: '',
                                        89: ''})

    if max_length != -1:
            code = code[:max_length]

    return code


[docs]def norphone(word):
    """Return the Norphone code.

    The reference implementation by Lars Marius Garshol is available in
    :cite:`Garshol:2015`.

    Norphone was designed for Norwegian, but this implementation has been
    extended to support Swedish vowels as well. This function incorporates
    the "not implemented" rules from the above file's rule set.

    :param str word: the word to transform
    :returns: the Norphone code
    :rtype: str

    >>> norphone('Hansen')
    'HNSN'
    >>> norphone('Larsen')
    'LRSN'
    >>> norphone('Aagaard')
    'ÅKRT'
    >>> norphone('Braaten')
    'BRTN'
    >>> norphone('Sandvik')
    'SNVK'
    """
    _vowels = {'A', 'E', 'I', 'O', 'U', 'Y', 'Å', 'Æ', 'Ø', 'Ä', 'Ö'}

    replacements = {4: {'SKEI': 'X'},
                    3: {'SKJ': 'X', 'KEI': 'X'},
                    2: {'CH': 'K', 'CK': 'K', 'GJ': 'J', 'GH': 'K', 'HG': 'K',
                        'HJ': 'J', 'HL': 'L', 'HR': 'R', 'KJ': 'X', 'KI': 'X',
                        'LD': 'L', 'ND': 'N', 'PH': 'F', 'TH': 'T', 'SJ': 'X'},
                    1: {'W': 'V', 'X': 'KS', 'Z': 'S', 'D': 'T', 'G': 'K'}}

    word = word.upper()

    code = ''
    skip = 0

    if word[0:2] == 'AA':
        code = 'Å'
        skip = 2
    elif word[0:2] == 'GI':
        code = 'J'
        skip = 2
    elif word[0:3] == 'SKY':
        code = 'X'
        skip = 3
    elif word[0:2] == 'EI':
        code = 'Æ'
        skip = 2
    elif word[0:2] == 'KY':
        code = 'X'
        skip = 2
    elif word[:1] == 'C':
        code = 'K'
        skip = 1
    elif word[:1] == 'Ä':
        code = 'Æ'
        skip = 1
    elif word[:1] == 'Ö':
        code = 'Ø'
        skip = 1

    if word[-2:] == 'DT':
        word = word[:-2]+'T'
    # Though the rules indicate this rule applies in all positions, the
    # reference implementation indicates it applies only in final position.
    elif word[-2:-1] in _vowels and word[-1:] == 'D':
        word = word[:-2]

    for pos, char in enumerate(word):
        if skip:
            skip -= 1
        else:
            for length in sorted(replacements, reverse=True):
                if word[pos:pos+length] in replacements[length]:
                    code += replacements[length][word[pos:pos+length]]
                    skip = length-1
                    break
            else:
                if not pos or char not in _vowels:
                    code += char

    code = _delete_consecutive_repeats(code)

    return code


[docs]def dolby(word, max_length=-1, keep_vowels=False, vowel_char='*'):
    r"""Return the Dolby Code of a name.

    This follows "A Spelling Equivalent Abbreviation Algorithm For Personal
    Names" from :cite:`Dolby:1970` and :cite:`Cunningham:1969`.

    :param word: the word to encode
    :param max_length: maximum length of the returned Dolby code -- this also
        activates the fixed-length code mode if it is greater than 0
    :param keep_vowels: if True, retains all vowel markers
    :param vowel_char: the vowel marker character (default to \*)
    :returns: the Dolby Code
    :rtype: str

    >>> dolby('Hansen')
    'H*NSN'
    >>> dolby('Larsen')
    'L*RSN'
    >>> dolby('Aagaard')
    '*GR'
    >>> dolby('Braaten')
    'BR*DN'
    >>> dolby('Sandvik')
    'S*NVK'
    >>> dolby('Hansen', max_length=6)
    'H*NS*N'
    >>> dolby('Larsen', max_length=6)
    'L*RS*N'
    >>> dolby('Aagaard', max_length=6)
    '*G*R  '
    >>> dolby('Braaten', max_length=6)
    'BR*D*N'
    >>> dolby('Sandvik', max_length=6)
    'S*NF*K'

    >>> dolby('Smith')
    'SM*D'
    >>> dolby('Waters')
    'W*DRS'
    >>> dolby('James')
    'J*MS'
    >>> dolby('Schmidt')
    'SM*D'
    >>> dolby('Ashcroft')
    '*SKRFD'
    >>> dolby('Smith', max_length=6)
    'SM*D  '
    >>> dolby('Waters', max_length=6)
    'W*D*RS'
    >>> dolby('James', max_length=6)
    'J*M*S '
    >>> dolby('Schmidt', max_length=6)
    'SM*D  '
    >>> dolby('Ashcroft', max_length=6)
    '*SKRFD'
    """
    _vowels = {'A', 'E', 'I', 'O', 'U', 'Y'}

    # uppercase, normalize, decompose, and filter non-A-Z out
    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})

    # Rule 1 (FL2)
    if word[:3] in {'MCG', 'MAG', 'MAC'}:
        word = 'MK'+word[3:]
    elif word[:2] == 'MC':
        word = 'MK'+word[2:]

    # Rule 2 (FL3)
    pos = len(word)-2
    while pos > -1:
        if word[pos:pos+2] in {'DT', 'LD', 'ND', 'NT', 'RC', 'RD', 'RT', 'SC',
                               'SK', 'ST'}:
            word = word[:pos+1]+word[pos+2:]
            pos += 1
        pos -= 1

    # Rule 3 (FL4)
    # Although the rule indicates "after the first letter", the test cases make
    # it clear that these apply to the first letter also.
    word = word.replace('X', 'KS')
    word = word.replace('CE', 'SE')
    word = word.replace('CI', 'SI')
    word = word.replace('CY', 'SI')

    # not in the rule set, but they seem to have intended it
    word = word.replace('TCH', 'CH')

    pos = word.find('CH', 1)
    while pos != -1:
        if word[pos-1:pos] not in _vowels:
            word = word[:pos]+'S'+word[pos+1:]
        pos = word.find('CH', pos+1)

    word = word.replace('C', 'K')
    word = word.replace('Z', 'S')

    word = word.replace('WR', 'R')
    word = word.replace('DG', 'G')
    word = word.replace('QU', 'K')
    word = word.replace('T', 'D')
    word = word.replace('PH', 'F')

    # Rule 4 (FL5)
    # Although the rule indicates "after the first letter", the test cases make
    # it clear that these apply to the first letter also.
    pos = word.find('K', 0)
    while pos != -1:
        if pos > 1 and word[pos-1:pos] not in _vowels | {'L', 'N', 'R'}:
            word = word[:pos-1]+word[pos:]
            pos -= 1
        pos = word.find('K', pos+1)

    # Rule FL6
    if max_length > 0 and word[-1:] == 'E':
        word = word[:-1]

    # Rule 5 (FL7)
    word = _delete_consecutive_repeats(word)

    # Rule 6 (FL8)
    if word[:2] == 'PF':
        word = word[1:]
    if word[-2:] == 'PF':
        word = word[:-1]
    elif word[-2:] == 'GH':
        if word[-3:-2] in _vowels:
            word = word[:-2]+'F'
        else:
            word = word[:-2]+'G'
    word = word.replace('GH', '')

    # Rule FL9
    if max_length > 0:
        word = word.replace('V', 'F')

    # Rules 7-9 (FL10-FL12)
    first = 1 + (1 if max_length > 0 else 0)
    code = ''
    for pos, char in enumerate(word):
        if char in _vowels:
            if first or keep_vowels:
                code += vowel_char
                first -= 1
        elif pos > 0 and char in {'W', 'H'}:
            continue
        else:
            code += char

    if max_length > 0:
        # Rule FL13
        if len(code) > max_length and code[-1:] == 'S':
            code = code[:-1]
        if keep_vowels:
            code = code[:max_length]
        else:
            # Rule FL14
            code = code[:max_length + 2]
            # Rule FL15
            while len(code) > max_length:
                vowels = len(code) - max_length
                excess = vowels - 1
                word = code
                code = ''
                for char in word:
                    if char == vowel_char:
                        if vowels:
                            code += char
                            vowels -= 1
                    else:
                        code += char
                code = code[:max_length + excess]

        # Rule FL16
        code += ' ' * (max_length - len(code))

    return code


[docs]def phonetic_spanish(word, max_length=-1):
    """Return the PhoneticSpanish coding of word.

    This follows the coding described in :cite:`Amon:2012` and
    :cite:`delPilarAngeles:2015`.

    :param str word: the word to transform
    :param int max_length: the length of the code returned (defaults to
        unlimited)
    :returns: the PhoneticSpanish code
    :rtype: str

    >>> phonetic_spanish('Perez')
    '094'
    >>> phonetic_spanish('Martinez')
    '69364'
    >>> phonetic_spanish('Gutierrez')
    '83994'
    >>> phonetic_spanish('Santiago')
    '4638'
    >>> phonetic_spanish('Nicolás')
    '6454'
    """
    _es_soundex_translation = dict(zip((ord(_) for _ in
                                        'BCDFGHJKLMNPQRSTVXYZ'),
                                       '14328287566079431454'))

    # uppercase, normalize, and decompose, filter to A-Z minus vowels & W
    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = ''.join(c for c in word if c in
                   {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N',
                    'P', 'Q', 'R', 'S', 'T', 'V', 'X', 'Y', 'Z'})

    # merge repeated Ls & Rs
    word = word.replace('LL', 'L')
    word = word.replace('R', 'R')

    # apply the Soundex algorithm
    sdx = word.translate(_es_soundex_translation)

    if max_length > 0:
        sdx = (sdx+('0'*max_length))[:max_length]

    return sdx


[docs]def spanish_metaphone(word, max_length=6, modified=False):
    """Return the Spanish Metaphone of a word.

    This is a quick rewrite of the Spanish Metaphone Algorithm, as presented at
    https://github.com/amsqr/Spanish-Metaphone and discussed in
    :cite:`Mosquera:2012`.

    Modified version based on :cite:`delPilarAngeles:2016`.

    :param str word: the word to transform
    :param int max_length: the length of the code returned (defaults to 6)
    :param bool modified: Set to True to use del Pilar Angeles &
        Bailón-Miguel's modified version of the algorithm
    :returns: the Spanish Metaphone code
    :rtype: str

    >>> spanish_metaphone('Perez')
    'PRZ'
    >>> spanish_metaphone('Martinez')
    'MRTNZ'
    >>> spanish_metaphone('Gutierrez')
    'GTRRZ'
    >>> spanish_metaphone('Santiago')
    'SNTG'
    >>> spanish_metaphone('Nicolás')
    'NKLS'
    """
    def _is_vowel(pos):
        """Return True if the character at word[pos] is a vowel."""
        return (pos < len(word) and
                word[pos] in {'A', 'E', 'I', 'O', 'U'})

    word = unicode_normalize('NFC', text_type(word.upper()))

    meta_key = ''
    pos = 0

    # do some replacements for the modified version
    if modified:
        word = word.replace('MB', 'NB')
        word = word.replace('MP', 'NP')
        word = word.replace('BS', 'S')
        if word[:2] == 'PS':
            word = word[1:]

    # simple replacements
    word = word.replace('Á', 'A')
    word = word.replace('CH', 'X')
    word = word.replace('Ç', 'S')
    word = word.replace('É', 'E')
    word = word.replace('Í', 'I')
    word = word.replace('Ó', 'O')
    word = word.replace('Ú', 'U')
    word = word.replace('Ñ', 'NY')
    word = word.replace('GÜ', 'W')
    word = word.replace('Ü', 'U')
    word = word.replace('B', 'V')
    word = word.replace('LL', 'Y')

    while len(meta_key) < max_length:
        if pos >= len(word):
            break

        # get the next character
        current_char = word[pos]

        # if a vowel in pos 0, add to key
        if _is_vowel(pos) and pos == 0:
            meta_key += current_char
            pos += 1
        # otherwise, do consonant rules
        else:
            # simple consonants (unmutated)
            if current_char in {'D', 'F', 'J', 'K', 'M', 'N', 'P', 'T', 'V',
                                'L', 'Y'}:
                meta_key += current_char
                # skip doubled consonants
                if word[pos+1:pos+2] == current_char:
                    pos += 2
                else:
                    pos += 1
            else:
                if current_char == 'C':
                    # special case 'acción', 'reacción',etc.
                    if word[pos+1:pos+2] == 'C':
                        meta_key += 'X'
                        pos += 2
                    # special case 'cesar', 'cien', 'cid', 'conciencia'
                    elif word[pos+1:pos+2] in {'E', 'I'}:
                        meta_key += 'Z'
                        pos += 2
                    # base case
                    else:
                        meta_key += 'K'
                        pos += 1
                elif current_char == 'G':
                    # special case 'gente', 'ecologia',etc
                    if word[pos + 1:pos + 2] in {'E', 'I'}:
                        meta_key += 'J'
                        pos += 2
                    # base case
                    else:
                        meta_key += 'G'
                        pos += 1
                elif current_char == 'H':
                    # since the letter 'H' is silent in Spanish,
                    # set the meta key to the vowel after the letter 'H'
                    if _is_vowel(pos+1):
                        meta_key += word[pos+1]
                        pos += 2
                    else:
                        meta_key += 'H'
                        pos += 1
                elif current_char == 'Q':
                    if word[pos+1:pos+2] == 'U':
                        pos += 2
                    else:
                        pos += 1
                    meta_key += 'K'
                elif current_char == 'W':
                    meta_key += 'U'
                    pos += 1
                elif current_char == 'R':
                    meta_key += 'R'
                    pos += 1
                elif current_char == 'S':
                    if not _is_vowel(pos+1) and pos == 0:
                        meta_key += 'ES'
                        pos += 1
                    else:
                        meta_key += 'S'
                        pos += 1
                elif current_char == 'Z':
                    meta_key += 'Z'
                    pos += 1
                elif current_char == 'X':
                    if len(word) > 1 and pos == 0 and not _is_vowel(pos+1):
                        meta_key += 'EX'
                        pos += 1
                    else:
                        meta_key += 'X'
                        pos += 1
                else:
                    pos += 1

    # Final change from S to Z in modified version
    if modified:
        meta_key = meta_key.replace('S', 'Z')

    return meta_key


[docs]def metasoundex(word, lang='en'):
    """Return the MetaSoundex code for a word.

    This is based on :cite:`Koneru:2017`. Only English ('en') and Spanish
    ('es') languages are supported, as in the original.

    :param str word: the word to transform
    :param str lang: either 'en' for English or 'es' for Spanish
    :returns: the MetaSoundex code
    :rtype: str

    >>> metasoundex('Smith')
    '4500'
    >>> metasoundex('Waters')
    '7362'
    >>> metasoundex('James')
    '1520'
    >>> metasoundex('Schmidt')
    '4530'
    >>> metasoundex('Ashcroft')
    '0261'
    >>> metasoundex('Perez', lang='es')
    '094'
    >>> metasoundex('Martinez', lang='es')
    '69364'
    >>> metasoundex('Gutierrez', lang='es')
    '83994'
    >>> metasoundex('Santiago', lang='es')
    '4638'
    >>> metasoundex('Nicolás', lang='es')
    '6754'
    """
    _metasoundex_translation = dict(zip((ord(_) for _ in
                                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
                                        '07430755015866075943077514'))

    if lang == 'es':
        return phonetic_spanish(spanish_metaphone(word))

    word = soundex(metaphone(word))
    word = word[0].translate(_metasoundex_translation)+word[1:]

    return word


[docs]def soundex_br(word, max_length=4, zero_pad=True):
    """Return the SoundexBR encoding of a word.

    This is based on :cite:`Marcelino:2015`.

    :param str word: the word to transform
    :param int max_length: the length of the code returned (defaults to 4)
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
        max_length string
    :returns: the SoundexBR code
    :rtype: str

    >>> soundex_br('Oliveira')
    'O416'
    >>> soundex_br('Almeida')
    'A453'
    >>> soundex_br('Barbosa')
    'B612'
    >>> soundex_br('Araújo')
    'A620'
    >>> soundex_br('Gonçalves')
    'G524'
    >>> soundex_br('Goncalves')
    'G524'
    """
    _soundex_br_translation = dict(zip((ord(_) for _ in
                                        'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
                                       '01230120022455012623010202'))

    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})

    if word[:2] == 'WA':
        first = 'V'
    elif word[:1] == 'K' and word[1:2] in {'A', 'O', 'U'}:
        first = 'C'
    elif word[:1] == 'C' and word[1:2] in {'I', 'E'}:
        first = 'S'
    elif word[:1] == 'G' and word[1:2] in {'E', 'I'}:
        first = 'J'
    elif word[:1] == 'Y':
        first = 'I'
    elif word[:1] == 'H':
        first = word[1:2]
        word = word[1:]
    else:
        first = word[:1]

    sdx = first + word[1:].translate(_soundex_br_translation)
    sdx = _delete_consecutive_repeats(sdx)
    sdx = sdx.replace('0', '')

    if zero_pad:
        sdx += ('0'*max_length)

    return sdx[:max_length]


[docs]def nrl(word):
    """Return the Naval Research Laboratory phonetic encoding of a word.

    This is defined by :cite:`Elovitz:1976`.

    :param str word: the word to transform
    :returns: the NRL phonetic encoding
    :rtype: str

    >>> nrl('the')
    'DHAX'
    >>> nrl('round')
    'rAWnd'
    >>> nrl('quick')
    'kwIHk'
    >>> nrl('eaten')
    'IYtEHn'
    >>> nrl('Smith')
    'smIHTH'
    >>> nrl('Larsen')
    'lAArsEHn'
    """
    def _to_regex(pattern, left_match=True):
        new_pattern = ''
        replacements = {'#': '[AEIOU]+',
                        ':': '[BCDFGHJKLMNPQRSTVWXYZ]*',
                        '^': '[BCDFGHJKLMNPQRSTVWXYZ]',
                        '.': '[BDVGJLMNTWZ]',
                        '%': '(ER|E|ES|ED|ING|ELY)',
                        '+': '[EIY]',
                        ' ': '^'}
        for char in pattern:
            new_pattern += (replacements[char] if char in replacements
                            else char)

        if left_match:
            new_pattern += '$'
            if '^' not in pattern:
                new_pattern = '^.*' + new_pattern
        else:
            new_pattern = '^' + new_pattern.replace('^', '$')
            if '$' not in new_pattern:
                new_pattern += '.*$'

        return new_pattern

    rules = {' ': (('', ' ', '', ' '),
                   ('', '-', '', ''),
                   ('.', '\'S', '', 'z'),
                   ('#:.E', '\'S', '', 'z'),
                   ('#', '\'S', '', 'z'),
                   ('', '\'', '', ''),
                   ('', ',', '', ' '),
                   ('', '.', '', ' '),
                   ('', '?', '', ' '),
                   ('', '!', '', ' ')),
             'A': (('', 'A', ' ', 'AX'),
                   (' ', 'ARE', ' ', 'AAr'),
                   (' ', 'AR', 'O', 'AXr'),
                   ('', 'AR', '#', 'EHr'),
                   ('^', 'AS', '#', 'EYs'),
                   ('', 'A', 'WA', 'AX'),
                   ('', 'AW', '', 'AO'),
                   (' :', 'ANY', '', 'EHnIY'),
                   ('', 'A', '^+#', 'EY'),
                   ('#:', 'ALLY', '', 'AXlIY'),
                   (' ', 'AL', '#', 'AXl'),
                   ('', 'AGAIN', '', 'AXgEHn'),
                   ('#:', 'AG', 'E', 'IHj'),
                   ('', 'A', '^+:#', 'AE'),
                   (' :', 'A', '^+ ', 'EY'),
                   ('', 'A', '^%', 'EY'),
                   (' ', 'ARR', '', 'AXr'),
                   ('', 'ARR', '', 'AEr'),
                   (' :', 'AR', ' ', 'AAr'),
                   ('', 'AR', ' ', 'ER'),
                   ('', 'AR', '', 'AAr'),
                   ('', 'AIR', '', 'EHr'),
                   ('', 'AI', '', 'EY'),
                   ('', 'AY', '', 'EY'),
                   ('', 'AU', '', 'AO'),
                   ('#:', 'AL', ' ', 'AXl'),
                   ('#:', 'ALS', ' ', 'AXlz'),
                   ('', 'ALK', '', 'AOk'),
                   ('', 'AL', '^', 'AOl'),
                   (' :', 'ABLE', '', 'EYbAXl'),
                   ('', 'ABLE', '', 'AXbAXl'),
                   ('', 'ANG', '+', 'EYnj'),
                   ('', 'A', '', 'AE')),
             'B': ((' ', 'BE', '^#', 'bIH'),
                   ('', 'BEING', '', 'bIYIHNG'),
                   (' ', 'BOTH', ' ', 'bOWTH'),
                   (' ', 'BUS', '#', 'bIHz'),
                   ('', 'BUIL', '', 'bIHl'),
                   ('', 'B', '', 'b')),
             'C': ((' ', 'CH', '^', 'k'),
                   ('^E', 'CH', '', 'k'),
                   ('', 'CH', '', 'CH'),
                   (' S', 'CI', '#', 'sAY'),
                   ('', 'CI', 'A', 'SH'),
                   ('', 'CI', 'O', 'SH'),
                   ('', 'CI', 'EN', 'SH'),
                   ('', 'C', '+', 's'),
                   ('', 'CK', '', 'k'),
                   ('', 'COM', '%', 'kAHm'),
                   ('', 'C', '', 'k')),
             'D': (('#:', 'DED', ' ', 'dIHd'),
                   ('.E', 'D', ' ', 'd'),
                   ('#:^E', 'D', ' ', 't'),
                   (' ', 'DE', '^#', 'dIH'),
                   (' ', 'DO', ' ', 'dUW'),
                   (' ', 'DOES', '', 'dAHz'),
                   (' ', 'DOING', '', 'dUWIHNG'),
                   (' ', 'DOW', '', 'dAW'),
                   ('', 'DU', 'A', 'jUW'),
                   ('', 'D', '', 'd')),
             'E': (('#:', 'E', ' ', ''),
                   ('\':^', 'E', ' ', ''),
                   (' :', 'E', ' ', 'IY'),
                   ('#', 'ED', ' ', 'd'),
                   ('#:', 'E', 'D ', ''),
                   ('', 'EV', 'ER', 'EHv'),
                   ('', 'E', '^%', 'IY'),
                   ('', 'ERI', '#', 'IYrIY'),
                   ('', 'ERI', '', 'EHrIH'),
                   ('#:', 'ER', '#', 'ER'),
                   ('', 'ER', '#', 'EHr'),
                   ('', 'ER', '', 'ER'),
                   (' ', 'EVEN', '', 'IYvEHn'),
                   ('#:', 'E', 'W', ''),
                   ('T', 'EW', '', 'UW'),
                   ('S', 'EW', '', 'UW'),
                   ('R', 'EW', '', 'UW'),
                   ('D', 'EW', '', 'UW'),
                   ('L', 'EW', '', 'UW'),
                   ('Z', 'EW', '', 'UW'),
                   ('N', 'EW', '', 'UW'),
                   ('J', 'EW', '', 'UW'),
                   ('TH', 'EW', '', 'UW'),
                   ('CH', 'EW', '', 'UW'),
                   ('SH', 'EW', '', 'UW'),
                   ('', 'EW', '', 'yUW'),
                   ('', 'E', 'O', 'IY'),
                   ('#:S', 'ES', ' ', 'IHz'),
                   ('#:C', 'ES', ' ', 'IHz'),
                   ('#:G', 'ES', ' ', 'IHz'),
                   ('#:Z', 'ES', ' ', 'IHz'),
                   ('#:X', 'ES', ' ', 'IHz'),
                   ('#:J', 'ES', ' ', 'IHz'),
                   ('#:CH', 'ES', ' ', 'IHz'),
                   ('#:SH', 'ES', ' ', 'IHz'),
                   ('#:', 'E', 'S ', ''),
                   ('#:', 'ELY', ' ', 'lIY'),
                   ('#:', 'EMENT', '', 'mEHnt'),
                   ('', 'EFUL', '', 'fUHl'),
                   ('', 'EE', '', 'IY'),
                   ('', 'EARN', '', 'ERn'),
                   (' ', 'EAR', '^', 'ER'),
                   ('', 'EAD', '', 'EHd'),
                   ('#:', 'EA', ' ', 'IYAX'),
                   ('', 'EA', 'SU', 'EH'),
                   ('', 'EA', '', 'IY'),
                   ('', 'EIGH', '', 'EY'),
                   ('', 'EI', '', 'IY'),
                   (' ', 'EYE', '', 'AY'),
                   ('', 'EY', '', 'IY'),
                   ('', 'EU', '', 'yUW'),
                   ('', 'E', '', 'EH')),
             'F': (('', 'FUL', '', 'fUHl'),
                   ('', 'F', '', 'f')),
             'G': (('', 'GIV', '', 'gIHv'),
                   (' ', 'G', 'I^', 'g'),
                   ('', 'GE', 'T', 'gEH'),
                   ('SU', 'GGES', '', 'gjEHs'),
                   ('', 'GG', '', 'g'),
                   (' B#', 'G', '', 'g'),
                   ('', 'G', '+', 'j'),
                   ('', 'GREAT', '', 'grEYt'),
                   ('#', 'GH', '', ''),
                   ('', 'G', '', 'g')),
             'H': ((' ', 'HAV', '', 'hAEv'),
                   (' ', 'HERE', '', 'hIYr'),
                   (' ', 'HOUR', '', 'AWER'),
                   ('', 'HOW', '', 'hAW'),
                   ('', 'H', '#', 'h'),
                   ('', 'H', '', '')),
             'I': ((' ', 'IN', '', 'IHn'),
                   (' ', 'I', ' ', 'AY'),
                   ('', 'IN', 'D', 'AYn'),
                   ('', 'IER', '', 'IYER'),
                   ('#:R', 'IED', '', 'IYd'),
                   ('', 'IED', ' ', 'AYd'),
                   ('', 'IEN', '', 'IYEHn'),
                   ('', 'IE', 'T', 'AYEH'),
                   (' :', 'I', '%', 'AY'),
                   ('', 'I', '%', 'IY'),
                   ('', 'IE', '', 'IY'),
                   ('', 'I', '^+:#', 'IH'),
                   ('', 'IR', '#', 'AYr'),
                   ('', 'IZ', '%', 'AYz'),
                   ('', 'IS', '%', 'AYz'),
                   ('', 'I', 'D%', 'AY'),
                   ('+^', 'I', '^+', 'IH'),
                   ('', 'I', 'T%', 'AY'),
                   ('#:^', 'I', '^+', 'IH'),
                   ('', 'I', '^+', 'AY'),
                   ('', 'IR', '', 'ER'),
                   ('', 'IGH', '', 'AY'),
                   ('', 'ILD', '', 'AYld'),
                   ('', 'IGN', ' ', 'AYn'),
                   ('', 'IGN', '^', 'AYn'),
                   ('', 'IGN', '%', 'AYn'),
                   ('', 'IQUE', '', 'IYk'),
                   ('', 'I', '', 'IH')),
             'J': (('', 'J', '', 'j'),),
             'K': ((' ', 'K', 'N', ''),
                   ('', 'K', '', 'k')),
             'L': (('', 'LO', 'C#', 'lOW'),
                   ('L', 'L', '', ''),
                   ('#:^', 'L', '%', 'AXl'),
                   ('', 'LEAD', '', 'lIYd'),
                   ('', 'L', '', 'l')),
             'M': (('', 'MOV', '', 'mUWv'),
                   ('', 'M', '', 'm')),
             'N': (('E', 'NG', '+', 'nj'),
                   ('', 'NG', 'R', 'NGg'),
                   ('', 'NG', '#', 'NGg'),
                   ('', 'NGL', '%', 'NGgAXl'),
                   ('', 'NG', '', 'NG'),
                   ('', 'NK', '', 'NGk'),
                   (' ', 'NOW', ' ', 'nAW'),
                   ('', 'N', '', 'n')),
             'O': (('', 'OF', ' ', 'AXv'),
                   ('', 'OROUGH', '', 'EROW'),
                   ('#:', 'OR', ' ', 'ER'),
                   ('#:', 'ORS', ' ', 'ERz'),
                   ('', 'OR', '', 'AOr'),
                   (' ', 'ONE', '', 'wAHn'),
                   ('', 'OW', '', 'OW'),
                   (' ', 'OVER', '', 'OWvER'),
                   ('', 'OV', '', 'AHv'),
                   ('', 'O', '^%', 'OW'),
                   ('', 'O', '^EN', 'OW'),
                   ('', 'O', '^I#', 'OW'),
                   ('', 'OL', 'D', 'OWl'),
                   ('', 'OUGHT', '', 'AOt'),
                   ('', 'OUGH', '', 'AHf'),
                   (' ', 'OU', '', 'AW'),
                   ('H', 'OU', 'S#', 'AW'),
                   ('', 'OUS', '', 'AXs'),
                   ('', 'OUR', '', 'AOr'),
                   ('', 'OULD', '', 'UHd'),
                   ('^', 'OU', '^L', 'AH'),
                   ('', 'OUP', '', 'UWp'),
                   ('', 'OU', '', 'AW'),
                   ('', 'OY', '', 'OY'),
                   ('', 'OING', '', 'OWIHNG'),
                   ('', 'OI', '', 'OY'),
                   ('', 'OOR', '', 'AOr'),
                   ('', 'OOK', '', 'UHk'),
                   ('', 'OOD', '', 'UHd'),
                   ('', 'OO', '', 'UW'),
                   ('', 'O', 'E', 'OW'),
                   ('', 'O', ' ', 'OW'),
                   ('', 'OA', '', 'OW'),
                   (' ', 'ONLY', '', 'OWnlIY'),
                   (' ', 'ONCE', '', 'wAHns'),
                   ('', 'ON\'T', '', 'OWnt'),
                   ('C', 'O', 'N', 'AA'),
                   ('', 'O', 'NG', 'AO'),
                   (' :^', 'O', 'N', 'AH'),
                   ('I', 'ON', '', 'AXn'),
                   ('#:', 'ON', ' ', 'AXn'),
                   ('#^', 'ON', '', 'AXn'),
                   ('', 'O', 'ST ', 'OW'),
                   ('', 'OF', '^', 'AOf'),
                   ('', 'OTHER', '', 'AHDHER'),
                   ('', 'OSS', ' ', 'AOs'),
                   ('#:^', 'OM', '', 'AHm'),
                   ('', 'O', '', 'AA')),
             'P': (('', 'PH', '', 'f'),
                   ('', 'PEOP', '', 'pIYp'),
                   ('', 'POW', '', 'pAW'),
                   ('', 'PUT', ' ', 'pUHt'),
                   ('', 'P', '', 'p')),
             'Q': (('', 'QUAR', '', 'kwAOr'),
                   ('', 'QU', '', 'kw'),
                   ('', 'Q', '', 'k')),
             'R': ((' ', 'RE', '^#', 'rIY'),
                   ('', 'R', '', 'r')),
             'S': (('', 'SH', '', 'SH'),
                   ('#', 'SION', '', 'ZHAXn'),
                   ('', 'SOME', '', 'sAHm'),
                   ('#', 'SUR', '#', 'ZHER'),
                   ('', 'SUR', '#', 'SHER'),
                   ('#', 'SU', '#', 'ZHUW'),
                   ('#', 'SSU', '#', 'SHUW'),
                   ('#', 'SED', ' ', 'zd'),
                   ('#', 'S', '#', 'z'),
                   ('', 'SAID', '', 'sEHd'),
                   ('^', 'SION', '', 'SHAXn'),
                   ('', 'S', 'S', ''),
                   ('.', 'S', ' ', 'z'),
                   ('#:.E', 'S', ' ', 'z'),
                   ('#:^##', 'S', ' ', 'z'),
                   ('#:^#', 'S', ' ', 's'),
                   ('U', 'S', ' ', 's'),
                   (' :#', 'S', ' ', 'z'),
                   (' ', 'SCH', '', 'sk'),
                   ('', 'S', 'C+', ''),
                   ('#', 'SM', '', 'zm'),
                   ('#', 'SN', '\'', 'zAXn'),
                   ('', 'S', '', 's')),
             'T': ((' ', 'THE', ' ', 'DHAX'),
                   ('', 'TO', ' ', 'tUW'),
                   ('', 'THAT', ' ', 'DHAEt'),
                   (' ', 'THIS', ' ', 'DHIHs'),
                   (' ', 'THEY', '', 'DHEY'),
                   (' ', 'THERE', '', 'DHEHr'),
                   ('', 'THER', '', 'DHER'),
                   ('', 'THEIR', '', 'DHEHr'),
                   (' ', 'THAN', ' ', 'DHAEn'),
                   (' ', 'THEM', ' ', 'DHEHm'),
                   ('', 'THESE', ' ', 'DHIYz'),
                   (' ', 'THEN', '', 'DHEHn'),
                   ('', 'THROUGH', '', 'THrUW'),
                   ('', 'THOSE', '', 'DHOWz'),
                   ('', 'THOUGH', ' ', 'DHOW'),
                   (' ', 'THUS', '', 'DHAHs'),
                   ('', 'TH', '', 'TH'),
                   ('#:', 'TED', ' ', 'tIHd'),
                   ('S', 'TI', '#N', 'CH'),
                   ('', 'TI', 'O', 'SH'),
                   ('', 'TI', 'A', 'SH'),
                   ('', 'TIEN', '', 'SHAXn'),
                   ('', 'TUR', '#', 'CHER'),
                   ('', 'TU', 'A', 'CHUW'),
                   (' ', 'TWO', '', 'tUW'),
                   ('', 'T', '', 't')),
             'U': ((' ', 'UN', 'I', 'yUWn'),
                   (' ', 'UN', '', 'AHn'),
                   (' ', 'UPON', '', 'AXpAOn'),
                   ('T', 'UR', '#', 'UHr'),
                   ('S', 'UR', '#', 'UHr'),
                   ('R', 'UR', '#', 'UHr'),
                   ('D', 'UR', '#', 'UHr'),
                   ('L', 'UR', '#', 'UHr'),
                   ('Z', 'UR', '#', 'UHr'),
                   ('N', 'UR', '#', 'UHr'),
                   ('J', 'UR', '#', 'UHr'),
                   ('TH', 'UR', '#', 'UHr'),
                   ('CH', 'UR', '#', 'UHr'),
                   ('SH', 'UR', '#', 'UHr'),
                   ('', 'UR', '#', 'yUHr'),
                   ('', 'UR', '', 'ER'),
                   ('', 'U', '^ ', 'AH'),
                   ('', 'U', '^^', 'AH'),
                   ('', 'UY', '', 'AY'),
                   (' G', 'U', '#', ''),
                   ('G', 'U', '%', ''),
                   ('G', 'U', '#', 'w'),
                   ('#N', 'U', '', 'yUW'),
                   ('T', 'U', '', 'UW'),
                   ('S', 'U', '', 'UW'),
                   ('R', 'U', '', 'UW'),
                   ('D', 'U', '', 'UW'),
                   ('L', 'U', '', 'UW'),
                   ('Z', 'U', '', 'UW'),
                   ('N', 'U', '', 'UW'),
                   ('J', 'U', '', 'UW'),
                   ('TH', 'U', '', 'UW'),
                   ('CH', 'U', '', 'UW'),
                   ('SH', 'U', '', 'UW'),
                   ('', 'U', '', 'yUW')),
             'V': (('', 'VIEW', '', 'vyUW'),
                   ('', 'V', '', 'v')),
             'W': ((' ', 'WERE', '', 'wER'),
                   ('', 'WA', 'S', 'wAA'),
                   ('', 'WA', 'T', 'wAA'),
                   ('', 'WHERE', '', 'WHEHr'),
                   ('', 'WHAT', '', 'WHAAt'),
                   ('', 'WHOL', '', 'hOWl'),
                   ('', 'WHO', '', 'hUW'),
                   ('', 'WH', '', 'WH'),
                   ('', 'WAR', '', 'wAOr'),
                   ('', 'WOR', '^', 'wER'),
                   ('', 'WR', '', 'r'),
                   ('', 'W', '', 'w')),
             'X': (('', 'X', '', 'ks'),),
             'Y': (('', 'YOUNG', '', 'yAHNG'),
                   (' ', 'YOU', '', 'yUW'),
                   (' ', 'YES', '', 'yEHs'),
                   (' ', 'Y', '', 'y'),
                   ('#:^', 'Y', ' ', 'IY'),
                   ('#:^', 'Y', 'I', 'IY'),
                   (' :', 'Y', ' ', 'AY'),
                   (' :', 'Y', '#', 'AY'),
                   (' :', 'Y', '^+:#', 'IH'),
                   (' :', 'Y', '^#', 'AY'),
                   ('', 'Y', '', 'IH')),
             'Z': (('', 'Z', '', 'z'),)}

    word = word.upper()

    pron = ''
    pos = 0
    while pos < len(word):
        left_orig = word[:pos]
        right_orig = word[pos:]
        first = word[pos] if word[pos] in rules else ' '
        for rule in rules[first]:
            left, match, right, out = rule
            if right_orig.startswith(match):
                if left:
                    l_pattern = _to_regex(left, left_match=True)
                if right:
                    r_pattern = _to_regex(right, left_match=False)
                if ((not left or re_match(l_pattern, left_orig)) and
                        (not right or
                         re_match(r_pattern, right_orig[len(match):]))):
                    pron += out
                    pos += len(match)
                    break
        else:
            pron += word[pos]
            pos += 1

    return pron


[docs]def bmpm(word, language_arg=0, name_mode='gen', match_mode='approx',
         concat=False, filter_langs=False):
    """Return the Beider-Morse Phonetic Matching algorithm code for a word.

    The Beider-Morse Phonetic Matching algorithm is described in
    :cite:`Beider:2008`.
    The reference implementation is licensed under GPLv3.

    :param str word: the word to transform
    :param str language_arg: the language of the term; supported values
        include:

            - 'any'
            - 'arabic'
            - 'cyrillic'
            - 'czech'
            - 'dutch'
            - 'english'
            - 'french'
            - 'german'
            - 'greek'
            - 'greeklatin'
            - 'hebrew'
            - 'hungarian'
            - 'italian'
            - 'latvian'
            - 'polish'
            - 'portuguese'
            - 'romanian'
            - 'russian'
            - 'spanish'
            - 'turkish'

    :param str name_mode: the name mode of the algorithm:

            - 'gen' -- general (default)
            - 'ash' -- Ashkenazi
            - 'sep' -- Sephardic

    :param str match_mode: matching mode: 'approx' or 'exact'
    :param bool concat: concatenation mode
    :param bool filter_langs: filter out incompatible languages
    :returns: the BMPM value(s)
    :rtype: tuple

    >>> bmpm('Christopher')
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
    xristYfir xristopi xritopir xritopi xristofi xritofir xritofi tzristopir
    tzristofir zristopir zristopi zritopir zritopi zristofir zristofi zritofir
    zritofi'
    >>> bmpm('Niall')
    'nial niol'
    >>> bmpm('Smith')
    'zmit'
    >>> bmpm('Schmidt')
    'zmit stzmit'

    >>> bmpm('Christopher', language_arg='German')
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
    xristYfir'
    >>> bmpm('Christopher', language_arg='English')
    'tzristofir tzrQstofir tzristafir tzrQstafir xristofir xrQstofir xristafir
    xrQstafir'
    >>> bmpm('Christopher', language_arg='German', name_mode='ash')
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
    xristYfir'

    >>> bmpm('Christopher', language_arg='German', match_mode='exact')
    'xriStopher xriStofer xristopher xristofer'
    """
    return _bmpm(word, language_arg, name_mode, match_mode,
                 concat, filter_langs)


if __name__ == '__main__':
    import doctest
    doctest.testmod()