# -*- coding: utf-8 -*-
# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.phonetic._soundex.
The phonetic._soundex module implements phonetic algorithms that are generally
Soundex-like, including:
- American Soundex
- Refined Soundex
- Fuzzy Soundex
- Phonex
- Phonix
- Lein
- PSHP Soundex/Viewex Coding
Being Soundex-like, for the purposes of this module means: targeted at English,
returning a code that starts with a letter and continues with (usually 3)
numerals, and mostly based on a simple translation table.
"""
from __future__ import unicode_literals
from unicodedata import normalize as unicode_normalize
from six import text_type
from six.moves import range
from ._util import _delete_consecutive_repeats
__all__ = [
'fuzzy_soundex',
'lein',
'phonex',
'phonix',
'pshp_soundex_first',
'pshp_soundex_last',
'refined_soundex',
'soundex',
]
[docs]def soundex(word, max_length=4, var='American', reverse=False, zero_pad=True):
"""Return the Soundex code for a word.
:param str word: the word to transform
:param int max_length: the length of the code returned (defaults to 4)
:param str var: the variant of the algorithm to employ (defaults to
'American'):
- 'American' follows the American Soundex algorithm, as described at
:cite:`US:2007` and in :cite:`Knuth:1998`; this is also called
Miracode
- 'special' follows the rules from the 1880-1910 US Census
retrospective re-analysis, in which h & w are not treated as blocking
consonants but as vowels. Cf. :cite:`Repici:2013`.
- 'Census' follows the rules laid out in GIL 55 :cite:`US:1997` by the
US Census, including coding prefixed and unprefixed versions of some
names
:param bool reverse: reverse the word before computing the selected Soundex
(defaults to False); This results in "Reverse Soundex", which is useful
for blocking in cases where the initial elements may be in error.
:param bool zero_pad: pad the end of the return value with 0s to achieve a
max_length string
:returns: the Soundex value
:rtype: str
>>> soundex("Christopher")
'C623'
>>> soundex("Niall")
'N400'
>>> soundex('Smith')
'S530'
>>> soundex('Schmidt')
'S530'
>>> soundex('Christopher', max_length=-1)
'C623160000000000000000000000000000000000000000000000000000000000'
>>> soundex('Christopher', max_length=-1, zero_pad=False)
'C62316'
>>> soundex('Christopher', reverse=True)
'R132'
>>> soundex('Ashcroft')
'A261'
>>> soundex('Asicroft')
'A226'
>>> soundex('Ashcroft', var='special')
'A226'
>>> soundex('Asicroft', var='special')
'A226'
"""
_soundex_translation = dict(
zip(
(ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
'01230129022455012623019202',
)
)
# Require a max_length of at least 4 and not more than 64
if max_length != -1:
max_length = min(max(4, max_length), 64)
else:
max_length = 64
# uppercase, normalize, decompose, and filter non-A-Z out
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
if var == 'Census':
# TODO: Should these prefixes be supplemented? (VANDE, DELA, VON)
if word[:3] in {'VAN', 'CON'} and len(word) > 4:
return (
soundex(word, max_length, 'American', reverse, zero_pad),
soundex(word[3:], max_length, 'American', reverse, zero_pad),
)
if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3:
return (
soundex(word, max_length, 'American', reverse, zero_pad),
soundex(word[2:], max_length, 'American', reverse, zero_pad),
)
# Otherwise, proceed as usual (var='American' mode, ostensibly)
word = ''.join(
c
for c in word
if c
in {
'A',
'B',
'C',
'D',
'E',
'F',
'G',
'H',
'I',
'J',
'K',
'L',
'M',
'N',
'O',
'P',
'Q',
'R',
'S',
'T',
'U',
'V',
'W',
'X',
'Y',
'Z',
}
)
# Nothing to convert, return base case
if not word:
if zero_pad:
return '0' * max_length
return '0'
# Reverse word if computing Reverse Soundex
if reverse:
word = word[::-1]
# apply the Soundex algorithm
sdx = word.translate(_soundex_translation)
if var == 'special':
sdx = sdx.replace('9', '0') # special rule for 1880-1910 census
else:
sdx = sdx.replace('9', '') # rule 1
sdx = _delete_consecutive_repeats(sdx) # rule 3
if word[0] in 'HW':
sdx = word[0] + sdx
else:
sdx = word[0] + sdx[1:]
sdx = sdx.replace('0', '') # rule 1
if zero_pad:
sdx += '0' * max_length # rule 4
return sdx[:max_length]
[docs]def refined_soundex(word, max_length=-1, zero_pad=False, retain_vowels=False):
"""Return the Refined Soundex code for a word.
This is Soundex, but with more character classes. It was defined at
:cite:`Boyce:1998`.
:param word: the word to transform
:param max_length: the length of the code returned (defaults to unlimited)
:param zero_pad: pad the end of the return value with 0s to achieve a
max_length string
:param retain_vowels: retain vowels (as 0) in the resulting code
:returns: the Refined Soundex value
:rtype: str
>>> refined_soundex('Christopher')
'C393619'
>>> refined_soundex('Niall')
'N87'
>>> refined_soundex('Smith')
'S386'
>>> refined_soundex('Schmidt')
'S386'
"""
_ref_soundex_translation = dict(
zip(
(ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
'01360240043788015936020505',
)
)
# uppercase, normalize, decompose, and filter non-A-Z out
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
word = ''.join(
c
for c in word
if c
in {
'A',
'B',
'C',
'D',
'E',
'F',
'G',
'H',
'I',
'J',
'K',
'L',
'M',
'N',
'O',
'P',
'Q',
'R',
'S',
'T',
'U',
'V',
'W',
'X',
'Y',
'Z',
}
)
# apply the Soundex algorithm
sdx = word[:1] + word.translate(_ref_soundex_translation)
sdx = _delete_consecutive_repeats(sdx)
if not retain_vowels:
sdx = sdx.replace('0', '') # Delete vowels, H, W, Y
if max_length > 0:
if zero_pad:
sdx += '0' * max_length
sdx = sdx[:max_length]
return sdx
[docs]def fuzzy_soundex(word, max_length=5, zero_pad=True):
"""Return the Fuzzy Soundex code for a word.
Fuzzy Soundex is an algorithm derived from Soundex, defined in
:cite:`Holmes:2002`.
:param str word: the word to transform
:param int max_length: the length of the code returned (defaults to 4)
:param bool zero_pad: pad the end of the return value with 0s to achieve
a max_length string
:returns: the Fuzzy Soundex value
:rtype: str
>>> fuzzy_soundex('Christopher')
'K6931'
>>> fuzzy_soundex('Niall')
'N4000'
>>> fuzzy_soundex('Smith')
'S5300'
>>> fuzzy_soundex('Smith')
'S5300'
"""
_fuzzy_soundex_translation = dict(
zip(
(ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
'0193017-07745501769301-7-9',
)
)
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
# Clamp max_length to [4, 64]
if max_length != -1:
max_length = min(max(4, max_length), 64)
else:
max_length = 64
if not word:
if zero_pad:
return '0' * max_length
return '0'
if word[:2] in {'CS', 'CZ', 'TS', 'TZ'}:
word = 'SS' + word[2:]
elif word[:2] == 'GN':
word = 'NN' + word[2:]
elif word[:2] in {'HR', 'WR'}:
word = 'RR' + word[2:]
elif word[:2] == 'HW':
word = 'WW' + word[2:]
elif word[:2] in {'KN', 'NG'}:
word = 'NN' + word[2:]
if word[-2:] == 'CH':
word = word[:-2] + 'KK'
elif word[-2:] == 'NT':
word = word[:-2] + 'TT'
elif word[-2:] == 'RT':
word = word[:-2] + 'RR'
elif word[-3:] == 'RDT':
word = word[:-3] + 'RR'
word = word.replace('CA', 'KA')
word = word.replace('CC', 'KK')
word = word.replace('CK', 'KK')
word = word.replace('CE', 'SE')
word = word.replace('CHL', 'KL')
word = word.replace('CL', 'KL')
word = word.replace('CHR', 'KR')
word = word.replace('CR', 'KR')
word = word.replace('CI', 'SI')
word = word.replace('CO', 'KO')
word = word.replace('CU', 'KU')
word = word.replace('CY', 'SY')
word = word.replace('DG', 'GG')
word = word.replace('GH', 'HH')
word = word.replace('MAC', 'MK')
word = word.replace('MC', 'MK')
word = word.replace('NST', 'NSS')
word = word.replace('PF', 'FF')
word = word.replace('PH', 'FF')
word = word.replace('SCH', 'SSS')
word = word.replace('TIO', 'SIO')
word = word.replace('TIA', 'SIO')
word = word.replace('TCH', 'CHH')
sdx = word.translate(_fuzzy_soundex_translation)
sdx = sdx.replace('-', '')
# remove repeating characters
sdx = _delete_consecutive_repeats(sdx)
if word[0] in {'H', 'W', 'Y'}:
sdx = word[0] + sdx
else:
sdx = word[0] + sdx[1:]
sdx = sdx.replace('0', '')
if zero_pad:
sdx += '0' * max_length
return sdx[:max_length]
[docs]def phonex(word, max_length=4, zero_pad=True):
"""Return the Phonex code for a word.
Phonex is an algorithm derived from Soundex, defined in :cite:`Lait:1996`.
:param str word: the word to transform
:param int max_length: the length of the code returned (defaults to 4)
:param bool zero_pad: pad the end of the return value with 0s to achieve
a max_length string
:returns: the Phonex value
:rtype: str
>>> phonex('Christopher')
'C623'
>>> phonex('Niall')
'N400'
>>> phonex('Schmidt')
'S253'
>>> phonex('Smith')
'S530'
"""
name = unicode_normalize('NFKD', text_type(word.upper()))
name = name.replace('ß', 'SS')
# Clamp max_length to [4, 64]
if max_length != -1:
max_length = min(max(4, max_length), 64)
else:
max_length = 64
name_code = last = ''
# Deletions effected by replacing with next letter which
# will be ignored due to duplicate handling of Soundex code.
# This is faster than 'moving' all subsequent letters.
# Remove any trailing Ss
while name[-1:] == 'S':
name = name[:-1]
# Phonetic equivalents of first 2 characters
# Works since duplicate letters are ignored
if name[:2] == 'KN':
name = 'N' + name[2:] # KN.. == N..
elif name[:2] == 'PH':
name = 'F' + name[2:] # PH.. == F.. (H ignored anyway)
elif name[:2] == 'WR':
name = 'R' + name[2:] # WR.. == R..
if name:
# Special case, ignore H first letter (subsequent Hs ignored anyway)
# Works since duplicate letters are ignored
if name[0] == 'H':
name = name[1:]
if name:
# Phonetic equivalents of first character
if name[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
name = 'A' + name[1:]
elif name[0] in {'B', 'P'}:
name = 'B' + name[1:]
elif name[0] in {'V', 'F'}:
name = 'F' + name[1:]
elif name[0] in {'C', 'K', 'Q'}:
name = 'C' + name[1:]
elif name[0] in {'G', 'J'}:
name = 'G' + name[1:]
elif name[0] in {'S', 'Z'}:
name = 'S' + name[1:]
name_code = last = name[0]
# Modified Soundex code
for i in range(1, len(name)):
code = '0'
if name[i] in {'B', 'F', 'P', 'V'}:
code = '1'
elif name[i] in {'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z'}:
code = '2'
elif name[i] in {'D', 'T'}:
if name[i + 1 : i + 2] != 'C':
code = '3'
elif name[i] == 'L':
if name[i + 1 : i + 2] in {
'A',
'E',
'I',
'O',
'U',
'Y',
} or i + 1 == len(name):
code = '4'
elif name[i] in {'M', 'N'}:
if name[i + 1 : i + 2] in {'D', 'G'}:
name = name[: i + 1] + name[i] + name[i + 2 :]
code = '5'
elif name[i] == 'R':
if name[i + 1 : i + 2] in {
'A',
'E',
'I',
'O',
'U',
'Y',
} or i + 1 == len(name):
code = '6'
if code != last and code != '0' and i != 0:
name_code += code
last = name_code[-1]
if zero_pad:
name_code += '0' * max_length
if not name_code:
name_code = '0'
return name_code[:max_length]
[docs]def phonix(word, max_length=4, zero_pad=True):
"""Return the Phonix code for a word.
Phonix is a Soundex-like algorithm defined in :cite:`Gadd:1990`.
This implementation is based on:
- :cite:`Pfeifer:2000`
- :cite:`Christen:2011`
- :cite:`Kollar:2007`
:param str word: the word to transform
:param int max_length: the length of the code returned (defaults to 4)
:param bool zero_pad: pad the end of the return value with 0s to achieve
a max_length string
:returns: the Phonix value
:rtype: str
>>> phonix('Christopher')
'K683'
>>> phonix('Niall')
'N400'
>>> phonix('Smith')
'S530'
>>> phonix('Schmidt')
'S530'
"""
def _start_repl(word, src, tar, post=None):
r"""Replace src with tar at the start of word."""
if post:
for i in post:
if word.startswith(src + i):
return tar + word[len(src) :]
elif word.startswith(src):
return tar + word[len(src) :]
return word
def _end_repl(word, src, tar, pre=None):
r"""Replace src with tar at the end of word."""
if pre:
for i in pre:
if word.endswith(i + src):
return word[: -len(src)] + tar
elif word.endswith(src):
return word[: -len(src)] + tar
return word
def _mid_repl(word, src, tar, pre=None, post=None):
r"""Replace src with tar in the middle of word."""
if pre or post:
if not pre:
return word[0] + _all_repl(word[1:], src, tar, pre, post)
elif not post:
return _all_repl(word[:-1], src, tar, pre, post) + word[-1]
return _all_repl(word, src, tar, pre, post)
return word[0] + _all_repl(word[1:-1], src, tar, pre, post) + word[-1]
def _all_repl(word, src, tar, pre=None, post=None):
r"""Replace src with tar anywhere in word."""
if pre or post:
if post:
post = post
else:
post = frozenset(('',))
if pre:
pre = pre
else:
pre = frozenset(('',))
for i, j in ((i, j) for i in pre for j in post):
word = word.replace(i + src + j, i + tar + j)
return word
else:
return word.replace(src, tar)
_vow = {'A', 'E', 'I', 'O', 'U'}
_con = {
'B',
'C',
'D',
'F',
'G',
'H',
'J',
'K',
'L',
'M',
'N',
'P',
'Q',
'R',
'S',
'T',
'V',
'W',
'X',
'Y',
'Z',
}
_phonix_substitutions = (
(_all_repl, 'DG', 'G'),
(_all_repl, 'CO', 'KO'),
(_all_repl, 'CA', 'KA'),
(_all_repl, 'CU', 'KU'),
(_all_repl, 'CY', 'SI'),
(_all_repl, 'CI', 'SI'),
(_all_repl, 'CE', 'SE'),
(_start_repl, 'CL', 'KL', _vow),
(_all_repl, 'CK', 'K'),
(_end_repl, 'GC', 'K'),
(_end_repl, 'JC', 'K'),
(_start_repl, 'CHR', 'KR', _vow),
(_start_repl, 'CR', 'KR', _vow),
(_start_repl, 'WR', 'R'),
(_all_repl, 'NC', 'NK'),
(_all_repl, 'CT', 'KT'),
(_all_repl, 'PH', 'F'),
(_all_repl, 'AA', 'AR'),
(_all_repl, 'SCH', 'SH'),
(_all_repl, 'BTL', 'TL'),
(_all_repl, 'GHT', 'T'),
(_all_repl, 'AUGH', 'ARF'),
(_mid_repl, 'LJ', 'LD', _vow, _vow),
(_all_repl, 'LOUGH', 'LOW'),
(_start_repl, 'Q', 'KW'),
(_start_repl, 'KN', 'N'),
(_end_repl, 'GN', 'N'),
(_all_repl, 'GHN', 'N'),
(_end_repl, 'GNE', 'N'),
(_all_repl, 'GHNE', 'NE'),
(_end_repl, 'GNES', 'NS'),
(_start_repl, 'GN', 'N'),
(_mid_repl, 'GN', 'N', None, _con),
(_end_repl, 'GN', 'N'),
(_start_repl, 'PS', 'S'),
(_start_repl, 'PT', 'T'),
(_start_repl, 'CZ', 'C'),
(_mid_repl, 'WZ', 'Z', _vow),
(_mid_repl, 'CZ', 'CH'),
(_all_repl, 'LZ', 'LSH'),
(_all_repl, 'RZ', 'RSH'),
(_mid_repl, 'Z', 'S', None, _vow),
(_all_repl, 'ZZ', 'TS'),
(_mid_repl, 'Z', 'TS', _con),
(_all_repl, 'HROUG', 'REW'),
(_all_repl, 'OUGH', 'OF'),
(_mid_repl, 'Q', 'KW', _vow, _vow),
(_mid_repl, 'J', 'Y', _vow, _vow),
(_start_repl, 'YJ', 'Y', _vow),
(_start_repl, 'GH', 'G'),
(_end_repl, 'GH', 'E', _vow),
(_start_repl, 'CY', 'S'),
(_all_repl, 'NX', 'NKS'),
(_start_repl, 'PF', 'F'),
(_end_repl, 'DT', 'T'),
(_end_repl, 'TL', 'TIL'),
(_end_repl, 'DL', 'DIL'),
(_all_repl, 'YTH', 'ITH'),
(_start_repl, 'TJ', 'CH', _vow),
(_start_repl, 'TSJ', 'CH', _vow),
(_start_repl, 'TS', 'T', _vow),
(_all_repl, 'TCH', 'CH'),
(_mid_repl, 'WSK', 'VSKIE', _vow),
(_end_repl, 'WSK', 'VSKIE', _vow),
(_start_repl, 'MN', 'N', _vow),
(_start_repl, 'PN', 'N', _vow),
(_mid_repl, 'STL', 'SL', _vow),
(_end_repl, 'STL', 'SL', _vow),
(_end_repl, 'TNT', 'ENT'),
(_end_repl, 'EAUX', 'OH'),
(_all_repl, 'EXCI', 'ECS'),
(_all_repl, 'X', 'ECS'),
(_end_repl, 'NED', 'ND'),
(_all_repl, 'JR', 'DR'),
(_end_repl, 'EE', 'EA'),
(_all_repl, 'ZS', 'S'),
(_mid_repl, 'R', 'AH', _vow, _con),
(_end_repl, 'R', 'AH', _vow),
(_mid_repl, 'HR', 'AH', _vow, _con),
(_end_repl, 'HR', 'AH', _vow),
(_end_repl, 'HR', 'AH', _vow),
(_end_repl, 'RE', 'AR'),
(_end_repl, 'R', 'AH', _vow),
(_all_repl, 'LLE', 'LE'),
(_end_repl, 'LE', 'ILE', _con),
(_end_repl, 'LES', 'ILES', _con),
(_end_repl, 'E', ''),
(_end_repl, 'ES', 'S'),
(_end_repl, 'SS', 'AS', _vow),
(_end_repl, 'MB', 'M', _vow),
(_all_repl, 'MPTS', 'MPS'),
(_all_repl, 'MPS', 'MS'),
(_all_repl, 'MPT', 'MT'),
)
_phonix_translation = dict(
zip(
(ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
'01230720022455012683070808',
)
)
sdx = ''
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
word = ''.join(
c
for c in word
if c
in {
'A',
'B',
'C',
'D',
'E',
'F',
'G',
'H',
'I',
'J',
'K',
'L',
'M',
'N',
'O',
'P',
'Q',
'R',
'S',
'T',
'U',
'V',
'W',
'X',
'Y',
'Z',
}
)
if word:
for trans in _phonix_substitutions:
word = trans[0](word, *trans[1:])
if word[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
sdx = 'v' + word[1:].translate(_phonix_translation)
else:
sdx = word[0] + word[1:].translate(_phonix_translation)
sdx = _delete_consecutive_repeats(sdx)
sdx = sdx.replace('0', '')
# Clamp max_length to [4, 64]
if max_length != -1:
max_length = min(max(4, max_length), 64)
else:
max_length = 64
if zero_pad:
sdx += '0' * max_length
if not sdx:
sdx = '0'
return sdx[:max_length]
[docs]def lein(word, max_length=4, zero_pad=True):
"""Return the Lein code for a word.
This is Lein name coding, described in :cite:`Moore:1977`.
:param str word: the word to transform
:param int max_length: the maximum length (default 4) of the code to return
:param bool zero_pad: pad the end of the return value with 0s to achieve a
max_length string
:returns: the Lein code
:rtype: str
>>> lein('Christopher')
'C351'
>>> lein('Niall')
'N300'
>>> lein('Smith')
'S210'
>>> lein('Schmidt')
'S521'
"""
_lein_translation = dict(
zip((ord(_) for _ in 'BCDFGJKLMNPQRSTVXZ'), '451455532245351455')
)
# uppercase, normalize, decompose, and filter non-A-Z out
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
word = ''.join(
c
for c in word
if c
in {
'A',
'B',
'C',
'D',
'E',
'F',
'G',
'H',
'I',
'J',
'K',
'L',
'M',
'N',
'O',
'P',
'Q',
'R',
'S',
'T',
'U',
'V',
'W',
'X',
'Y',
'Z',
}
)
code = word[:1] # Rule 1
word = word[1:].translate(
{
32: None,
65: None,
69: None,
72: None,
73: None,
79: None,
85: None,
87: None,
89: None,
}
) # Rule 2
word = _delete_consecutive_repeats(word) # Rule 3
code += word.translate(_lein_translation) # Rule 4
if zero_pad:
code += '0' * max_length # Rule 4
return code[:max_length]
[docs]def pshp_soundex_last(lname, max_length=4, german=False):
"""Calculate the PSHP Soundex/Viewex Coding of a last name.
This coding is based on :cite:`Hershberg:1976`.
Reference was also made to the German version of the same:
:cite:`Hershberg:1979`.
A separate function, pshp_soundex_first() is used for first names.
:param str lname: the last name to encode
:param int max_length: the length of the code returned (defaults to 4)
:param bool german: set to True if the name is German (different rules
apply)
:returns: the PSHP Soundex/Viewex Coding
:rtype: str
>>> pshp_soundex_last('Smith')
'S530'
>>> pshp_soundex_last('Waters')
'W350'
>>> pshp_soundex_last('James')
'J500'
>>> pshp_soundex_last('Schmidt')
'S530'
>>> pshp_soundex_last('Ashcroft')
'A225'
"""
lname = unicode_normalize('NFKD', text_type(lname.upper()))
lname = lname.replace('ß', 'SS')
lname = ''.join(
c
for c in lname
if c
in {
'A',
'B',
'C',
'D',
'E',
'F',
'G',
'H',
'I',
'J',
'K',
'L',
'M',
'N',
'O',
'P',
'Q',
'R',
'S',
'T',
'U',
'V',
'W',
'X',
'Y',
'Z',
}
)
# A. Prefix treatment
if lname[:3] == 'VON' or lname[:3] == 'VAN':
lname = lname[3:].strip()
# The rule implemented below says "MC, MAC become 1". I believe it meant to
# say they become M except in German data (where superscripted 1 indicates
# "except in German data"). It doesn't make sense for them to become 1
# (BPFV -> 1) or to apply outside German. Unfortunately, both articles have
# this error(?).
if not german:
if lname[:3] == 'MAC':
lname = 'M' + lname[3:]
elif lname[:2] == 'MC':
lname = 'M' + lname[2:]
# The non-German-only rule to strip ' is unnecessary due to filtering
if lname[:1] in {'E', 'I', 'O', 'U'}:
lname = 'A' + lname[1:]
elif lname[:2] in {'GE', 'GI', 'GY'}:
lname = 'J' + lname[1:]
elif lname[:2] in {'CE', 'CI', 'CY'}:
lname = 'S' + lname[1:]
elif lname[:3] == 'CHR':
lname = 'K' + lname[1:]
elif lname[:1] == 'C' and lname[:2] != 'CH':
lname = 'K' + lname[1:]
if lname[:2] == 'KN':
lname = 'N' + lname[1:]
elif lname[:2] == 'PH':
lname = 'F' + lname[1:]
elif lname[:3] in {'WIE', 'WEI'}:
lname = 'V' + lname[1:]
if german and lname[:1] in {'W', 'M', 'Y', 'Z'}:
lname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[lname[0]] + lname[1:]
code = lname[:1]
# B. Postfix treatment
if german: # moved from end of postfix treatment due to blocking
if lname[-3:] == 'TES':
lname = lname[:-3]
elif lname[-2:] == 'TS':
lname = lname[:-2]
if lname[-3:] == 'TZE':
lname = lname[:-3]
elif lname[-2:] == 'ZE':
lname = lname[:-2]
if lname[-1:] == 'Z':
lname = lname[:-1]
elif lname[-2:] == 'TE':
lname = lname[:-2]
if lname[-1:] == 'R':
lname = lname[:-1] + 'N'
elif lname[-2:] in {'SE', 'CE'}:
lname = lname[:-2]
if lname[-2:] == 'SS':
lname = lname[:-2]
elif lname[-1:] == 'S':
lname = lname[:-1]
if not german:
l5_repl = {'STOWN': 'SAWON', 'MPSON': 'MASON'}
l4_repl = {
'NSEN': 'ASEN',
'MSON': 'ASON',
'STEN': 'SAEN',
'STON': 'SAON',
}
if lname[-5:] in l5_repl:
lname = lname[:-5] + l5_repl[lname[-5:]]
elif lname[-4:] in l4_repl:
lname = lname[:-4] + l4_repl[lname[-4:]]
if lname[-2:] in {'NG', 'ND'}:
lname = lname[:-1]
if not german and lname[-3:] in {'GAN', 'GEN'}:
lname = lname[:-3] + 'A' + lname[-2:]
# C. Infix Treatment
lname = lname.replace('CK', 'C')
lname = lname.replace('SCH', 'S')
lname = lname.replace('DT', 'T')
lname = lname.replace('ND', 'N')
lname = lname.replace('NG', 'N')
lname = lname.replace('LM', 'M')
lname = lname.replace('MN', 'M')
lname = lname.replace('WIE', 'VIE')
lname = lname.replace('WEI', 'VEI')
# D. Soundexing
# code for X & Y are unspecified, but presumably are 2 & 0
_pshp_translation = dict(
zip(
(ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
'01230120022455012523010202',
)
)
lname = lname.translate(_pshp_translation)
lname = _delete_consecutive_repeats(lname)
code += lname[1:]
code = code.replace('0', '') # rule 1
if max_length != -1:
if len(code) < max_length:
code += '0' * (max_length - len(code))
else:
code = code[:max_length]
return code
[docs]def pshp_soundex_first(fname, max_length=4, german=False):
"""Calculate the PSHP Soundex/Viewex Coding of a first name.
This coding is based on :cite:`Hershberg:1976`.
Reference was also made to the German version of the same:
:cite:`Hershberg:1979`.
A separate function, pshp_soundex_last() is used for last names.
:param str fname: the first name to encode
:param int max_length: the length of the code returned (defaults to 4)
:param bool german: set to True if the name is German (different rules
apply)
:returns: the PSHP Soundex/Viewex Coding
:rtype: str
>>> pshp_soundex_first('Smith')
'S530'
>>> pshp_soundex_first('Waters')
'W352'
>>> pshp_soundex_first('James')
'J700'
>>> pshp_soundex_first('Schmidt')
'S500'
>>> pshp_soundex_first('Ashcroft')
'A220'
>>> pshp_soundex_first('John')
'J500'
>>> pshp_soundex_first('Colin')
'K400'
>>> pshp_soundex_first('Niall')
'N400'
>>> pshp_soundex_first('Sally')
'S400'
>>> pshp_soundex_first('Jane')
'J500'
"""
fname = unicode_normalize('NFKD', text_type(fname.upper()))
fname = fname.replace('ß', 'SS')
fname = ''.join(
c
for c in fname
if c
in {
'A',
'B',
'C',
'D',
'E',
'F',
'G',
'H',
'I',
'J',
'K',
'L',
'M',
'N',
'O',
'P',
'Q',
'R',
'S',
'T',
'U',
'V',
'W',
'X',
'Y',
'Z',
}
)
# special rules
if fname == 'JAMES':
code = 'J7'
elif fname == 'PAT':
code = 'P7'
else:
# A. Prefix treatment
if fname[:2] in {'GE', 'GI', 'GY'}:
fname = 'J' + fname[1:]
elif fname[:2] in {'CE', 'CI', 'CY'}:
fname = 'S' + fname[1:]
elif fname[:3] == 'CHR':
fname = 'K' + fname[1:]
elif fname[:1] == 'C' and fname[:2] != 'CH':
fname = 'K' + fname[1:]
if fname[:2] == 'KN':
fname = 'N' + fname[1:]
elif fname[:2] == 'PH':
fname = 'F' + fname[1:]
elif fname[:3] in {'WIE', 'WEI'}:
fname = 'V' + fname[1:]
if german and fname[:1] in {'W', 'M', 'Y', 'Z'}:
fname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[fname[0]] + fname[
1:
]
code = fname[:1]
# B. Soundex coding
# code for Y unspecified, but presumably is 0
_pshp_translation = dict(
zip(
(ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
'01230120022455012523010202',
)
)
fname = fname.translate(_pshp_translation)
fname = _delete_consecutive_repeats(fname)
code += fname[1:]
syl_ptr = code.find('0')
syl2_ptr = code[syl_ptr + 1 :].find('0')
if syl_ptr != -1 and syl2_ptr != -1 and syl2_ptr - syl_ptr > -1:
code = code[: syl_ptr + 2]
code = code.replace('0', '') # rule 1
if max_length != -1:
if len(code) < max_length:
code += '0' * (max_length - len(code))
else:
code = code[:max_length]
return code
if __name__ == '__main__':
import doctest
doctest.testmod()