Source code for abydos.phonetic._double_metaphone
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.phonetic._double_metaphone.
Double Metaphone
"""
from deprecation import deprecated
from ._phonetic import _Phonetic
from .. import __version__
__all__ = ['DoubleMetaphone', 'double_metaphone']
[docs]class DoubleMetaphone(_Phonetic):
"""Double Metaphone.
Based on Lawrence Philips' (Visual) C++ code from 1999
:cite:`Philips:2000`.
.. versionadded:: 0.3.6
"""
def __init__(self, max_length=-1):
"""Initialize DoubleMetaphone instance.
Parameters
----------
max_length : int
Maximum length of the returned Dolby code -- this also activates
the fixed-length code mode if it is greater than 0
.. versionadded:: 0.4.0
"""
self._max_length = max_length
# Require a max_length of at least 4
if self._max_length != -1:
self._max_length = max(4, max_length)
[docs] def encode_alpha(self, word):
"""Return the alphabetic Double Metaphone code for a word.
Parameters
----------
word : str
The word to transform
Returns
-------
tuple
The alphabetic Double Metaphone value(s)
Examples
--------
>>> pe = DoubleMetaphone()
>>> pe.encode_alpha('Christopher')
('KRSTFR', '')
>>> pe.encode_alpha('Niall')
('NL', '')
>>> pe.encode_alpha('Smith')
('SMÞ', 'XMT')
>>> pe.encode_alpha('Schmidt')
('XMT', 'SMT')
.. versionadded:: 0.4.0
"""
return tuple(code.replace('0', 'Þ') for code in self.encode(word))
[docs] def encode(self, word):
"""Return the Double Metaphone code for a word.
Parameters
----------
word : str
The word to transform
Returns
-------
tuple
The Double Metaphone value(s)
Examples
--------
>>> pe = DoubleMetaphone()
>>> pe.encode('Christopher')
('KRSTFR', '')
>>> pe.encode('Niall')
('NL', '')
>>> pe.encode('Smith')
('SM0', 'XMT')
>>> pe.encode('Schmidt')
('XMT', 'SMT')
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
primary = ''
secondary = ''
def _slavo_germanic():
"""Return True if the word appears to be Slavic or Germanic.
Returns
-------
bool
True if the word appears to be Slavic or Germanic
.. versionadded:: 0.1.0
"""
if 'W' in word or 'K' in word or 'CZ' in word:
return True
return False
def _metaph_add(pri, sec=''):
"""Return a new metaphone tuple with the supplied elements.
Parameters
----------
pri : str
The primary element
sec : str
The secondary element
Returns
-------
tuple
A new metaphone tuple with the supplied elements
.. versionadded:: 0.1.0
"""
newpri = primary
newsec = secondary
if pri:
newpri += pri
if sec:
if sec != ' ':
newsec += sec
else:
newsec += pri
return newpri, newsec
def _is_vowel(pos):
"""Return True if the character at word[pos] is a vowel.
Parameters
----------
pos : int
Position in the word
Returns
-------
bool
True if the character is a vowel
.. versionadded:: 0.1.0
"""
if pos >= 0 and word[pos] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
return True
return False
def _get_at(pos):
"""Return the character at word[pos].
Parameters
----------
pos : int
Position in the word
Returns
-------
str
Character at word[pos]
.. versionadded:: 0.1.0
"""
return word[pos]
def _string_at(pos, slen, substrings):
"""Return True if word[pos:pos+slen] is in substrings.
Parameters
----------
pos : int
Position in the word
slen : int
Substring length
substrings : set
Substrings to search
Returns
-------
bool
True if word[pos:pos+slen] is in substrings
.. versionadded:: 0.1.0
"""
if pos < 0:
return False
return word[pos : pos + slen] in substrings
current = 0
length = len(word)
if length < 1:
return '', ''
last = length - 1
word = word.upper()
# Pad the original string so that we can index beyond the edge of the
# world
word += ' '
# Skip these when at start of word
if word[0:2] in {'GN', 'KN', 'PN', 'WR', 'PS'}:
current += 1
# Initial 'X' is pronounced 'Z' e.g. 'Xavier'
if _get_at(0) == 'X':
primary, secondary = _metaph_add('S') # 'Z' maps to 'S'
current += 1
# Main loop
while True:
if current >= length:
break
if _get_at(current) in {'A', 'E', 'I', 'O', 'U', 'Y'}:
if current == 0:
# All init vowels now map to 'A'
primary, secondary = _metaph_add('A')
current += 1
continue
elif _get_at(current) == 'B':
# "-mb", e.g", "dumb", already skipped over...
primary, secondary = _metaph_add('P')
if _get_at(current + 1) == 'B':
current += 2
else:
current += 1
continue
elif _get_at(current) == 'Ç':
primary, secondary = _metaph_add('S')
current += 1
continue
elif _get_at(current) == 'C':
# Various Germanic
if (
current > 1
and not _is_vowel(current - 2)
and _string_at((current - 1), 3, {'ACH'})
and (
(_get_at(current + 2) != 'I')
and (
(_get_at(current + 2) != 'E')
or _string_at(
(current - 2), 6, {'BACHER', 'MACHER'}
)
)
)
):
primary, secondary = _metaph_add('K')
current += 2
continue
# Special case 'caesar'
elif current == 0 and _string_at(current, 6, {'CAESAR'}):
primary, secondary = _metaph_add('S')
current += 2
continue
# Italian 'chianti'
elif _string_at(current, 4, {'CHIA'}):
primary, secondary = _metaph_add('K')
current += 2
continue
elif _string_at(current, 2, {'CH'}):
# Find 'Michael'
if current > 0 and _string_at(current, 4, {'CHAE'}):
primary, secondary = _metaph_add('K', 'X')
current += 2
continue
# Greek roots e.g. 'chemistry', 'chorus'
elif (
current == 0
and (
_string_at((current + 1), 5, {'HARAC', 'HARIS'})
or _string_at(
(current + 1), 3, {'HOR', 'HYM', 'HIA', 'HEM'}
)
)
and not _string_at(0, 5, {'CHORE'})
):
primary, secondary = _metaph_add('K')
current += 2
continue
# Germanic, Greek, or otherwise 'ch' for 'kh' sound
elif (
(
_string_at(0, 4, {'VAN ', 'VON '})
or _string_at(0, 3, {'SCH'})
)
or
# 'architect but not 'arch', 'orchestra', 'orchid'
_string_at(
(current - 2), 6, {'ORCHES', 'ARCHIT', 'ORCHID'}
)
or _string_at((current + 2), 1, {'T', 'S'})
or (
(
_string_at(
(current - 1), 1, {'A', 'O', 'U', 'E'}
)
or (current == 0)
)
and
# e.g., 'wachtler', 'wechsler', but not 'tichner'
_string_at(
(current + 2),
1,
{
'L',
'R',
'N',
'M',
'B',
'H',
'F',
'V',
'W',
' ',
},
)
)
):
primary, secondary = _metaph_add('K')
else:
if current > 0:
if _string_at(0, 2, {'MC'}):
# e.g., "McHugh"
primary, secondary = _metaph_add('K')
else:
primary, secondary = _metaph_add('X', 'K')
else:
primary, secondary = _metaph_add('X')
current += 2
continue
# e.g, 'czerny'
elif _string_at(current, 2, {'CZ'}) and not _string_at(
(current - 2), 4, {'WICZ'}
):
primary, secondary = _metaph_add('S', 'X')
current += 2
continue
# e.g., 'focaccia'
elif _string_at((current + 1), 3, {'CIA'}):
primary, secondary = _metaph_add('X')
current += 3
# double 'C', but not if e.g. 'McClellan'
elif _string_at(current, 2, {'CC'}) and not (
(current == 1) and (_get_at(0) == 'M')
):
# 'bellocchio' but not 'bacchus'
if _string_at(
(current + 2), 1, {'I', 'E', 'H'}
) and not _string_at((current + 2), 2, {'HU'}):
# 'accident', 'accede' 'succeed'
if (
(current == 1) and _get_at(current - 1) == 'A'
) or _string_at((current - 1), 5, {'UCCEE', 'UCCES'}):
primary, secondary = _metaph_add('KS')
# 'bacci', 'bertucci', other italian
else:
primary, secondary = _metaph_add('X')
current += 3
continue
else: # Pierce's rule
primary, secondary = _metaph_add('K')
current += 2
continue
elif _string_at(current, 2, {'CK', 'CG', 'CQ'}):
primary, secondary = _metaph_add('K')
current += 2
continue
elif _string_at(current, 2, {'CI', 'CE', 'CY'}):
# Italian vs. English
if _string_at(current, 3, {'CIO', 'CIE', 'CIA'}):
primary, secondary = _metaph_add('S', 'X')
else:
primary, secondary = _metaph_add('S')
current += 2
continue
# else
else:
primary, secondary = _metaph_add('K')
# name sent in 'mac caffrey', 'mac gregor
if _string_at((current + 1), 2, {' C', ' Q', ' G'}):
current += 3
elif _string_at(
(current + 1), 1, {'C', 'K', 'Q'}
) and not _string_at((current + 1), 2, {'CE', 'CI'}):
current += 2
else:
current += 1
continue
elif _get_at(current) == 'D':
if _string_at(current, 2, {'DG'}):
if _string_at((current + 2), 1, {'I', 'E', 'Y'}):
# e.g. 'edge'
primary, secondary = _metaph_add('J')
current += 3
continue
else:
# e.g. 'edgar'
primary, secondary = _metaph_add('TK')
current += 2
continue
elif _string_at(current, 2, {'DT', 'DD'}):
primary, secondary = _metaph_add('T')
current += 2
continue
# else
else:
primary, secondary = _metaph_add('T')
current += 1
continue
elif _get_at(current) == 'F':
if _get_at(current + 1) == 'F':
current += 2
else:
current += 1
primary, secondary = _metaph_add('F')
continue
elif _get_at(current) == 'G':
if _get_at(current + 1) == 'H':
if (current > 0) and not _is_vowel(current - 1):
primary, secondary = _metaph_add('K')
current += 2
continue
# 'ghislane', ghiradelli
elif current == 0:
if _get_at(current + 2) == 'I':
primary, secondary = _metaph_add('J')
else:
primary, secondary = _metaph_add('K')
current += 2
continue
# Parker's rule (with some further refinements) -
# e.g., 'hugh'
elif (
(
(current > 1)
and _string_at((current - 2), 1, {'B', 'H', 'D'})
)
or
# e.g., 'bough'
(
(current > 2)
and _string_at((current - 3), 1, {'B', 'H', 'D'})
)
or
# e.g., 'broughton'
(
(current > 3)
and _string_at((current - 4), 1, {'B', 'H'})
)
):
current += 2
continue
else:
# e.g. 'laugh', 'McLaughlin', 'cough',
# 'gough', 'rough', 'tough'
if (
(current > 2)
and (_get_at(current - 1) == 'U')
and (
_string_at(
(current - 3), 1, {'C', 'G', 'L', 'R', 'T'}
)
)
):
primary, secondary = _metaph_add('F')
elif (current > 0) and _get_at(current - 1) != 'I':
primary, secondary = _metaph_add('K')
current += 2
continue
elif _get_at(current + 1) == 'N':
if (
(current == 1)
and _is_vowel(0)
and not _slavo_germanic()
):
primary, secondary = _metaph_add('KN', 'N')
# not e.g. 'cagney'
elif (
not _string_at((current + 2), 2, {'EY'})
and (_get_at(current + 1) != 'Y')
and not _slavo_germanic()
):
primary, secondary = _metaph_add('N', 'KN')
else:
primary, secondary = _metaph_add('KN')
current += 2
continue
# 'tagliaro'
elif (
_string_at((current + 1), 2, {'LI'})
and not _slavo_germanic()
):
primary, secondary = _metaph_add('KL', 'L')
current += 2
continue
# -ges-, -gep-, -gel-, -gie- at beginning
elif (current == 0) and (
(_get_at(current + 1) == 'Y')
or _string_at(
(current + 1),
2,
{
'ES',
'EP',
'EB',
'EL',
'EY',
'IB',
'IL',
'IN',
'IE',
'EI',
'ER',
},
)
):
primary, secondary = _metaph_add('K', 'J')
current += 2
continue
# -ger-, -gy-
elif (
(
_string_at((current + 1), 2, {'ER'})
or (_get_at(current + 1) == 'Y')
)
and not _string_at(0, 6, {'DANGER', 'RANGER', 'MANGER'})
and not _string_at((current - 1), 1, {'E', 'I'})
and not _string_at((current - 1), 3, {'RGY', 'OGY'})
):
primary, secondary = _metaph_add('K', 'J')
current += 2
continue
# italian e.g, 'biaggi'
elif _string_at(
(current + 1), 1, {'E', 'I', 'Y'}
) or _string_at((current - 1), 4, {'AGGI', 'OGGI'}):
# obvious germanic
if (
_string_at(0, 4, {'VAN ', 'VON '})
or _string_at(0, 3, {'SCH'})
) or _string_at((current + 1), 2, {'ET'}):
primary, secondary = _metaph_add('K')
elif _string_at((current + 1), 4, {'IER '}):
primary, secondary = _metaph_add('J')
else:
primary, secondary = _metaph_add('J', 'K')
current += 2
continue
else:
if _get_at(current + 1) == 'G':
current += 2
else:
current += 1
primary, secondary = _metaph_add('K')
continue
elif _get_at(current) == 'H':
# only keep if first & before vowel or btw. 2 vowels
if ((current == 0) or _is_vowel(current - 1)) and _is_vowel(
current + 1
):
primary, secondary = _metaph_add('H')
current += 2
else: # also takes care of 'HH'
current += 1
continue
elif _get_at(current) == 'J':
# obvious spanish, 'jose', 'san jacinto'
if _string_at(current, 4, {'JOSE'}) or _string_at(
0, 4, {'SAN '}
):
if (
(current == 0) and (_get_at(current + 4) == ' ')
) or _string_at(0, 4, {'SAN '}):
primary, secondary = _metaph_add('H')
else:
primary, secondary = _metaph_add('J', 'H')
current += 1
continue
elif (current == 0) and not _string_at(current, 4, {'JOSE'}):
# Yankelovich/Jankelowicz
primary, secondary = _metaph_add('J', 'A')
# Spanish pron. of e.g. 'bajador'
elif (
_is_vowel(current - 1)
and not _slavo_germanic()
and (
(_get_at(current + 1) == 'A')
or (_get_at(current + 1) == 'O')
)
):
primary, secondary = _metaph_add('J', 'H')
elif current == last:
primary, secondary = _metaph_add('J', ' ')
elif not _string_at(
(current + 1), 1, {'L', 'T', 'K', 'S', 'N', 'M', 'B', 'Z'}
) and not _string_at((current - 1), 1, {'S', 'K', 'L'}):
primary, secondary = _metaph_add('J')
if _get_at(current + 1) == 'J': # it could happen!
current += 2
else:
current += 1
continue
elif _get_at(current) == 'K':
if _get_at(current + 1) == 'K':
current += 2
else:
current += 1
primary, secondary = _metaph_add('K')
continue
elif _get_at(current) == 'L':
if _get_at(current + 1) == 'L':
# Spanish e.g. 'cabrillo', 'gallegos'
if (
(current == (length - 3))
and _string_at(
(current - 1), 4, {'ILLO', 'ILLA', 'ALLE'}
)
) or (
(
_string_at((last - 1), 2, {'AS', 'OS'})
or _string_at(last, 1, {'A', 'O'})
)
and _string_at((current - 1), 4, {'ALLE'})
):
primary, secondary = _metaph_add('L', ' ')
current += 2
continue
current += 2
else:
current += 1
primary, secondary = _metaph_add('L')
continue
elif _get_at(current) == 'M':
if (
(
_string_at((current - 1), 3, {'UMB'})
and (
((current + 1) == last)
or _string_at((current + 2), 2, {'ER'})
)
)
or
# 'dumb', 'thumb'
(_get_at(current + 1) == 'M')
):
current += 2
else:
current += 1
primary, secondary = _metaph_add('M')
continue
elif _get_at(current) == 'N':
if _get_at(current + 1) == 'N':
current += 2
else:
current += 1
primary, secondary = _metaph_add('N')
continue
elif _get_at(current) == 'Ñ':
current += 1
primary, secondary = _metaph_add('N')
continue
elif _get_at(current) == 'P':
if _get_at(current + 1) == 'H':
primary, secondary = _metaph_add('F')
current += 2
continue
# also account for "campbell", "raspberry"
elif _string_at((current + 1), 1, {'P', 'B'}):
current += 2
else:
current += 1
primary, secondary = _metaph_add('P')
continue
elif _get_at(current) == 'Q':
if _get_at(current + 1) == 'Q':
current += 2
else:
current += 1
primary, secondary = _metaph_add('K')
continue
elif _get_at(current) == 'R':
# french e.g. 'rogier', but exclude 'hochmeier'
if (
(current == last)
and not _slavo_germanic()
and _string_at((current - 2), 2, {'IE'})
and not _string_at((current - 4), 2, {'ME', 'MA'})
):
primary, secondary = _metaph_add('', 'R')
else:
primary, secondary = _metaph_add('R')
if _get_at(current + 1) == 'R':
current += 2
else:
current += 1
continue
elif _get_at(current) == 'S':
# special cases 'island', 'isle', 'carlisle', 'carlysle'
if _string_at((current - 1), 3, {'ISL', 'YSL'}):
current += 1
continue
# special case 'sugar-'
elif (current == 0) and _string_at(current, 5, {'SUGAR'}):
primary, secondary = _metaph_add('X', 'S')
current += 1
continue
elif _string_at(current, 2, {'SH'}):
# Germanic
if _string_at(
(current + 1), 4, {'HEIM', 'HOEK', 'HOLM', 'HOLZ'}
):
primary, secondary = _metaph_add('S')
else:
primary, secondary = _metaph_add('X')
current += 2
continue
# Italian & Armenian
elif _string_at(current, 3, {'SIO', 'SIA'}) or _string_at(
current, 4, {'SIAN'}
):
if not _slavo_germanic():
primary, secondary = _metaph_add('S', 'X')
else:
primary, secondary = _metaph_add('S')
current += 3
continue
# German & anglicisations, e.g. 'smith' match 'schmidt',
# 'snider' match 'schneider'
# also, -sz- in Slavic language although in Hungarian it is
# pronounced 's'
elif (
(current == 0)
and _string_at((current + 1), 1, {'M', 'N', 'L', 'W'})
) or _string_at((current + 1), 1, {'Z'}):
primary, secondary = _metaph_add('S', 'X')
if _string_at((current + 1), 1, {'Z'}):
current += 2
else:
current += 1
continue
elif _string_at(current, 2, {'SC'}):
# Schlesinger's rule
if _get_at(current + 2) == 'H':
# dutch origin, e.g. 'school', 'schooner'
if _string_at(
(current + 3),
2,
{'OO', 'ER', 'EN', 'UY', 'ED', 'EM'},
):
# 'schermerhorn', 'schenker'
if _string_at((current + 3), 2, {'ER', 'EN'}):
primary, secondary = _metaph_add('X', 'SK')
else:
primary, secondary = _metaph_add('SK')
current += 3
continue
else:
if (
(current == 0)
and not _is_vowel(3)
and (_get_at(3) != 'W')
):
primary, secondary = _metaph_add('X', 'S')
else:
primary, secondary = _metaph_add('X')
current += 3
continue
elif _string_at((current + 2), 1, {'I', 'E', 'Y'}):
primary, secondary = _metaph_add('S')
current += 3
continue
# else
else:
primary, secondary = _metaph_add('SK')
current += 3
continue
else:
# french e.g. 'resnais', 'artois'
if (current == last) and _string_at(
(current - 2), 2, {'AI', 'OI'}
):
primary, secondary = _metaph_add('', 'S')
else:
primary, secondary = _metaph_add('S')
if _string_at((current + 1), 1, {'S', 'Z'}):
current += 2
else:
current += 1
continue
elif _get_at(current) == 'T':
if _string_at(current, 4, {'TION'}):
primary, secondary = _metaph_add('X')
current += 3
continue
elif _string_at(current, 3, {'TIA', 'TCH'}):
primary, secondary = _metaph_add('X')
current += 3
continue
elif _string_at(current, 2, {'TH'}) or _string_at(
current, 3, {'TTH'}
):
# special case 'thomas', 'thames' or germanic
if (
_string_at((current + 2), 2, {'OM', 'AM'})
or _string_at(0, 4, {'VAN ', 'VON '})
or _string_at(0, 3, {'SCH'})
):
primary, secondary = _metaph_add('T')
else:
primary, secondary = _metaph_add('0', 'T')
current += 2
continue
elif _string_at((current + 1), 1, {'T', 'D'}):
current += 2
else:
current += 1
primary, secondary = _metaph_add('T')
continue
elif _get_at(current) == 'V':
if _get_at(current + 1) == 'V':
current += 2
else:
current += 1
primary, secondary = _metaph_add('F')
continue
elif _get_at(current) == 'W':
# can also be in middle of word
if _string_at(current, 2, {'WR'}):
primary, secondary = _metaph_add('R')
current += 2
continue
elif (current == 0) and (
_is_vowel(current + 1) or _string_at(current, 2, {'WH'})
):
# Wasserman should match Vasserman
if _is_vowel(current + 1):
primary, secondary = _metaph_add('A', 'F')
else:
# need Uomo to match Womo
primary, secondary = _metaph_add('A')
# Arnow should match Arnoff
if (
((current == last) and _is_vowel(current - 1))
or _string_at(
(current - 1), 5, {'EWSKI', 'EWSKY', 'OWSKI', 'OWSKY'}
)
or _string_at(0, 3, {'SCH'})
):
primary, secondary = _metaph_add('', 'F')
current += 1
continue
# Polish e.g. 'filipowicz'
elif _string_at(current, 4, {'WICZ', 'WITZ'}):
primary, secondary = _metaph_add('TS', 'FX')
current += 4
continue
# else skip it
else:
current += 1
continue
elif _get_at(current) == 'X':
# French e.g. breaux
if not (
(current == last)
and (
_string_at((current - 3), 3, {'IAU', 'EAU'})
or _string_at((current - 2), 2, {'AU', 'OU'})
)
):
primary, secondary = _metaph_add('KS')
if _string_at((current + 1), 1, {'C', 'X'}):
current += 2
else:
current += 1
continue
elif _get_at(current) == 'Z':
# Chinese Pinyin e.g. 'zhao'
if _get_at(current + 1) == 'H':
primary, secondary = _metaph_add('J')
current += 2
continue
elif _string_at((current + 1), 2, {'ZO', 'ZI', 'ZA'}) or (
_slavo_germanic()
and ((current > 0) and _get_at(current - 1) != 'T')
):
primary, secondary = _metaph_add('S', 'TS')
else:
primary, secondary = _metaph_add('S')
if _get_at(current + 1) == 'Z':
current += 2
else:
current += 1
continue
else:
current += 1
if self._max_length > 0:
primary = primary[: self._max_length]
secondary = secondary[: self._max_length]
if primary == secondary:
secondary = ''
return primary, secondary
[docs]@deprecated(
deprecated_in='0.4.0',
removed_in='0.6.0',
current_version=__version__,
details='Use the DoubleMetaphone.encode method instead.',
)
def double_metaphone(word, max_length=-1):
"""Return the Double Metaphone code for a word.
This is a wrapper for :py:meth:`DoubleMetaphone.encode`.
Parameters
----------
word : str
The word to transform
max_length : int
The maximum length of the returned Double Metaphone codes (defaults to
unlimited, but in Philips' original implementation this was 4)
Returns
-------
tuple
The Double Metaphone value(s)
Examples
--------
>>> double_metaphone('Christopher')
('KRSTFR', '')
>>> double_metaphone('Niall')
('NL', '')
>>> double_metaphone('Smith')
('SM0', 'XMT')
>>> double_metaphone('Schmidt')
('XMT', 'SMT')
.. versionadded:: 0.1.0
"""
return DoubleMetaphone(max_length).encode(word)
if __name__ == '__main__':
import doctest
doctest.testmod()