Source code for abydos.phonetic._spanish_metaphone

# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._spanish_metaphone.

Spanish Metaphone
"""

from unicodedata import normalize as unicode_normalize

from deprecation import deprecated

from ._phonetic import _Phonetic
from .. import __version__

__all__ = ['SpanishMetaphone', 'spanish_metaphone']


[docs]class SpanishMetaphone(_Phonetic): """Spanish Metaphone. This is a quick rewrite of the Spanish Metaphone Algorithm, as presented at https://github.com/amsqr/Spanish-Metaphone and discussed in :cite:`Mosquera:2012`. Modified version based on :cite:`delPilarAngeles:2016`. .. versionadded:: 0.3.6 """ def __init__(self, max_length=6, modified=False): """Initialize AlphaSIS instance. Parameters ---------- max_length : int The length of the code returned (defaults to 6) modified : bool Set to True to use del Pilar Angeles & Bailón-Miguel's modified version of the algorithm .. versionadded:: 0.4.0 """ self._max_length = max_length self._modified = modified
[docs] def encode(self, word): """Return the Spanish Metaphone of a word. Parameters ---------- word : str The word to transform Returns ------- str The Spanish Metaphone code Examples -------- >>> pe = SpanishMetaphone() >>> pe.encode('Perez') 'PRZ' >>> pe.encode('Martinez') 'MRTNZ' >>> pe.encode('Gutierrez') 'GTRRZ' >>> pe.encode('Santiago') 'SNTG' >>> pe.encode('Nicolás') 'NKLS' .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ def _is_vowel(pos): """Return True if the character at word[pos] is a vowel. Parameters ---------- pos : int Position to check for a vowel Returns ------- bool True if word[pos] is a vowel .. versionadded:: 0.3.0 """ return pos < len(word) and word[pos] in {'A', 'E', 'I', 'O', 'U'} word = unicode_normalize('NFC', word.upper()) meta_key = '' pos = 0 # do some replacements for the modified version if self._modified: word = word.replace('MB', 'NB') word = word.replace('MP', 'NP') word = word.replace('BS', 'S') if word[:2] == 'PS': word = word[1:] # simple replacements word = word.replace('Á', 'A') word = word.replace('CH', 'X') word = word.replace('Ç', 'S') word = word.replace('É', 'E') word = word.replace('Í', 'I') word = word.replace('Ó', 'O') word = word.replace('Ú', 'U') word = word.replace('Ñ', 'NY') word = word.replace('GÜ', 'W') word = word.replace('Ü', 'U') word = word.replace('B', 'V') word = word.replace('LL', 'Y') while len(meta_key) < self._max_length: if pos >= len(word): break # get the next character current_char = word[pos] # if a vowel in pos 0, add to key if _is_vowel(pos) and pos == 0: meta_key += current_char pos += 1 # otherwise, do consonant rules else: # simple consonants (unmutated) if current_char in { 'D', 'F', 'J', 'K', 'M', 'N', 'P', 'T', 'V', 'L', 'Y', }: meta_key += current_char # skip doubled consonants if word[pos + 1 : pos + 2] == current_char: pos += 2 else: pos += 1 else: if current_char == 'C': # special case 'acción', 'reacción',etc. if word[pos + 1 : pos + 2] == 'C': meta_key += 'X' pos += 2 # special case 'cesar', 'cien', 'cid', 'conciencia' elif word[pos + 1 : pos + 2] in {'E', 'I'}: meta_key += 'Z' pos += 2 # base case else: meta_key += 'K' pos += 1 elif current_char == 'G': # special case 'gente', 'ecologia',etc if word[pos + 1 : pos + 2] in {'E', 'I'}: meta_key += 'J' pos += 2 # base case else: meta_key += 'G' pos += 1 elif current_char == 'H': # since the letter 'H' is silent in Spanish, # set the meta key to the vowel after the letter 'H' if _is_vowel(pos + 1): meta_key += word[pos + 1] pos += 2 else: meta_key += 'H' pos += 1 elif current_char == 'Q': if word[pos + 1 : pos + 2] == 'U': pos += 2 else: pos += 1 meta_key += 'K' elif current_char == 'W': meta_key += 'U' pos += 1 elif current_char == 'R': meta_key += 'R' pos += 1 elif current_char == 'S': if not _is_vowel(pos + 1) and pos == 0: meta_key += 'ES' pos += 1 else: meta_key += 'S' pos += 1 elif current_char == 'Z': meta_key += 'Z' pos += 1 elif current_char == 'X': if ( len(word) > 1 and pos == 0 and not _is_vowel(pos + 1) ): meta_key += 'EX' pos += 1 else: meta_key += 'X' pos += 1 else: pos += 1 # Final change from S to Z in modified version if self._modified: meta_key = meta_key.replace('S', 'Z') return meta_key
[docs]@deprecated( deprecated_in='0.4.0', removed_in='0.6.0', current_version=__version__, details='Use the SpanishMetaphone.encode method instead.', ) def spanish_metaphone(word, max_length=6, modified=False): """Return the Spanish Metaphone of a word. This is a wrapper for :py:meth:`SpanishMetaphone.encode`. Parameters ---------- word : str The word to transform max_length : int The length of the code returned (defaults to 6) modified : bool Set to True to use del Pilar Angeles & Bailón-Miguel's modified version of the algorithm Returns ------- str The Spanish Metaphone code Examples -------- >>> spanish_metaphone('Perez') 'PRZ' >>> spanish_metaphone('Martinez') 'MRTNZ' >>> spanish_metaphone('Gutierrez') 'GTRRZ' >>> spanish_metaphone('Santiago') 'SNTG' >>> spanish_metaphone('Nicolás') 'NKLS' .. versionadded:: 0.3.0 """ return SpanishMetaphone(max_length, modified).encode(word)
if __name__ == '__main__': import doctest doctest.testmod()