Source code for abydos.phonetic._alpha_sis

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._alpha_sis.

The phonetic._alpha_sis module implements IBM's Alpha Search Inquiry System
coding.
"""

from __future__ import unicode_literals

from unicodedata import normalize as unicode_normalize

from six import text_type
from six.moves import range

__all__ = ['alpha_sis']


[docs]def alpha_sis(word, max_length=14): """Return the IBM Alpha Search Inquiry System code for a word. The Alpha Search Inquiry System code is defined in :cite:`IBM:1973`. This implementation is based on the description in :cite:`Moore:1977`. A collection is necessary since there can be multiple values for a single word. But the collection must be ordered since the first value is the primary coding. :param str word: the word to transform :param int max_length: the length of the code returned (defaults to 14) :returns: the Alpha SIS value :rtype: tuple >>> alpha_sis('Christopher') ('06401840000000', '07040184000000', '04018400000000') >>> alpha_sis('Niall') ('02500000000000',) >>> alpha_sis('Smith') ('03100000000000',) >>> alpha_sis('Schmidt') ('06310000000000',) """ _alpha_sis_initials = { 'GF': '08', 'GM': '03', 'GN': '02', 'KN': '02', 'PF': '08', 'PN': '02', 'PS': '00', 'WR': '04', 'A': '1', 'E': '1', 'H': '2', 'I': '1', 'J': '3', 'O': '1', 'U': '1', 'W': '4', 'Y': '5', } _alpha_sis_initials_order = ( 'GF', 'GM', 'GN', 'KN', 'PF', 'PN', 'PS', 'WR', 'A', 'E', 'H', 'I', 'J', 'O', 'U', 'W', 'Y', ) _alpha_sis_basic = { 'SCH': '6', 'CZ': ('70', '6', '0'), 'CH': ('6', '70', '0'), 'CK': ('7', '6'), 'DS': ('0', '10'), 'DZ': ('0', '10'), 'TS': ('0', '10'), 'TZ': ('0', '10'), 'CI': '0', 'CY': '0', 'CE': '0', 'SH': '6', 'DG': '7', 'PH': '8', 'C': ('7', '6'), 'K': ('7', '6'), 'Z': '0', 'S': '0', 'D': '1', 'T': '1', 'N': '2', 'M': '3', 'R': '4', 'L': '5', 'J': '6', 'G': '7', 'Q': '7', 'X': '7', 'F': '8', 'V': '8', 'B': '9', 'P': '9', } _alpha_sis_basic_order = ( 'SCH', 'CZ', 'CH', 'CK', 'DS', 'DZ', 'TS', 'TZ', 'CI', 'CY', 'CE', 'SH', 'DG', 'PH', 'C', 'K', 'Z', 'S', 'D', 'T', 'N', 'M', 'R', 'L', 'J', 'C', 'G', 'K', 'Q', 'X', 'F', 'V', 'B', 'P', ) alpha = [''] pos = 0 word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = ''.join( c for c in word if c in { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', } ) # Clamp max_length to [4, 64] if max_length != -1: max_length = min(max(4, max_length), 64) else: max_length = 64 # Do special processing for initial substrings for k in _alpha_sis_initials_order: if word.startswith(k): alpha[0] += _alpha_sis_initials[k] pos += len(k) break # Add a '0' if alpha is still empty if not alpha[0]: alpha[0] += '0' # Whether or not any special initial codes were encoded, iterate # through the length of the word in the main encoding loop while pos < len(word): orig_pos = pos for k in _alpha_sis_basic_order: if word[pos:].startswith(k): if isinstance(_alpha_sis_basic[k], tuple): newalpha = [] for i in range(len(_alpha_sis_basic[k])): newalpha += [_ + _alpha_sis_basic[k][i] for _ in alpha] alpha = newalpha else: alpha = [_ + _alpha_sis_basic[k] for _ in alpha] pos += len(k) break if pos == orig_pos: alpha = [_ + '_' for _ in alpha] pos += 1 # Trim doublets and placeholders for i in range(len(alpha)): pos = 1 while pos < len(alpha[i]): if alpha[i][pos] == alpha[i][pos - 1]: alpha[i] = alpha[i][:pos] + alpha[i][pos + 1 :] pos += 1 alpha = (_.replace('_', '') for _ in alpha) # Trim codes and return tuple alpha = ((_ + ('0' * max_length))[:max_length] for _ in alpha) return tuple(alpha)
if __name__ == '__main__': import doctest doctest.testmod()