# -*- coding: utf-8 -*-
# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.phonetic._alpha_sis.
The phonetic._alpha_sis module implements IBM's Alpha Search Inquiry System
coding.
"""
from __future__ import unicode_literals
from unicodedata import normalize as unicode_normalize
from six import text_type
from six.moves import range
__all__ = ['alpha_sis']
[docs]def alpha_sis(word, max_length=14):
"""Return the IBM Alpha Search Inquiry System code for a word.
The Alpha Search Inquiry System code is defined in :cite:`IBM:1973`.
This implementation is based on the description in :cite:`Moore:1977`.
A collection is necessary since there can be multiple values for a
single word. But the collection must be ordered since the first value
is the primary coding.
:param str word: the word to transform
:param int max_length: the length of the code returned (defaults to 14)
:returns: the Alpha SIS value
:rtype: tuple
>>> alpha_sis('Christopher')
('06401840000000', '07040184000000', '04018400000000')
>>> alpha_sis('Niall')
('02500000000000',)
>>> alpha_sis('Smith')
('03100000000000',)
>>> alpha_sis('Schmidt')
('06310000000000',)
"""
_alpha_sis_initials = {
'GF': '08',
'GM': '03',
'GN': '02',
'KN': '02',
'PF': '08',
'PN': '02',
'PS': '00',
'WR': '04',
'A': '1',
'E': '1',
'H': '2',
'I': '1',
'J': '3',
'O': '1',
'U': '1',
'W': '4',
'Y': '5',
}
_alpha_sis_initials_order = (
'GF',
'GM',
'GN',
'KN',
'PF',
'PN',
'PS',
'WR',
'A',
'E',
'H',
'I',
'J',
'O',
'U',
'W',
'Y',
)
_alpha_sis_basic = {
'SCH': '6',
'CZ': ('70', '6', '0'),
'CH': ('6', '70', '0'),
'CK': ('7', '6'),
'DS': ('0', '10'),
'DZ': ('0', '10'),
'TS': ('0', '10'),
'TZ': ('0', '10'),
'CI': '0',
'CY': '0',
'CE': '0',
'SH': '6',
'DG': '7',
'PH': '8',
'C': ('7', '6'),
'K': ('7', '6'),
'Z': '0',
'S': '0',
'D': '1',
'T': '1',
'N': '2',
'M': '3',
'R': '4',
'L': '5',
'J': '6',
'G': '7',
'Q': '7',
'X': '7',
'F': '8',
'V': '8',
'B': '9',
'P': '9',
}
_alpha_sis_basic_order = (
'SCH',
'CZ',
'CH',
'CK',
'DS',
'DZ',
'TS',
'TZ',
'CI',
'CY',
'CE',
'SH',
'DG',
'PH',
'C',
'K',
'Z',
'S',
'D',
'T',
'N',
'M',
'R',
'L',
'J',
'C',
'G',
'K',
'Q',
'X',
'F',
'V',
'B',
'P',
)
alpha = ['']
pos = 0
word = unicode_normalize('NFKD', text_type(word.upper()))
word = word.replace('ß', 'SS')
word = ''.join(
c
for c in word
if c
in {
'A',
'B',
'C',
'D',
'E',
'F',
'G',
'H',
'I',
'J',
'K',
'L',
'M',
'N',
'O',
'P',
'Q',
'R',
'S',
'T',
'U',
'V',
'W',
'X',
'Y',
'Z',
}
)
# Clamp max_length to [4, 64]
if max_length != -1:
max_length = min(max(4, max_length), 64)
else:
max_length = 64
# Do special processing for initial substrings
for k in _alpha_sis_initials_order:
if word.startswith(k):
alpha[0] += _alpha_sis_initials[k]
pos += len(k)
break
# Add a '0' if alpha is still empty
if not alpha[0]:
alpha[0] += '0'
# Whether or not any special initial codes were encoded, iterate
# through the length of the word in the main encoding loop
while pos < len(word):
orig_pos = pos
for k in _alpha_sis_basic_order:
if word[pos:].startswith(k):
if isinstance(_alpha_sis_basic[k], tuple):
newalpha = []
for i in range(len(_alpha_sis_basic[k])):
newalpha += [_ + _alpha_sis_basic[k][i] for _ in alpha]
alpha = newalpha
else:
alpha = [_ + _alpha_sis_basic[k] for _ in alpha]
pos += len(k)
break
if pos == orig_pos:
alpha = [_ + '_' for _ in alpha]
pos += 1
# Trim doublets and placeholders
for i in range(len(alpha)):
pos = 1
while pos < len(alpha[i]):
if alpha[i][pos] == alpha[i][pos - 1]:
alpha[i] = alpha[i][:pos] + alpha[i][pos + 1 :]
pos += 1
alpha = (_.replace('_', '') for _ in alpha)
# Trim codes and return tuple
alpha = ((_ + ('0' * max_length))[:max_length] for _ in alpha)
return tuple(alpha)
if __name__ == '__main__':
import doctest
doctest.testmod()