# -*- coding: utf-8 -*-
# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.phonetic._spfc.
The phonetic._spfc module implements the Standardized Phonetic Frequency Code
(SPFC) algorithm.
"""
from __future__ import unicode_literals
from unicodedata import normalize as unicode_normalize
from six import text_type
from six.moves import range
from ._util import _delete_consecutive_repeats
__all__ = ['spfc']
[docs]def spfc(word):
"""Return the Standardized Phonetic Frequency Code (SPFC) of a word.
Standardized Phonetic Frequency Code is roughly Soundex-like.
This implementation is based on page 19-21 of :cite:`Moore:1977`.
:param str word: the word to transform
:returns: the SPFC value
:rtype: str
>>> spfc('Christopher Smith')
'01160'
>>> spfc('Christopher Schmidt')
'01160'
>>> spfc('Niall Smith')
'01660'
>>> spfc('Niall Schmidt')
'01660'
>>> spfc('L.Smith')
'01960'
>>> spfc('R.Miller')
'65490'
>>> spfc(('L', 'Smith'))
'01960'
>>> spfc(('R', 'Miller'))
'65490'
"""
_pf1 = dict(
zip(
(ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'),
'0011112222334445556666777',
)
)
_pf2 = dict(
zip(
(ord(_) for _ in 'SZCKQFPXABORDHIMNGJTUVWEL'),
'0011122233445556677788899',
)
)
_pf3 = dict(
zip(
(ord(_) for _ in 'BCKQVDTFLPGJXMNRSZAEHIOUWY'),
'00000112223334456677777777',
)
)
_substitutions = (
('DK', 'K'),
('DT', 'T'),
('SC', 'S'),
('KN', 'N'),
('MN', 'N'),
)
def _raise_word_ex():
"""Raise an AttributeError."""
raise AttributeError(
'word attribute must be a string with a space '
+ 'or period dividing the first and last names '
+ 'or a tuple/list consisting of the first and '
+ 'last names'
)
if not word:
return ''
names = []
if isinstance(word, (str, text_type)):
names = word.split('.', 1)
if len(names) != 2:
names = word.split(' ', 1)
if len(names) != 2:
_raise_word_ex()
elif hasattr(word, '__iter__'):
if len(word) != 2:
_raise_word_ex()
names = word
else:
_raise_word_ex()
names = [
unicode_normalize(
'NFKD', text_type(_.strip().replace('ß', 'SS').upper())
)
for _ in names
]
code = ''
def steps_one_to_three(name):
"""Perform the first three steps of SPFC."""
# filter out non A-Z
name = ''.join(
_
for _ in name
if _
in {
'A',
'B',
'C',
'D',
'E',
'F',
'G',
'H',
'I',
'J',
'K',
'L',
'M',
'N',
'O',
'P',
'Q',
'R',
'S',
'T',
'U',
'V',
'W',
'X',
'Y',
'Z',
}
)
# 1. In the field, convert DK to K, DT to T, SC to S, KN to N,
# and MN to N
for subst in _substitutions:
name = name.replace(subst[0], subst[1])
# 2. In the name field, replace multiple letters with a single letter
name = _delete_consecutive_repeats(name)
# 3. Remove vowels, W, H, and Y, but keep the first letter in the name
# field.
if name:
name = name[0] + ''.join(
_
for _ in name[1:]
if _ not in {'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'}
)
return name
names = [steps_one_to_three(_) for _ in names]
# 4. The first digit of the code is obtained using PF1 and the first letter
# of the name field. Remove this letter after coding.
if names[1]:
code += names[1][0].translate(_pf1)
names[1] = names[1][1:]
# 5. Using the last letters of the name, use Table PF3 to obtain the
# second digit of the code. Use as many letters as possible and remove
# after coding.
if names[1]:
if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS':
code += '8'
names[1] = names[1][:-3]
elif names[1][-2:] == 'SN':
code += '8'
names[1] = names[1][:-2]
elif names[1][-3:] == 'STR':
code += '9'
names[1] = names[1][:-3]
elif names[1][-2:] in {'SR', 'TN', 'TD'}:
code += '9'
names[1] = names[1][:-2]
elif names[1][-3:] == 'DRS':
code += '7'
names[1] = names[1][:-3]
elif names[1][-2:] in {'TR', 'MN'}:
code += '7'
names[1] = names[1][:-2]
else:
code += names[1][-1].translate(_pf3)
names[1] = names[1][:-1]
# 6. The third digit is found using Table PF2 and the first character of
# the first name. Remove after coding.
if names[0]:
code += names[0][0].translate(_pf2)
names[0] = names[0][1:]
# 7. The fourth digit is found using Table PF2 and the first character of
# the name field. If no letters remain use zero. After coding remove the
# letter.
# 8. The fifth digit is found in the same manner as the fourth using the
# remaining characters of the name field if any.
for _ in range(2):
if names[1]:
code += names[1][0].translate(_pf2)
names[1] = names[1][1:]
else:
code += '0'
return code
if __name__ == '__main__':
import doctest
doctest.testmod()