Source code for abydos.fingerprint._extract
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.fingerprint._extract.
Taft's extract letter list coding
"""
from ._fingerprint import _Fingerprint
__all__ = ['Extract']
[docs]class Extract(_Fingerprint):
"""Extract Letter List fingerprint.
Based on the extract letter list coding from :cite:`Taft:1970`,
for lists 1, 2, 3, & 4.
.. versionadded:: 0.4.1
"""
def __init__(self, letter_list=1):
"""Initialize Extract instance.
Parameters
----------
letter_list : int or iterable
If an integer (1-4) is supplied, Taft's specified letter lists are
used. If an iterable is supplied, its values will be used as the
list of letters to remove (in order).
.. versionadded:: 0.4.1
"""
letter_lists = [
'ETAONRISHDLFCMUGYPWBVKXJQZ',
'ETASIONRHCDLPMFBUWGYKVJQZX',
'ETAONISRHLDCUMFYWGPKBVXJQZ',
'EARNLOISTHDMCBGUWYJKPFVZXQ',
]
super(_Fingerprint, self).__init__()
self._letter_list = letter_list
if isinstance(self._letter_list, int) and 1 <= self._letter_list <= 4:
self._letter_list = list(letter_lists[self._letter_list - 1])
elif hasattr(self._letter_list, '__iter__'):
self._letter_list = list(self._letter_list)
else:
self._letter_list = list(letter_lists[0])
[docs] def fingerprint(self, word):
"""Return the extract letter list coding.
Parameters
----------
word : str
The word to fingerprint
Returns
-------
int
The extract letter list coding
Examples
--------
>>> fp = Extract()
>>> fp.fingerprint('hat')
'HAT'
>>> fp.fingerprint('niall')
'NILL'
>>> fp.fingerprint('colin')
'CLIN'
>>> fp.fingerprint('atcg')
'ATCG'
>>> fp.fingerprint('entreatment')
'NRMN'
.. versionadded:: 0.4.1
"""
# uppercase & reverse
word = word.upper()[::-1]
for letter in self._letter_list: # pragma: no branch
if len(word) < 5:
break
count = word.count(letter)
if count:
word = word.replace(
letter, '', count - (4 - (len(word) - count))
)
return word[::-1]
if __name__ == '__main__':
import doctest
doctest.testmod()