Source code for abydos.stemmer._paice_husk

# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.stemmer._paice_husk.

Paice-Husk Stemmer
"""

from deprecation import deprecated

from ._stemmer import _Stemmer
from .. import __version__

__all__ = ['PaiceHusk', 'paice_husk']


[docs]class PaiceHusk(_Stemmer): """Paice-Husk stemmer. Implementation of the Paice-Husk Stemmer, also known as the Lancaster Stemmer, developed by Chris Paice, with the assistance of Gareth Husk This is based on the algorithm's description in :cite:`Paice:1990`. .. versionadded:: 0.3.6 """ _rule_table = { 6: {'ifiabl': (False, 6, None, True), 'plicat': (False, 4, 'y', True)}, 5: { 'guish': (False, 5, 'ct', True), 'sumpt': (False, 2, None, True), 'istry': (False, 5, None, True), }, 4: { 'ytic': (False, 3, 's', True), 'ceed': (False, 2, 'ss', True), 'hood': (False, 4, None, False), 'lief': (False, 1, 'v', True), 'verj': (False, 1, 't', True), 'misj': (False, 2, 't', True), 'iabl': (False, 4, 'y', True), 'iful': (False, 4, 'y', True), 'sion': (False, 4, 'j', False), 'xion': (False, 4, 'ct', True), 'ship': (False, 4, None, False), 'ness': (False, 4, None, False), 'ment': (False, 4, None, False), 'ript': (False, 2, 'b', True), 'orpt': (False, 2, 'b', True), 'duct': (False, 1, None, True), 'cept': (False, 2, 'iv', True), 'olut': (False, 2, 'v', True), 'sist': (False, 0, None, True), }, 3: { 'ied': (False, 3, 'y', False), 'eed': (False, 1, None, True), 'ing': (False, 3, None, False), 'iag': (False, 3, 'y', True), 'ish': (False, 3, None, False), 'fuj': (False, 1, 's', True), 'hej': (False, 1, 'r', True), 'abl': (False, 3, None, False), 'ibl': (False, 3, None, True), 'bil': (False, 2, 'l', False), 'ful': (False, 3, None, False), 'ial': (False, 3, None, False), 'ual': (False, 3, None, False), 'ium': (False, 3, None, True), 'ism': (False, 3, None, False), 'ion': (False, 3, None, False), 'ian': (False, 3, None, False), 'een': (False, 0, None, True), 'ear': (False, 0, None, True), 'ier': (False, 3, 'y', False), 'ies': (False, 3, 'y', False), 'sis': (False, 2, None, True), 'ous': (False, 3, None, False), 'ent': (False, 3, None, False), 'ant': (False, 3, None, False), 'ist': (False, 3, None, False), 'iqu': (False, 3, None, True), 'ogu': (False, 1, None, True), 'siv': (False, 3, 'j', False), 'eiv': (False, 0, None, True), 'bly': (False, 1, None, False), 'ily': (False, 3, 'y', False), 'ply': (False, 0, None, True), 'ogy': (False, 1, None, True), 'phy': (False, 1, None, True), 'omy': (False, 1, None, True), 'opy': (False, 1, None, True), 'ity': (False, 3, None, False), 'ety': (False, 3, None, False), 'lty': (False, 2, None, True), 'ary': (False, 3, None, False), 'ory': (False, 3, None, False), 'ify': (False, 3, None, True), 'ncy': (False, 2, 't', False), 'acy': (False, 3, None, False), }, 2: { 'ia': (True, 2, None, True), 'bb': (False, 1, None, True), 'ic': (False, 2, None, False), 'nc': (False, 1, 't', False), 'dd': (False, 1, None, True), 'ed': (False, 2, None, False), 'if': (False, 2, None, False), 'ag': (False, 2, None, False), 'gg': (False, 1, None, True), 'th': (True, 2, None, True), 'ij': (False, 1, 'd', True), 'uj': (False, 1, 'd', True), 'oj': (False, 1, 'd', True), 'nj': (False, 1, 'd', True), 'cl': (False, 1, None, True), 'ul': (False, 2, None, True), 'al': (False, 2, None, False), 'll': (False, 1, None, True), 'um': (True, 2, None, True), 'mm': (False, 1, None, True), 'an': (False, 2, None, False), 'en': (False, 2, None, False), 'nn': (False, 1, None, True), 'pp': (False, 1, None, True), 'er': (False, 2, None, False), 'ar': (False, 2, None, True), 'or': (False, 2, None, False), 'ur': (False, 2, None, False), 'rr': (False, 1, None, True), 'tr': (False, 1, None, False), 'is': (False, 2, None, False), 'ss': (False, 0, None, True), 'us': (True, 2, None, True), 'at': (False, 2, None, False), 'tt': (False, 1, None, True), 'iv': (False, 2, None, False), 'ly': (False, 2, None, False), 'iz': (False, 2, None, False), 'yz': (False, 1, 's', True), }, 1: { 'a': (True, 1, None, True), 'e': (False, 1, None, False), 'i': ((True, 1, None, True), (False, 1, 'y', False)), 'j': (False, 1, 's', True), 's': ((True, 1, None, False), (False, 0, None, True)), }, } def _has_vowel(self, word): for char in word: if char in {'a', 'e', 'i', 'o', 'u', 'y'}: return True return False def _acceptable(self, word): if word and word[0] in {'a', 'e', 'i', 'o', 'u'}: return len(word) > 1 return len(word) > 2 and self._has_vowel(word[1:]) def _apply_rule(self, word, rule, intact, terminate): old_word = word only_intact, del_len, add_str, set_terminate = rule # print(word, word[-n:], rule) if (not only_intact) or (intact and only_intact): if del_len: word = word[:-del_len] if add_str: word += add_str else: return word, False, intact, terminate if self._acceptable(word): return word, True, False, set_terminate else: return old_word, False, intact, terminate
[docs] def stem(self, word): """Return Paice-Husk stem. Parameters ---------- word : str The word to stem Returns ------- str Word stem Examples -------- >>> stmr = PaiceHusk() >>> stmr.stem('assumption') 'assum' >>> stmr.stem('verifiable') 'ver' >>> stmr.stem('fancies') 'fant' >>> stmr.stem('fanciful') 'fancy' >>> stmr.stem('torment') 'tor' .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ terminate = False intact = True while not terminate: for n in range(6, 0, -1): if word[-n:] in self._rule_table[n]: accept = False if len(self._rule_table[n][word[-n:]]) < 4: for rule in self._rule_table[n][word[-n:]]: ( word, accept, intact, terminate, ) = self._apply_rule(word, rule, intact, terminate) if accept: break else: rule = self._rule_table[n][word[-n:]] (word, accept, intact, terminate) = self._apply_rule( word, rule, intact, terminate ) if accept: break else: break return word
[docs]@deprecated( deprecated_in='0.4.0', removed_in='0.6.0', current_version=__version__, details='Use the PaiceHusk.stem method instead.', ) def paice_husk(word): """Return Paice-Husk stem. This is a wrapper for :py:meth:`PaiceHusk.stem`. Parameters ---------- word : str The word to stem Returns ------- str Word stem Examples -------- >>> paice_husk('assumption') 'assum' >>> paice_husk('verifiable') 'ver' >>> paice_husk('fancies') 'fant' >>> paice_husk('fanciful') 'fancy' >>> paice_husk('torment') 'tor' .. versionadded:: 0.3.0 """ return PaiceHusk().stem(word)
if __name__ == '__main__': import doctest doctest.testmod()