Source code for abydos.stemmer._lovins

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.stemmer._lovins.

The stemmer._lovins module implements the Lovins stemmer.
"""

from __future__ import unicode_literals

from unicodedata import normalize

from six import text_type
from six.moves import range

__all__ = ['lovins']


[docs]def lovins(word): """Return Lovins stem. Lovins stemmer The Lovins stemmer is described in Julie Beth Lovins's article :cite:`Lovins:1968`. :param str word: the word to stem :returns: word stem :rtype: str >>> lovins('reading') 'read' >>> lovins('suspension') 'suspens' >>> lovins('elusiveness') 'elus' """ # lowercase, normalize, and compose word = normalize('NFC', text_type(word.lower())) def cond_b(word, suffix_len): """Return Lovins' condition B. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return len(word) - suffix_len >= 3 def cond_c(word, suffix_len): """Return Lovins' condition C. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return len(word) - suffix_len >= 4 def cond_d(word, suffix_len): """Return Lovins' condition D. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return len(word) - suffix_len >= 5 def cond_e(word, suffix_len): """Return Lovins' condition E. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len - 1] != 'e' def cond_f(word, suffix_len): """Return Lovins' condition F. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return len(word) - suffix_len >= 3 and word[-suffix_len - 1] != 'e' def cond_g(word, suffix_len): """Return Lovins' condition G. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return len(word) - suffix_len >= 3 and word[-suffix_len - 1] == 'f' def cond_h(word, suffix_len): """Return Lovins' condition H. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return ( word[-suffix_len - 1] == 't' or word[-suffix_len - 2 : -suffix_len] == 'll' ) def cond_i(word, suffix_len): """Return Lovins' condition I. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len - 1] not in {'e', 'o'} def cond_j(word, suffix_len): """Return Lovins' condition J. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len - 1] not in {'a', 'e'} def cond_k(word, suffix_len): """Return Lovins' condition K. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return len(word) - suffix_len >= 3 and ( word[-suffix_len - 1] in {'i', 'l'} or (word[-suffix_len - 3] == 'u' and word[-suffix_len - 1] == 'e') ) def cond_l(word, suffix_len): """Return Lovins' condition L. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return ( word[-suffix_len - 1] not in {'s', 'u', 'x'} or word[-suffix_len - 1] == 'os' ) def cond_m(word, suffix_len): """Return Lovins' condition M. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len - 1] not in {'a', 'c', 'e', 'm'} def cond_n(word, suffix_len): """Return Lovins' condition N. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ if len(word) - suffix_len >= 3: if word[-suffix_len - 3] == 's': if len(word) - suffix_len >= 4: return True else: return True return False def cond_o(word, suffix_len): """Return Lovins' condition O. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len - 1] in {'i', 'l'} def cond_p(word, suffix_len): """Return Lovins' condition P. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len - 1] != 'c' def cond_q(word, suffix_len): """Return Lovins' condition Q. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return len(word) - suffix_len >= 3 and word[-suffix_len - 1] not in { 'l', 'n', } def cond_r(word, suffix_len): """Return Lovins' condition R. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len - 1] in {'n', 'r'} def cond_s(word, suffix_len): """Return Lovins' condition S. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len - 2 : -suffix_len] == 'dr' or ( word[-suffix_len - 1] == 't' and word[-suffix_len - 2 : -suffix_len] != 'tt' ) def cond_t(word, suffix_len): """Return Lovins' condition T. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return ( word[-suffix_len - 1] in {'s', 't'} and word[-suffix_len - 2 : -suffix_len] != 'ot' ) def cond_u(word, suffix_len): """Return Lovins' condition U. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len - 1] in {'l', 'm', 'n', 'r'} def cond_v(word, suffix_len): """Return Lovins' condition V. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len - 1] == 'c' def cond_w(word, suffix_len): """Return Lovins' condition W. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len - 1] not in {'s', 'u'} def cond_x(word, suffix_len): """Return Lovins' condition X. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len - 1] in {'i', 'l'} or ( word[-suffix_len - 3 : -suffix_len] == 'u' and word[-suffix_len - 1] == 'e' ) def cond_y(word, suffix_len): """Return Lovins' condition Y. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len - 2 : -suffix_len] == 'in' def cond_z(word, suffix_len): """Return Lovins' condition Z. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len - 1] != 'f' def cond_aa(word, suffix_len): """Return Lovins' condition AA. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len - 1] in {'d', 'f', 'l', 't'} or word[ -suffix_len - 2 : -suffix_len ] in {'ph', 'th', 'er', 'or', 'es'} def cond_bb(word, suffix_len): """Return Lovins' condition BB. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return ( len(word) - suffix_len >= 3 and word[-suffix_len - 3 : -suffix_len] != 'met' and word[-suffix_len - 4 : -suffix_len] != 'ryst' ) def cond_cc(word, suffix_len): """Return Lovins' condition CC. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len - 1] == 'l' suffix = { 'alistically': cond_b, 'arizability': None, 'izationally': cond_b, 'antialness': None, 'arisations': None, 'arizations': None, 'entialness': None, 'allically': cond_c, 'antaneous': None, 'antiality': None, 'arisation': None, 'arization': None, 'ationally': cond_b, 'ativeness': None, 'eableness': cond_e, 'entations': None, 'entiality': None, 'entialize': None, 'entiation': None, 'ionalness': None, 'istically': None, 'itousness': None, 'izability': None, 'izational': None, 'ableness': None, 'arizable': None, 'entation': None, 'entially': None, 'eousness': None, 'ibleness': None, 'icalness': None, 'ionalism': None, 'ionality': None, 'ionalize': None, 'iousness': None, 'izations': None, 'lessness': None, 'ability': None, 'aically': None, 'alistic': cond_b, 'alities': None, 'ariness': cond_e, 'aristic': None, 'arizing': None, 'ateness': None, 'atingly': None, 'ational': cond_b, 'atively': None, 'ativism': None, 'elihood': cond_e, 'encible': None, 'entally': None, 'entials': None, 'entiate': None, 'entness': None, 'fulness': None, 'ibility': None, 'icalism': None, 'icalist': None, 'icality': None, 'icalize': None, 'ication': cond_g, 'icianry': None, 'ination': None, 'ingness': None, 'ionally': None, 'isation': None, 'ishness': None, 'istical': None, 'iteness': None, 'iveness': None, 'ivistic': None, 'ivities': None, 'ization': cond_f, 'izement': None, 'oidally': None, 'ousness': None, 'aceous': None, 'acious': cond_b, 'action': cond_g, 'alness': None, 'ancial': None, 'ancies': None, 'ancing': cond_b, 'ariser': None, 'arized': None, 'arizer': None, 'atable': None, 'ations': cond_b, 'atives': None, 'eature': cond_z, 'efully': None, 'encies': None, 'encing': None, 'ential': None, 'enting': cond_c, 'entist': None, 'eously': None, 'ialist': None, 'iality': None, 'ialize': None, 'ically': None, 'icance': None, 'icians': None, 'icists': None, 'ifully': None, 'ionals': None, 'ionate': cond_d, 'ioning': None, 'ionist': None, 'iously': None, 'istics': None, 'izable': cond_e, 'lessly': None, 'nesses': None, 'oidism': None, 'acies': None, 'acity': None, 'aging': cond_b, 'aical': None, 'alist': None, 'alism': cond_b, 'ality': None, 'alize': None, 'allic': cond_bb, 'anced': cond_b, 'ances': cond_b, 'antic': cond_c, 'arial': None, 'aries': None, 'arily': None, 'arity': cond_b, 'arize': None, 'aroid': None, 'ately': None, 'ating': cond_i, 'ation': cond_b, 'ative': None, 'ators': None, 'atory': None, 'ature': cond_e, 'early': cond_y, 'ehood': None, 'eless': None, 'elity': None, 'ement': None, 'enced': None, 'ences': None, 'eness': cond_e, 'ening': cond_e, 'ental': None, 'ented': cond_c, 'ently': None, 'fully': None, 'ially': None, 'icant': None, 'ician': None, 'icide': None, 'icism': None, 'icist': None, 'icity': None, 'idine': cond_i, 'iedly': None, 'ihood': None, 'inate': None, 'iness': None, 'ingly': cond_b, 'inism': cond_j, 'inity': cond_cc, 'ional': None, 'ioned': None, 'ished': None, 'istic': None, 'ities': None, 'itous': None, 'ively': None, 'ivity': None, 'izers': cond_f, 'izing': cond_f, 'oidal': None, 'oides': None, 'otide': None, 'ously': None, 'able': None, 'ably': None, 'ages': cond_b, 'ally': cond_b, 'ance': cond_b, 'ancy': cond_b, 'ants': cond_b, 'aric': None, 'arly': cond_k, 'ated': cond_i, 'ates': None, 'atic': cond_b, 'ator': None, 'ealy': cond_y, 'edly': cond_e, 'eful': None, 'eity': None, 'ence': None, 'ency': None, 'ened': cond_e, 'enly': cond_e, 'eous': None, 'hood': None, 'ials': None, 'ians': None, 'ible': None, 'ibly': None, 'ical': None, 'ides': cond_l, 'iers': None, 'iful': None, 'ines': cond_m, 'ings': cond_n, 'ions': cond_b, 'ious': None, 'isms': cond_b, 'ists': None, 'itic': cond_h, 'ized': cond_f, 'izer': cond_f, 'less': None, 'lily': None, 'ness': None, 'ogen': None, 'ward': None, 'wise': None, 'ying': cond_b, 'yish': None, 'acy': None, 'age': cond_b, 'aic': None, 'als': cond_bb, 'ant': cond_b, 'ars': cond_o, 'ary': cond_f, 'ata': None, 'ate': None, 'eal': cond_y, 'ear': cond_y, 'ely': cond_e, 'ene': cond_e, 'ent': cond_c, 'ery': cond_e, 'ese': None, 'ful': None, 'ial': None, 'ian': None, 'ics': None, 'ide': cond_l, 'ied': None, 'ier': None, 'ies': cond_p, 'ily': None, 'ine': cond_m, 'ing': cond_n, 'ion': cond_q, 'ish': cond_c, 'ism': cond_b, 'ist': None, 'ite': cond_aa, 'ity': None, 'ium': None, 'ive': None, 'ize': cond_f, 'oid': None, 'one': cond_r, 'ous': None, 'ae': None, 'al': cond_bb, 'ar': cond_x, 'as': cond_b, 'ed': cond_e, 'en': cond_f, 'es': cond_e, 'ia': None, 'ic': None, 'is': None, 'ly': cond_b, 'on': cond_s, 'or': cond_t, 'um': cond_u, 'us': cond_v, 'yl': cond_r, '\'s': None, 's\'': None, 'a': None, 'e': None, 'i': None, 'o': None, 's': cond_w, 'y': cond_b, } for suffix_len in range(11, 0, -1): ending = word[-suffix_len:] if ( ending in suffix and len(word) - suffix_len >= 2 and (suffix[ending] is None or suffix[ending](word, suffix_len)) ): word = word[:-suffix_len] break def recode9(stem): """Return Lovins' conditional recode rule 9.""" if stem[-3:-2] in {'a', 'i', 'o'}: return stem return stem[:-2] + 'l' def recode24(stem): """Return Lovins' conditional recode rule 24.""" if stem[-4:-3] == 's': return stem return stem[:-1] + 's' def recode28(stem): """Return Lovins' conditional recode rule 28.""" if stem[-4:-3] in {'p', 't'}: return stem return stem[:-1] + 's' def recode30(stem): """Return Lovins' conditional recode rule 30.""" if stem[-4:-3] == 'm': return stem return stem[:-1] + 's' def recode32(stem): """Return Lovins' conditional recode rule 32.""" if stem[-3:-2] == 'n': return stem return stem[:-1] + 's' if word[-2:] in { 'bb', 'dd', 'gg', 'll', 'mm', 'nn', 'pp', 'rr', 'ss', 'tt', }: word = word[:-1] recode = ( ('iev', 'ief'), ('uct', 'uc'), ('umpt', 'um'), ('rpt', 'rb'), ('urs', 'ur'), ('istr', 'ister'), ('metr', 'meter'), ('olv', 'olut'), ('ul', recode9), ('bex', 'bic'), ('dex', 'dic'), ('pex', 'pic'), ('tex', 'tic'), ('ax', 'ac'), ('ex', 'ec'), ('ix', 'ic'), ('lux', 'luc'), ('uad', 'uas'), ('vad', 'vas'), ('cid', 'cis'), ('lid', 'lis'), ('erid', 'eris'), ('pand', 'pans'), ('end', recode24), ('ond', 'ons'), ('lud', 'lus'), ('rud', 'rus'), ('her', recode28), ('mit', 'mis'), ('ent', recode30), ('ert', 'ers'), ('et', recode32), ('yt', 'ys'), ('yz', 'ys'), ) for ending, replacement in recode: if word.endswith(ending): if callable(replacement): word = replacement(word) else: word = word[: -len(ending)] + replacement return word
if __name__ == '__main__': import doctest doctest.testmod()