Source code for abydos.stemmer

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <>.


The stemmer module defines word stemmers including:

    - the Lovins stemmer
    - the Porter and Porter2 (Snowball English) stemmers
    - Snowball stemmers for German, Dutch, Norwegian, Swedish, and Danish
    - CLEF German, German plus, and Swedish stemmers
    - Caumanns German stemmer
    - UEA-Lite Stemmer
    - Paice-Husk Stemmer
    - Schinke Latin stemmer
    - S stemmer

from __future__ import unicode_literals

from re import match as re_match
from unicodedata import normalize

from six import text_type
from six.moves import range

__all__ = ['caumanns', 'clef_german', 'clef_german_plus', 'clef_swedish',
           'lovins', 'paice_husk', 'porter', 'porter2', 's_stemmer',
           'sb_danish', 'sb_dutch', 'sb_german', 'sb_norwegian', 'sb_swedish',
           'schinke', 'uealite']

[docs]def lovins(word): """Return Lovins stem. Lovins stemmer The Lovins stemmer is described in Julie Beth Lovins's article :cite:`Lovins:1968`. :param str word: the word to stem :returns: word stem :rtype: str >>> lovins('reading') 'read' >>> lovins('suspension') 'suspens' >>> lovins('elusiveness') 'elus' """ # lowercase, normalize, and compose word = normalize('NFC', text_type(word.lower())) def cond_b(word, suffix_len): """Return Lovins' condition B. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return len(word)-suffix_len >= 3 def cond_c(word, suffix_len): """Return Lovins' condition C. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return len(word)-suffix_len >= 4 def cond_d(word, suffix_len): """Return Lovins' condition D. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return len(word)-suffix_len >= 5 def cond_e(word, suffix_len): """Return Lovins' condition E. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len-1] != 'e' def cond_f(word, suffix_len): """Return Lovins' condition F. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return (len(word)-suffix_len >= 3 and word[-suffix_len-1] != 'e') def cond_g(word, suffix_len): """Return Lovins' condition G. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return (len(word)-suffix_len >= 3 and word[-suffix_len-1] == 'f') def cond_h(word, suffix_len): """Return Lovins' condition H. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return (word[-suffix_len-1] == 't' or word[-suffix_len-2:-suffix_len] == 'll') def cond_i(word, suffix_len): """Return Lovins' condition I. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len-1] not in {'e', 'o'} def cond_j(word, suffix_len): """Return Lovins' condition J. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len-1] not in {'a', 'e'} def cond_k(word, suffix_len): """Return Lovins' condition K. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return (len(word)-suffix_len >= 3 and (word[-suffix_len-1] in {'i', 'l'} or (word[-suffix_len-3] == 'u' and word[-suffix_len-1] == 'e'))) def cond_l(word, suffix_len): """Return Lovins' condition L. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return (word[-suffix_len-1] not in {'s', 'u', 'x'} or word[-suffix_len-1] == 'os') def cond_m(word, suffix_len): """Return Lovins' condition M. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len-1] not in {'a', 'c', 'e', 'm'} def cond_n(word, suffix_len): """Return Lovins' condition N. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ if len(word)-suffix_len >= 3: if word[-suffix_len-3] == 's': if len(word)-suffix_len >= 4: return True else: return True return False def cond_o(word, suffix_len): """Return Lovins' condition O. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len-1] in {'i', 'l'} def cond_p(word, suffix_len): """Return Lovins' condition P. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len-1] != 'c' def cond_q(word, suffix_len): """Return Lovins' condition Q. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return (len(word)-suffix_len >= 3 and word[-suffix_len-1] not in {'l', 'n'}) def cond_r(word, suffix_len): """Return Lovins' condition R. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len-1] in {'n', 'r'} def cond_s(word, suffix_len): """Return Lovins' condition S. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return (word[-suffix_len-2:-suffix_len] == 'dr' or (word[-suffix_len-1] == 't' and word[-suffix_len-2:-suffix_len] != 'tt')) def cond_t(word, suffix_len): """Return Lovins' condition T. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return (word[-suffix_len-1] in {'s', 't'} and word[-suffix_len-2:-suffix_len] != 'ot') def cond_u(word, suffix_len): """Return Lovins' condition U. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len-1] in {'l', 'm', 'n', 'r'} def cond_v(word, suffix_len): """Return Lovins' condition V. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len-1] == 'c' def cond_w(word, suffix_len): """Return Lovins' condition W. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len-1] not in {'s', 'u'} def cond_x(word, suffix_len): """Return Lovins' condition X. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return (word[-suffix_len-1] in {'i', 'l'} or (word[-suffix_len-3:-suffix_len] == 'u' and word[-suffix_len-1] == 'e')) def cond_y(word, suffix_len): """Return Lovins' condition Y. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len-2:-suffix_len] == 'in' def cond_z(word, suffix_len): """Return Lovins' condition Z. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len-1] != 'f' def cond_aa(word, suffix_len): """Return Lovins' condition AA. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return (word[-suffix_len-1] in {'d', 'f', 'l', 't'} or word[-suffix_len-2:-suffix_len] in {'ph', 'th', 'er', 'or', 'es'}) def cond_bb(word, suffix_len): """Return Lovins' condition BB. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return (len(word)-suffix_len >= 3 and word[-suffix_len-3:-suffix_len] != 'met' and word[-suffix_len-4:-suffix_len] != 'ryst') def cond_cc(word, suffix_len): """Return Lovins' condition CC. :param str word: word to check :param int suffix_len: suffix length :rtype: bool """ return word[-suffix_len-1] == 'l' suffix = {'alistically': cond_b, 'arizability': None, 'izationally': cond_b, 'antialness': None, 'arisations': None, 'arizations': None, 'entialness': None, 'allically': cond_c, 'antaneous': None, 'antiality': None, 'arisation': None, 'arization': None, 'ationally': cond_b, 'ativeness': None, 'eableness': cond_e, 'entations': None, 'entiality': None, 'entialize': None, 'entiation': None, 'ionalness': None, 'istically': None, 'itousness': None, 'izability': None, 'izational': None, 'ableness': None, 'arizable': None, 'entation': None, 'entially': None, 'eousness': None, 'ibleness': None, 'icalness': None, 'ionalism': None, 'ionality': None, 'ionalize': None, 'iousness': None, 'izations': None, 'lessness': None, 'ability': None, 'aically': None, 'alistic': cond_b, 'alities': None, 'ariness': cond_e, 'aristic': None, 'arizing': None, 'ateness': None, 'atingly': None, 'ational': cond_b, 'atively': None, 'ativism': None, 'elihood': cond_e, 'encible': None, 'entally': None, 'entials': None, 'entiate': None, 'entness': None, 'fulness': None, 'ibility': None, 'icalism': None, 'icalist': None, 'icality': None, 'icalize': None, 'ication': cond_g, 'icianry': None, 'ination': None, 'ingness': None, 'ionally': None, 'isation': None, 'ishness': None, 'istical': None, 'iteness': None, 'iveness': None, 'ivistic': None, 'ivities': None, 'ization': cond_f, 'izement': None, 'oidally': None, 'ousness': None, 'aceous': None, 'acious': cond_b, 'action': cond_g, 'alness': None, 'ancial': None, 'ancies': None, 'ancing': cond_b, 'ariser': None, 'arized': None, 'arizer': None, 'atable': None, 'ations': cond_b, 'atives': None, 'eature': cond_z, 'efully': None, 'encies': None, 'encing': None, 'ential': None, 'enting': cond_c, 'entist': None, 'eously': None, 'ialist': None, 'iality': None, 'ialize': None, 'ically': None, 'icance': None, 'icians': None, 'icists': None, 'ifully': None, 'ionals': None, 'ionate': cond_d, 'ioning': None, 'ionist': None, 'iously': None, 'istics': None, 'izable': cond_e, 'lessly': None, 'nesses': None, 'oidism': None, 'acies': None, 'acity': None, 'aging': cond_b, 'aical': None, 'alist': None, 'alism': cond_b, 'ality': None, 'alize': None, 'allic': cond_bb, 'anced': cond_b, 'ances': cond_b, 'antic': cond_c, 'arial': None, 'aries': None, 'arily': None, 'arity': cond_b, 'arize': None, 'aroid': None, 'ately': None, 'ating': cond_i, 'ation': cond_b, 'ative': None, 'ators': None, 'atory': None, 'ature': cond_e, 'early': cond_y, 'ehood': None, 'eless': None, 'elity': None, 'ement': None, 'enced': None, 'ences': None, 'eness': cond_e, 'ening': cond_e, 'ental': None, 'ented': cond_c, 'ently': None, 'fully': None, 'ially': None, 'icant': None, 'ician': None, 'icide': None, 'icism': None, 'icist': None, 'icity': None, 'idine': cond_i, 'iedly': None, 'ihood': None, 'inate': None, 'iness': None, 'ingly': cond_b, 'inism': cond_j, 'inity': cond_cc, 'ional': None, 'ioned': None, 'ished': None, 'istic': None, 'ities': None, 'itous': None, 'ively': None, 'ivity': None, 'izers': cond_f, 'izing': cond_f, 'oidal': None, 'oides': None, 'otide': None, 'ously': None, 'able': None, 'ably': None, 'ages': cond_b, 'ally': cond_b, 'ance': cond_b, 'ancy': cond_b, 'ants': cond_b, 'aric': None, 'arly': cond_k, 'ated': cond_i, 'ates': None, 'atic': cond_b, 'ator': None, 'ealy': cond_y, 'edly': cond_e, 'eful': None, 'eity': None, 'ence': None, 'ency': None, 'ened': cond_e, 'enly': cond_e, 'eous': None, 'hood': None, 'ials': None, 'ians': None, 'ible': None, 'ibly': None, 'ical': None, 'ides': cond_l, 'iers': None, 'iful': None, 'ines': cond_m, 'ings': cond_n, 'ions': cond_b, 'ious': None, 'isms': cond_b, 'ists': None, 'itic': cond_h, 'ized': cond_f, 'izer': cond_f, 'less': None, 'lily': None, 'ness': None, 'ogen': None, 'ward': None, 'wise': None, 'ying': cond_b, 'yish': None, 'acy': None, 'age': cond_b, 'aic': None, 'als': cond_bb, 'ant': cond_b, 'ars': cond_o, 'ary': cond_f, 'ata': None, 'ate': None, 'eal': cond_y, 'ear': cond_y, 'ely': cond_e, 'ene': cond_e, 'ent': cond_c, 'ery': cond_e, 'ese': None, 'ful': None, 'ial': None, 'ian': None, 'ics': None, 'ide': cond_l, 'ied': None, 'ier': None, 'ies': cond_p, 'ily': None, 'ine': cond_m, 'ing': cond_n, 'ion': cond_q, 'ish': cond_c, 'ism': cond_b, 'ist': None, 'ite': cond_aa, 'ity': None, 'ium': None, 'ive': None, 'ize': cond_f, 'oid': None, 'one': cond_r, 'ous': None, 'ae': None, 'al': cond_bb, 'ar': cond_x, 'as': cond_b, 'ed': cond_e, 'en': cond_f, 'es': cond_e, 'ia': None, 'ic': None, 'is': None, 'ly': cond_b, 'on': cond_s, 'or': cond_t, 'um': cond_u, 'us': cond_v, 'yl': cond_r, '\'s': None, 's\'': None, 'a': None, 'e': None, 'i': None, 'o': None, 's': cond_w, 'y': cond_b} for suffix_len in range(11, 0, -1): ending = word[-suffix_len:] if (ending in suffix and len(word)-suffix_len >= 2 and (suffix[ending] is None or suffix[ending](word, suffix_len))): word = word[:-suffix_len] break def recode9(stem): """Return Lovins' conditional recode rule 9.""" if stem[-3:-2] in {'a', 'i', 'o'}: return stem return stem[:-2]+'l' def recode24(stem): """Return Lovins' conditional recode rule 24.""" if stem[-4:-3] == 's': return stem return stem[:-1]+'s' def recode28(stem): """Return Lovins' conditional recode rule 28.""" if stem[-4:-3] in {'p', 't'}: return stem return stem[:-1]+'s' def recode30(stem): """Return Lovins' conditional recode rule 30.""" if stem[-4:-3] == 'm': return stem return stem[:-1]+'s' def recode32(stem): """Return Lovins' conditional recode rule 32.""" if stem[-3:-2] == 'n': return stem return stem[:-1]+'s' if word[-2:] in {'bb', 'dd', 'gg', 'll', 'mm', 'nn', 'pp', 'rr', 'ss', 'tt'}: word = word[:-1] recode = (('iev', 'ief'), ('uct', 'uc'), ('umpt', 'um'), ('rpt', 'rb'), ('urs', 'ur'), ('istr', 'ister'), ('metr', 'meter'), ('olv', 'olut'), ('ul', recode9), ('bex', 'bic'), ('dex', 'dic'), ('pex', 'pic'), ('tex', 'tic'), ('ax', 'ac'), ('ex', 'ec'), ('ix', 'ic'), ('lux', 'luc'), ('uad', 'uas'), ('vad', 'vas'), ('cid', 'cis'), ('lid', 'lis'), ('erid', 'eris'), ('pand', 'pans'), ('end', recode24), ('ond', 'ons'), ('lud', 'lus'), ('rud', 'rus'), ('her', recode28), ('mit', 'mis'), ('ent', recode30), ('ert', 'ers'), ('et', recode32), ('yt', 'ys'), ('yz', 'ys')) for ending, replacement in recode: if word.endswith(ending): if callable(replacement): word = replacement(word) else: word = word[:-len(ending)] + replacement return word
def _m_degree(term, vowels): """Return Porter helper function _m_degree value. m-degree is equal to the number of V to C transitions :param str term: the word for which to calculate the m-degree :param set vowels: the set of vowels in the language :returns: the m-degree as defined in the Porter stemmer definition :rtype: int """ mdeg = 0 last_was_vowel = False for letter in term: if letter in vowels: last_was_vowel = True else: if last_was_vowel: mdeg += 1 last_was_vowel = False return mdeg def _sb_has_vowel(term, vowels): """Return Porter helper function _sb_has_vowel value. :param str term: the word to scan for vowels :param set vowels: the set of vowels in the language :returns: true iff a vowel exists in the term (as defined in the Porter stemmer definition) :rtype: bool """ for letter in term: if letter in vowels: return True return False def _ends_in_doubled_cons(term, vowels): """Return Porter helper function _ends_in_doubled_cons value. :param str term: the word to check for a final doubled consonant :param set vowels: the set of vowels in the language :returns: true iff the stem ends in a doubled consonant (as defined in the Porter stemmer definition) :rtype: bool """ return len(term) > 1 and term[-1] not in vowels and term[-2] == term[-1] def _ends_in_cvc(term, vowels): """Return Porter helper function _ends_in_cvc value. :param str term: the word to scan for cvc :param set vowels: the set of vowels in the language :returns: true iff the stem ends in cvc (as defined in the Porter stemmer definition) :rtype: bool """ return (len(term) > 2 and (term[-1] not in vowels and term[-2] in vowels and term[-3] not in vowels and term[-1] not in tuple('wxY')))
[docs]def porter(word, early_english=False): """Return Porter stem. The Porter stemmer is described in :cite:`Porter:1980`. :param str word: the word to calculate the stem of :param bool early_english: set to True in order to remove -eth & -est (2nd & 3rd person singular verbal agreement suffixes) :returns: word stem :rtype: str >>> porter('reading') 'read' >>> porter('suspension') 'suspens' >>> porter('elusiveness') 'elus' >>> porter('eateth', early_english=True) 'eat' """ # lowercase, normalize, and compose word = normalize('NFC', text_type(word.lower())) # Return word if stem is shorter than 2 if len(word) < 3: return word _vowels = {'a', 'e', 'i', 'o', 'u', 'y'} # Re-map consonantal y to Y (Y will be C, y will be V) if word[0] == 'y': word = 'Y' + word[1:] for i in range(1, len(word)): if word[i] == 'y' and word[i-1] in _vowels: word = word[:i] + 'Y' + word[i+1:] # Step 1a if word[-1] == 's': if word[-4:] == 'sses': word = word[:-2] elif word[-3:] == 'ies': word = word[:-2] elif word[-2:] == 'ss': pass else: word = word[:-1] # Step 1b step1b_flag = False if word[-3:] == 'eed': if _m_degree(word[:-3], _vowels) > 0: word = word[:-1] elif word[-2:] == 'ed': if _sb_has_vowel(word[:-2], _vowels): word = word[:-2] step1b_flag = True elif word[-3:] == 'ing': if _sb_has_vowel(word[:-3], _vowels): word = word[:-3] step1b_flag = True elif early_english: if word[-3:] == 'est': if _sb_has_vowel(word[:-3], _vowels): word = word[:-3] step1b_flag = True elif word[-3:] == 'eth': if _sb_has_vowel(word[:-3], _vowels): word = word[:-3] step1b_flag = True if step1b_flag: if word[-2:] in {'at', 'bl', 'iz'}: word += 'e' elif (_ends_in_doubled_cons(word, _vowels) and word[-1] not in {'l', 's', 'z'}): word = word[:-1] elif _m_degree(word, _vowels) == 1 and _ends_in_cvc(word, _vowels): word += 'e' # Step 1c if word[-1] in {'Y', 'y'} and _sb_has_vowel(word[:-1], _vowels): word = word[:-1] + 'i' # Step 2 if len(word) > 1: if word[-2] == 'a': if word[-7:] == 'ational': if _m_degree(word[:-7], _vowels) > 0: word = word[:-5] + 'e' elif word[-6:] == 'tional': if _m_degree(word[:-6], _vowels) > 0: word = word[:-2] elif word[-2] == 'c': if word[-4:] in {'enci', 'anci'}: if _m_degree(word[:-4], _vowels) > 0: word = word[:-1] + 'e' elif word[-2] == 'e': if word[-4:] == 'izer': if _m_degree(word[:-4], _vowels) > 0: word = word[:-1] elif word[-2] == 'g': if word[-4:] == 'logi': if _m_degree(word[:-4], _vowels) > 0: word = word[:-1] elif word[-2] == 'l': if word[-3:] == 'bli': if _m_degree(word[:-3], _vowels) > 0: word = word[:-1] + 'e' elif word[-4:] == 'alli': if _m_degree(word[:-4], _vowels) > 0: word = word[:-2] elif word[-5:] == 'entli': if _m_degree(word[:-5], _vowels) > 0: word = word[:-2] elif word[-3:] == 'eli': if _m_degree(word[:-3], _vowels) > 0: word = word[:-2] elif word[-5:] == 'ousli': if _m_degree(word[:-5], _vowels) > 0: word = word[:-2] elif word[-2] == 'o': if word[-7:] == 'ization': if _m_degree(word[:-7], _vowels) > 0: word = word[:-5] + 'e' elif word[-5:] == 'ation': if _m_degree(word[:-5], _vowels) > 0: word = word[:-3] + 'e' elif word[-4:] == 'ator': if _m_degree(word[:-4], _vowels) > 0: word = word[:-2] + 'e' elif word[-2] == 's': if word[-5:] == 'alism': if _m_degree(word[:-5], _vowels) > 0: word = word[:-3] elif word[-7:] in {'iveness', 'fulness', 'ousness'}: if _m_degree(word[:-7], _vowels) > 0: word = word[:-4] elif word[-2] == 't': if word[-5:] == 'aliti': if _m_degree(word[:-5], _vowels) > 0: word = word[:-3] elif word[-5:] == 'iviti': if _m_degree(word[:-5], _vowels) > 0: word = word[:-3] + 'e' elif word[-6:] == 'biliti': if _m_degree(word[:-6], _vowels) > 0: word = word[:-5] + 'le' # Step 3 if word[-5:] == 'icate': if _m_degree(word[:-5], _vowels) > 0: word = word[:-3] elif word[-5:] == 'ative': if _m_degree(word[:-5], _vowels) > 0: word = word[:-5] elif word[-5:] in {'alize', 'iciti'}: if _m_degree(word[:-5], _vowels) > 0: word = word[:-3] elif word[-4:] == 'ical': if _m_degree(word[:-4], _vowels) > 0: word = word[:-2] elif word[-3:] == 'ful': if _m_degree(word[:-3], _vowels) > 0: word = word[:-3] elif word[-4:] == 'ness': if _m_degree(word[:-4], _vowels) > 0: word = word[:-4] # Step 4 if word[-2:] == 'al': if _m_degree(word[:-2], _vowels) > 1: word = word[:-2] elif word[-4:] == 'ance': if _m_degree(word[:-4], _vowels) > 1: word = word[:-4] elif word[-4:] == 'ence': if _m_degree(word[:-4], _vowels) > 1: word = word[:-4] elif word[-2:] == 'er': if _m_degree(word[:-2], _vowels) > 1: word = word[:-2] elif word[-2:] == 'ic': if _m_degree(word[:-2], _vowels) > 1: word = word[:-2] elif word[-4:] == 'able': if _m_degree(word[:-4], _vowels) > 1: word = word[:-4] elif word[-4:] == 'ible': if _m_degree(word[:-4], _vowels) > 1: word = word[:-4] elif word[-3:] == 'ant': if _m_degree(word[:-3], _vowels) > 1: word = word[:-3] elif word[-5:] == 'ement': if _m_degree(word[:-5], _vowels) > 1: word = word[:-5] elif word[-4:] == 'ment': if _m_degree(word[:-4], _vowels) > 1: word = word[:-4] elif word[-3:] == 'ent': if _m_degree(word[:-3], _vowels) > 1: word = word[:-3] elif word[-4:] in {'sion', 'tion'}: if _m_degree(word[:-3], _vowels) > 1: word = word[:-3] elif word[-2:] == 'ou': if _m_degree(word[:-2], _vowels) > 1: word = word[:-2] elif word[-3:] == 'ism': if _m_degree(word[:-3], _vowels) > 1: word = word[:-3] elif word[-3:] == 'ate': if _m_degree(word[:-3], _vowels) > 1: word = word[:-3] elif word[-3:] == 'iti': if _m_degree(word[:-3], _vowels) > 1: word = word[:-3] elif word[-3:] == 'ous': if _m_degree(word[:-3], _vowels) > 1: word = word[:-3] elif word[-3:] == 'ive': if _m_degree(word[:-3], _vowels) > 1: word = word[:-3] elif word[-3:] == 'ize': if _m_degree(word[:-3], _vowels) > 1: word = word[:-3] # Step 5a if word[-1] == 'e': if _m_degree(word[:-1], _vowels) > 1: word = word[:-1] elif (_m_degree(word[:-1], _vowels) == 1 and not _ends_in_cvc(word[:-1], _vowels)): word = word[:-1] # Step 5b if word[-2:] == 'll' and _m_degree(word, _vowels) > 1: word = word[:-1] # Change 'Y' back to 'y' if it survived stemming for i in range(len(word)): if word[i] == 'Y': word = word[:i] + 'y' + word[i+1:] return word
def _sb_r1(term, vowels, r1_prefixes=None): """Return the R1 region, as defined in the Porter2 specification.""" vowel_found = False if hasattr(r1_prefixes, '__iter__'): for prefix in r1_prefixes: if term[:len(prefix)] == prefix: return len(prefix) for i in range(len(term)): if not vowel_found and term[i] in vowels: vowel_found = True elif vowel_found and term[i] not in vowels: return i + 1 return len(term) def _sb_r2(term, vowels, r1_prefixes=None): """Return the R2 region, as defined in the Porter2 specification.""" r1_start = _sb_r1(term, vowels, r1_prefixes) return r1_start + _sb_r1(term[r1_start:], vowels) def _sb_ends_in_short_syllable(term, vowels, codanonvowels): """Return True iff term ends in a short syllable. (...according to the Porter2 specification.) NB: This is akin to the CVC test from the Porter stemmer. The description is unfortunately poor/ambiguous. """ if not term: return False if len(term) == 2: if term[-2] in vowels and term[-1] not in vowels: return True elif len(term) >= 3: if ((term[-3] not in vowels and term[-2] in vowels and term[-1] in codanonvowels)): return True return False def _sb_short_word(term, vowels, codanonvowels, r1_prefixes=None): """Return True iff term is a short word. (...according to the Porter2 specification.) """ if ((_sb_r1(term, vowels, r1_prefixes) == len(term) and _sb_ends_in_short_syllable(term, vowels, codanonvowels))): return True return False
[docs]def porter2(word, early_english=False): """Return the Porter2 (Snowball English) stem. The Porter2 (Snowball English) stemmer is defined in :cite:`Porter:2002`. :param str word: the word to calculate the stem of :param bool early_english: set to True in order to remove -eth & -est (2nd & 3rd person singular verbal agreement suffixes) :returns: word stem :rtype: str >>> porter2('reading') 'read' >>> porter2('suspension') 'suspens' >>> porter2('elusiveness') 'elus' >>> porter2('eateth', early_english=True) 'eat' """ _vowels = {'a', 'e', 'i', 'o', 'u', 'y'} _codanonvowels = {"'", 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'z'} _doubles = {'bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt'} _li = {'c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't'} # R1 prefixes should be in order from longest to shortest to prevent # masking _r1_prefixes = ('commun', 'gener', 'arsen') _exception1dict = { # special changes: 'skis': 'ski', 'skies': 'sky', 'dying': 'die', 'lying': 'lie', 'tying': 'tie', # special -LY cases: 'idly': 'idl', 'gently': 'gentl', 'ugly': 'ugli', 'early': 'earli', 'only': 'onli', 'singly': 'singl'} _exception1set = {'sky', 'news', 'howe', 'atlas', 'cosmos', 'bias', 'andes'} _exception2set = {'inning', 'outing', 'canning', 'herring', 'earring', 'proceed', 'exceed', 'succeed'} # lowercase, normalize, and compose word = normalize('NFC', text_type(word.lower())) # replace apostrophe-like characters with U+0027, per # word = word.replace('’', '\'') word = word.replace('’', '\'') # Exceptions 1 if word in _exception1dict: return _exception1dict[word] elif word in _exception1set: return word # Return word if stem is shorter than 3 if len(word) < 3: return word # Remove initial ', if present. while word and word[0] == '\'': word = word[1:] # Return word if stem is shorter than 2 if len(word) < 2: return word # Re-map vocalic Y to y (Y will be C, y will be V) if word[0] == 'y': word = 'Y' + word[1:] for i in range(1, len(word)): if word[i] == 'y' and word[i-1] in _vowels: word = word[:i] + 'Y' + word[i+1:] r1_start = _sb_r1(word, _vowels, _r1_prefixes) r2_start = _sb_r2(word, _vowels, _r1_prefixes) # Step 0 if word[-3:] == '\'s\'': word = word[:-3] elif word[-2:] == '\'s': word = word[:-2] elif word[-1:] == '\'': word = word[:-1] # Return word if stem is shorter than 2 if len(word) < 3: return word # Step 1a if word[-4:] == 'sses': word = word[:-2] elif word[-3:] in {'ied', 'ies'}: if len(word) > 4: word = word[:-2] else: word = word[:-1] elif word[-2:] in {'us', 'ss'}: pass elif word[-1] == 's': if _sb_has_vowel(word[:-2], _vowels): word = word[:-1] # Exceptions 2 if word in _exception2set: return word # Step 1b step1b_flag = False if word[-5:] == 'eedly': if len(word[r1_start:]) >= 5: word = word[:-3] elif word[-5:] == 'ingly': if _sb_has_vowel(word[:-5], _vowels): word = word[:-5] step1b_flag = True elif word[-4:] == 'edly': if _sb_has_vowel(word[:-4], _vowels): word = word[:-4] step1b_flag = True elif word[-3:] == 'eed': if len(word[r1_start:]) >= 3: word = word[:-1] elif word[-3:] == 'ing': if _sb_has_vowel(word[:-3], _vowels): word = word[:-3] step1b_flag = True elif word[-2:] == 'ed': if _sb_has_vowel(word[:-2], _vowels): word = word[:-2] step1b_flag = True elif early_english: if word[-3:] == 'est': if _sb_has_vowel(word[:-3], _vowels): word = word[:-3] step1b_flag = True elif word[-3:] == 'eth': if _sb_has_vowel(word[:-3], _vowels): word = word[:-3] step1b_flag = True if step1b_flag: if word[-2:] in {'at', 'bl', 'iz'}: word += 'e' elif word[-2:] in _doubles: word = word[:-1] elif _sb_short_word(word, _vowels, _codanonvowels, _r1_prefixes): word += 'e' # Step 1c if ((len(word) > 2 and word[-1] in {'Y', 'y'} and word[-2] not in _vowels)): word = word[:-1] + 'i' # Step 2 if word[-2] == 'a': if word[-7:] == 'ational': if len(word[r1_start:]) >= 7: word = word[:-5] + 'e' elif word[-6:] == 'tional': if len(word[r1_start:]) >= 6: word = word[:-2] elif word[-2] == 'c': if word[-4:] in {'enci', 'anci'}: if len(word[r1_start:]) >= 4: word = word[:-1] + 'e' elif word[-2] == 'e': if word[-4:] == 'izer': if len(word[r1_start:]) >= 4: word = word[:-1] elif word[-2] == 'g': if word[-3:] == 'ogi': if ((r1_start >= 1 and len(word[r1_start:]) >= 3 and word[-4] == 'l')): word = word[:-1] elif word[-2] == 'l': if word[-6:] == 'lessli': if len(word[r1_start:]) >= 6: word = word[:-2] elif word[-5:] in {'entli', 'fulli', 'ousli'}: if len(word[r1_start:]) >= 5: word = word[:-2] elif word[-4:] == 'abli': if len(word[r1_start:]) >= 4: word = word[:-1] + 'e' elif word[-4:] == 'alli': if len(word[r1_start:]) >= 4: word = word[:-2] elif word[-3:] == 'bli': if len(word[r1_start:]) >= 3: word = word[:-1] + 'e' elif word[-2:] == 'li': if ((r1_start >= 1 and len(word[r1_start:]) >= 2 and word[-3] in _li)): word = word[:-2] elif word[-2] == 'o': if word[-7:] == 'ization': if len(word[r1_start:]) >= 7: word = word[:-5] + 'e' elif word[-5:] == 'ation': if len(word[r1_start:]) >= 5: word = word[:-3] + 'e' elif word[-4:] == 'ator': if len(word[r1_start:]) >= 4: word = word[:-2] + 'e' elif word[-2] == 's': if word[-7:] in {'fulness', 'ousness', 'iveness'}: if len(word[r1_start:]) >= 7: word = word[:-4] elif word[-5:] == 'alism': if len(word[r1_start:]) >= 5: word = word[:-3] elif word[-2] == 't': if word[-6:] == 'biliti': if len(word[r1_start:]) >= 6: word = word[:-5] + 'le' elif word[-5:] == 'aliti': if len(word[r1_start:]) >= 5: word = word[:-3] elif word[-5:] == 'iviti': if len(word[r1_start:]) >= 5: word = word[:-3] + 'e' # Step 3 if word[-7:] == 'ational': if len(word[r1_start:]) >= 7: word = word[:-5] + 'e' elif word[-6:] == 'tional': if len(word[r1_start:]) >= 6: word = word[:-2] elif word[-5:] in {'alize', 'icate', 'iciti'}: if len(word[r1_start:]) >= 5: word = word[:-3] elif word[-5:] == 'ative': if len(word[r2_start:]) >= 5: word = word[:-5] elif word[-4:] == 'ical': if len(word[r1_start:]) >= 4: word = word[:-2] elif word[-4:] == 'ness': if len(word[r1_start:]) >= 4: word = word[:-4] elif word[-3:] == 'ful': if len(word[r1_start:]) >= 3: word = word[:-3] # Step 4 for suffix in ('ement', 'ance', 'ence', 'able', 'ible', 'ment', 'ant', 'ent', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize', 'al', 'er', 'ic'): if word[-len(suffix):] == suffix: if len(word[r2_start:]) >= len(suffix): word = word[:-len(suffix)] break else: if word[-3:] == 'ion': if ((len(word[r2_start:]) >= 3 and len(word) >= 4 and word[-4] in tuple('st'))): word = word[:-3] # Step 5 if word[-1] == 'e': if (len(word[r2_start:]) >= 1 or (len(word[r1_start:]) >= 1 and not _sb_ends_in_short_syllable(word[:-1], _vowels, _codanonvowels))): word = word[:-1] elif word[-1] == 'l': if len(word[r2_start:]) >= 1 and word[-2] == 'l': word = word[:-1] # Change 'Y' back to 'y' if it survived stemming for i in range(0, len(word)): if word[i] == 'Y': word = word[:i] + 'y' + word[i+1:] return word
[docs]def sb_german(word, alternate_vowels=False): """Return Snowball German stem. The Snowball German stemmer is defined at: :param str word: the word to calculate the stem of :param bool alternate_vowels: composes ae as ä, oe as ö, and ue as ü before running the algorithm :returns: word stem :rtype: str >>> sb_german('lesen') 'les' >>> sb_german('graues') 'grau' >>> sb_german('buchstabieren') 'buchstabi' """ _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö', 'ü'} _s_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 'r', 't'} _st_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'} # lowercase, normalize, and compose word = normalize('NFC', word.lower()) word = word.replace('ß', 'ss') if len(word) > 2: for i in range(2, len(word)): if word[i] in _vowels and word[i-2] in _vowels: if word[i-1] == 'u': word = word[:i-1] + 'U' + word[i:] elif word[i-1] == 'y': word = word[:i-1] + 'Y' + word[i:] if alternate_vowels: word = word.replace('ae', 'ä') word = word.replace('oe', 'ö') word = word.replace('que', 'Q') word = word.replace('ue', 'ü') word = word.replace('Q', 'que') r1_start = max(3, _sb_r1(word, _vowels)) r2_start = _sb_r2(word, _vowels) # Step 1 niss_flag = False if word[-3:] == 'ern': if len(word[r1_start:]) >= 3: word = word[:-3] elif word[-2:] == 'em': if len(word[r1_start:]) >= 2: word = word[:-2] elif word[-2:] == 'er': if len(word[r1_start:]) >= 2: word = word[:-2] elif word[-2:] == 'en': if len(word[r1_start:]) >= 2: word = word[:-2] niss_flag = True elif word[-2:] == 'es': if len(word[r1_start:]) >= 2: word = word[:-2] niss_flag = True elif word[-1:] == 'e': if len(word[r1_start:]) >= 1: word = word[:-1] niss_flag = True elif word[-1:] == 's': if ((len(word[r1_start:]) >= 1 and len(word) >= 2 and word[-2] in _s_endings)): word = word[:-1] if niss_flag and word[-4:] == 'niss': word = word[:-1] # Step 2 if word[-3:] == 'est': if len(word[r1_start:]) >= 3: word = word[:-3] elif word[-2:] == 'en': if len(word[r1_start:]) >= 2: word = word[:-2] elif word[-2:] == 'er': if len(word[r1_start:]) >= 2: word = word[:-2] elif word[-2:] == 'st': if ((len(word[r1_start:]) >= 2 and len(word) >= 6 and word[-3] in _st_endings)): word = word[:-2] # Step 3 if word[-4:] == 'isch': if len(word[r2_start:]) >= 4 and word[-5] != 'e': word = word[:-4] elif word[-4:] in {'lich', 'heit'}: if len(word[r2_start:]) >= 4: word = word[:-4] if ((word[-2:] in {'er', 'en'} and len(word[r1_start:]) >= 2)): word = word[:-2] elif word[-4:] == 'keit': if len(word[r2_start:]) >= 4: word = word[:-4] if word[-4:] == 'lich' and len(word[r2_start:]) >= 4: word = word[:-4] elif word[-2:] == 'ig' and len(word[r2_start:]) >= 2: word = word[:-2] elif word[-3:] in {'end', 'ung'}: if len(word[r2_start:]) >= 3: word = word[:-3] if ((word[-2:] == 'ig' and len(word[r2_start:]) >= 2 and word[-3] != 'e')): word = word[:-2] elif word[-2:] in {'ig', 'ik'}: if len(word[r2_start:]) >= 2 and word[-3] != 'e': word = word[:-2] # Change 'Y' and 'U' back to lowercase if survived stemming for i in range(0, len(word)): if word[i] == 'Y': word = word[:i] + 'y' + word[i+1:] elif word[i] == 'U': word = word[:i] + 'u' + word[i+1:] # Remove umlauts _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou')) word = word.translate(_umlauts) return word
[docs]def sb_dutch(word): """Return Snowball Dutch stem. The Snowball Dutch stemmer is defined at: :param str word: the word to calculate the stem of :returns: word stem :rtype: str >>> sb_dutch('lezen') 'lez' >>> sb_dutch('opschorting') 'opschort' >>> sb_dutch('ongrijpbaarheid') 'ongrijp' """ _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'è'} _not_s_endings = {'a', 'e', 'i', 'j', 'o', 'u', 'y', 'è'} def _undouble(word): """Undouble endings -kk, -dd, and -tt.""" if ((len(word) > 1 and word[-1] == word[-2] and word[-1] in {'d', 'k', 't'})): return word[:-1] return word # lowercase, normalize, decompose, filter umlauts & acutes out, and compose word = normalize('NFC', text_type(word.lower())) _accented = dict(zip((ord(_) for _ in 'äëïöüáéíóú'), 'aeiouaeiou')) word = word.translate(_accented) for i in range(len(word)): if i == 0 and word[0] == 'y': word = 'Y' + word[1:] elif word[i] == 'y' and word[i-1] in _vowels: word = word[:i] + 'Y' + word[i+1:] elif (word[i] == 'i' and word[i-1] in _vowels and i+1 < len(word) and word[i+1] in _vowels): word = word[:i] + 'I' + word[i+1:] r1_start = max(3, _sb_r1(word, _vowels)) r2_start = _sb_r2(word, _vowels) # Step 1 if word[-5:] == 'heden': if len(word[r1_start:]) >= 5: word = word[:-3] + 'id' elif word[-3:] == 'ene': if ((len(word[r1_start:]) >= 3 and (word[-4] not in _vowels and word[-6:-3] != 'gem'))): word = _undouble(word[:-3]) elif word[-2:] == 'en': if ((len(word[r1_start:]) >= 2 and (word[-3] not in _vowels and word[-5:-2] != 'gem'))): word = _undouble(word[:-2]) elif word[-2:] == 'se': if len(word[r1_start:]) >= 2 and word[-3] not in _not_s_endings: word = word[:-2] elif word[-1:] == 's': if len(word[r1_start:]) >= 1 and word[-2] not in _not_s_endings: word = word[:-1] # Step 2 e_removed = False if word[-1:] == 'e': if len(word[r1_start:]) >= 1 and word[-2] not in _vowels: word = _undouble(word[:-1]) e_removed = True # Step 3a if word[-4:] == 'heid': if len(word[r2_start:]) >= 4 and word[-5] != 'c': word = word[:-4] if word[-2:] == 'en': if ((len(word[r1_start:]) >= 2 and (word[-3] not in _vowels and word[-5:-2] != 'gem'))): word = _undouble(word[:-2]) # Step 3b if word[-4:] == 'lijk': if len(word[r2_start:]) >= 4: word = word[:-4] # Repeat step 2 if word[-1:] == 'e': if len(word[r1_start:]) >= 1 and word[-2] not in _vowels: word = _undouble(word[:-1]) elif word[-4:] == 'baar': if len(word[r2_start:]) >= 4: word = word[:-4] elif word[-3:] in ('end', 'ing'): if len(word[r2_start:]) >= 3: word = word[:-3] if ((word[-2:] == 'ig' and len(word[r2_start:]) >= 2 and word[-3] != 'e')): word = word[:-2] else: word = _undouble(word) elif word[-3:] == 'bar': if len(word[r2_start:]) >= 3 and e_removed: word = word[:-3] elif word[-2:] == 'ig': if len(word[r2_start:]) >= 2 and word[-3] != 'e': word = word[:-2] # Step 4 if ((len(word) >= 4 and word[-3] == word[-2] and word[-2] in {'a', 'e', 'o', 'u'} and word[-4] not in _vowels and word[-1] not in _vowels and word[-1] != 'I')): word = word[:-2] + word[-1] # Change 'Y' and 'U' back to lowercase if survived stemming for i in range(0, len(word)): if word[i] == 'Y': word = word[:i] + 'y' + word[i+1:] elif word[i] == 'I': word = word[:i] + 'i' + word[i+1:] return word
[docs]def sb_norwegian(word): """Return Snowball Norwegian stem. The Snowball Norwegian stemmer is defined at: :param str word: the word to calculate the stem of :returns: word stem :rtype: str >>> sb_norwegian('lese') 'les' >>> sb_norwegian('suspensjon') 'suspensjon' >>> sb_norwegian('sikkerhet') 'sikker' """ _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'å', 'æ', 'ø'} _s_endings = {'b', 'c', 'd', 'f', 'g', 'h', 'j', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y', 'z'} # lowercase, normalize, and compose word = normalize('NFC', text_type(word.lower())) r1_start = min(max(3, _sb_r1(word, _vowels)), len(word)) # Step 1 _r1 = word[r1_start:] if _r1[-7:] == 'hetenes': word = word[:-7] elif _r1[-6:] in {'hetene', 'hetens'}: word = word[:-6] elif _r1[-5:] in {'heten', 'heter', 'endes'}: word = word[:-5] elif _r1[-4:] in {'ande', 'ende', 'edes', 'enes', 'erte'}: if word[-4:] == 'erte': word = word[:-2] else: word = word[:-4] elif _r1[-3:] in {'ede', 'ane', 'ene', 'ens', 'ers', 'ets', 'het', 'ast', 'ert'}: if word[-3:] == 'ert': word = word[:-1] else: word = word[:-3] elif _r1[-2:] in {'en', 'ar', 'er', 'as', 'es', 'et'}: word = word[:-2] elif _r1[-1:] in {'a', 'e'}: word = word[:-1] elif _r1[-1:] == 's': if (((len(word) > 1 and word[-2] in _s_endings) or (len(word) > 2 and word[-2] == 'k' and word[-3] not in _vowels))): word = word[:-1] # Step 2 if word[r1_start:][-2:] in {'dt', 'vt'}: word = word[:-1] # Step 3 _r1 = word[r1_start:] if _r1[-7:] == 'hetslov': word = word[:-7] elif _r1[-4:] in {'eleg', 'elig', 'elov', 'slov'}: word = word[:-4] elif _r1[-3:] in {'leg', 'eig', 'lig', 'els', 'lov'}: word = word[:-3] elif _r1[-2:] == 'ig': word = word[:-2] return word
[docs]def sb_swedish(word): """Return Snowball Swedish stem. The Snowball Swedish stemmer is defined at: :param str word: the word to calculate the stem of :returns: word stem :rtype: str >>> sb_swedish('undervisa') 'undervis' >>> sb_swedish('suspension') 'suspension' >>> sb_swedish('visshet') 'viss' """ _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'ä', 'å', 'ö'} _s_endings = {'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y'} # lowercase, normalize, and compose word = normalize('NFC', text_type(word.lower())) r1_start = min(max(3, _sb_r1(word, _vowels)), len(word)) # Step 1 _r1 = word[r1_start:] if _r1[-7:] == 'heterna': word = word[:-7] elif _r1[-6:] == 'hetens': word = word[:-6] elif _r1[-5:] in {'anden', 'heten', 'heter', 'arnas', 'ernas', 'ornas', 'andes', 'arens', 'andet'}: word = word[:-5] elif _r1[-4:] in {'arna', 'erna', 'orna', 'ande', 'arne', 'aste', 'aren', 'ades', 'erns'}: word = word[:-4] elif _r1[-3:] in {'ade', 'are', 'ern', 'ens', 'het', 'ast'}: word = word[:-3] elif _r1[-2:] in {'ad', 'en', 'ar', 'er', 'or', 'as', 'es', 'at'}: word = word[:-2] elif _r1[-1:] in {'a', 'e'}: word = word[:-1] elif _r1[-1:] == 's': if len(word) > 1 and word[-2] in _s_endings: word = word[:-1] # Step 2 if word[r1_start:][-2:] in {'dd', 'gd', 'nn', 'dt', 'gt', 'kt', 'tt'}: word = word[:-1] # Step 3 _r1 = word[r1_start:] if _r1[-5:] == 'fullt': word = word[:-1] elif _r1[-4:] == 'löst': word = word[:-1] elif _r1[-3:] in {'lig', 'els'}: word = word[:-3] elif _r1[-2:] == 'ig': word = word[:-2] return word
[docs]def sb_danish(word): """Return Snowball Danish stem. The Snowball Danish stemmer is defined at: :param str word: the word to calculate the stem of :returns: word stem :rtype: str >>> sb_danish('underviser') 'undervis' >>> sb_danish('suspension') 'suspension' >>> sb_danish('sikkerhed') 'sikker' """ _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'å', 'æ', 'ø'} _s_endings = {'a', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y', 'z', 'å'} # lowercase, normalize, and compose word = normalize('NFC', text_type(word.lower())) r1_start = min(max(3, _sb_r1(word, _vowels)), len(word)) # Step 1 _r1 = word[r1_start:] if _r1[-7:] == 'erendes': word = word[:-7] elif _r1[-6:] in {'erende', 'hedens'}: word = word[:-6] elif _r1[-5:] in {'ethed', 'erede', 'heden', 'heder', 'endes', 'ernes', 'erens', 'erets'}: word = word[:-5] elif _r1[-4:] in {'ered', 'ende', 'erne', 'eren', 'erer', 'heds', 'enes', 'eres', 'eret'}: word = word[:-4] elif _r1[-3:] in {'hed', 'ene', 'ere', 'ens', 'ers', 'ets'}: word = word[:-3] elif _r1[-2:] in {'en', 'er', 'es', 'et'}: word = word[:-2] elif _r1[-1:] == 'e': word = word[:-1] elif _r1[-1:] == 's': if len(word) > 1 and word[-2] in _s_endings: word = word[:-1] # Step 2 if word[r1_start:][-2:] in {'gd', 'dt', 'gt', 'kt'}: word = word[:-1] # Step 3 if word[-4:] == 'igst': word = word[:-2] _r1 = word[r1_start:] repeat_step2 = False if _r1[-4:] == 'elig': word = word[:-4] repeat_step2 = True elif _r1[-4:] == 'løst': word = word[:-1] elif _r1[-3:] in {'lig', 'els'}: word = word[:-3] repeat_step2 = True elif _r1[-2:] == 'ig': word = word[:-2] repeat_step2 = True if repeat_step2: if word[r1_start:][-2:] in {'gd', 'dt', 'gt', 'kt'}: word = word[:-1] # Step 4 if ((len(word[r1_start:]) >= 1 and len(word) >= 2 and word[-1] == word[-2] and word[-1] not in _vowels)): word = word[:-1] return word
[docs]def clef_german(word): """Return CLEF German stem. The CLEF German stemmer is defined at :cite:`Savoy:2005`. :param str word: the word to calculate the stem of :returns: word stem :rtype: str >>> clef_german('lesen') 'lese' >>> clef_german('graues') 'grau' >>> clef_german('buchstabieren') 'buchstabier' """ # lowercase, normalize, and compose word = normalize('NFC', text_type(word.lower())) # remove umlauts _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou')) word = word.translate(_umlauts) # remove plurals wlen = len(word)-1 if wlen > 3: if wlen > 5: if word[-3:] == 'nen': return word[:-3] if wlen > 4: if word[-2:] in {'en', 'se', 'es', 'er'}: return word[:-2] if word[-1] in {'e', 'n', 'r', 's'}: return word[:-1] return word
[docs]def clef_german_plus(word): """Return 'CLEF German stemmer plus' stem. The CLEF German stemmer plus is defined at :cite:`Savoy:2005`. :param str word: the word to calculate the stem of :returns: word stem :rtype: str >>> clef_german_plus('lesen') 'les' >>> clef_german_plus('graues') 'grau' >>> clef_german_plus('buchstabieren') 'buchstabi' """ _st_ending = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'} # lowercase, normalize, and compose word = normalize('NFC', text_type(word.lower())) # remove umlauts _accents = dict(zip((ord(_) for _ in 'äàáâöòóôïìíîüùúû'), 'aaaaooooiiiiuuuu')) word = word.translate(_accents) # Step 1 wlen = len(word)-1 if wlen > 4 and word[-3:] == 'ern': word = word[:-3] elif wlen > 3 and word[-2:] in {'em', 'en', 'er', 'es'}: word = word[:-2] elif wlen > 2 and (word[-1] == 'e' or (word[-1] == 's' and word[-2] in _st_ending)): word = word[:-1] # Step 2 wlen = len(word)-1 if wlen > 4 and word[-3:] == 'est': word = word[:-3] elif wlen > 3 and (word[-2:] in {'er', 'en'} or (word[-2:] == 'st' and word[-3] in _st_ending)): word = word[:-2] return word
[docs]def clef_swedish(word): """Return CLEF Swedish stem. The CLEF Swedish stemmer is defined at :cite:`Savoy:2005`. :param str word: the word to calculate the stem of :returns: word stem :rtype: str >>> clef_swedish('undervisa') 'undervis' >>> clef_swedish('suspension') 'suspensio' >>> clef_swedish('visshet') 'viss' """ wlen = len(word)-1 if wlen > 3 and word[-1] == 's': word = word[:-1] wlen -= 1 if wlen > 6: if word[-5:] in {'elser', 'heten'}: return word[:-5] if wlen > 5: if word[-4:] in {'arne', 'erna', 'ande', 'else', 'aste', 'orna', 'aren'}: return word[:-4] if wlen > 4: if word[-3:] in {'are', 'ast', 'het'}: return word[:-3] if wlen > 3: if word[-2:] in {'ar', 'er', 'or', 'en', 'at', 'te', 'et'}: return word[:-2] if wlen > 2: if word[-1] in {'a', 'e', 'n', 't'}: return word[:-1] return word
[docs]def caumanns(word): """Return Caumanns German stem. Jörg Caumanns' stemmer is described in his article in :cite:`Caumanns:1999`. This implementation is based on the GermanStemFilter described at :cite:`Lang:2013`. :param str word: the word to calculate the stem of :returns: word stem :rtype: str >>> caumanns('lesen') 'les' >>> caumanns('graues') 'grau' >>> caumanns('buchstabieren') 'buchstabier' """ if not word: return '' upper_initial = word[0].isupper() word = normalize('NFC', text_type(word.lower())) # # Part 2: Substitution # 1. Change umlauts to corresponding vowels & ß to ss _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou')) word = word.translate(_umlauts) word = word.replace('ß', 'ss') # 2. Change second of doubled characters to * new_word = word[0] for i in range(1, len(word)): if new_word[i-1] == word[i]: new_word += '*' else: new_word += word[i] word = new_word # 3. Replace sch, ch, ei, ie with $, §, %, & word = word.replace('sch', '$') word = word.replace('ch', '§') word = word.replace('ei', '%') word = word.replace('ie', '&') word = word.replace('ig', '#') word = word.replace('st', '!') # # Part 1: Recursive Context-Free Stripping # 1. Remove the following 7 suffixes recursively while len(word) > 3: if (((len(word) > 4 and word[-2:] in {'em', 'er'}) or (len(word) > 5 and word[-2:] == 'nd'))): word = word[:-2] elif ((word[-1] in {'e', 's', 'n'}) or (not upper_initial and word[-1] in {'t', '!'})): word = word[:-1] else: break # Additional optimizations: if len(word) > 5 and word[-5:] == 'erin*': word = word[:-1] if word[-1] == 'z': word = word[:-1] + 'x' # Reverse substitutions: word = word.replace('$', 'sch') word = word.replace('§', 'ch') word = word.replace('%', 'ei') word = word.replace('&', 'ie') word = word.replace('#', 'ig') word = word.replace('!', 'st') # Expand doubled word = ''.join([word[0]] + [word[i-1] if word[i] == '*' else word[i] for i in range(1, len(word))]) # Finally, convert gege to ge if len(word) > 4: word = word.replace('gege', 'ge', 1) return word
[docs]def uealite(word, max_word_length=20, max_acro_length=8, return_rule_no=False, var=None): """Return UEA-Lite stem. The UEA-Lite stemmer is discussed in :cite:`Jenkins:2005`. This is chiefly based on the Java implementation of the algorithm, with variants based on the Perl implementation and Jason Adams' Ruby port. Java version: :cite:`Churchill:2005` Perl version: :cite:`Jenkins:2005` Ruby version: :cite:`Adams:2017` :param str word: the word to calculate the stem of :param int max_word_length: the maximum word length allowed :param int max_acro_length: the maximum acryonym length allowed :param bool return_rule_no: if True, returns the stem along with rule number :param str var: variant to use (set to 'Adams' to use Jason Adams' rules, or 'Perl' to use the original Perl set of rules) :returns: word stem :rtype: str or (str, int) >>> uealite('readings') 'read' >>> uealite('insulted') 'insult' >>> uealite('cussed') 'cuss' >>> uealite('fancies') 'fancy' >>> uealite('eroded') 'erode' """ problem_words = {'is', 'as', 'this', 'has', 'was', 'during'} # rule table format: # top-level dictionary: length-of-suffix: dict-of-rules # dict-of-rules: suffix: (rule_no, suffix_length_to_delete, # suffix_to_append) rule_table = {7: {'titudes': (30, 1, None), 'fulness': (34, 4, None), 'ousness': (35, 4, None), 'eadings': (40.7, 4, None), 'oadings': (40.6, 4, None), 'ealings': (42.4, 4, None), 'ailings': (42.2, 4, None), }, 6: {'aceous': (1, 6, None), 'aining': (24, 3, None), 'acting': (25, 3, None), 'ttings': (26, 5, None), 'viding': (27, 3, 'e'), 'ssings': (37, 4, None), 'ulting': (38, 3, None), 'eading': (40.7, 3, None), 'oading': (40.6, 3, None), 'edings': (40.5, 4, None), 'ddings': (40.4, 5, None), 'ldings': (40.3, 4, None), 'rdings': (40.2, 4, None), 'ndings': (40.1, 4, None), 'llings': (41, 5, None), 'ealing': (42.4, 3, None), 'olings': (42.3, 4, None), 'ailing': (42.2, 3, None), 'elings': (42.1, 4, None), 'mmings': (44.3, 5, None), 'ngings': (45.2, 4, None), 'ggings': (45.1, 5, None), 'stings': (47, 4, None), 'etings': (48.4, 4, None), 'ntings': (48.2, 4, None), 'irings': (54.4, 4, 'e'), 'urings': (54.3, 4, 'e'), 'ncings': (54.2, 4, 'e'), 'things': (58.1, 1, None), }, 5: {'iases': (11.4, 2, None), 'ained': (13.6, 2, None), 'erned': (13.5, 2, None), 'ifted': (14, 2, None), 'ected': (15, 2, None), 'vided': (16, 1, None), 'erred': (19, 3, None), 'urred': (20.5, 3, None), 'lored': (20.4, 2, None), 'eared': (20.3, 2, None), 'tored': (20.2, 1, None), 'noted': (22.4, 1, None), 'leted': (22.3, 1, None), 'anges': (23, 1, None), 'tting': (26, 4, None), 'ulted': (32, 2, None), 'uming': (33, 3, 'e'), 'rabed': (36.1, 1, None), 'rebed': (36.1, 1, None), 'ribed': (36.1, 1, None), 'robed': (36.1, 1, None), 'rubed': (36.1, 1, None), 'ssing': (37, 3, None), 'vings': (39, 4, 'e'), 'eding': (40.5, 3, None), 'dding': (40.4, 4, None), 'lding': (40.3, 3, None), 'rding': (40.2, 3, None), 'nding': (40.1, 3, None), 'dings': (40, 4, 'e'), 'lling': (41, 4, None), 'oling': (42.3, 3, None), 'eling': (42.1, 3, None), 'lings': (42, 4, 'e'), 'mming': (44.3, 4, None), 'rming': (44.2, 3, None), 'lming': (44.1, 3, None), 'mings': (44, 4, 'e'), 'nging': (45.2, 3, None), 'gging': (45.1, 4, None), 'gings': (45, 4, 'e'), 'aning': (46.6, 3, None), 'ening': (46.5, 3, None), 'gning': (46.4, 3, None), 'nning': (46.3, 4, None), 'oning': (46.2, 3, None), 'rning': (46.1, 3, None), 'sting': (47, 3, None), 'eting': (48.4, 3, None), 'pting': (48.3, 3, None), 'nting': (48.2, 3, None), 'cting': (48.1, 3, None), 'tings': (48, 4, 'e'), 'iring': (54.4, 3, 'e'), 'uring': (54.3, 3, 'e'), 'ncing': (54.2, 3, 'e'), 'sings': (54, 4, 'e'), # 'lling': (55, 3, None), # masked by 41 'ating': (57, 3, 'e'), 'thing': (58.1, 0, None), }, 4: {'eeds': (7, 1, None), 'uses': (11.3, 1, None), 'sses': (11.2, 2, None), 'eses': (11.1, 2, 'is'), 'tled': (12.5, 1, None), 'pled': (12.4, 1, None), 'bled': (12.3, 1, None), 'eled': (12.2, 2, None), 'lled': (12.1, 2, None), 'ened': (13.7, 2, None), 'rned': (13.4, 2, None), 'nned': (13.3, 3, None), 'oned': (13.2, 2, None), 'gned': (13.1, 2, None), 'ered': (20.1, 2, None), 'reds': (20, 2, None), 'tted': (21, 3, None), 'uted': (22.2, 1, None), 'ated': (22.1, 1, None), 'ssed': (28, 2, None), 'umed': (31, 1, None), 'beds': (36, 3, None), 'ving': (39, 3, 'e'), 'ding': (40, 3, 'e'), 'ling': (42, 3, 'e'), 'nged': (43.2, 1, None), 'gged': (43.1, 3, None), 'ming': (44, 3, 'e'), 'ging': (45, 3, 'e'), 'ning': (46, 3, 'e'), 'ting': (48, 3, 'e'), # 'ssed': (49, 2, None), # masked by 28 # 'lled': (53, 2, None), # masked by 12.1 'zing': (54.1, 3, 'e'), 'sing': (54, 3, 'e'), 'lves': (60.1, 3, 'f'), 'aped': (61.3, 1, None), 'uded': (61.2, 1, None), 'oded': (61.1, 1, None), # 'ated': (61, 1, None), # masked by 22.1 'ones': (63.6, 1, None), 'izes': (63.5, 1, None), 'ures': (63.4, 1, None), 'ines': (63.3, 1, None), 'ides': (63.2, 1, None), }, 3: {'ces': (2, 1, None), 'sis': (4, 0, None), 'tis': (5, 0, None), 'eed': (7, 0, None), 'ued': (8, 1, None), 'ues': (9, 1, None), 'ees': (10, 1, None), 'ses': (11, 1, None), 'led': (12, 2, None), 'ned': (13, 1, None), 'ved': (17, 1, None), 'ced': (18, 1, None), 'red': (20, 1, None), 'ted': (22, 2, None), 'sed': (29, 1, None), 'bed': (36, 2, None), 'ged': (43, 1, None), 'les': (50, 1, None), 'tes': (51, 1, None), 'zed': (52, 1, None), 'ied': (56, 3, 'y'), 'ies': (59, 3, 'y'), 'ves': (60, 1, None), 'pes': (63.8, 1, None), 'mes': (63.7, 1, None), 'ges': (63.1, 1, None), 'ous': (65, 0, None), 'ums': (66, 0, None), }, 2: {'cs': (3, 0, None), 'ss': (6, 0, None), 'es': (63, 2, None), 'is': (64, 2, 'e'), 'us': (67, 0, None), }} if var == 'Perl': perl_deletions = {7: ['eadings', 'oadings', 'ealings', 'ailings'], 6: ['ttings', 'ssings', 'edings', 'ddings', 'ldings', 'rdings', 'ndings', 'llings', 'olings', 'elings', 'mmings', 'ngings', 'ggings', 'stings', 'etings', 'ntings', 'irings', 'urings', 'ncings', 'things'], 5: ['vings', 'dings', 'lings', 'mings', 'gings', 'tings', 'sings'], 4: ['eeds', 'reds', 'beds']} # Delete the above rules from rule_table for del_len in perl_deletions: for term in perl_deletions[del_len]: del rule_table[del_len][term] elif var == 'Adams': adams_additions = {6: {'chited': (22.8, 1, None)}, 5: {'dying': (58.2, 4, 'ie'), 'tying': (58.2, 4, 'ie'), 'vited': (22.6, 1, None), 'mited': (22.5, 1, None), 'vided': (22.9, 1, None), 'mided': (22.10, 1, None), 'lying': (58.2, 4, 'ie'), 'arred': (19.1, 3, None), }, 4: {'ited': (22.7, 2, None), 'oked': (31.1, 1, None), 'aked': (31.1, 1, None), 'iked': (31.1, 1, None), 'uked': (31.1, 1, None), 'amed': (31, 1, None), 'imed': (31, 1, None), 'does': (31.2, 2, None), }, 3: {'oed': (31.3, 1, None), 'oes': (31.2, 1, None), 'kes': (63.1, 1, None), 'des': (63.10, 1, None), 'res': (63.9, 1, None), }} # Add the above additional rules to rule_table for del_len in adams_additions: rule_table[del_len] = dict(rule_table[del_len], **adams_additions[del_len]) # Add additional problem word problem_words.add('menses') def _stem_with_duplicate_character_check(word, del_len): if word[-1] == 's': del_len += 1 stemmed_word = word[:-del_len] if re_match(r'.*(\w)\1$', stemmed_word): stemmed_word = stemmed_word[:-1] return stemmed_word def _stem(word): stemmed_word = word rule_no = 0 if not word: return word, 0 if word in problem_words: return word, 90 if max_word_length and len(word) > max_word_length: return word, 95 if "'" in word: if word[-2:] in {"'s", "'S"}: stemmed_word = word[:-2] if word[-1:] == "'": stemmed_word = word[:-1] stemmed_word = stemmed_word.replace("n't", 'not') stemmed_word = stemmed_word.replace("'ve", 'have') stemmed_word = stemmed_word.replace("'re", 'are') stemmed_word = stemmed_word.replace("'m", 'am') return stemmed_word, 94 if word.isdigit(): return word, 90.3 else: hyphen = word.find('-') if len(word) > hyphen > 0: if word[:hyphen].isalpha() and word[hyphen+1:].isalpha(): return word, 90.2 else: return word, 90.1 elif '_' in word: return word, 90 elif word[-1] == 's' and word[:-1].isupper(): if var == 'Adams' and len(word)-1 > max_acro_length: return word, 96 return word[:-1], 91.1 elif word.isupper(): if var == 'Adams' and len(word) > max_acro_length: return word, 96 return word, 91 elif re_match(r'^.*[A-Z].*[A-Z].*$', word): return word, 92 elif word[0].isupper(): return word, 93 elif var == 'Adams' and re_match(r'^[a-z](|[rl])(ing|ed)$', word): return word, 97 for n in range(7, 1, -1): if word[-n:] in rule_table[n]: rule_no, del_len, add_str = rule_table[n][word[-n:]] if del_len: stemmed_word = word[:-del_len] else: stemmed_word = word if add_str: stemmed_word += add_str break if not rule_no: if re_match(r'.*\w\wings?$', word): # rule 58 stemmed_word = _stem_with_duplicate_character_check(word, 3) rule_no = 58 elif re_match(r'.*\w\weds?$', word): # rule 62 stemmed_word = _stem_with_duplicate_character_check(word, 2) rule_no = 62 elif word[-1] == 's': # rule 68 stemmed_word = word[:-1] rule_no = 68 return stemmed_word, rule_no stem, rule_no = _stem(word) if return_rule_no: return stem, rule_no return stem
[docs]def paice_husk(word): """Return Paice-Husk stem. Implementation of the Paice-Husk Stemmer, also known as the Lancaster Stemmer, developed by Chris Paice, with the assistance of Gareth Husk This is based on the algorithm's description in :cite:`Paice:1990`. :param str word: the word to stem :returns: the stemmed word :rtype: str >>> paice_husk('assumption') 'assum' >>> paice_husk('verifiable') 'ver' >>> paice_husk('fancies') 'fant' >>> paice_husk('fanciful') 'fancy' >>> paice_husk('torment') 'tor' """ rule_table = {6: {'ifiabl': (False, 6, None, True), 'plicat': (False, 4, 'y', True)}, 5: {'guish': (False, 5, 'ct', True), 'sumpt': (False, 2, None, True), 'istry': (False, 5, None, True)}, 4: {'ytic': (False, 3, 's', True), 'ceed': (False, 2, 'ss', True), 'hood': (False, 4, None, False), 'lief': (False, 1, 'v', True), 'verj': (False, 1, 't', True), 'misj': (False, 2, 't', True), 'iabl': (False, 4, 'y', True), 'iful': (False, 4, 'y', True), 'sion': (False, 4, 'j', False), 'xion': (False, 4, 'ct', True), 'ship': (False, 4, None, False), 'ness': (False, 4, None, False), 'ment': (False, 4, None, False), 'ript': (False, 2, 'b', True), 'orpt': (False, 2, 'b', True), 'duct': (False, 1, None, True), 'cept': (False, 2, 'iv', True), 'olut': (False, 2, 'v', True), 'sist': (False, 0, None, True)}, 3: {'ied': (False, 3, 'y', False), 'eed': (False, 1, None, True), 'ing': (False, 3, None, False), 'iag': (False, 3, 'y', True), 'ish': (False, 3, None, False), 'fuj': (False, 1, 's', True), 'hej': (False, 1, 'r', True), 'abl': (False, 3, None, False), 'ibl': (False, 3, None, True), 'bil': (False, 2, 'l', False), 'ful': (False, 3, None, False), 'ial': (False, 3, None, False), 'ual': (False, 3, None, False), 'ium': (False, 3, None, True), 'ism': (False, 3, None, False), 'ion': (False, 3, None, False), 'ian': (False, 3, None, False), 'een': (False, 0, None, True), 'ear': (False, 0, None, True), 'ier': (False, 3, 'y', False), 'ies': (False, 3, 'y', False), 'sis': (False, 2, None, True), 'ous': (False, 3, None, False), 'ent': (False, 3, None, False), 'ant': (False, 3, None, False), 'ist': (False, 3, None, False), 'iqu': (False, 3, None, True), 'ogu': (False, 1, None, True), 'siv': (False, 3, 'j', False), 'eiv': (False, 0, None, True), 'bly': (False, 1, None, False), 'ily': (False, 3, 'y', False), 'ply': (False, 0, None, True), 'ogy': (False, 1, None, True), 'phy': (False, 1, None, True), 'omy': (False, 1, None, True), 'opy': (False, 1, None, True), 'ity': (False, 3, None, False), 'ety': (False, 3, None, False), 'lty': (False, 2, None, True), 'ary': (False, 3, None, False), 'ory': (False, 3, None, False), 'ify': (False, 3, None, True), 'ncy': (False, 2, 't', False), 'acy': (False, 3, None, False)}, 2: {'ia': (True, 2, None, True), 'bb': (False, 1, None, True), 'ic': (False, 2, None, False), 'nc': (False, 1, 't', False), 'dd': (False, 1, None, True), 'ed': (False, 2, None, False), 'if': (False, 2, None, False), 'ag': (False, 2, None, False), 'gg': (False, 1, None, True), 'th': (True, 2, None, True), 'ij': (False, 1, 'd', True), 'uj': (False, 1, 'd', True), 'oj': (False, 1, 'd', True), 'nj': (False, 1, 'd', True), 'cl': (False, 1, None, True), 'ul': (False, 2, None, True), 'al': (False, 2, None, False), 'll': (False, 1, None, True), 'um': (True, 2, None, True), 'mm': (False, 1, None, True), 'an': (False, 2, None, False), 'en': (False, 2, None, False), 'nn': (False, 1, None, True), 'pp': (False, 1, None, True), 'er': (False, 2, None, False), 'ar': (False, 2, None, True), 'or': (False, 2, None, False), 'ur': (False, 2, None, False), 'rr': (False, 1, None, True), 'tr': (False, 1, None, False), 'is': (False, 2, None, False), 'ss': (False, 0, None, True), 'us': (True, 2, None, True), 'at': (False, 2, None, False), 'tt': (False, 1, None, True), 'iv': (False, 2, None, False), 'ly': (False, 2, None, False), 'iz': (False, 2, None, False), 'yz': (False, 1, 's', True)}, 1: {'a': (True, 1, None, True), 'e': (False, 1, None, False), 'i': ((True, 1, None, True), (False, 1, 'y', False)), 'j': (False, 1, 's', True), 's': ((True, 1, None, False), (False, 0, None, True))}} def _has_vowel(word): for char in word: if char in {'a', 'e', 'i', 'o', 'u', 'y'}: return True return False def _acceptable(word): if word and word[0] in {'a', 'e', 'i', 'o', 'u'}: return len(word) > 1 return len(word) > 2 and _has_vowel(word[1:]) def _apply_rule(word, rule, intact): old_word = word only_intact, del_len, add_str, set_terminate = rule # print(word, word[-n:], rule) if (not only_intact) or (intact and only_intact): if del_len: word = word[:-del_len] if add_str: word += add_str else: return word, False, intact, terminate if _acceptable(word): return word, True, False, set_terminate else: return old_word, False, intact, terminate terminate = False intact = True while not terminate: for n in range(6, 0, -1): if word[-n:] in rule_table[n]: accept = False if len(rule_table[n][word[-n:]]) < 4: for rule in rule_table[n][word[-n:]]: (word, accept, intact, terminate) = _apply_rule(word, rule, intact) if accept: break else: rule = rule_table[n][word[-n:]] (word, accept, intact, terminate) = _apply_rule(word, rule, intact) if accept: break else: break return word
[docs]def schinke(word): """Return the stem of a word according to the Schinke stemmer. This is defined in :cite:`Schinke:1996`. :param str word: the word to stem :returns: a dict of the noun- and verb-stemmed word :rtype: dict >>> schinke('atque') {'n': 'atque', 'v': 'atque'} >>> schinke('census') {'n': 'cens', 'v': 'censu'} >>> schinke('virum') {'n': 'uir', 'v': 'uiru'} >>> schinke('populusque') {'n': 'popul', 'v': 'populu'} >>> schinke('senatus') {'n': 'senat', 'v': 'senatu'} """ word = normalize('NFKD', text_type(word.lower())) word = ''.join(c for c in word if c in {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'}) # Rule 2 word = word.replace('j', 'i').replace('v', 'u') # Rule 3 keep_que = {'at', 'quo', 'ne', 'ita', 'abs', 'aps', 'abus', 'adae', 'adus', 'deni', 'de', 'sus', 'obli', 'perae', 'plenis', 'quando', 'quis', 'quae', 'cuius', 'cui', 'quem', 'quam', 'qua', 'qui', 'quorum', 'quarum', 'quibus', 'quos', 'quas', 'quotusquis', 'quous', 'ubi', 'undi', 'us', 'uter', 'uti', 'utro', 'utribi', 'tor', 'co', 'conco', 'contor', 'detor', 'deco', 'exco', 'extor', 'obtor', 'optor', 'retor', 'reco', 'attor', 'inco', 'intor', 'praetor'} if word[-3:] == 'que': # This diverges from the paper by also returning 'que' itself unstemmed if word[:-3] in keep_que or word == 'que': return {'n': word, 'v': word} else: word = word[:-3] # Base case will mean returning the words as is noun = word verb = word # Rule 4 n_endings = {4: {'ibus'}, 3: {'ius'}, 2: {'is', 'nt', 'ae', 'os', 'am', 'ud', 'as', 'um', 'em', 'us', 'es', 'ia'}, 1: {'a', 'e', 'i', 'o', 'u'}} for endlen in range(4, 0, -1): if word[-endlen:] in n_endings[endlen]: if len(word)-2 >= endlen: noun = word[:-endlen] else: noun = word break v_endings_strip = {6: {}, 5: {}, 4: {'mini', 'ntur', 'stis'}, 3: {'mur', 'mus', 'ris', 'sti', 'tis', 'tur'}, 2: {'ns', 'nt', 'ri'}, 1: {'m', 'r', 's', 't'}} v_endings_alter = {6: {'iuntur'}, 5: {'beris', 'erunt', 'untur'}, 4: {'iunt'}, 3: {'bor', 'ero', 'unt'}, 2: {'bo'}, 1: {}} for endlen in range(6, 0, -1): if word[-endlen:] in v_endings_strip[endlen]: if len(word)-2 >= endlen: verb = word[:-endlen] else: verb = word break if word[-endlen:] in v_endings_alter[endlen]: if word[-endlen:] in {'iuntur', 'erunt', 'untur', 'iunt', 'unt'}: new_word = word[:-endlen]+'i' addlen = 1 elif word[-endlen:] in {'beris', 'bor', 'bo'}: new_word = word[:-endlen]+'bi' addlen = 2 else: new_word = word[:-endlen]+'eri' addlen = 3 # Technically this diverges from the paper by considering the # length of the stem without the new suffix if len(new_word) >= 2+addlen: verb = new_word else: verb = word break return {'n': noun, 'v': verb}
[docs]def s_stemmer(word): """Return the S-stemmed form of a word. The S stemmer is defined in :cite:`Harman:1991`. :param str word: the word to stem :returns: the stemmed word :rtype: str >>> s_stemmer('summaries') 'summary' >>> s_stemmer('summary') 'summary' >>> s_stemmer('towers') 'tower' >>> s_stemmer('reading') 'reading' >>> s_stemmer('census') 'census' """ lowered = word.lower() if lowered[-3:] == 'ies' and lowered[-4:-3] not in {'e', 'a'}: return word[:-3] + ('Y' if word[-1:].isupper() else 'y') if lowered[-2:] == 'es' and lowered[-3:-2] not in {'a', 'e', 'o'}: return word[:-1] if lowered[-1:] == 's' and lowered[-2:-1] not in {'u', 's'}: return word[:-1] return word
if __name__ == '__main__': import doctest doctest.testmod()