Source code for abydos.stemmer._porter

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.stemmer._porter.

Porter stemmer
"""

from unicodedata import normalize

from deprecation import deprecated

from ._stemmer import _Stemmer
from .. import __version__

__all__ = ['Porter', 'porter']


[docs]class Porter(_Stemmer): """Porter stemmer. The Porter stemmer is described in :cite:`Porter:1980`. .. versionadded:: 0.3.6 """ _vowels = {'a', 'e', 'i', 'o', 'u', 'y'} def _m_degree(self, term): """Return Porter helper function _m_degree value. m-degree is equal to the number of V to C transitions Parameters ---------- term : str The word for which to calculate the m-degree Returns ------- int The m-degree as defined in the Porter stemmer definition .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ mdeg = 0 last_was_vowel = False for letter in term: if letter in self._vowels: last_was_vowel = True else: if last_was_vowel: mdeg += 1 last_was_vowel = False return mdeg def _has_vowel(self, term): """Return Porter helper function _has_vowel value. Parameters ---------- term : str The word to scan for vowels Returns ------- bool True iff a vowel exists in the term (as defined in the Porter stemmer definition) .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ for letter in term: if letter in self._vowels: return True return False def _ends_in_doubled_cons(self, term): """Return Porter helper function _ends_in_doubled_cons value. Parameters ---------- term : str The word to check for a final doubled consonant Returns ------- bool True iff the stem ends in a doubled consonant (as defined in the Porter stemmer definition) .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ return ( len(term) > 1 and term[-1] not in self._vowels and term[-2] == term[-1] ) def _ends_in_cvc(self, term): """Return Porter helper function _ends_in_cvc value. Parameters ---------- term : str The word to scan for cvc Returns ------- bool True iff the stem ends in cvc (as defined in the Porter stemmer definition) .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ return len(term) > 2 and ( term[-1] not in self._vowels and term[-2] in self._vowels and term[-3] not in self._vowels and term[-1] not in tuple('wxY') ) def __init__(self, early_english=False): """Initialize Porter instance. Parameters ---------- early_english : bool Set to True in order to remove -eth & -est (2nd & 3rd person singular verbal agreement suffixes) .. versionadded:: 0.4.0 """ self._early_english = early_english
[docs] def stem(self, word): """Return Porter stem. Parameters ---------- word : str The word to stem Returns ------- str Word stem Examples -------- >>> stmr = Porter() >>> stmr.stem('reading') 'read' >>> stmr.stem('suspension') 'suspens' >>> stmr.stem('elusiveness') 'elus' >>> stmr = Porter(early_english=True) >>> stmr.stem('eateth') 'eat' .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ # lowercase, normalize, and compose word = normalize('NFC', word.lower()) # Return word if stem is shorter than 2 if len(word) < 3: return word # Re-map consonantal y to Y (Y will be C, y will be V) if word[0] == 'y': word = 'Y' + word[1:] for i in range(1, len(word)): if word[i] == 'y' and word[i - 1] in self._vowels: word = word[:i] + 'Y' + word[i + 1 :] # Step 1a if word[-1] == 's': if word[-4:] == 'sses': word = word[:-2] elif word[-3:] == 'ies': word = word[:-2] elif word[-2:] == 'ss': pass else: word = word[:-1] # Step 1b step1b_flag = False if word[-3:] == 'eed': if self._m_degree(word[:-3]) > 0: word = word[:-1] elif word[-2:] == 'ed': if self._has_vowel(word[:-2]): word = word[:-2] step1b_flag = True elif word[-3:] == 'ing': if self._has_vowel(word[:-3]): word = word[:-3] step1b_flag = True elif self._early_english: if word[-3:] == 'est': if self._has_vowel(word[:-3]): word = word[:-3] step1b_flag = True elif word[-3:] == 'eth': if self._has_vowel(word[:-3]): word = word[:-3] step1b_flag = True if step1b_flag: if word[-2:] in {'at', 'bl', 'iz'}: word += 'e' elif self._ends_in_doubled_cons(word) and word[-1] not in { 'l', 's', 'z', }: word = word[:-1] elif self._m_degree(word) == 1 and self._ends_in_cvc(word): word += 'e' # Step 1c if word[-1] in {'Y', 'y'} and self._has_vowel(word[:-1]): word = word[:-1] + 'i' # Step 2 if len(word) > 1: if word[-2] == 'a': if word[-7:] == 'ational': if self._m_degree(word[:-7]) > 0: word = word[:-5] + 'e' elif word[-6:] == 'tional': if self._m_degree(word[:-6]) > 0: word = word[:-2] elif word[-2] == 'c': if word[-4:] in {'enci', 'anci'}: if self._m_degree(word[:-4]) > 0: word = word[:-1] + 'e' elif word[-2] == 'e': if word[-4:] == 'izer': if self._m_degree(word[:-4]) > 0: word = word[:-1] elif word[-2] == 'g': if word[-4:] == 'logi': if self._m_degree(word[:-4]) > 0: word = word[:-1] elif word[-2] == 'l': if word[-3:] == 'bli': if self._m_degree(word[:-3]) > 0: word = word[:-1] + 'e' elif word[-4:] == 'alli': if self._m_degree(word[:-4]) > 0: word = word[:-2] elif word[-5:] == 'entli': if self._m_degree(word[:-5]) > 0: word = word[:-2] elif word[-3:] == 'eli': if self._m_degree(word[:-3]) > 0: word = word[:-2] elif word[-5:] == 'ousli': if self._m_degree(word[:-5]) > 0: word = word[:-2] elif word[-2] == 'o': if word[-7:] == 'ization': if self._m_degree(word[:-7]) > 0: word = word[:-5] + 'e' elif word[-5:] == 'ation': if self._m_degree(word[:-5]) > 0: word = word[:-3] + 'e' elif word[-4:] == 'ator': if self._m_degree(word[:-4]) > 0: word = word[:-2] + 'e' elif word[-2] == 's': if word[-5:] == 'alism': if self._m_degree(word[:-5]) > 0: word = word[:-3] elif word[-7:] in {'iveness', 'fulness', 'ousness'}: if self._m_degree(word[:-7]) > 0: word = word[:-4] elif word[-2] == 't': if word[-5:] == 'aliti': if self._m_degree(word[:-5]) > 0: word = word[:-3] elif word[-5:] == 'iviti': if self._m_degree(word[:-5]) > 0: word = word[:-3] + 'e' elif word[-6:] == 'biliti': if self._m_degree(word[:-6]) > 0: word = word[:-5] + 'le' # Step 3 if word[-5:] in 'icate': if self._m_degree(word[:-5]) > 0: word = word[:-3] elif word[-5:] == 'ative': if self._m_degree(word[:-5]) > 0: word = word[:-5] elif word[-5:] in {'alize', 'iciti'}: if self._m_degree(word[:-5]) > 0: word = word[:-3] elif word[-4:] == 'ical': if self._m_degree(word[:-4]) > 0: word = word[:-2] elif word[-3:] == 'ful': if self._m_degree(word[:-3]) > 0: word = word[:-3] elif word[-4:] == 'ness': if self._m_degree(word[:-4]) > 0: word = word[:-4] # Step 4 if word[-2:] == 'al': if self._m_degree(word[:-2]) > 1: word = word[:-2] elif word[-4:] in {'ance', 'ence'}: if self._m_degree(word[:-4]) > 1: word = word[:-4] elif word[-2:] in {'er', 'ic'}: if self._m_degree(word[:-2]) > 1: word = word[:-2] elif word[-4:] in {'able', 'ible'}: if self._m_degree(word[:-4]) > 1: word = word[:-4] elif word[-3:] == 'ant': if self._m_degree(word[:-3]) > 1: word = word[:-3] elif word[-5:] == 'ement': if self._m_degree(word[:-5]) > 1: word = word[:-5] elif word[-4:] == 'ment': if self._m_degree(word[:-4]) > 1: word = word[:-4] elif word[-3:] == 'ent': if self._m_degree(word[:-3]) > 1: word = word[:-3] elif word[-4:] in {'sion', 'tion'}: if self._m_degree(word[:-3]) > 1: word = word[:-3] elif word[-2:] == 'ou': if self._m_degree(word[:-2]) > 1: word = word[:-2] elif word[-3:] in {'ism', 'ate', 'iti', 'ous', 'ive', 'ize'}: if self._m_degree(word[:-3]) > 1: word = word[:-3] # Step 5a if word[-1] == 'e': if self._m_degree(word[:-1]) > 1: word = word[:-1] elif self._m_degree(word[:-1]) == 1 and not self._ends_in_cvc( word[:-1] ): word = word[:-1] # Step 5b if word[-2:] == 'll' and self._m_degree(word) > 1: word = word[:-1] # Change 'Y' back to 'y' if it survived stemming for i in range(len(word)): if word[i] == 'Y': word = word[:i] + 'y' + word[i + 1 :] return word
[docs]@deprecated( deprecated_in='0.4.0', removed_in='0.6.0', current_version=__version__, details='Use the Porter.stem method instead.', ) def porter(word, early_english=False): """Return Porter stem. This is a wrapper for :py:meth:`Porter.stem`. Parameters ---------- word : str The word to stem early_english : bool Set to True in order to remove -eth & -est (2nd & 3rd person singular verbal agreement suffixes) Returns ------- str Word stem Examples -------- >>> porter('reading') 'read' >>> porter('suspension') 'suspens' >>> porter('elusiveness') 'elus' >>> porter('eateth', early_english=True) 'eat' .. versionadded:: 0.1.0 """ return Porter(early_english).stem(word)
if __name__ == '__main__': import doctest doctest.testmod()