Source code for abydos.stemmer._snowball

# -*- coding: utf-8 -*-

# Copyright 2014-2019 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.stemmer._snowball.

Snowball Stemmer base class
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from six.moves import range

from ._stemmer import _Stemmer

__all__ = ['_Snowball']


[docs]class _Snowball(_Stemmer): """Snowball stemmer base class. .. versionadded:: 0.3.6 """ _vowels = set('aeiouy') _codanonvowels = set("'bcdfghjklmnpqrstvz")
[docs] def _sb_r1(self, term, r1_prefixes=None): """Return the R1 region, as defined in the Porter2 specification. Parameters ---------- term : str The term to examine r1_prefixes : set Prefixes to consider Returns ------- int Length of the R1 region .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ vowel_found = False if hasattr(r1_prefixes, '__iter__'): for prefix in r1_prefixes: if term[: len(prefix)] == prefix: return len(prefix) for i in range(len(term)): if not vowel_found and term[i] in self._vowels: vowel_found = True elif vowel_found and term[i] not in self._vowels: return i + 1 return len(term)
[docs] def _sb_r2(self, term, r1_prefixes=None): """Return the R2 region, as defined in the Porter2 specification. Parameters ---------- term : str The term to examine r1_prefixes : set Prefixes to consider Returns ------- int Length of the R1 region .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ r1_start = self._sb_r1(term, r1_prefixes) return r1_start + self._sb_r1(term[r1_start:])
[docs] def _sb_ends_in_short_syllable(self, term): """Return True iff term ends in a short syllable. (...according to the Porter2 specification.) NB: This is akin to the CVC test from the Porter stemmer. The description is unfortunately poor/ambiguous. Parameters ---------- term : str The term to examine Returns ------- bool True iff term ends in a short syllable .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ if not term: return False if len(term) == 2: if term[-2] in self._vowels and term[-1] not in self._vowels: return True elif len(term) >= 3: if ( term[-3] not in self._vowels and term[-2] in self._vowels and term[-1] in self._codanonvowels ): return True return False
[docs] def _sb_short_word(self, term, r1_prefixes=None): """Return True iff term is a short word. (...according to the Porter2 specification.) Parameters ---------- term : str The term to examine r1_prefixes : set Prefixes to consider Returns ------- bool True iff term is a short word .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ if self._sb_r1(term, r1_prefixes) == len( term ) and self._sb_ends_in_short_syllable(term): return True return False
[docs] def _sb_has_vowel(self, term): """Return Porter helper function _sb_has_vowel value. Parameters ---------- term : str The term to examine Returns ------- bool True iff a vowel exists in the term (as defined in the Porter stemmer definition) .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ for letter in term: if letter in self._vowels: return True return False
if __name__ == '__main__': import doctest doctest.testmod()