Source code for abydos.stemmer._clef_german_plus

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.stemmer._clef_german_plus.

CLEF German plus stemmer
"""

from unicodedata import normalize

from deprecation import deprecated

from ._stemmer import _Stemmer
from .. import __version__

__all__ = ['CLEFGermanPlus', 'clef_german_plus']


[docs]class CLEFGermanPlus(_Stemmer): """CLEF German stemmer plus. The CLEF German stemmer plus is defined at :cite:`Savoy:2005`. .. versionadded:: 0.3.6 """ _st_ending = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'} _accents = dict( zip((ord(_) for _ in 'äàáâöòóôïìíîüùúû'), 'aaaaooooiiiiuuuu') )
[docs] def stem(self, word): """Return 'CLEF German stemmer plus' stem. Parameters ---------- word : str The word to stem Returns ------- str Word stem Examples -------- >>> stmr = CLEFGermanPlus() >>> clef_german_plus('lesen') 'les' >>> clef_german_plus('graues') 'grau' >>> clef_german_plus('buchstabieren') 'buchstabi' .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ # lowercase, normalize, and compose word = normalize('NFC', word.lower()) # remove umlauts word = word.translate(self._accents) # Step 1 wlen = len(word) - 1 if wlen > 4 and word[-3:] == 'ern': word = word[:-3] elif wlen > 3 and word[-2:] in {'em', 'en', 'er', 'es'}: word = word[:-2] elif wlen > 2 and ( word[-1] == 'e' or (word[-1] == 's' and word[-2] in self._st_ending) ): word = word[:-1] # Step 2 wlen = len(word) - 1 if wlen > 4 and word[-3:] == 'est': word = word[:-3] elif wlen > 3 and ( word[-2:] in {'er', 'en'} or (word[-2:] == 'st' and word[-3] in self._st_ending) ): word = word[:-2] return word
[docs]@deprecated( deprecated_in='0.4.0', removed_in='0.6.0', current_version=__version__, details='Use the CLEFGermanPlus.stem method instead.', ) def clef_german_plus(word): """Return 'CLEF German stemmer plus' stem. This is a wrapper for :py:meth:`CLEFGermanPlus.stem`. Parameters ---------- word : str The word to stem Returns ------- str Word stem Examples -------- >>> stmr = CLEFGermanPlus() >>> clef_german_plus('lesen') 'les' >>> clef_german_plus('graues') 'grau' >>> clef_german_plus('buchstabieren') 'buchstabi' .. versionadded:: 0.1.0 """ return CLEFGermanPlus().stem(word)
if __name__ == '__main__': import doctest doctest.testmod()