Source code for abydos.stemmer._clef_german_plus

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.stemmer._clef_german_plus.

CLEF German plus stemmer
"""

from unicodedata import normalize

from deprecation import deprecated

from ._stemmer import _Stemmer
from .. import __version__

__all__ = ['CLEFGermanPlus', 'clef_german_plus']


[docs]class CLEFGermanPlus(_Stemmer):
    """CLEF German stemmer plus.

    The CLEF German stemmer plus is defined at :cite:`Savoy:2005`.

    .. versionadded:: 0.3.6
    """

    _st_ending = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'}

    _accents = dict(
        zip((ord(_) for _ in 'äàáâöòóôïìíîüùúû'), 'aaaaooooiiiiuuuu')
    )

[docs]    def stem(self, word):
        """Return 'CLEF German stemmer plus' stem.

        Parameters
        ----------
        word : str
            The word to stem

        Returns
        -------
        str
            Word stem

        Examples
        --------
        >>> stmr = CLEFGermanPlus()
        >>> clef_german_plus('lesen')
        'les'
        >>> clef_german_plus('graues')
        'grau'
        >>> clef_german_plus('buchstabieren')
        'buchstabi'


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        # lowercase, normalize, and compose
        word = normalize('NFC', word.lower())

        # remove umlauts
        word = word.translate(self._accents)

        # Step 1
        wlen = len(word) - 1
        if wlen > 4 and word[-3:] == 'ern':
            word = word[:-3]
        elif wlen > 3 and word[-2:] in {'em', 'en', 'er', 'es'}:
            word = word[:-2]
        elif wlen > 2 and (
            word[-1] == 'e'
            or (word[-1] == 's' and word[-2] in self._st_ending)
        ):
            word = word[:-1]

        # Step 2
        wlen = len(word) - 1
        if wlen > 4 and word[-3:] == 'est':
            word = word[:-3]
        elif wlen > 3 and (
            word[-2:] in {'er', 'en'}
            or (word[-2:] == 'st' and word[-3] in self._st_ending)
        ):
            word = word[:-2]

        return word


[docs]@deprecated(
    deprecated_in='0.4.0',
    removed_in='0.6.0',
    current_version=__version__,
    details='Use the CLEFGermanPlus.stem method instead.',
)
def clef_german_plus(word):
    """Return 'CLEF German stemmer plus' stem.

    This is a wrapper for :py:meth:`CLEFGermanPlus.stem`.

    Parameters
    ----------
    word : str
        The word to stem

    Returns
    -------
    str
        Word stem

    Examples
    --------
    >>> stmr = CLEFGermanPlus()
    >>> clef_german_plus('lesen')
    'les'
    >>> clef_german_plus('graues')
    'grau'
    >>> clef_german_plus('buchstabieren')
    'buchstabi'

    .. versionadded:: 0.1.0

    """
    return CLEFGermanPlus().stem(word)


if __name__ == '__main__':
    import doctest

    doctest.testmod()