Source code for abydos.stemmer._schinke

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.stemmer._schinke.

Schinke Latin stemmer.
"""

from unicodedata import normalize

from deprecation import deprecated

from ._stemmer import _Stemmer
from .. import __version__

__all__ = ['Schinke', 'schinke']


[docs]class Schinke(_Stemmer):
    """Schinke stemmer.

    This is defined in :cite:`Schinke:1996`.

    .. versionadded:: 0.3.6
    """

    _keep_que = {
        'at',
        'quo',
        'ne',
        'ita',
        'abs',
        'aps',
        'abus',
        'adae',
        'adus',
        'deni',
        'de',
        'sus',
        'obli',
        'perae',
        'plenis',
        'quando',
        'quis',
        'quae',
        'cuius',
        'cui',
        'quem',
        'quam',
        'qua',
        'qui',
        'quorum',
        'quarum',
        'quibus',
        'quos',
        'quas',
        'quotusquis',
        'quous',
        'ubi',
        'undi',
        'us',
        'uter',
        'uti',
        'utro',
        'utribi',
        'tor',
        'co',
        'conco',
        'contor',
        'detor',
        'deco',
        'exco',
        'extor',
        'obtor',
        'optor',
        'retor',
        'reco',
        'attor',
        'inco',
        'intor',
        'praetor',
    }

    _n_endings = {
        4: {'ibus'},
        3: {'ius'},
        2: {
            'is',
            'nt',
            'ae',
            'os',
            'am',
            'ud',
            'as',
            'um',
            'em',
            'us',
            'es',
            'ia',
        },
        1: {'a', 'e', 'i', 'o', 'u'},
    }

    _v_endings_strip = {
        6: {},
        5: {},
        4: {'mini', 'ntur', 'stis'},
        3: {'mur', 'mus', 'ris', 'sti', 'tis', 'tur'},
        2: {'ns', 'nt', 'ri'},
        1: {'m', 'r', 's', 't'},
    }
    _v_endings_alter = {
        6: {'iuntur'},
        5: {'beris', 'erunt', 'untur'},
        4: {'iunt'},
        3: {'bor', 'ero', 'unt'},
        2: {'bo'},
        1: {},
    }

[docs]    def stem(self, word):
        """Return the stem of a word according to the Schinke stemmer.

        Parameters
        ----------
        word : str
            The word to stem

        Returns
        -------
        str
            Word stem

        Examples
        --------
        >>> stmr = Schinke()
        >>> stmr.stem('atque')
        {'n': 'atque', 'v': 'atque'}
        >>> stmr.stem('census')
        {'n': 'cens', 'v': 'censu'}
        >>> stmr.stem('virum')
        {'n': 'uir', 'v': 'uiru'}
        >>> stmr.stem('populusque')
        {'n': 'popul', 'v': 'populu'}
        >>> stmr.stem('senatus')
        {'n': 'senat', 'v': 'senatu'}


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        word = normalize('NFKD', word.lower())
        word = ''.join(
            c
            for c in word
            if c
            in {
                'a',
                'b',
                'c',
                'd',
                'e',
                'f',
                'g',
                'h',
                'i',
                'j',
                'k',
                'l',
                'm',
                'n',
                'o',
                'p',
                'q',
                'r',
                's',
                't',
                'u',
                'v',
                'w',
                'x',
                'y',
                'z',
            }
        )

        # Rule 2
        word = word.replace('j', 'i').replace('v', 'u')

        # Rule 3
        if word[-3:] == 'que':
            # This diverges from the paper by also returning 'que' itself
            #  unstemmed
            if word[:-3] in self._keep_que or word == 'que':
                return {'n': word, 'v': word}
            else:
                word = word[:-3]

        # Base case will mean returning the words as is
        noun = word
        verb = word

        # Rule 4
        for endlen in range(4, 0, -1):
            if word[-endlen:] in self._n_endings[endlen]:
                if len(word) - 2 >= endlen:
                    noun = word[:-endlen]
                else:
                    noun = word
                break

        for endlen in range(6, 0, -1):
            if word[-endlen:] in self._v_endings_strip[endlen]:
                if len(word) - 2 >= endlen:
                    verb = word[:-endlen]
                else:
                    verb = word
                break
            if word[-endlen:] in self._v_endings_alter[endlen]:
                if word[-endlen:] in {
                    'iuntur',
                    'erunt',
                    'untur',
                    'iunt',
                    'unt',
                }:
                    new_word = word[:-endlen] + 'i'
                    addlen = 1
                elif word[-endlen:] in {'beris', 'bor', 'bo'}:
                    new_word = word[:-endlen] + 'bi'
                    addlen = 2
                else:
                    new_word = word[:-endlen] + 'eri'
                    addlen = 3

                # Technically this diverges from the paper by considering the
                # length of the stem without the new suffix
                if len(new_word) >= 2 + addlen:
                    verb = new_word
                else:
                    verb = word
                break

        return {'n': noun, 'v': verb}


[docs]@deprecated(
    deprecated_in='0.4.0',
    removed_in='0.6.0',
    current_version=__version__,
    details='Use the Schinke.stem method instead.',
)
def schinke(word):
    """Return the stem of a word according to the Schinke stemmer.

    This is a wrapper for :py:meth:`Schinke.stem`.

    Parameters
    ----------
    word : str
        The word to stem

    Returns
    -------
    str
        Word stem

    Examples
    --------
    >>> schinke('atque')
    {'n': 'atque', 'v': 'atque'}
    >>> schinke('census')
    {'n': 'cens', 'v': 'censu'}
    >>> schinke('virum')
    {'n': 'uir', 'v': 'uiru'}
    >>> schinke('populusque')
    {'n': 'popul', 'v': 'populu'}
    >>> schinke('senatus')
    {'n': 'senat', 'v': 'senatu'}

    .. versionadded:: 0.3.0

    """
    return Schinke().stem(word)


if __name__ == '__main__':
    import doctest

    doctest.testmod()