# -*- coding: utf-8 -*-
# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.stemmer._clef.
The stemmer._clef module defines CLEF stemmers for:
- German
- German plus
- Swedish
"""
from __future__ import unicode_literals
from unicodedata import normalize
from six import text_type
__all__ = ['clef_german', 'clef_german_plus', 'clef_swedish']
[docs]def clef_german(word):
"""Return CLEF German stem.
The CLEF German stemmer is defined at :cite:`Savoy:2005`.
:param str word: the word to calculate the stem of
:returns: word stem
:rtype: str
>>> clef_german('lesen')
'lese'
>>> clef_german('graues')
'grau'
>>> clef_german('buchstabieren')
'buchstabier'
"""
# lowercase, normalize, and compose
word = normalize('NFC', text_type(word.lower()))
# remove umlauts
_umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))
word = word.translate(_umlauts)
# remove plurals
wlen = len(word) - 1
if wlen > 3:
if wlen > 5:
if word[-3:] == 'nen':
return word[:-3]
if wlen > 4:
if word[-2:] in {'en', 'se', 'es', 'er'}:
return word[:-2]
if word[-1] in {'e', 'n', 'r', 's'}:
return word[:-1]
return word
[docs]def clef_german_plus(word):
"""Return 'CLEF German stemmer plus' stem.
The CLEF German stemmer plus is defined at :cite:`Savoy:2005`.
:param str word: the word to calculate the stem of
:returns: word stem
:rtype: str
>>> clef_german_plus('lesen')
'les'
>>> clef_german_plus('graues')
'grau'
>>> clef_german_plus('buchstabieren')
'buchstabi'
"""
_st_ending = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'}
# lowercase, normalize, and compose
word = normalize('NFC', text_type(word.lower()))
# remove umlauts
_accents = dict(
zip((ord(_) for _ in 'äàáâöòóôïìíîüùúû'), 'aaaaooooiiiiuuuu')
)
word = word.translate(_accents)
# Step 1
wlen = len(word) - 1
if wlen > 4 and word[-3:] == 'ern':
word = word[:-3]
elif wlen > 3 and word[-2:] in {'em', 'en', 'er', 'es'}:
word = word[:-2]
elif wlen > 2 and (
word[-1] == 'e' or (word[-1] == 's' and word[-2] in _st_ending)
):
word = word[:-1]
# Step 2
wlen = len(word) - 1
if wlen > 4 and word[-3:] == 'est':
word = word[:-3]
elif wlen > 3 and (
word[-2:] in {'er', 'en'}
or (word[-2:] == 'st' and word[-3] in _st_ending)
):
word = word[:-2]
return word
[docs]def clef_swedish(word):
"""Return CLEF Swedish stem.
The CLEF Swedish stemmer is defined at :cite:`Savoy:2005`.
:param str word: the word to calculate the stem of
:returns: word stem
:rtype: str
>>> clef_swedish('undervisa')
'undervis'
>>> clef_swedish('suspension')
'suspensio'
>>> clef_swedish('visshet')
'viss'
"""
wlen = len(word) - 1
if wlen > 3 and word[-1] == 's':
word = word[:-1]
wlen -= 1
if wlen > 6:
if word[-5:] in {'elser', 'heten'}:
return word[:-5]
if wlen > 5:
if word[-4:] in {
'arne',
'erna',
'ande',
'else',
'aste',
'orna',
'aren',
}:
return word[:-4]
if wlen > 4:
if word[-3:] in {'are', 'ast', 'het'}:
return word[:-3]
if wlen > 3:
if word[-2:] in {'ar', 'er', 'or', 'en', 'at', 'te', 'et'}:
return word[:-2]
if wlen > 2:
if word[-1] in {'a', 'e', 'n', 't'}:
return word[:-1]
return word
if __name__ == '__main__':
import doctest
doctest.testmod()