Source code for abydos.stemmer._caumanns

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.stemmer._caumanns.

Caumanns German stemmer
"""

from unicodedata import normalize

from deprecation import deprecated

from ._stemmer import _Stemmer
from .. import __version__

__all__ = ['Caumanns', 'caumanns']


[docs]class Caumanns(_Stemmer): """Caumanns stemmer. Jörg Caumanns' stemmer is described in his article in :cite:`Caumanns:1999`. This implementation is based on the GermanStemFilter described at :cite:`Lang:2013`. .. versionadded:: 0.3.6 """ _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))
[docs] def stem(self, word): """Return Caumanns German stem. Parameters ---------- word : str The word to stem Returns ------- str Word stem Examples -------- >>> stmr = Caumanns() >>> stmr.stem('lesen') 'les' >>> stmr.stem('graues') 'grau' >>> stmr.stem('buchstabieren') 'buchstabier' .. versionadded:: 0.2.0 .. versionchanged:: 0.3.6 Encapsulated in class """ if not word: return '' upper_initial = word[0].isupper() word = normalize('NFC', word.lower()) # # Part 2: Substitution # 1. Change umlauts to corresponding vowels & ß to ss word = word.translate(self._umlauts) word = word.replace('ß', 'ss') # 2. Change second of doubled characters to * new_word = word[0] for i in range(1, len(word)): if new_word[i - 1] == word[i]: new_word += '*' else: new_word += word[i] word = new_word # 3. Replace sch, ch, ei, ie with $, §, %, & word = word.replace('sch', '$') word = word.replace('ch', '§') word = word.replace('ei', '%') word = word.replace('ie', '&') word = word.replace('ig', '#') word = word.replace('st', '!') # # Part 1: Recursive Context-Free Stripping # 1. Remove the following 7 suffixes recursively while len(word) > 3: if (len(word) > 4 and word[-2:] in {'em', 'er'}) or ( len(word) > 5 and word[-2:] == 'nd' ): word = word[:-2] elif (word[-1] in {'e', 's', 'n'}) or ( not upper_initial and word[-1] in {'t', '!'} ): word = word[:-1] else: break # Additional optimizations: if len(word) > 5 and word[-5:] == 'erin*': word = word[:-1] if word[-1] == 'z': word = word[:-1] + 'x' # Reverse substitutions: word = word.replace('$', 'sch') word = word.replace('§', 'ch') word = word.replace('%', 'ei') word = word.replace('&', 'ie') word = word.replace('#', 'ig') word = word.replace('!', 'st') # Expand doubled word = ''.join( [word[0]] + [ word[i - 1] if word[i] == '*' else word[i] for i in range(1, len(word)) ] ) # Finally, convert gege to ge if len(word) > 4: word = word.replace('gege', 'ge', 1) return word
[docs]@deprecated( deprecated_in='0.4.0', removed_in='0.6.0', current_version=__version__, details='Use the Caumanns.stem method instead.', ) def caumanns(word): """Return Caumanns German stem. This is a wrapper for :py:meth:`Caumanns.stem`. Parameters ---------- word : str The word to stem Returns ------- str Word stem Examples -------- >>> caumanns('lesen') 'les' >>> caumanns('graues') 'grau' >>> caumanns('buchstabieren') 'buchstabier' .. versionadded:: 0.2.0 """ return Caumanns().stem(word)
if __name__ == '__main__': import doctest doctest.testmod()