Source code for abydos.stemmer._snowball_dutch

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.stemmer._snowball_dutch.

Snowball Dutch stemmer
"""

from unicodedata import normalize

from deprecation import deprecated

from ._snowball import _Snowball
from .. import __version__

__all__ = ['SnowballDutch', 'sb_dutch']


[docs]class SnowballDutch(_Snowball): """Snowball Dutch stemmer. The Snowball Dutch stemmer is defined at: http://snowball.tartarus.org/algorithms/dutch/stemmer.html .. versionadded:: 0.3.6 """ _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'è'} _not_s_endings = {'a', 'e', 'i', 'j', 'o', 'u', 'y', 'è'} _accented = dict(zip((ord(_) for _ in 'äëïöüáéíóú'), 'aeiouaeiou')) def _undouble(self, word): """Undouble endings -kk, -dd, and -tt. Parameters ---------- word : str The word to stem Returns ------- str The word with doubled endings undoubled .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ if ( len(word) > 1 and word[-1] == word[-2] and word[-1] in {'d', 'k', 't'} ): return word[:-1] return word
[docs] def stem(self, word): """Return Snowball Dutch stem. Parameters ---------- word : str The word to stem Returns ------- str Word stem Examples -------- >>> stmr = SnowballDutch() >>> stmr.stem('lezen') 'lez' >>> stmr.stem('opschorting') 'opschort' >>> stmr.stem('ongrijpbaarheid') 'ongrijp' .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ # lowercase, normalize, decompose, filter umlauts & acutes out, and # compose word = normalize('NFC', word.lower()) word = word.translate(self._accented) for i in range(len(word)): if i == 0 and word[0] == 'y': word = 'Y' + word[1:] elif word[i] == 'y' and word[i - 1] in self._vowels: word = word[:i] + 'Y' + word[i + 1 :] elif ( word[i] == 'i' and word[i - 1] in self._vowels and i + 1 < len(word) and word[i + 1] in self._vowels ): word = word[:i] + 'I' + word[i + 1 :] r1_start = max(3, self._sb_r1(word)) r2_start = self._sb_r2(word) # Step 1 if word[-5:] == 'heden': if len(word[r1_start:]) >= 5: word = word[:-3] + 'id' elif word[-3:] == 'ene': if len(word[r1_start:]) >= 3 and ( word[-4] not in self._vowels and word[-6:-3] != 'gem' ): word = self._undouble(word[:-3]) elif word[-2:] == 'en': if len(word[r1_start:]) >= 2 and ( word[-3] not in self._vowels and word[-5:-2] != 'gem' ): word = self._undouble(word[:-2]) elif word[-2:] == 'se': if ( len(word[r1_start:]) >= 2 and word[-3] not in self._not_s_endings ): word = word[:-2] elif word[-1:] == 's': if ( len(word[r1_start:]) >= 1 and word[-2] not in self._not_s_endings ): word = word[:-1] # Step 2 e_removed = False if word[-1:] == 'e': if len(word[r1_start:]) >= 1 and word[-2] not in self._vowels: word = self._undouble(word[:-1]) e_removed = True # Step 3a if word[-4:] == 'heid': if len(word[r2_start:]) >= 4 and word[-5] != 'c': word = word[:-4] if word[-2:] == 'en': if len(word[r1_start:]) >= 2 and ( word[-3] not in self._vowels and word[-5:-2] != 'gem' ): word = self._undouble(word[:-2]) # Step 3b if word[-4:] == 'lijk': if len(word[r2_start:]) >= 4: word = word[:-4] # Repeat step 2 if word[-1:] == 'e': if ( len(word[r1_start:]) >= 1 and word[-2] not in self._vowels ): word = self._undouble(word[:-1]) elif word[-4:] == 'baar': if len(word[r2_start:]) >= 4: word = word[:-4] elif word[-3:] in ('end', 'ing'): if len(word[r2_start:]) >= 3: word = word[:-3] if ( word[-2:] == 'ig' and len(word[r2_start:]) >= 2 and word[-3] != 'e' ): word = word[:-2] else: word = self._undouble(word) elif word[-3:] == 'bar': if len(word[r2_start:]) >= 3 and e_removed: word = word[:-3] elif word[-2:] == 'ig': if len(word[r2_start:]) >= 2 and word[-3] != 'e': word = word[:-2] # Step 4 if ( len(word) >= 4 and word[-3] == word[-2] and word[-2] in {'a', 'e', 'o', 'u'} and word[-4] not in self._vowels and word[-1] not in self._vowels and word[-1] != 'I' ): word = word[:-2] + word[-1] # Change 'Y' and 'U' back to lowercase if survived stemming for i in range(0, len(word)): if word[i] == 'Y': word = word[:i] + 'y' + word[i + 1 :] elif word[i] == 'I': word = word[:i] + 'i' + word[i + 1 :] return word
[docs]@deprecated( deprecated_in='0.4.0', removed_in='0.6.0', current_version=__version__, details='Use the SnowballDutch.stem method instead.', ) def sb_dutch(word): """Return Snowball Dutch stem. This is a wrapper for :py:meth:`SnowballDutch.stem`. Parameters ---------- word : str The word to stem Returns ------- str Word stem Examples -------- >>> sb_dutch('lezen') 'lez' >>> sb_dutch('opschorting') 'opschort' >>> sb_dutch('ongrijpbaarheid') 'ongrijp' .. versionadded:: 0.1.0 """ return SnowballDutch().stem(word)
if __name__ == '__main__': import doctest doctest.testmod()