# -*- coding: utf-8 -*-
# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.stemmer._snowball.
The stemmer._snowball module defines the stemmers:
- Porter
- Porter2 (Snowball English)
- Snowball German
- Snowball Dutch
- Snowball Norwegian
- Snowball Swedish
- Snowball Danish
"""
from __future__ import unicode_literals
from unicodedata import normalize
from six import text_type
from six.moves import range
__all__ = [
'porter',
'porter2',
'sb_danish',
'sb_dutch',
'sb_german',
'sb_norwegian',
'sb_swedish',
]
def _m_degree(term, vowels):
"""Return Porter helper function _m_degree value.
m-degree is equal to the number of V to C transitions
:param str term: the word for which to calculate the m-degree
:param set vowels: the set of vowels in the language
:returns: the m-degree as defined in the Porter stemmer definition
:rtype: int
"""
mdeg = 0
last_was_vowel = False
for letter in term:
if letter in vowels:
last_was_vowel = True
else:
if last_was_vowel:
mdeg += 1
last_was_vowel = False
return mdeg
def _sb_has_vowel(term, vowels):
"""Return Porter helper function _sb_has_vowel value.
:param str term: the word to scan for vowels
:param set vowels: the set of vowels in the language
:returns: true iff a vowel exists in the term (as defined in the Porter
stemmer definition)
:rtype: bool
"""
for letter in term:
if letter in vowels:
return True
return False
def _ends_in_doubled_cons(term, vowels):
"""Return Porter helper function _ends_in_doubled_cons value.
:param str term: the word to check for a final doubled consonant
:param set vowels: the set of vowels in the language
:returns: true iff the stem ends in a doubled consonant (as defined in the
Porter stemmer definition)
:rtype: bool
"""
return len(term) > 1 and term[-1] not in vowels and term[-2] == term[-1]
def _ends_in_cvc(term, vowels):
"""Return Porter helper function _ends_in_cvc value.
:param str term: the word to scan for cvc
:param set vowels: the set of vowels in the language
:returns: true iff the stem ends in cvc (as defined in the Porter stemmer
definition)
:rtype: bool
"""
return len(term) > 2 and (
term[-1] not in vowels
and term[-2] in vowels
and term[-3] not in vowels
and term[-1] not in tuple('wxY')
)
[docs]def porter(word, early_english=False):
"""Return Porter stem.
The Porter stemmer is described in :cite:`Porter:1980`.
:param str word: the word to calculate the stem of
:param bool early_english: set to True in order to remove -eth & -est
(2nd & 3rd person singular verbal agreement suffixes)
:returns: word stem
:rtype: str
>>> porter('reading')
'read'
>>> porter('suspension')
'suspens'
>>> porter('elusiveness')
'elus'
>>> porter('eateth', early_english=True)
'eat'
"""
# lowercase, normalize, and compose
word = normalize('NFC', text_type(word.lower()))
# Return word if stem is shorter than 2
if len(word) < 3:
return word
_vowels = {'a', 'e', 'i', 'o', 'u', 'y'}
# Re-map consonantal y to Y (Y will be C, y will be V)
if word[0] == 'y':
word = 'Y' + word[1:]
for i in range(1, len(word)):
if word[i] == 'y' and word[i - 1] in _vowels:
word = word[:i] + 'Y' + word[i + 1 :]
# Step 1a
if word[-1] == 's':
if word[-4:] == 'sses':
word = word[:-2]
elif word[-3:] == 'ies':
word = word[:-2]
elif word[-2:] == 'ss':
pass
else:
word = word[:-1]
# Step 1b
step1b_flag = False
if word[-3:] == 'eed':
if _m_degree(word[:-3], _vowels) > 0:
word = word[:-1]
elif word[-2:] == 'ed':
if _sb_has_vowel(word[:-2], _vowels):
word = word[:-2]
step1b_flag = True
elif word[-3:] == 'ing':
if _sb_has_vowel(word[:-3], _vowels):
word = word[:-3]
step1b_flag = True
elif early_english:
if word[-3:] == 'est':
if _sb_has_vowel(word[:-3], _vowels):
word = word[:-3]
step1b_flag = True
elif word[-3:] == 'eth':
if _sb_has_vowel(word[:-3], _vowels):
word = word[:-3]
step1b_flag = True
if step1b_flag:
if word[-2:] in {'at', 'bl', 'iz'}:
word += 'e'
elif _ends_in_doubled_cons(word, _vowels) and word[-1] not in {
'l',
's',
'z',
}:
word = word[:-1]
elif _m_degree(word, _vowels) == 1 and _ends_in_cvc(word, _vowels):
word += 'e'
# Step 1c
if word[-1] in {'Y', 'y'} and _sb_has_vowel(word[:-1], _vowels):
word = word[:-1] + 'i'
# Step 2
if len(word) > 1:
if word[-2] == 'a':
if word[-7:] == 'ational':
if _m_degree(word[:-7], _vowels) > 0:
word = word[:-5] + 'e'
elif word[-6:] == 'tional':
if _m_degree(word[:-6], _vowels) > 0:
word = word[:-2]
elif word[-2] == 'c':
if word[-4:] in {'enci', 'anci'}:
if _m_degree(word[:-4], _vowels) > 0:
word = word[:-1] + 'e'
elif word[-2] == 'e':
if word[-4:] == 'izer':
if _m_degree(word[:-4], _vowels) > 0:
word = word[:-1]
elif word[-2] == 'g':
if word[-4:] == 'logi':
if _m_degree(word[:-4], _vowels) > 0:
word = word[:-1]
elif word[-2] == 'l':
if word[-3:] == 'bli':
if _m_degree(word[:-3], _vowels) > 0:
word = word[:-1] + 'e'
elif word[-4:] == 'alli':
if _m_degree(word[:-4], _vowels) > 0:
word = word[:-2]
elif word[-5:] == 'entli':
if _m_degree(word[:-5], _vowels) > 0:
word = word[:-2]
elif word[-3:] == 'eli':
if _m_degree(word[:-3], _vowels) > 0:
word = word[:-2]
elif word[-5:] == 'ousli':
if _m_degree(word[:-5], _vowels) > 0:
word = word[:-2]
elif word[-2] == 'o':
if word[-7:] == 'ization':
if _m_degree(word[:-7], _vowels) > 0:
word = word[:-5] + 'e'
elif word[-5:] == 'ation':
if _m_degree(word[:-5], _vowels) > 0:
word = word[:-3] + 'e'
elif word[-4:] == 'ator':
if _m_degree(word[:-4], _vowels) > 0:
word = word[:-2] + 'e'
elif word[-2] == 's':
if word[-5:] == 'alism':
if _m_degree(word[:-5], _vowels) > 0:
word = word[:-3]
elif word[-7:] in {'iveness', 'fulness', 'ousness'}:
if _m_degree(word[:-7], _vowels) > 0:
word = word[:-4]
elif word[-2] == 't':
if word[-5:] == 'aliti':
if _m_degree(word[:-5], _vowels) > 0:
word = word[:-3]
elif word[-5:] == 'iviti':
if _m_degree(word[:-5], _vowels) > 0:
word = word[:-3] + 'e'
elif word[-6:] == 'biliti':
if _m_degree(word[:-6], _vowels) > 0:
word = word[:-5] + 'le'
# Step 3
if word[-5:] == 'icate':
if _m_degree(word[:-5], _vowels) > 0:
word = word[:-3]
elif word[-5:] == 'ative':
if _m_degree(word[:-5], _vowels) > 0:
word = word[:-5]
elif word[-5:] in {'alize', 'iciti'}:
if _m_degree(word[:-5], _vowels) > 0:
word = word[:-3]
elif word[-4:] == 'ical':
if _m_degree(word[:-4], _vowels) > 0:
word = word[:-2]
elif word[-3:] == 'ful':
if _m_degree(word[:-3], _vowels) > 0:
word = word[:-3]
elif word[-4:] == 'ness':
if _m_degree(word[:-4], _vowels) > 0:
word = word[:-4]
# Step 4
if word[-2:] == 'al':
if _m_degree(word[:-2], _vowels) > 1:
word = word[:-2]
elif word[-4:] == 'ance':
if _m_degree(word[:-4], _vowels) > 1:
word = word[:-4]
elif word[-4:] == 'ence':
if _m_degree(word[:-4], _vowels) > 1:
word = word[:-4]
elif word[-2:] == 'er':
if _m_degree(word[:-2], _vowels) > 1:
word = word[:-2]
elif word[-2:] == 'ic':
if _m_degree(word[:-2], _vowels) > 1:
word = word[:-2]
elif word[-4:] == 'able':
if _m_degree(word[:-4], _vowels) > 1:
word = word[:-4]
elif word[-4:] == 'ible':
if _m_degree(word[:-4], _vowels) > 1:
word = word[:-4]
elif word[-3:] == 'ant':
if _m_degree(word[:-3], _vowels) > 1:
word = word[:-3]
elif word[-5:] == 'ement':
if _m_degree(word[:-5], _vowels) > 1:
word = word[:-5]
elif word[-4:] == 'ment':
if _m_degree(word[:-4], _vowels) > 1:
word = word[:-4]
elif word[-3:] == 'ent':
if _m_degree(word[:-3], _vowels) > 1:
word = word[:-3]
elif word[-4:] in {'sion', 'tion'}:
if _m_degree(word[:-3], _vowels) > 1:
word = word[:-3]
elif word[-2:] == 'ou':
if _m_degree(word[:-2], _vowels) > 1:
word = word[:-2]
elif word[-3:] == 'ism':
if _m_degree(word[:-3], _vowels) > 1:
word = word[:-3]
elif word[-3:] == 'ate':
if _m_degree(word[:-3], _vowels) > 1:
word = word[:-3]
elif word[-3:] == 'iti':
if _m_degree(word[:-3], _vowels) > 1:
word = word[:-3]
elif word[-3:] == 'ous':
if _m_degree(word[:-3], _vowels) > 1:
word = word[:-3]
elif word[-3:] == 'ive':
if _m_degree(word[:-3], _vowels) > 1:
word = word[:-3]
elif word[-3:] == 'ize':
if _m_degree(word[:-3], _vowels) > 1:
word = word[:-3]
# Step 5a
if word[-1] == 'e':
if _m_degree(word[:-1], _vowels) > 1:
word = word[:-1]
elif _m_degree(word[:-1], _vowels) == 1 and not _ends_in_cvc(
word[:-1], _vowels
):
word = word[:-1]
# Step 5b
if word[-2:] == 'll' and _m_degree(word, _vowels) > 1:
word = word[:-1]
# Change 'Y' back to 'y' if it survived stemming
for i in range(len(word)):
if word[i] == 'Y':
word = word[:i] + 'y' + word[i + 1 :]
return word
def _sb_r1(term, vowels, r1_prefixes=None):
"""Return the R1 region, as defined in the Porter2 specification."""
vowel_found = False
if hasattr(r1_prefixes, '__iter__'):
for prefix in r1_prefixes:
if term[: len(prefix)] == prefix:
return len(prefix)
for i in range(len(term)):
if not vowel_found and term[i] in vowels:
vowel_found = True
elif vowel_found and term[i] not in vowels:
return i + 1
return len(term)
def _sb_r2(term, vowels, r1_prefixes=None):
"""Return the R2 region, as defined in the Porter2 specification."""
r1_start = _sb_r1(term, vowels, r1_prefixes)
return r1_start + _sb_r1(term[r1_start:], vowels)
def _sb_ends_in_short_syllable(term, vowels, codanonvowels):
"""Return True iff term ends in a short syllable.
(...according to the Porter2 specification.)
NB: This is akin to the CVC test from the Porter stemmer. The description
is unfortunately poor/ambiguous.
"""
if not term:
return False
if len(term) == 2:
if term[-2] in vowels and term[-1] not in vowels:
return True
elif len(term) >= 3:
if (
term[-3] not in vowels
and term[-2] in vowels
and term[-1] in codanonvowels
):
return True
return False
def _sb_short_word(term, vowels, codanonvowels, r1_prefixes=None):
"""Return True iff term is a short word.
(...according to the Porter2 specification.)
"""
if _sb_r1(term, vowels, r1_prefixes) == len(
term
) and _sb_ends_in_short_syllable(term, vowels, codanonvowels):
return True
return False
[docs]def porter2(word, early_english=False):
"""Return the Porter2 (Snowball English) stem.
The Porter2 (Snowball English) stemmer is defined in :cite:`Porter:2002`.
:param str word: the word to calculate the stem of
:param bool early_english: set to True in order to remove -eth & -est
(2nd & 3rd person singular verbal agreement suffixes)
:returns: word stem
:rtype: str
>>> porter2('reading')
'read'
>>> porter2('suspension')
'suspens'
>>> porter2('elusiveness')
'elus'
>>> porter2('eateth', early_english=True)
'eat'
"""
_vowels = {'a', 'e', 'i', 'o', 'u', 'y'}
_codanonvowels = {
"'",
'b',
'c',
'd',
'f',
'g',
'h',
'j',
'k',
'l',
'm',
'n',
'p',
'q',
'r',
's',
't',
'v',
'z',
}
_doubles = {'bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt'}
_li = {'c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't'}
# R1 prefixes should be in order from longest to shortest to prevent
# masking
_r1_prefixes = ('commun', 'gener', 'arsen')
_exception1dict = { # special changes:
'skis': 'ski',
'skies': 'sky',
'dying': 'die',
'lying': 'lie',
'tying': 'tie',
# special -LY cases:
'idly': 'idl',
'gently': 'gentl',
'ugly': 'ugli',
'early': 'earli',
'only': 'onli',
'singly': 'singl',
}
_exception1set = {
'sky',
'news',
'howe',
'atlas',
'cosmos',
'bias',
'andes',
}
_exception2set = {
'inning',
'outing',
'canning',
'herring',
'earring',
'proceed',
'exceed',
'succeed',
}
# lowercase, normalize, and compose
word = normalize('NFC', text_type(word.lower()))
# replace apostrophe-like characters with U+0027, per
# http://snowball.tartarus.org/texts/apostrophe.html
word = word.replace('’', '\'')
word = word.replace('’', '\'')
# Exceptions 1
if word in _exception1dict:
return _exception1dict[word]
elif word in _exception1set:
return word
# Return word if stem is shorter than 3
if len(word) < 3:
return word
# Remove initial ', if present.
while word and word[0] == '\'':
word = word[1:]
# Return word if stem is shorter than 2
if len(word) < 2:
return word
# Re-map vocalic Y to y (Y will be C, y will be V)
if word[0] == 'y':
word = 'Y' + word[1:]
for i in range(1, len(word)):
if word[i] == 'y' and word[i - 1] in _vowels:
word = word[:i] + 'Y' + word[i + 1 :]
r1_start = _sb_r1(word, _vowels, _r1_prefixes)
r2_start = _sb_r2(word, _vowels, _r1_prefixes)
# Step 0
if word[-3:] == '\'s\'':
word = word[:-3]
elif word[-2:] == '\'s':
word = word[:-2]
elif word[-1:] == '\'':
word = word[:-1]
# Return word if stem is shorter than 2
if len(word) < 3:
return word
# Step 1a
if word[-4:] == 'sses':
word = word[:-2]
elif word[-3:] in {'ied', 'ies'}:
if len(word) > 4:
word = word[:-2]
else:
word = word[:-1]
elif word[-2:] in {'us', 'ss'}:
pass
elif word[-1] == 's':
if _sb_has_vowel(word[:-2], _vowels):
word = word[:-1]
# Exceptions 2
if word in _exception2set:
return word
# Step 1b
step1b_flag = False
if word[-5:] == 'eedly':
if len(word[r1_start:]) >= 5:
word = word[:-3]
elif word[-5:] == 'ingly':
if _sb_has_vowel(word[:-5], _vowels):
word = word[:-5]
step1b_flag = True
elif word[-4:] == 'edly':
if _sb_has_vowel(word[:-4], _vowels):
word = word[:-4]
step1b_flag = True
elif word[-3:] == 'eed':
if len(word[r1_start:]) >= 3:
word = word[:-1]
elif word[-3:] == 'ing':
if _sb_has_vowel(word[:-3], _vowels):
word = word[:-3]
step1b_flag = True
elif word[-2:] == 'ed':
if _sb_has_vowel(word[:-2], _vowels):
word = word[:-2]
step1b_flag = True
elif early_english:
if word[-3:] == 'est':
if _sb_has_vowel(word[:-3], _vowels):
word = word[:-3]
step1b_flag = True
elif word[-3:] == 'eth':
if _sb_has_vowel(word[:-3], _vowels):
word = word[:-3]
step1b_flag = True
if step1b_flag:
if word[-2:] in {'at', 'bl', 'iz'}:
word += 'e'
elif word[-2:] in _doubles:
word = word[:-1]
elif _sb_short_word(word, _vowels, _codanonvowels, _r1_prefixes):
word += 'e'
# Step 1c
if len(word) > 2 and word[-1] in {'Y', 'y'} and word[-2] not in _vowels:
word = word[:-1] + 'i'
# Step 2
if word[-2] == 'a':
if word[-7:] == 'ational':
if len(word[r1_start:]) >= 7:
word = word[:-5] + 'e'
elif word[-6:] == 'tional':
if len(word[r1_start:]) >= 6:
word = word[:-2]
elif word[-2] == 'c':
if word[-4:] in {'enci', 'anci'}:
if len(word[r1_start:]) >= 4:
word = word[:-1] + 'e'
elif word[-2] == 'e':
if word[-4:] == 'izer':
if len(word[r1_start:]) >= 4:
word = word[:-1]
elif word[-2] == 'g':
if word[-3:] == 'ogi':
if r1_start >= 1 and len(word[r1_start:]) >= 3 and word[-4] == 'l':
word = word[:-1]
elif word[-2] == 'l':
if word[-6:] == 'lessli':
if len(word[r1_start:]) >= 6:
word = word[:-2]
elif word[-5:] in {'entli', 'fulli', 'ousli'}:
if len(word[r1_start:]) >= 5:
word = word[:-2]
elif word[-4:] == 'abli':
if len(word[r1_start:]) >= 4:
word = word[:-1] + 'e'
elif word[-4:] == 'alli':
if len(word[r1_start:]) >= 4:
word = word[:-2]
elif word[-3:] == 'bli':
if len(word[r1_start:]) >= 3:
word = word[:-1] + 'e'
elif word[-2:] == 'li':
if r1_start >= 1 and len(word[r1_start:]) >= 2 and word[-3] in _li:
word = word[:-2]
elif word[-2] == 'o':
if word[-7:] == 'ization':
if len(word[r1_start:]) >= 7:
word = word[:-5] + 'e'
elif word[-5:] == 'ation':
if len(word[r1_start:]) >= 5:
word = word[:-3] + 'e'
elif word[-4:] == 'ator':
if len(word[r1_start:]) >= 4:
word = word[:-2] + 'e'
elif word[-2] == 's':
if word[-7:] in {'fulness', 'ousness', 'iveness'}:
if len(word[r1_start:]) >= 7:
word = word[:-4]
elif word[-5:] == 'alism':
if len(word[r1_start:]) >= 5:
word = word[:-3]
elif word[-2] == 't':
if word[-6:] == 'biliti':
if len(word[r1_start:]) >= 6:
word = word[:-5] + 'le'
elif word[-5:] == 'aliti':
if len(word[r1_start:]) >= 5:
word = word[:-3]
elif word[-5:] == 'iviti':
if len(word[r1_start:]) >= 5:
word = word[:-3] + 'e'
# Step 3
if word[-7:] == 'ational':
if len(word[r1_start:]) >= 7:
word = word[:-5] + 'e'
elif word[-6:] == 'tional':
if len(word[r1_start:]) >= 6:
word = word[:-2]
elif word[-5:] in {'alize', 'icate', 'iciti'}:
if len(word[r1_start:]) >= 5:
word = word[:-3]
elif word[-5:] == 'ative':
if len(word[r2_start:]) >= 5:
word = word[:-5]
elif word[-4:] == 'ical':
if len(word[r1_start:]) >= 4:
word = word[:-2]
elif word[-4:] == 'ness':
if len(word[r1_start:]) >= 4:
word = word[:-4]
elif word[-3:] == 'ful':
if len(word[r1_start:]) >= 3:
word = word[:-3]
# Step 4
for suffix in (
'ement',
'ance',
'ence',
'able',
'ible',
'ment',
'ant',
'ent',
'ism',
'ate',
'iti',
'ous',
'ive',
'ize',
'al',
'er',
'ic',
):
if word[-len(suffix) :] == suffix:
if len(word[r2_start:]) >= len(suffix):
word = word[: -len(suffix)]
break
else:
if word[-3:] == 'ion':
if (
len(word[r2_start:]) >= 3
and len(word) >= 4
and word[-4] in tuple('st')
):
word = word[:-3]
# Step 5
if word[-1] == 'e':
if len(word[r2_start:]) >= 1 or (
len(word[r1_start:]) >= 1
and not _sb_ends_in_short_syllable(
word[:-1], _vowels, _codanonvowels
)
):
word = word[:-1]
elif word[-1] == 'l':
if len(word[r2_start:]) >= 1 and word[-2] == 'l':
word = word[:-1]
# Change 'Y' back to 'y' if it survived stemming
for i in range(0, len(word)):
if word[i] == 'Y':
word = word[:i] + 'y' + word[i + 1 :]
return word
[docs]def sb_german(word, alternate_vowels=False):
"""Return Snowball German stem.
The Snowball German stemmer is defined at:
http://snowball.tartarus.org/algorithms/german/stemmer.html
:param str word: the word to calculate the stem of
:param bool alternate_vowels: composes ae as ä, oe as ö, and ue as ü before
running the algorithm
:returns: word stem
:rtype: str
>>> sb_german('lesen')
'les'
>>> sb_german('graues')
'grau'
>>> sb_german('buchstabieren')
'buchstabi'
"""
_vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö', 'ü'}
_s_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 'r', 't'}
_st_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'}
# lowercase, normalize, and compose
word = normalize('NFC', word.lower())
word = word.replace('ß', 'ss')
if len(word) > 2:
for i in range(2, len(word)):
if word[i] in _vowels and word[i - 2] in _vowels:
if word[i - 1] == 'u':
word = word[: i - 1] + 'U' + word[i:]
elif word[i - 1] == 'y':
word = word[: i - 1] + 'Y' + word[i:]
if alternate_vowels:
word = word.replace('ae', 'ä')
word = word.replace('oe', 'ö')
word = word.replace('que', 'Q')
word = word.replace('ue', 'ü')
word = word.replace('Q', 'que')
r1_start = max(3, _sb_r1(word, _vowels))
r2_start = _sb_r2(word, _vowels)
# Step 1
niss_flag = False
if word[-3:] == 'ern':
if len(word[r1_start:]) >= 3:
word = word[:-3]
elif word[-2:] == 'em':
if len(word[r1_start:]) >= 2:
word = word[:-2]
elif word[-2:] == 'er':
if len(word[r1_start:]) >= 2:
word = word[:-2]
elif word[-2:] == 'en':
if len(word[r1_start:]) >= 2:
word = word[:-2]
niss_flag = True
elif word[-2:] == 'es':
if len(word[r1_start:]) >= 2:
word = word[:-2]
niss_flag = True
elif word[-1:] == 'e':
if len(word[r1_start:]) >= 1:
word = word[:-1]
niss_flag = True
elif word[-1:] == 's':
if (
len(word[r1_start:]) >= 1
and len(word) >= 2
and word[-2] in _s_endings
):
word = word[:-1]
if niss_flag and word[-4:] == 'niss':
word = word[:-1]
# Step 2
if word[-3:] == 'est':
if len(word[r1_start:]) >= 3:
word = word[:-3]
elif word[-2:] == 'en':
if len(word[r1_start:]) >= 2:
word = word[:-2]
elif word[-2:] == 'er':
if len(word[r1_start:]) >= 2:
word = word[:-2]
elif word[-2:] == 'st':
if (
len(word[r1_start:]) >= 2
and len(word) >= 6
and word[-3] in _st_endings
):
word = word[:-2]
# Step 3
if word[-4:] == 'isch':
if len(word[r2_start:]) >= 4 and word[-5] != 'e':
word = word[:-4]
elif word[-4:] in {'lich', 'heit'}:
if len(word[r2_start:]) >= 4:
word = word[:-4]
if word[-2:] in {'er', 'en'} and len(word[r1_start:]) >= 2:
word = word[:-2]
elif word[-4:] == 'keit':
if len(word[r2_start:]) >= 4:
word = word[:-4]
if word[-4:] == 'lich' and len(word[r2_start:]) >= 4:
word = word[:-4]
elif word[-2:] == 'ig' and len(word[r2_start:]) >= 2:
word = word[:-2]
elif word[-3:] in {'end', 'ung'}:
if len(word[r2_start:]) >= 3:
word = word[:-3]
if (
word[-2:] == 'ig'
and len(word[r2_start:]) >= 2
and word[-3] != 'e'
):
word = word[:-2]
elif word[-2:] in {'ig', 'ik'}:
if len(word[r2_start:]) >= 2 and word[-3] != 'e':
word = word[:-2]
# Change 'Y' and 'U' back to lowercase if survived stemming
for i in range(0, len(word)):
if word[i] == 'Y':
word = word[:i] + 'y' + word[i + 1 :]
elif word[i] == 'U':
word = word[:i] + 'u' + word[i + 1 :]
# Remove umlauts
_umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))
word = word.translate(_umlauts)
return word
[docs]def sb_dutch(word):
"""Return Snowball Dutch stem.
The Snowball Dutch stemmer is defined at:
http://snowball.tartarus.org/algorithms/dutch/stemmer.html
:param str word: the word to calculate the stem of
:returns: word stem
:rtype: str
>>> sb_dutch('lezen')
'lez'
>>> sb_dutch('opschorting')
'opschort'
>>> sb_dutch('ongrijpbaarheid')
'ongrijp'
"""
_vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'è'}
_not_s_endings = {'a', 'e', 'i', 'j', 'o', 'u', 'y', 'è'}
def _undouble(word):
"""Undouble endings -kk, -dd, and -tt."""
if (
len(word) > 1
and word[-1] == word[-2]
and word[-1] in {'d', 'k', 't'}
):
return word[:-1]
return word
# lowercase, normalize, decompose, filter umlauts & acutes out, and compose
word = normalize('NFC', text_type(word.lower()))
_accented = dict(zip((ord(_) for _ in 'äëïöüáéíóú'), 'aeiouaeiou'))
word = word.translate(_accented)
for i in range(len(word)):
if i == 0 and word[0] == 'y':
word = 'Y' + word[1:]
elif word[i] == 'y' and word[i - 1] in _vowels:
word = word[:i] + 'Y' + word[i + 1 :]
elif (
word[i] == 'i'
and word[i - 1] in _vowels
and i + 1 < len(word)
and word[i + 1] in _vowels
):
word = word[:i] + 'I' + word[i + 1 :]
r1_start = max(3, _sb_r1(word, _vowels))
r2_start = _sb_r2(word, _vowels)
# Step 1
if word[-5:] == 'heden':
if len(word[r1_start:]) >= 5:
word = word[:-3] + 'id'
elif word[-3:] == 'ene':
if len(word[r1_start:]) >= 3 and (
word[-4] not in _vowels and word[-6:-3] != 'gem'
):
word = _undouble(word[:-3])
elif word[-2:] == 'en':
if len(word[r1_start:]) >= 2 and (
word[-3] not in _vowels and word[-5:-2] != 'gem'
):
word = _undouble(word[:-2])
elif word[-2:] == 'se':
if len(word[r1_start:]) >= 2 and word[-3] not in _not_s_endings:
word = word[:-2]
elif word[-1:] == 's':
if len(word[r1_start:]) >= 1 and word[-2] not in _not_s_endings:
word = word[:-1]
# Step 2
e_removed = False
if word[-1:] == 'e':
if len(word[r1_start:]) >= 1 and word[-2] not in _vowels:
word = _undouble(word[:-1])
e_removed = True
# Step 3a
if word[-4:] == 'heid':
if len(word[r2_start:]) >= 4 and word[-5] != 'c':
word = word[:-4]
if word[-2:] == 'en':
if len(word[r1_start:]) >= 2 and (
word[-3] not in _vowels and word[-5:-2] != 'gem'
):
word = _undouble(word[:-2])
# Step 3b
if word[-4:] == 'lijk':
if len(word[r2_start:]) >= 4:
word = word[:-4]
# Repeat step 2
if word[-1:] == 'e':
if len(word[r1_start:]) >= 1 and word[-2] not in _vowels:
word = _undouble(word[:-1])
elif word[-4:] == 'baar':
if len(word[r2_start:]) >= 4:
word = word[:-4]
elif word[-3:] in ('end', 'ing'):
if len(word[r2_start:]) >= 3:
word = word[:-3]
if (
word[-2:] == 'ig'
and len(word[r2_start:]) >= 2
and word[-3] != 'e'
):
word = word[:-2]
else:
word = _undouble(word)
elif word[-3:] == 'bar':
if len(word[r2_start:]) >= 3 and e_removed:
word = word[:-3]
elif word[-2:] == 'ig':
if len(word[r2_start:]) >= 2 and word[-3] != 'e':
word = word[:-2]
# Step 4
if (
len(word) >= 4
and word[-3] == word[-2]
and word[-2] in {'a', 'e', 'o', 'u'}
and word[-4] not in _vowels
and word[-1] not in _vowels
and word[-1] != 'I'
):
word = word[:-2] + word[-1]
# Change 'Y' and 'U' back to lowercase if survived stemming
for i in range(0, len(word)):
if word[i] == 'Y':
word = word[:i] + 'y' + word[i + 1 :]
elif word[i] == 'I':
word = word[:i] + 'i' + word[i + 1 :]
return word
[docs]def sb_norwegian(word):
"""Return Snowball Norwegian stem.
The Snowball Norwegian stemmer is defined at:
http://snowball.tartarus.org/algorithms/norwegian/stemmer.html
:param str word: the word to calculate the stem of
:returns: word stem
:rtype: str
>>> sb_norwegian('lese')
'les'
>>> sb_norwegian('suspensjon')
'suspensjon'
>>> sb_norwegian('sikkerhet')
'sikker'
"""
_vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'å', 'æ', 'ø'}
_s_endings = {
'b',
'c',
'd',
'f',
'g',
'h',
'j',
'l',
'm',
'n',
'o',
'p',
'r',
't',
'v',
'y',
'z',
}
# lowercase, normalize, and compose
word = normalize('NFC', text_type(word.lower()))
r1_start = min(max(3, _sb_r1(word, _vowels)), len(word))
# Step 1
_r1 = word[r1_start:]
if _r1[-7:] == 'hetenes':
word = word[:-7]
elif _r1[-6:] in {'hetene', 'hetens'}:
word = word[:-6]
elif _r1[-5:] in {'heten', 'heter', 'endes'}:
word = word[:-5]
elif _r1[-4:] in {'ande', 'ende', 'edes', 'enes', 'erte'}:
if word[-4:] == 'erte':
word = word[:-2]
else:
word = word[:-4]
elif _r1[-3:] in {
'ede',
'ane',
'ene',
'ens',
'ers',
'ets',
'het',
'ast',
'ert',
}:
if word[-3:] == 'ert':
word = word[:-1]
else:
word = word[:-3]
elif _r1[-2:] in {'en', 'ar', 'er', 'as', 'es', 'et'}:
word = word[:-2]
elif _r1[-1:] in {'a', 'e'}:
word = word[:-1]
elif _r1[-1:] == 's':
if (len(word) > 1 and word[-2] in _s_endings) or (
len(word) > 2 and word[-2] == 'k' and word[-3] not in _vowels
):
word = word[:-1]
# Step 2
if word[r1_start:][-2:] in {'dt', 'vt'}:
word = word[:-1]
# Step 3
_r1 = word[r1_start:]
if _r1[-7:] == 'hetslov':
word = word[:-7]
elif _r1[-4:] in {'eleg', 'elig', 'elov', 'slov'}:
word = word[:-4]
elif _r1[-3:] in {'leg', 'eig', 'lig', 'els', 'lov'}:
word = word[:-3]
elif _r1[-2:] == 'ig':
word = word[:-2]
return word
[docs]def sb_swedish(word):
"""Return Snowball Swedish stem.
The Snowball Swedish stemmer is defined at:
http://snowball.tartarus.org/algorithms/swedish/stemmer.html
:param str word: the word to calculate the stem of
:returns: word stem
:rtype: str
>>> sb_swedish('undervisa')
'undervis'
>>> sb_swedish('suspension')
'suspension'
>>> sb_swedish('visshet')
'viss'
"""
_vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'ä', 'å', 'ö'}
_s_endings = {
'b',
'c',
'd',
'f',
'g',
'h',
'j',
'k',
'l',
'm',
'n',
'o',
'p',
'r',
't',
'v',
'y',
}
# lowercase, normalize, and compose
word = normalize('NFC', text_type(word.lower()))
r1_start = min(max(3, _sb_r1(word, _vowels)), len(word))
# Step 1
_r1 = word[r1_start:]
if _r1[-7:] == 'heterna':
word = word[:-7]
elif _r1[-6:] == 'hetens':
word = word[:-6]
elif _r1[-5:] in {
'anden',
'heten',
'heter',
'arnas',
'ernas',
'ornas',
'andes',
'arens',
'andet',
}:
word = word[:-5]
elif _r1[-4:] in {
'arna',
'erna',
'orna',
'ande',
'arne',
'aste',
'aren',
'ades',
'erns',
}:
word = word[:-4]
elif _r1[-3:] in {'ade', 'are', 'ern', 'ens', 'het', 'ast'}:
word = word[:-3]
elif _r1[-2:] in {'ad', 'en', 'ar', 'er', 'or', 'as', 'es', 'at'}:
word = word[:-2]
elif _r1[-1:] in {'a', 'e'}:
word = word[:-1]
elif _r1[-1:] == 's':
if len(word) > 1 and word[-2] in _s_endings:
word = word[:-1]
# Step 2
if word[r1_start:][-2:] in {'dd', 'gd', 'nn', 'dt', 'gt', 'kt', 'tt'}:
word = word[:-1]
# Step 3
_r1 = word[r1_start:]
if _r1[-5:] == 'fullt':
word = word[:-1]
elif _r1[-4:] == 'löst':
word = word[:-1]
elif _r1[-3:] in {'lig', 'els'}:
word = word[:-3]
elif _r1[-2:] == 'ig':
word = word[:-2]
return word
[docs]def sb_danish(word):
"""Return Snowball Danish stem.
The Snowball Danish stemmer is defined at:
http://snowball.tartarus.org/algorithms/danish/stemmer.html
:param str word: the word to calculate the stem of
:returns: word stem
:rtype: str
>>> sb_danish('underviser')
'undervis'
>>> sb_danish('suspension')
'suspension'
>>> sb_danish('sikkerhed')
'sikker'
"""
_vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'å', 'æ', 'ø'}
_s_endings = {
'a',
'b',
'c',
'd',
'f',
'g',
'h',
'j',
'k',
'l',
'm',
'n',
'o',
'p',
'r',
't',
'v',
'y',
'z',
'å',
}
# lowercase, normalize, and compose
word = normalize('NFC', text_type(word.lower()))
r1_start = min(max(3, _sb_r1(word, _vowels)), len(word))
# Step 1
_r1 = word[r1_start:]
if _r1[-7:] == 'erendes':
word = word[:-7]
elif _r1[-6:] in {'erende', 'hedens'}:
word = word[:-6]
elif _r1[-5:] in {
'ethed',
'erede',
'heden',
'heder',
'endes',
'ernes',
'erens',
'erets',
}:
word = word[:-5]
elif _r1[-4:] in {
'ered',
'ende',
'erne',
'eren',
'erer',
'heds',
'enes',
'eres',
'eret',
}:
word = word[:-4]
elif _r1[-3:] in {'hed', 'ene', 'ere', 'ens', 'ers', 'ets'}:
word = word[:-3]
elif _r1[-2:] in {'en', 'er', 'es', 'et'}:
word = word[:-2]
elif _r1[-1:] == 'e':
word = word[:-1]
elif _r1[-1:] == 's':
if len(word) > 1 and word[-2] in _s_endings:
word = word[:-1]
# Step 2
if word[r1_start:][-2:] in {'gd', 'dt', 'gt', 'kt'}:
word = word[:-1]
# Step 3
if word[-4:] == 'igst':
word = word[:-2]
_r1 = word[r1_start:]
repeat_step2 = False
if _r1[-4:] == 'elig':
word = word[:-4]
repeat_step2 = True
elif _r1[-4:] == 'løst':
word = word[:-1]
elif _r1[-3:] in {'lig', 'els'}:
word = word[:-3]
repeat_step2 = True
elif _r1[-2:] == 'ig':
word = word[:-2]
repeat_step2 = True
if repeat_step2:
if word[r1_start:][-2:] in {'gd', 'dt', 'gt', 'kt'}:
word = word[:-1]
# Step 4
if (
len(word[r1_start:]) >= 1
and len(word) >= 2
and word[-1] == word[-2]
and word[-1] not in _vowels
):
word = word[:-1]
return word
if __name__ == '__main__':
import doctest
doctest.testmod()