Source code for abydos.tokenizer._q_grams

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.tokenizer._q_grams.

QGrams multi-set class
"""

from collections import Iterable

from ._tokenizer import _Tokenizer

__all__ = ['QGrams']


[docs]class QGrams(_Tokenizer): """A q-gram class, which functions like a bag/multiset. A q-gram is here defined as all sequences of q characters. Q-grams are also known as k-grams and n-grams, but the term n-gram more typically refers to sequences of whitespace-delimited words in a string, where q-gram refers to sequences of characters in a word or string. .. versionadded:: 0.1.0 """ def __init__(self, qval=2, start_stop='$#', skip=0, scaler=None): """Initialize QGrams. Parameters ---------- qval : int or Iterable The q-gram length (defaults to 2), can be an integer, range object, or list start_stop : str A string of length >= 0 indicating start & stop symbols. If the string is '', q-grams will be calculated without start & stop symbols appended to each end. Otherwise, the first character of start_stop will pad the beginning of the string and the last character of start_stop will pad the end of the string before q-grams are calculated. (In the case that start_stop is only 1 character long, the same symbol will be used for both.) skip : int or Iterable The number of characters to skip, can be an integer, range object, or list scaler : None, str, or function A scaling function for the Counter: - None : no scaling - 'set' : All non-zero values are set to 1. - 'length' : Each token has weight equal to its length. - 'length-log' : Each token has weight equal to the log of its length + 1. - 'length-exp' : Each token has weight equal to e raised to its length. - a callable function : The function is applied to each value in the Counter. Some useful functions include math.exp, math.log1p, math.sqrt, and indexes into interesting integer sequences such as the Fibonacci sequence. Raises ------ ValueError Use WhitespaceTokenizer instead of qval=0. Examples -------- >>> qg = QGrams().tokenize('AATTATAT') >>> qg QGrams({'AT': 3, 'TA': 2, '$A': 1, 'AA': 1, 'TT': 1, 'T#': 1}) >>> qg = QGrams(qval=1, start_stop='').tokenize('AATTATAT') >>> qg QGrams({'A': 4, 'T': 4}) >>> qg = QGrams(qval=3, start_stop='').tokenize('AATTATAT') >>> qg QGrams({'TAT': 2, 'AAT': 1, 'ATT': 1, 'TTA': 1, 'ATA': 1}) >>> QGrams(qval=2, start_stop='$#').tokenize('interning') QGrams({'in': 2, '$i': 1, 'nt': 1, 'te': 1, 'er': 1, 'rn': 1, 'ni': 1, 'ng': 1, 'g#': 1}) >>> QGrams(start_stop='', skip=1).tokenize('AACTAGAAC') QGrams({'AC': 2, 'AT': 1, 'CA': 1, 'TG': 1, 'AA': 1, 'GA': 1, 'A': 1}) >>> QGrams(start_stop='', skip=[0, 1]).tokenize('AACTAGAAC') QGrams({'AC': 4, 'AA': 3, 'GA': 2, 'CT': 1, 'TA': 1, 'AG': 1, 'AT': 1, 'CA': 1, 'TG': 1, 'A': 1}) >>> QGrams(qval=range(3), skip=[0, 1]).tokenize('interdisciplinarian') QGrams({'i': 10, 'n': 7, 'r': 4, 'a': 4, 'in': 3, 't': 2, 'e': 2, 'd': 2, 's': 2, 'c': 2, 'p': 2, 'l': 2, 'ri': 2, 'ia': 2, '$i': 1, 'nt': 1, 'te': 1, 'er': 1, 'rd': 1, 'di': 1, 'is': 1, 'sc': 1, 'ci': 1, 'ip': 1, 'pl': 1, 'li': 1, 'na': 1, 'ar': 1, 'an': 1, 'n#': 1, '$n': 1, 'it': 1, 'ne': 1, 'tr': 1, 'ed': 1, 'ds': 1, 'ic': 1, 'si': 1, 'cp': 1, 'il': 1, 'pi': 1, 'ln': 1, 'nr': 1, 'ai': 1, 'ra': 1, 'a#': 1}) .. versionadded:: 0.1.0 .. versionchanged:: 0.4.0 Broke tokenization functions out into tokenize method """ if qval == 0: raise ValueError('Use WhitespaceTokenizer instead of qval=0.') super(QGrams, self).__init__(scaler) # Save parameters self.qval = qval self.start_stop = start_stop if qval == 1: self.start_stop = '' self.skip = skip self._string_ss = self._string
[docs] def tokenize(self, string): """Tokenize the term and store it. The tokenized term is stored as an ordered list and as a Counter object. Parameters ---------- string : str The string to tokenize .. versionadded:: 0.4.0 """ self._string = string self._ordered_tokens = [] if not isinstance(self.qval, Iterable): self.qval = (self.qval,) if not isinstance(self.skip, Iterable): self.skip = (self.skip,) if string: for qval_i in self.qval: for skip_i in self.skip: if qval_i < 1: continue if self.start_stop: string = ( self.start_stop[0] * (qval_i - 1) + self._string + self.start_stop[-1] * (qval_i - 1) ) else: string = self._string if qval_i > 1 and len(string) < qval_i: continue # Having appended start & stop symbols (or not), save the # result, but only for the longest valid qval_i if len(string) > len(self._string_ss): self._string_ss = string skip_i += 1 self._ordered_tokens += [ string[i : i + (qval_i * skip_i) : skip_i] for i in range(len(string) - (qval_i - 1)) ] super(QGrams, self).tokenize() return self
if __name__ == '__main__': import doctest doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)