Source code for abydos.tokenizer._q_grams

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.tokenizer._q_grams.

QGrams multi-set class
"""

from collections import Iterable

from ._tokenizer import _Tokenizer

__all__ = ['QGrams']


[docs]class QGrams(_Tokenizer):
    """A q-gram class, which functions like a bag/multiset.

    A q-gram is here defined as all sequences of q characters. Q-grams are also
    known as k-grams and n-grams, but the term n-gram more typically refers to
    sequences of whitespace-delimited words in a string, where q-gram refers
    to sequences of characters in a word or string.

    .. versionadded:: 0.1.0
    """

    def __init__(self, qval=2, start_stop='$#', skip=0, scaler=None):
        """Initialize QGrams.

        Parameters
        ----------
        qval : int or Iterable
            The q-gram length (defaults to 2), can be an integer, range object,
            or list
        start_stop : str
            A string of length >= 0 indicating start & stop symbols.
            If the string is '', q-grams will be calculated without start &
            stop symbols appended to each end.
            Otherwise, the first character of start_stop will pad the
            beginning of the string and the last character of start_stop
            will pad the end of the string before q-grams are calculated.
            (In the case that start_stop is only 1 character long, the same
            symbol will be used for both.)
        skip : int or Iterable
            The number of characters to skip, can be an integer, range object,
            or list
        scaler : None, str, or function
            A scaling function for the Counter:

                - None : no scaling
                - 'set' : All non-zero values are set to 1.
                - 'length' : Each token has weight equal to its length.
                - 'length-log' : Each token has weight equal to the log of its
                   length + 1.
                - 'length-exp' : Each token has weight equal to e raised to its
                   length.
                - a callable function : The function is applied to each value
                  in the Counter. Some useful functions include math.exp,
                  math.log1p, math.sqrt, and indexes into interesting integer
                  sequences such as the Fibonacci sequence.

        Raises
        ------
        ValueError
            Use WhitespaceTokenizer instead of qval=0.

        Examples
        --------
        >>> qg = QGrams().tokenize('AATTATAT')
        >>> qg
        QGrams({'AT': 3, 'TA': 2, '$A': 1, 'AA': 1, 'TT': 1, 'T#': 1})

        >>> qg = QGrams(qval=1, start_stop='').tokenize('AATTATAT')
        >>> qg
        QGrams({'A': 4, 'T': 4})

        >>> qg = QGrams(qval=3, start_stop='').tokenize('AATTATAT')
        >>> qg
        QGrams({'TAT': 2, 'AAT': 1, 'ATT': 1, 'TTA': 1, 'ATA': 1})

        >>> QGrams(qval=2, start_stop='$#').tokenize('interning')
        QGrams({'in': 2, '$i': 1, 'nt': 1, 'te': 1, 'er': 1, 'rn': 1,
        'ni': 1, 'ng': 1, 'g#': 1})

        >>> QGrams(start_stop='', skip=1).tokenize('AACTAGAAC')
        QGrams({'AC': 2, 'AT': 1, 'CA': 1, 'TG': 1, 'AA': 1, 'GA': 1, 'A': 1})

        >>> QGrams(start_stop='', skip=[0, 1]).tokenize('AACTAGAAC')
        QGrams({'AC': 4, 'AA': 3, 'GA': 2, 'CT': 1, 'TA': 1, 'AG': 1,
        'AT': 1, 'CA': 1, 'TG': 1, 'A': 1})

        >>> QGrams(qval=range(3), skip=[0, 1]).tokenize('interdisciplinarian')
        QGrams({'i': 10, 'n': 7, 'r': 4, 'a': 4, 'in': 3, 't': 2, 'e': 2,
        'd': 2, 's': 2, 'c': 2, 'p': 2, 'l': 2, 'ri': 2, 'ia': 2, '$i': 1,
        'nt': 1, 'te': 1, 'er': 1, 'rd': 1, 'di': 1, 'is': 1, 'sc': 1, 'ci': 1,
        'ip': 1, 'pl': 1, 'li': 1, 'na': 1, 'ar': 1, 'an': 1, 'n#': 1, '$n': 1,
        'it': 1, 'ne': 1, 'tr': 1, 'ed': 1, 'ds': 1, 'ic': 1, 'si': 1, 'cp': 1,
        'il': 1, 'pi': 1, 'ln': 1, 'nr': 1, 'ai': 1, 'ra': 1, 'a#': 1})

        .. versionadded:: 0.1.0
        .. versionchanged:: 0.4.0
            Broke tokenization functions out into tokenize method

        """
        if qval == 0:
            raise ValueError('Use WhitespaceTokenizer instead of qval=0.')
        super(QGrams, self).__init__(scaler)

        # Save parameters
        self.qval = qval
        self.start_stop = start_stop
        if qval == 1:
            self.start_stop = ''
        self.skip = skip

        self._string_ss = self._string

[docs]    def tokenize(self, string):
        """Tokenize the term and store it.

        The tokenized term is stored as an ordered list and as a Counter
        object.

        Parameters
        ----------
        string : str
            The string to tokenize


        .. versionadded:: 0.4.0

        """
        self._string = string
        self._ordered_tokens = []

        if not isinstance(self.qval, Iterable):
            self.qval = (self.qval,)
        if not isinstance(self.skip, Iterable):
            self.skip = (self.skip,)

        if string:
            for qval_i in self.qval:
                for skip_i in self.skip:
                    if qval_i < 1:
                        continue

                    if self.start_stop:
                        string = (
                            self.start_stop[0] * (qval_i - 1)
                            + self._string
                            + self.start_stop[-1] * (qval_i - 1)
                        )
                    else:
                        string = self._string

                    if qval_i > 1 and len(string) < qval_i:
                        continue

                    # Having appended start & stop symbols (or not), save the
                    # result, but only for the longest valid qval_i
                    if len(string) > len(self._string_ss):
                        self._string_ss = string

                    skip_i += 1
                    self._ordered_tokens += [
                        string[i : i + (qval_i * skip_i) : skip_i]
                        for i in range(len(string) - (qval_i - 1))
                    ]

        super(QGrams, self).tokenize()
        return self


if __name__ == '__main__':
    import doctest

    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)