Source code for abydos.fingerprint._qgram

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.fingerprint._q_gram_fingerprint.

q-gram fingerprint
"""

from unicodedata import normalize as unicode_normalize

from deprecation import deprecated

from ._fingerprint import _Fingerprint
from .. import __version__
from ..tokenizer import QGrams

__all__ = ['QGram', 'qgram_fingerprint']


[docs]class QGram(_Fingerprint):
    """Q-Gram Fingerprint.

    A q-gram fingerprint is a string consisting of all of the unique q-grams
    in a string, alphabetized & concatenated. This fingerprint is described at
    :cite:`OpenRefine:2012`.

    .. versionadded:: 0.3.6
    """

    def __init__(self, qval=2, start_stop='', joiner='', skip=0):
        """Initialize Q-Gram fingerprinter.

        qval : int
            The length of each q-gram (by default 2)
        start_stop : str
            The start & stop symbol(s) to concatenate on either end of the
            phrase, as defined in :py:class:`tokenizer.QGrams`
        joiner : str
            The string that will be placed between each word
        skip : int or Iterable
            The number of characters to skip, can be an integer, range object,
            or list


        .. versionadded:: 0.4.0

        """
        self._tokenizer = QGrams(qval, start_stop, skip)
        self._joiner = joiner

[docs]    def fingerprint(self, phrase):
        """Return Q-Gram fingerprint.

        Parameters
        ----------
        phrase : str
            The string from which to calculate the q-gram fingerprint

        Returns
        -------
        str
            The q-gram fingerprint of the phrase

        Examples
        --------
        >>> qf = QGram()
        >>> qf.fingerprint('The quick brown fox jumped over the lazy dog.')
        'azbrckdoedeleqerfoheicjukblampnfogovowoxpequrortthuiumvewnxjydzy'
        >>> qf.fingerprint('Christopher')
        'cherhehrisopphristto'
        >>> qf.fingerprint('Niall')
        'aliallni'


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        phrase = unicode_normalize('NFKD', phrase.strip().lower())
        phrase = ''.join(c for c in phrase if c.isalnum())
        phrase = self._tokenizer.tokenize(phrase).get_set()
        phrase = self._joiner.join(sorted(phrase))
        return phrase


[docs]@deprecated(
    deprecated_in='0.4.0',
    removed_in='0.6.0',
    current_version=__version__,
    details='Use the QGram.fingerprint method instead.',
)
def qgram_fingerprint(phrase, qval=2, start_stop='', joiner=''):
    """Return Q-Gram fingerprint.

    This is a wrapper for :py:meth:`QGram.fingerprint`.

    Parameters
    ----------
    phrase : str
        The string from which to calculate the q-gram fingerprint
    qval : int
        The length of each q-gram (by default 2)
    start_stop : str
        The start & stop symbol(s) to concatenate on either end of the phrase,
        as defined in :py:class:`tokenizer.QGrams`
    joiner : str
        The string that will be placed between each word

    Returns
    -------
    str
        The q-gram fingerprint of the phrase

    Examples
    --------
    >>> qgram_fingerprint('The quick brown fox jumped over the lazy dog.')
    'azbrckdoedeleqerfoheicjukblampnfogovowoxpequrortthuiumvewnxjydzy'
    >>> qgram_fingerprint('Christopher')
    'cherhehrisopphristto'
    >>> qgram_fingerprint('Niall')
    'aliallni'

    .. versionadded:: 0.1.0

    """
    return QGram(qval=qval, start_stop=start_stop, joiner=joiner).fingerprint(
        phrase
    )


if __name__ == '__main__':
    import doctest

    doctest.testmod()