Source code for abydos.compression

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.compression.

The compression module defines compression and compression-related functions
for use within Abydos, including implementations of the following:

    - arithmetic coding functions (ac_train, ac_encode, & ac_decode)
    - Burrows-Wheeler transform encoder/decoder (bwt_encode & bwt_decode)
    - Run-Length Encoding encoder/decoder (rle_encode & rle_decode)
"""

from __future__ import division, unicode_literals

from collections import Counter
from fractions import Fraction
from itertools import groupby

from six import PY3, text_type
from six.moves import range


if PY3:
    long = int

__all__ = ['ac_decode', 'ac_encode', 'ac_train', 'bwt_decode', 'bwt_encode',
           'rle_decode', 'rle_encode']


[docs]def ac_train(text):
    r"""Generate a probability dict from the provided text.

    Text -> 0-order probability statistics as a dict

    This is based on Andrew Dalke's public domain implementation
    :cite:`Dalke:2005`. It has been ported to use the fractions.Fraction class.

    :param str text: The text data over which to calculate probability
        statistics. This must not contain the NUL (0x00) character because
        that's used to indicate the end of data.
    :returns: a probability dict
    :rtype: dict

    >>> ac_train('the quick brown fox jumped over the lazy dog')
    {' ': (Fraction(0, 1), Fraction(8, 45)),
     'o': (Fraction(8, 45), Fraction(4, 15)),
     'e': (Fraction(4, 15), Fraction(16, 45)),
     'u': (Fraction(16, 45), Fraction(2, 5)),
     't': (Fraction(2, 5), Fraction(4, 9)),
     'r': (Fraction(4, 9), Fraction(22, 45)),
     'h': (Fraction(22, 45), Fraction(8, 15)),
     'd': (Fraction(8, 15), Fraction(26, 45)),
     'z': (Fraction(26, 45), Fraction(3, 5)),
     'y': (Fraction(3, 5), Fraction(28, 45)),
     'x': (Fraction(28, 45), Fraction(29, 45)),
     'w': (Fraction(29, 45), Fraction(2, 3)),
     'v': (Fraction(2, 3), Fraction(31, 45)),
     'q': (Fraction(31, 45), Fraction(32, 45)),
     'p': (Fraction(32, 45), Fraction(11, 15)),
     'n': (Fraction(11, 15), Fraction(34, 45)),
     'm': (Fraction(34, 45), Fraction(7, 9)),
     'l': (Fraction(7, 9), Fraction(4, 5)),
     'k': (Fraction(4, 5), Fraction(37, 45)),
     'j': (Fraction(37, 45), Fraction(38, 45)),
     'i': (Fraction(38, 45), Fraction(13, 15)),
     'g': (Fraction(13, 15), Fraction(8, 9)),
     'f': (Fraction(8, 9), Fraction(41, 45)),
     'c': (Fraction(41, 45), Fraction(14, 15)),
     'b': (Fraction(14, 15), Fraction(43, 45)),
     'a': (Fraction(43, 45), Fraction(44, 45)),
     '\x00': (Fraction(44, 45), Fraction(1, 1))}
    """
    text = text_type(text)
    if '\x00' in text:
        text = text.replace('\x00', ' ')
    counts = Counter(text)
    counts['\x00'] = 1
    tot_letters = sum(counts.values())

    tot = 0
    prob_range = {}
    prev = Fraction(0)
    for char, count in sorted(counts.items(), key=lambda x: (x[1], x[0]),
                              reverse=True):
        follow = Fraction(tot + count, tot_letters)
        prob_range[char] = (prev, follow)
        prev = follow
        tot = tot + count
    # assert tot == tot_letters

    return prob_range


[docs]def ac_encode(text, probs):
    """Encode a text using arithmetic coding with the provided probabilities.

    Text and the 0-order probability statistics -> longval, nbits

    The encoded number is Fraction(longval, 2**nbits)

    This is based on Andrew Dalke's public domain implementation
    :cite:`Dalke:2005`. It has been ported to use the fractions.Fraction class.

    :param str text: A string to encode
    :param dict probs: A probability statistics dictionary generated by
        ac_train
    :returns: The arithmetically coded text
    :rtype: tuple

    >>> pr = ac_train('the quick brown fox jumped over the lazy dog')
    >>> ac_encode('align', pr)
    (16720586181, 34)
    """
    text = text_type(text)
    if '\x00' in text:
        text = text.replace('\x00', ' ')
    minval = Fraction(0)
    maxval = Fraction(1)
    for char in text + '\x00':
        prob_range = probs[char]
        delta = maxval - minval
        maxval = minval + prob_range[1] * delta
        minval = minval + prob_range[0] * delta

    # I tried without the /2 just to check.  Doesn't work.
    # Keep scaling up until the error range is >= 1.  That
    # gives me the minimum number of bits needed to resolve
    # down to the end-of-data character.
    delta = (maxval - minval) / 2
    nbits = long(0)
    while delta < 1:
        nbits += 1
        delta *= 2
    # The below condition shouldn't ever be false
    if nbits == 0:  # pragma: no cover
        return 0, 0
    # using -1 instead of /2
    avg = (maxval + minval) * 2**(nbits-1)
    # Could return a rational instead ...
    # the division truncation is deliberate
    return avg.numerator//avg.denominator, nbits


[docs]def ac_decode(longval, nbits, probs):
    """Decode the number to a string using the given statistics.

    This is based on Andrew Dalke's public domain implementation
    :cite:`Dalke:2005`. It has been ported to use the fractions.Fraction class.

    :param int longval: The first part of an encoded tuple from ac_encode
    :param int nbits: The second part of an encoded tuple from ac_encode
    :param dict probs: A probability statistics dictionary generated by
        ac_train
    :returns: The arithmetically decoded text
    :rtype: str

    >>> pr = ac_train('the quick brown fox jumped over the lazy dog')
    >>> ac_decode(16720586181, 34, pr)
    'align'
    """
    val = Fraction(longval, long(1) << nbits)
    letters = []
    probs_items = [(char, minval, maxval) for (char, (minval, maxval))
                   in probs.items()]

    char = '\x00'
    while True:
        for (char, minval, maxval) in probs_items:  # noqa: B007
            if minval <= val < maxval:
                break

        if char == '\x00':
            break
        letters.append(char)
        delta = maxval - minval
        val = (val - minval)/delta
    return ''.join(letters)


[docs]def bwt_encode(word, terminator='\0'):
    r"""Return the Burrows-Wheeler transformed form of a word.

    The Burrows-Wheeler transform is an attempt at placing similar characters
    together to improve compression.
    Cf. :cite:`Burrows:1994`.

    :param str word: the word to transform using BWT
    :param str terminator: a character to add to word to signal the end of the
        string
    :returns: word encoded by BWT
    :rtype: str

    >>> bwt_encode('align')
    'n\x00ilag'
    >>> bwt_encode('banana')
    'annb\x00aa'
    >>> bwt_encode('banana', '@')
    'annb@aa'
    """
    if word:
        if terminator in word:
            raise ValueError('Specified terminator, %s, already in word.'
                             .format(terminator if
                                     terminator != '\0' else '\\0'))
        else:
            word += terminator
            wordlist = sorted(word[i:] + word[:i] for i in range(len(word)))
            return ''.join([w[-1] for w in wordlist])
    else:
        return terminator


[docs]def bwt_decode(code, terminator='\0'):
    r"""Return a word decoded from BWT form.

    The Burrows-Wheeler transform is an attempt at placing similar characters
    together to improve compression. This function reverses the transform.
    Cf. :cite:`Burrows:1994`.

    :param str code: the word to transform from BWT form
    :param str terminator: a character added to word to signal the end of the
        string
    :returns: word decoded by BWT
    :rtype: str

    >>> bwt_decode('n\x00ilag')
    'align'
    >>> bwt_decode('annb\x00aa')
    'banana'
    >>> bwt_decode('annb@aa', '@')
    'banana'
    """
    if code:
        if terminator not in code:
            raise ValueError('Specified terminator, %s, absent from code.'
                             .format(terminator if
                                     terminator != '\0' else '\\0'))
        else:
            wordlist = [''] * len(code)
            for i in range(len(code)):
                wordlist = sorted(code[i] + wordlist[i] for i in
                                  range(len(code)))
            rows = [w for w in wordlist if w[-1] == terminator][0]
            return rows.rstrip(terminator)
    else:
        return ''


[docs]def rle_encode(text, use_bwt=True):
    r"""Perform encoding of run-length-encoding (RLE).

    Cf. :cite:`Robinson:1967`.

    Based on http://rosettacode.org/wiki/Run-length_encoding#Python
    :cite:`rosettacode:2018`. This is licensed GFDL 1.2.

    Digits 0-9 cannot be in text.

    :param str text: a text string to encode
    :param bool use_bwt: boolean indicating whether to perform BWT encoding
        before RLE encoding
    :returns: word decoded by BWT
    :rtype: str

    >>> rle_encode('align')
    'n\x00ilag'
    >>> rle_encode('align', use_bwt=False)
    'align'

    >>> rle_encode('banana')
    'annb\x00aa'
    >>> rle_encode('banana', use_bwt=False)
    'banana'

    >>> rle_encode('aaabaabababa')
    'ab\x00abbab5a'
    >>> rle_encode('aaabaabababa', False)
    '3abaabababa'
    """
    if use_bwt:
        text = bwt_encode(text)
    if text:
        text = ((len(list(g)), k) for k, g in groupby(text))
        text = ((str(n) + k if n > 2 else (k if n == 1 else 2*k)) for
                n, k in text)
    return ''.join(text)


[docs]def rle_decode(text, use_bwt=True):
    r"""Perform decoding of run-length-encoding (RLE).

    Cf. :cite:`Robinson:1967`.

    Based on http://rosettacode.org/wiki/Run-length_encoding#Python
    :cite:`rosettacode:2018`. This is licensed GFDL 1.2.

    Digits 0-9 cannot have been in the original text.

    :param str text: a text string to decode
    :param bool use_bwt: boolean indicating whether to perform BWT decoding
        after RLE decoding
    :returns: word decoded by BWT
    :rtype: str

    >>> rle_decode('n\x00ilag')
    'align'
    >>> rle_decode('align', use_bwt=False)
    'align'

    >>> rle_decode('annb\x00aa')
    'banana'
    >>> rle_decode('banana', use_bwt=False)
    'banana'

    >>> rle_decode('ab\x00abbab5a')
    'aaabaabababa'
    >>> rle_decode('3abaabababa', False)
    'aaabaabababa'
    """
    mult = ''
    decoded = []
    for letter in list(text):
        if not letter.isdigit():
            if mult:
                decoded.append(int(mult)*letter)
                mult = ''
            else:
                decoded.append(letter)
        else:
            mult += letter

    text = ''.join(decoded)
    if use_bwt:
        text = bwt_decode(text)
    return text


if __name__ == '__main__':
    import doctest
    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)