Source code for abydos.compression

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.compression.

The compression module defines compression and compression-related functions
for use within Abydos, including implementations of the following:

    - arithmetic coding functions (ac_train, ac_encode, & ac_decode)
    - Burrows-Wheeler transform encoder/decoder (bwt_encode & bwt_decode)
    - Run-Length Encoding encoder/decoder (rle_encode & rle_decode)
"""

from __future__ import division, unicode_literals

from collections import Counter
from fractions import Fraction
from itertools import groupby

from six import PY3, text_type
from six.moves import range


if PY3:
    long = int

__all__ = ['ac_decode', 'ac_encode', 'ac_train', 'bwt_decode', 'bwt_encode',
           'rle_decode', 'rle_encode']


[docs]def ac_train(text): r"""Generate a probability dict from the provided text. Text -> 0-order probability statistics as a dict This is based on Andrew Dalke's public domain implementation :cite:`Dalke:2005`. It has been ported to use the fractions.Fraction class. :param str text: The text data over which to calculate probability statistics. This must not contain the NUL (0x00) character because that's used to indicate the end of data. :returns: a probability dict :rtype: dict >>> ac_train('the quick brown fox jumped over the lazy dog') {' ': (Fraction(0, 1), Fraction(8, 45)), 'o': (Fraction(8, 45), Fraction(4, 15)), 'e': (Fraction(4, 15), Fraction(16, 45)), 'u': (Fraction(16, 45), Fraction(2, 5)), 't': (Fraction(2, 5), Fraction(4, 9)), 'r': (Fraction(4, 9), Fraction(22, 45)), 'h': (Fraction(22, 45), Fraction(8, 15)), 'd': (Fraction(8, 15), Fraction(26, 45)), 'z': (Fraction(26, 45), Fraction(3, 5)), 'y': (Fraction(3, 5), Fraction(28, 45)), 'x': (Fraction(28, 45), Fraction(29, 45)), 'w': (Fraction(29, 45), Fraction(2, 3)), 'v': (Fraction(2, 3), Fraction(31, 45)), 'q': (Fraction(31, 45), Fraction(32, 45)), 'p': (Fraction(32, 45), Fraction(11, 15)), 'n': (Fraction(11, 15), Fraction(34, 45)), 'm': (Fraction(34, 45), Fraction(7, 9)), 'l': (Fraction(7, 9), Fraction(4, 5)), 'k': (Fraction(4, 5), Fraction(37, 45)), 'j': (Fraction(37, 45), Fraction(38, 45)), 'i': (Fraction(38, 45), Fraction(13, 15)), 'g': (Fraction(13, 15), Fraction(8, 9)), 'f': (Fraction(8, 9), Fraction(41, 45)), 'c': (Fraction(41, 45), Fraction(14, 15)), 'b': (Fraction(14, 15), Fraction(43, 45)), 'a': (Fraction(43, 45), Fraction(44, 45)), '\x00': (Fraction(44, 45), Fraction(1, 1))} """ text = text_type(text) if '\x00' in text: text = text.replace('\x00', ' ') counts = Counter(text) counts['\x00'] = 1 tot_letters = sum(counts.values()) tot = 0 prob_range = {} prev = Fraction(0) for char, count in sorted(counts.items(), key=lambda x: (x[1], x[0]), reverse=True): follow = Fraction(tot + count, tot_letters) prob_range[char] = (prev, follow) prev = follow tot = tot + count # assert tot == tot_letters return prob_range
[docs]def ac_encode(text, probs): """Encode a text using arithmetic coding with the provided probabilities. Text and the 0-order probability statistics -> longval, nbits The encoded number is Fraction(longval, 2**nbits) This is based on Andrew Dalke's public domain implementation :cite:`Dalke:2005`. It has been ported to use the fractions.Fraction class. :param str text: A string to encode :param dict probs: A probability statistics dictionary generated by ac_train :returns: The arithmetically coded text :rtype: tuple >>> pr = ac_train('the quick brown fox jumped over the lazy dog') >>> ac_encode('align', pr) (16720586181, 34) """ text = text_type(text) if '\x00' in text: text = text.replace('\x00', ' ') minval = Fraction(0) maxval = Fraction(1) for char in text + '\x00': prob_range = probs[char] delta = maxval - minval maxval = minval + prob_range[1] * delta minval = minval + prob_range[0] * delta # I tried without the /2 just to check. Doesn't work. # Keep scaling up until the error range is >= 1. That # gives me the minimum number of bits needed to resolve # down to the end-of-data character. delta = (maxval - minval) / 2 nbits = long(0) while delta < 1: nbits += 1 delta *= 2 # The below condition shouldn't ever be false if nbits == 0: # pragma: no cover return 0, 0 # using -1 instead of /2 avg = (maxval + minval) * 2**(nbits-1) # Could return a rational instead ... # the division truncation is deliberate return avg.numerator//avg.denominator, nbits
[docs]def ac_decode(longval, nbits, probs): """Decode the number to a string using the given statistics. This is based on Andrew Dalke's public domain implementation :cite:`Dalke:2005`. It has been ported to use the fractions.Fraction class. :param int longval: The first part of an encoded tuple from ac_encode :param int nbits: The second part of an encoded tuple from ac_encode :param dict probs: A probability statistics dictionary generated by ac_train :returns: The arithmetically decoded text :rtype: str >>> pr = ac_train('the quick brown fox jumped over the lazy dog') >>> ac_decode(16720586181, 34, pr) 'align' """ val = Fraction(longval, long(1) << nbits) letters = [] probs_items = [(char, minval, maxval) for (char, (minval, maxval)) in probs.items()] char = '\x00' while True: for (char, minval, maxval) in probs_items: # noqa: B007 if minval <= val < maxval: break if char == '\x00': break letters.append(char) delta = maxval - minval val = (val - minval)/delta return ''.join(letters)
[docs]def bwt_encode(word, terminator='\0'): r"""Return the Burrows-Wheeler transformed form of a word. The Burrows-Wheeler transform is an attempt at placing similar characters together to improve compression. Cf. :cite:`Burrows:1994`. :param str word: the word to transform using BWT :param str terminator: a character to add to word to signal the end of the string :returns: word encoded by BWT :rtype: str >>> bwt_encode('align') 'n\x00ilag' >>> bwt_encode('banana') 'annb\x00aa' >>> bwt_encode('banana', '@') 'annb@aa' """ if word: if terminator in word: raise ValueError('Specified terminator, %s, already in word.' .format(terminator if terminator != '\0' else '\\0')) else: word += terminator wordlist = sorted(word[i:] + word[:i] for i in range(len(word))) return ''.join([w[-1] for w in wordlist]) else: return terminator
[docs]def bwt_decode(code, terminator='\0'): r"""Return a word decoded from BWT form. The Burrows-Wheeler transform is an attempt at placing similar characters together to improve compression. This function reverses the transform. Cf. :cite:`Burrows:1994`. :param str code: the word to transform from BWT form :param str terminator: a character added to word to signal the end of the string :returns: word decoded by BWT :rtype: str >>> bwt_decode('n\x00ilag') 'align' >>> bwt_decode('annb\x00aa') 'banana' >>> bwt_decode('annb@aa', '@') 'banana' """ if code: if terminator not in code: raise ValueError('Specified terminator, %s, absent from code.' .format(terminator if terminator != '\0' else '\\0')) else: wordlist = [''] * len(code) for i in range(len(code)): wordlist = sorted(code[i] + wordlist[i] for i in range(len(code))) rows = [w for w in wordlist if w[-1] == terminator][0] return rows.rstrip(terminator) else: return ''
[docs]def rle_encode(text, use_bwt=True): r"""Perform encoding of run-length-encoding (RLE). Cf. :cite:`Robinson:1967`. Based on http://rosettacode.org/wiki/Run-length_encoding#Python :cite:`rosettacode:2018`. This is licensed GFDL 1.2. Digits 0-9 cannot be in text. :param str text: a text string to encode :param bool use_bwt: boolean indicating whether to perform BWT encoding before RLE encoding :returns: word decoded by BWT :rtype: str >>> rle_encode('align') 'n\x00ilag' >>> rle_encode('align', use_bwt=False) 'align' >>> rle_encode('banana') 'annb\x00aa' >>> rle_encode('banana', use_bwt=False) 'banana' >>> rle_encode('aaabaabababa') 'ab\x00abbab5a' >>> rle_encode('aaabaabababa', False) '3abaabababa' """ if use_bwt: text = bwt_encode(text) if text: text = ((len(list(g)), k) for k, g in groupby(text)) text = ((str(n) + k if n > 2 else (k if n == 1 else 2*k)) for n, k in text) return ''.join(text)
[docs]def rle_decode(text, use_bwt=True): r"""Perform decoding of run-length-encoding (RLE). Cf. :cite:`Robinson:1967`. Based on http://rosettacode.org/wiki/Run-length_encoding#Python :cite:`rosettacode:2018`. This is licensed GFDL 1.2. Digits 0-9 cannot have been in the original text. :param str text: a text string to decode :param bool use_bwt: boolean indicating whether to perform BWT decoding after RLE decoding :returns: word decoded by BWT :rtype: str >>> rle_decode('n\x00ilag') 'align' >>> rle_decode('align', use_bwt=False) 'align' >>> rle_decode('annb\x00aa') 'banana' >>> rle_decode('banana', use_bwt=False) 'banana' >>> rle_decode('ab\x00abbab5a') 'aaabaabababa' >>> rle_decode('3abaabababa', False) 'aaabaabababa' """ mult = '' decoded = [] for letter in list(text): if not letter.isdigit(): if mult: decoded.append(int(mult)*letter) mult = '' else: decoded.append(letter) else: mult += letter text = ''.join(decoded) if use_bwt: text = bwt_decode(text) return text
if __name__ == '__main__': import doctest doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)