# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.compression._arithmetic.
Arithmetic coder/decoder
"""
from collections import Counter
from fractions import Fraction
from deprecation import deprecated
from .. import __version__
__all__ = ['Arithmetic', 'ac_decode', 'ac_encode', 'ac_train']
[docs]class Arithmetic(object):
"""Arithmetic Coder.
This is based on Andrew Dalke's public domain implementation
:cite:`Dalke:2005`. It has been ported to use the fractions.Fraction class.
.. versionadded:: 0.3.6
"""
_probs = {}
def __init__(self, text=None):
"""Initialize arithmetic coder object.
Parameters
----------
text : str
The training text
.. versionadded:: 0.3.6
"""
if text is not None:
self.train(text)
[docs] def get_probs(self):
"""Return the probs dictionary.
Returns
-------
dict
The dictionary of probabilities
.. versionadded:: 0.3.6
"""
return self._probs
[docs] def set_probs(self, probs):
"""Set the probs dictionary.
Parameters
----------
probs : dict
The dictionary of probabilities
.. versionadded:: 0.3.6
"""
self._probs = probs
[docs] def train(self, text):
r"""Generate a probability dict from the provided text.
Text to 0-order probability statistics as a dict
Parameters
----------
text : str
The text data over which to calculate probability statistics. This
must not contain the NUL (0x00) character because that is used to
indicate the end of data.
Example
-------
>>> ac = Arithmetic()
>>> ac.train('the quick brown fox jumped over the lazy dog')
>>> ac.get_probs()
{' ': (Fraction(0, 1), Fraction(8, 45)),
'o': (Fraction(8, 45), Fraction(4, 15)),
'e': (Fraction(4, 15), Fraction(16, 45)),
'u': (Fraction(16, 45), Fraction(2, 5)),
't': (Fraction(2, 5), Fraction(4, 9)),
'r': (Fraction(4, 9), Fraction(22, 45)),
'h': (Fraction(22, 45), Fraction(8, 15)),
'd': (Fraction(8, 15), Fraction(26, 45)),
'z': (Fraction(26, 45), Fraction(3, 5)),
'y': (Fraction(3, 5), Fraction(28, 45)),
'x': (Fraction(28, 45), Fraction(29, 45)),
'w': (Fraction(29, 45), Fraction(2, 3)),
'v': (Fraction(2, 3), Fraction(31, 45)),
'q': (Fraction(31, 45), Fraction(32, 45)),
'p': (Fraction(32, 45), Fraction(11, 15)),
'n': (Fraction(11, 15), Fraction(34, 45)),
'm': (Fraction(34, 45), Fraction(7, 9)),
'l': (Fraction(7, 9), Fraction(4, 5)),
'k': (Fraction(4, 5), Fraction(37, 45)),
'j': (Fraction(37, 45), Fraction(38, 45)),
'i': (Fraction(38, 45), Fraction(13, 15)),
'g': (Fraction(13, 15), Fraction(8, 9)),
'f': (Fraction(8, 9), Fraction(41, 45)),
'c': (Fraction(41, 45), Fraction(14, 15)),
'b': (Fraction(14, 15), Fraction(43, 45)),
'a': (Fraction(43, 45), Fraction(44, 45)),
'\x00': (Fraction(44, 45), Fraction(1, 1))}
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
if '\x00' in text:
text = text.replace('\x00', ' ')
counts = Counter(text)
counts['\x00'] = 1
tot_letters = sum(counts.values())
tot = 0
self._probs = {}
prev = Fraction(0)
for char, count in sorted(
counts.items(), key=lambda x: (x[1], x[0]), reverse=True
):
follow = Fraction(tot + count, tot_letters)
self._probs[char] = (prev, follow)
prev = follow
tot = tot + count
[docs] def encode(self, text):
"""Encode a text using arithmetic coding.
Text and the 0-order probability statistics -> longval, nbits
The encoded number is Fraction(longval, 2**nbits)
Parameters
----------
text : str
A string to encode
Returns
-------
tuple
The arithmetically coded text
Example
-------
>>> ac = Arithmetic('the quick brown fox jumped over the lazy dog')
>>> ac.encode('align')
(16720586181, 34)
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
if '\x00' in text:
text = text.replace('\x00', ' ')
minval = Fraction(0)
maxval = Fraction(1)
for char in text + '\x00':
prob_range = self._probs[char]
delta = maxval - minval
maxval = minval + prob_range[1] * delta
minval = minval + prob_range[0] * delta
# I tried without the /2 just to check. Doesn't work.
# Keep scaling up until the error range is >= 1. That
# gives me the minimum number of bits needed to resolve
# down to the end-of-data character.
delta = (maxval - minval) / 2
nbits = int(0)
while delta < 1:
nbits += 1
delta *= 2
# The below condition shouldn't ever be false
if nbits == 0: # pragma: no cover
return 0, 0
# using -1 instead of /2
avg = (maxval + minval) * 2 ** (nbits - 1)
# Could return a rational instead ...
# the division truncation is deliberate
return avg.numerator // avg.denominator, nbits
[docs] def decode(self, longval, nbits):
"""Decode the number to a string using the given statistics.
Parameters
----------
longval : int
The first part of an encoded tuple from encode
nbits : int
The second part of an encoded tuple from encode
Returns
-------
str
The arithmetically decoded text
Example
-------
>>> ac = Arithmetic('the quick brown fox jumped over the lazy dog')
>>> ac.decode(16720586181, 34)
'align'
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
val = Fraction(longval, int(1) << nbits)
letters = []
probs_items = [
(char, minval, maxval)
for (char, (minval, maxval)) in self._probs.items()
]
char = '\x00'
while True:
for (char, minval, maxval) in probs_items: # noqa: B007
if minval <= val < maxval:
break
if char == '\x00':
break
letters.append(char)
delta = maxval - minval
val = (val - minval) / delta
return ''.join(letters)
[docs]@deprecated(
deprecated_in='0.4.0',
removed_in='0.6.0',
current_version=__version__,
details='Use the Arithmetic.train method instead.',
)
def ac_train(text):
r"""Generate a probability dict from the provided text.
This is a wrapper for :py:meth:`Arithmetic.train`.
Parameters
----------
text : str
The text data over which to calculate probability statistics. This must
not contain the NUL (0x00) character because that's used to indicate
the end of data.
Returns
-------
dict
A probability dict
Example
-------
>>> ac_train('the quick brown fox jumped over the lazy dog')
{' ': (Fraction(0, 1), Fraction(8, 45)),
'o': (Fraction(8, 45), Fraction(4, 15)),
'e': (Fraction(4, 15), Fraction(16, 45)),
'u': (Fraction(16, 45), Fraction(2, 5)),
't': (Fraction(2, 5), Fraction(4, 9)),
'r': (Fraction(4, 9), Fraction(22, 45)),
'h': (Fraction(22, 45), Fraction(8, 15)),
'd': (Fraction(8, 15), Fraction(26, 45)),
'z': (Fraction(26, 45), Fraction(3, 5)),
'y': (Fraction(3, 5), Fraction(28, 45)),
'x': (Fraction(28, 45), Fraction(29, 45)),
'w': (Fraction(29, 45), Fraction(2, 3)),
'v': (Fraction(2, 3), Fraction(31, 45)),
'q': (Fraction(31, 45), Fraction(32, 45)),
'p': (Fraction(32, 45), Fraction(11, 15)),
'n': (Fraction(11, 15), Fraction(34, 45)),
'm': (Fraction(34, 45), Fraction(7, 9)),
'l': (Fraction(7, 9), Fraction(4, 5)),
'k': (Fraction(4, 5), Fraction(37, 45)),
'j': (Fraction(37, 45), Fraction(38, 45)),
'i': (Fraction(38, 45), Fraction(13, 15)),
'g': (Fraction(13, 15), Fraction(8, 9)),
'f': (Fraction(8, 9), Fraction(41, 45)),
'c': (Fraction(41, 45), Fraction(14, 15)),
'b': (Fraction(14, 15), Fraction(43, 45)),
'a': (Fraction(43, 45), Fraction(44, 45)),
'\x00': (Fraction(44, 45), Fraction(1, 1))}
.. versionadded:: 0.1.0
"""
return Arithmetic(text).get_probs()
[docs]@deprecated(
deprecated_in='0.4.0',
removed_in='0.6.0',
current_version=__version__,
details='Use the Arithmetic.encode method instead.',
)
def ac_encode(text, probs):
"""Encode a text using arithmetic coding with the provided probabilities.
This is a wrapper for :py:meth:`Arithmetic.encode`.
Parameters
----------
text : str
A string to encode
probs : dict
A probability statistics dictionary generated by
:py:meth:`Arithmetic.train`
Returns
-------
tuple
The arithmetically coded text
Example
-------
>>> pr = ac_train('the quick brown fox jumped over the lazy dog')
>>> ac_encode('align', pr)
(16720586181, 34)
.. versionadded:: 0.1.0
"""
coder = Arithmetic()
coder.set_probs(probs)
return coder.encode(text)
[docs]@deprecated(
deprecated_in='0.4.0',
removed_in='0.6.0',
current_version=__version__,
details='Use the Arithmetic.decode method instead.',
)
def ac_decode(longval, nbits, probs):
"""Decode the number to a string using the given statistics.
This is a wrapper for :py:meth:`Arithmetic.decode`.
Parameters
----------
longval : int
The first part of an encoded tuple from ac_encode
nbits : int
The second part of an encoded tuple from ac_encode
probs : dict
A probability statistics dictionary generated by
:py:meth:`Arithmetic.train`
Returns
-------
str
The arithmetically decoded text
Example
-------
>>> pr = ac_train('the quick brown fox jumped over the lazy dog')
>>> ac_decode(16720586181, 34, pr)
'align'
.. versionadded:: 0.1.0
"""
coder = Arithmetic()
coder.set_probs(probs)
return coder.decode(longval, nbits)
if __name__ == '__main__':
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)