diff options
Diffstat (limited to 'src/epy_reader/tools/KindleUnpack/mobi_uncompress.py')
-rw-r--r-- | src/epy_reader/tools/KindleUnpack/mobi_uncompress.py | 131 |
1 files changed, 131 insertions, 0 deletions
diff --git a/src/epy_reader/tools/KindleUnpack/mobi_uncompress.py b/src/epy_reader/tools/KindleUnpack/mobi_uncompress.py new file mode 100644 index 0000000..c5fad85 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_uncompress.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, bchr, lmap, bstr + +if PY2: + range = xrange + +import struct +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring + + +class unpackException(Exception): + pass + +class UncompressedReader: + + def unpack(self, data): + return data + +class PalmdocReader: + + def unpack(self, i): + o, p = b'', 0 + while p < len(i): + # for python 3 must use slice since i[p] returns int while slice returns character + c = ord(i[p:p+1]) + p += 1 + if (c >= 1 and c <= 8): + o += i[p:p+c] + p += c + elif (c < 128): + o += bchr(c) + elif (c >= 192): + o += b' ' + bchr(c ^ 128) + else: + if p < len(i): + c = (c << 8) | ord(i[p:p+1]) + p += 1 + m = (c >> 3) & 0x07ff + n = (c & 7) + 3 + if (m > n): + o += o[-m:n-m] + else: + for _ in range(n): + # because of completely ass-backwards decision by python mainters for python 3 + # we must use slice for bytes as i[p] returns int while slice returns character + if m == 1: + o += o[-m:] + else: + o += o[-m:-m+1] + return o + +class HuffcdicReader: + q = struct.Struct(b'>Q').unpack_from + + def loadHuff(self, huff): + if huff[0:8] != b'HUFF\x00\x00\x00\x18': + raise unpackException('invalid huff header') + off1, off2 = struct.unpack_from(b'>LL', huff, 8) + + def dict1_unpack(v): + codelen, term, maxcode = v&0x1f, v&0x80, v>>8 + assert codelen != 0 + if codelen <= 8: + assert term + maxcode = ((maxcode + 1) << (32 - codelen)) - 1 + return (codelen, term, maxcode) + self.dict1 = lmap(dict1_unpack, struct.unpack_from(b'>256L', huff, off1)) + + dict2 = struct.unpack_from(b'>64L', huff, off2) + self.mincode, self.maxcode = (), () + for codelen, mincode in enumerate((0,) + dict2[0::2]): + self.mincode += (mincode << (32 - codelen), ) + for codelen, maxcode in enumerate((0,) + dict2[1::2]): + self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, ) + + self.dictionary = [] + + def loadCdic(self, cdic): + if cdic[0:8] != b'CDIC\x00\x00\x00\x10': + raise unpackException('invalid cdic header') + phrases, bits = struct.unpack_from(b'>LL', cdic, 8) + n = min(1<<bits, phrases-len(self.dictionary)) + h = struct.Struct(b'>H').unpack_from + def getslice(off): + blen, = h(cdic, 16+off) + slice = cdic[18+off:18+off+(blen&0x7fff)] + return (slice, blen&0x8000) + self.dictionary += lmap(getslice, struct.unpack_from(bstr('>%dH' % n), cdic, 16)) + + def unpack(self, data): + q = HuffcdicReader.q + + bitsleft = len(data) * 8 + data += b"\x00\x00\x00\x00\x00\x00\x00\x00" + pos = 0 + x, = q(data, pos) + n = 32 + + s = b'' + while True: + if n <= 0: + pos += 4 + x, = q(data, pos) + n += 32 + code = (x >> n) & ((1 << 32) - 1) + + codelen, term, maxcode = self.dict1[code >> 24] + if not term: + while code < self.mincode[codelen]: + codelen += 1 + maxcode = self.maxcode[codelen] + + n -= codelen + bitsleft -= codelen + if bitsleft < 0: + break + + r = (maxcode - code) >> (32 - codelen) + slice, flag = self.dictionary[r] + if not flag: + self.dictionary[r] = None + slice = self.unpack(slice) + self.dictionary[r] = (slice, 1) + s += slice + return s |