aboutsummaryrefslogtreecommitdiffstats
path: root/src/epy_reader/tools/KindleUnpack
diff options
context:
space:
mode:
authorBenawi Adha <benawiadha@gmail.com>2022-10-02 21:22:38 +0700
committerBenawi Adha <benawiadha@gmail.com>2022-10-02 21:22:38 +0700
commit258c30d2e088cd4ab091a53794da3f93af79915d (patch)
treef49340bf565deb20c730358af74a01bcc231de53 /src/epy_reader/tools/KindleUnpack
parentd43533f01d9d5baf5f78b71f832641382bd5962a (diff)
downloadepy-258c30d2e088cd4ab091a53794da3f93af79915d.tar.gz
Major refactor: breakdown epy.py script
into package project structure for easier development Squashed commit of the following: commit 01309b961a4ab32394bff0d90949b57435dfda47 Author: Benawi Adha <benawiadha@gmail.com> Date: Sun Oct 2 21:15:04 2022 +0700 Fix missing objects commit aab2e773c30b255c81b1250b3b20967d5da40338 Author: Benawi Adha <benawiadha@gmail.com> Date: Sun Oct 2 21:09:31 2022 +0700 Update README.md commit d4e98926bcd9b00ce0410ad71249d24e6315abc5 Author: Benawi Adha <benawiadha@gmail.com> Date: Sun Oct 2 21:07:28 2022 +0700 Add keywords in pyproject.toml commit 432055af8245560a3ff2e046aef0b4e87da44930 Author: Benawi Adha <benawiadha@gmail.com> Date: Sun Oct 2 21:04:34 2022 +0700 Bump version and deprecete setup.py commit 51dd15aab8f8ff5996f822f8378e813f0b9fb80d Author: Benawi Adha <benawiadha@gmail.com> Date: Sun Oct 2 20:56:38 2022 +0700 Formatting commit 81fb35e3b6fa0e27d79ef1da77202ed81eb99500 Author: Benawi Adha <benawiadha@gmail.com> Date: Sun Oct 2 20:55:08 2022 +0700 Fix speakers module commit 3b852e7c59b38d5a28520038e35f50a95270d2f1 Author: Benawi Adha <benawiadha@gmail.com> Date: Sat Oct 1 20:52:46 2022 +0700 Fix circular import commit 061e8a2649dabacd28a9e2f972559475316c654c Author: Benawi Adha <benawiadha@gmail.com> Date: Sat Oct 1 20:39:27 2022 +0700 Run formatting commit abc2d0ab156992c63dc04745d14a69679a60accb Author: Benawi Adha <benawiadha@gmail.com> Date: Sat Oct 1 20:39:00 2022 +0700 Update isort and black config in pyproject commit 5dc2e41bab5b997bd719bdc1561eb51ba0c17a83 Author: Benawi Adha <benawiadha@gmail.com> Date: Sat Oct 1 20:31:00 2022 +0700 Add app Config commit ed485a2ea8281585bf86dc5772f0c6dd9c803cc4 Author: Benawi Adha <benawiadha@gmail.com> Date: Sat Oct 1 20:23:02 2022 +0700 Update debugpy script commit 68b0553dd4d63eb4b847132c68ea4018587fa8ec Author: Benawi Adha <benawiadha@gmail.com> Date: Sat Oct 1 20:14:11 2022 +0700 Connect reader to main script commit 63c3dd176f18a784a4ed2e88aa72b13d1c2b0990 Author: Benawi Adha <benawiadha@gmail.com> Date: Sat Oct 1 20:11:17 2022 +0700 Implement reader commit ce5eec8fb4e1db3870a16a07541365cd777d6c4c Author: Benawi Adha <benawiadha@gmail.com> Date: Sat Oct 1 19:29:49 2022 +0700 Fix script in pyproject.toml commit 941e8e49f1593731fb582d92084206772b3f0442 Author: Benawi Adha <benawiadha@gmail.com> Date: Sat Oct 1 19:28:39 2022 +0700 Rename modules commit 5a3e7f766aee774c09b3b5336f3a2968e9cb1d0c Author: Benawi Adha <benawiadha@gmail.com> Date: Sat Oct 1 19:28:20 2022 +0700 Rename tool method commit 3c0503ff475cb7eff8b12d3be0bda7a38efe1072 Author: Benawi Adha <benawiadha@gmail.com> Date: Sat Oct 1 19:27:03 2022 +0700 Add ebooks lib commit b5f71c3296a7d6f36454f6e1cbe84e15a45092ee Author: Benawi Adha <benawiadha@gmail.com> Date: Sat Oct 1 17:25:11 2022 +0700 Initial reorganization
Diffstat (limited to 'src/epy_reader/tools/KindleUnpack')
-rw-r--r--src/epy_reader/tools/KindleUnpack/__init__.py2
-rwxr-xr-xsrc/epy_reader/tools/KindleUnpack/compatibility_utils.py278
-rw-r--r--src/epy_reader/tools/KindleUnpack/kindleunpack.py1029
-rw-r--r--src/epy_reader/tools/KindleUnpack/mobi_cover.py238
-rw-r--r--src/epy_reader/tools/KindleUnpack/mobi_dict.py377
-rw-r--r--src/epy_reader/tools/KindleUnpack/mobi_header.py936
-rw-r--r--src/epy_reader/tools/KindleUnpack/mobi_html.py439
-rw-r--r--src/epy_reader/tools/KindleUnpack/mobi_index.py276
-rw-r--r--src/epy_reader/tools/KindleUnpack/mobi_k8proc.py496
-rw-r--r--src/epy_reader/tools/KindleUnpack/mobi_k8resc.py271
-rw-r--r--src/epy_reader/tools/KindleUnpack/mobi_nav.py187
-rw-r--r--src/epy_reader/tools/KindleUnpack/mobi_ncx.py275
-rw-r--r--src/epy_reader/tools/KindleUnpack/mobi_opf.py686
-rw-r--r--src/epy_reader/tools/KindleUnpack/mobi_pagemap.py158
-rw-r--r--src/epy_reader/tools/KindleUnpack/mobi_sectioner.py120
-rwxr-xr-xsrc/epy_reader/tools/KindleUnpack/mobi_split.py438
-rw-r--r--src/epy_reader/tools/KindleUnpack/mobi_uncompress.py131
-rw-r--r--src/epy_reader/tools/KindleUnpack/mobi_utils.py191
-rwxr-xr-xsrc/epy_reader/tools/KindleUnpack/mobiml2xhtml.py527
-rwxr-xr-xsrc/epy_reader/tools/KindleUnpack/unipath.py93
-rw-r--r--src/epy_reader/tools/KindleUnpack/unpack_structure.py167
21 files changed, 7315 insertions, 0 deletions
diff --git a/src/epy_reader/tools/KindleUnpack/__init__.py b/src/epy_reader/tools/KindleUnpack/__init__.py
new file mode 100644
index 0000000..0077258
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/__init__.py
@@ -0,0 +1,2 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
diff --git a/src/epy_reader/tools/KindleUnpack/compatibility_utils.py b/src/epy_reader/tools/KindleUnpack/compatibility_utils.py
new file mode 100755
index 0000000..c46c0bb
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/compatibility_utils.py
@@ -0,0 +1,278 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this list of
+# conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice, this list
+# of conditions and the following disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+import sys
+import codecs
+
+PY2 = sys.version_info[0] == 2
+PY3 = sys.version_info[0] == 3
+
+iswindows = sys.platform.startswith('win')
+
+try:
+ from urllib.parse import unquote
+except ImportError:
+ from urllib import unquote
+
+if PY2:
+ from HTMLParser import HTMLParser
+ _h = HTMLParser()
+elif sys.version_info[1] < 4:
+ import html.parser
+ _h = html.parser.HTMLParser()
+else:
+ import html as _h
+
+if PY3:
+ text_type = str
+ binary_type = bytes
+ # if will be printing arbitraty binary data to stdout on python 3
+ # sys.stdin = sys.stdin.detach()
+ # sys.stdout = sys.stdout.detach()
+ # sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
+else:
+ range = xrange
+ text_type = unicode
+ binary_type = str
+ # if will be printing unicode under python 2 need to protect
+ # against sys.stdout.encoding being None stupidly forcing forcing ascii encoding of unicode
+ # sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
+ # alternatively set environment variable as follows **before** launching python: export PYTHONIOENCODING=UTF-8
+
+# NOTE: Python 3 is completely broken when accessing single bytes in bytes strings
+# (and they amazingly claim by design and no bug!)
+
+# To illustrate: this works for unicode in Python 3 and for all Python 2.X for both bytestrings and unicode
+# >>> o = '123456789'
+# >>> o[-3]
+# '7'
+# >>> type(o[-3])
+# <class 'str'>
+# >>> type(o)
+# <class 'str'>
+
+# Unfortunately, this is what Python 3 does for no sane reason and only for bytestrings
+# >>> o = b'123456789'
+# >>> o[-3]
+# 55
+# >>> type(o[-3])
+# <class 'int'>
+# >>> type(o)
+# <class 'bytes'>
+
+# This mind boggling behaviour also happens when indexing a bytestring and/or
+# iteratoring over a bytestring. In other words it will return an int but not
+# the byte itself!!!!!!!
+
+# The only way to access a single byte as a byte in bytestring and get the byte in both
+# Python 2 and Python 3 is to use a slice
+
+# This problem is so common there are horrible hacks floating around the net to **try**
+# to work around it, so that code that works on both Python 2 and Python 3 is possible.
+
+# So in order to write code that works on both Python 2 and Python 3
+# if you index or access a single byte and want its ord() then use the bord() function.
+# If instead you want it as a single character byte use the bchar() function
+# both of which are defined below.
+
+if PY3:
+ # Also Note: if decode a bytestring using 'latin-1' (or any other full range 0-255 encoding)
+ # in place of ascii you will get a byte value to half-word or integer value
+ # one-to-one mapping (in the 0 - 255 range)
+
+ def bchr(s):
+ return bytes([s])
+
+ def bstr(s):
+ if isinstance(s, str):
+ return bytes(s, 'latin-1')
+ else:
+ return bytes(s)
+
+ def bord(s):
+ return s
+
+ def bchar(s):
+ return bytes([s])
+
+else:
+ def bchr(s):
+ return chr(s)
+
+ def bstr(s):
+ return str(s)
+
+ def bord(s):
+ return ord(s)
+
+ def bchar(s):
+ return s
+
+if PY3:
+ # list-producing versions of the major Python iterating functions
+ def lrange(*args, **kwargs):
+ return list(range(*args, **kwargs))
+
+ def lzip(*args, **kwargs):
+ return list(zip(*args, **kwargs))
+
+ def lmap(*args, **kwargs):
+ return list(map(*args, **kwargs))
+
+ def lfilter(*args, **kwargs):
+ return list(filter(*args, **kwargs))
+else:
+ import __builtin__
+ # Python 2-builtin ranges produce lists
+ lrange = __builtin__.range
+ lzip = __builtin__.zip
+ lmap = __builtin__.map
+ lfilter = __builtin__.filter
+
+# In Python 3 you can no longer use .encode('hex') on a bytestring
+# instead use the following on both platforms
+import binascii
+def hexlify(bdata):
+ return (binascii.hexlify(bdata)).decode('ascii')
+
+# If you: import struct
+# Note: struct pack, unpack, unpack_from all *require* bytestring format
+# data all the way up to at least Python 2.7.5, Python 3 is okay with either
+
+# If you: import re
+# note: Python 3 "re" requires the pattern to be the exact same type as the data to be
+# searched ... but u"" is not allowed for the pattern itself only b""
+# Python 2.X allows the pattern to be any type and converts it to match the data
+# and returns the same type as the data
+
+# convert string to be utf-8 encoded
+def utf8_str(p, enc='utf-8'):
+ if p is None:
+ return None
+ if isinstance(p, text_type):
+ return p.encode('utf-8')
+ if enc != 'utf-8':
+ return p.decode(enc).encode('utf-8')
+ return p
+
+# convert string to be unicode encoded
+def unicode_str(p, enc='utf-8'):
+ if p is None:
+ return None
+ if isinstance(p, text_type):
+ return p
+ return p.decode(enc)
+
+ASCII_CHARS = set(chr(x) for x in range(128))
+URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+ 'abcdefghijklmnopqrstuvwxyz'
+ '0123456789' '#' '_.-/~')
+IRI_UNSAFE = ASCII_CHARS - URL_SAFE
+
+# returns a quoted IRI (not a URI)
+def quoteurl(href):
+ if isinstance(href,binary_type):
+ href = href.decode('utf-8')
+ result = []
+ for char in href:
+ if char in IRI_UNSAFE:
+ char = "%%%02x" % ord(char)
+ result.append(char)
+ return ''.join(result)
+
+# unquotes url/iri
+def unquoteurl(href):
+ if isinstance(href,binary_type):
+ href = href.decode('utf-8')
+ href = unquote(href)
+ return href
+
+# unescape html
+def unescapeit(sval):
+ return _h.unescape(sval)
+
+# Python 2.X commandline parsing under Windows has been horribly broken for years!
+# Use the following code to emulate full unicode commandline parsing on Python 2
+# ie. To get sys.argv arguments and properly encode them as unicode
+
+def unicode_argv():
+ global iswindows
+ global PY3
+ if PY3:
+ return sys.argv
+ if iswindows:
+ # Versions 2.x of Python don't support Unicode in sys.argv on
+ # Windows, with the underlying Windows API instead replacing multi-byte
+ # characters with '?'. So use shell32.GetCommandLineArgvW to get sys.argv
+ # as a list of Unicode strings
+ from ctypes import POINTER, byref, cdll, c_int, windll
+ from ctypes.wintypes import LPCWSTR, LPWSTR
+
+ GetCommandLineW = cdll.kernel32.GetCommandLineW
+ GetCommandLineW.argtypes = []
+ GetCommandLineW.restype = LPCWSTR
+
+ CommandLineToArgvW = windll.shell32.CommandLineToArgvW
+ CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)]
+ CommandLineToArgvW.restype = POINTER(LPWSTR)
+
+ cmd = GetCommandLineW()
+ argc = c_int(0)
+ argv = CommandLineToArgvW(cmd, byref(argc))
+ if argc.value > 0:
+ # Remove Python executable and commands if present
+ start = argc.value - len(sys.argv)
+ return [argv[i] for i in
+ range(start, argc.value)]
+ # this should never happen
+ return None
+ else:
+ argv = []
+ argvencoding = sys.stdin.encoding
+ if argvencoding is None:
+ argvencoding = sys.getfilesystemencoding()
+ if argvencoding is None:
+ argvencoding = 'utf-8'
+ for arg in sys.argv:
+ if isinstance(arg, text_type):
+ argv.append(arg)
+ else:
+ argv.append(arg.decode(argvencoding))
+ return argv
+
+
+# Python 2.X is broken in that it does not recognize CP65001 as UTF-8
+def add_cp65001_codec():
+ if PY2:
+ try:
+ codecs.lookup('cp65001')
+ except LookupError:
+ codecs.register(
+ lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)
+ return
diff --git a/src/epy_reader/tools/KindleUnpack/kindleunpack.py b/src/epy_reader/tools/KindleUnpack/kindleunpack.py
new file mode 100644
index 0000000..317941a
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/kindleunpack.py
@@ -0,0 +1,1029 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+import os
+
+__path__ = ["lib", os.path.dirname(os.path.realpath(__file__)), "kindleunpack"]
+
+import sys
+import codecs
+import traceback
+
+from .compatibility_utils import PY2, binary_type, utf8_str, unicode_str
+from .compatibility_utils import unicode_argv, add_cp65001_codec
+from .compatibility_utils import hexlify
+
+add_cp65001_codec()
+
+from .unipath import pathof
+
+if PY2:
+ range = xrange
+ # since will be printing unicode under python 2 need to protect
+ # against sys.stdout.encoding being None stupidly forcing forcing ascii encoding
+ if sys.stdout.encoding is None:
+ sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
+ else:
+ encoding = sys.stdout.encoding
+ sys.stdout = codecs.getwriter(encoding)(sys.stdout)
+
+# Changelog
+# 0.11 - Version by adamselene
+# 0.11pd - Tweaked version by pdurrant
+# 0.12 - extracts pictures too, and all into a folder.
+# 0.13 - added back in optional output dir for those who don't want it based on infile
+# 0.14 - auto flush stdout and wrapped in main, added proper return codes
+# 0.15 - added support for metadata
+# 0.16 - metadata now starting to be output as an opf file (PD)
+# 0.17 - Also created tweaked text as source for Mobipocket Creator
+# 0.18 - removed raw mobi file completely but kept _meta.html file for ease of conversion
+# 0.19 - added in metadata for ASIN, Updated Title and Rights to the opf
+# 0.20 - remove _meta.html since no longer needed
+# 0.21 - Fixed some typos in the opf output, and also updated handling
+# of test for trailing data/multibyte characters
+# 0.22 - Fixed problem with > 9 images
+# 0.23 - Now output Start guide item
+# 0.24 - Set firstaddl value for 'TEXtREAd'
+# 0.25 - Now added character set metadata to html file for utf-8 files.
+# 0.26 - Dictionary support added. Image handling speed improved.
+# For huge files create temp files to speed up decoding.
+# Language decoding fixed. Metadata is now converted to utf-8 when written to opf file.
+# 0.27 - Add idx:entry attribute "scriptable" if dictionary contains entry length tags.
+# Don't save non-image sections as images. Extract and save source zip file
+# included by kindlegen as kindlegensrc.zip.
+# 0.28 - Added back correct image file name extensions, created FastConcat class to simplify and clean up
+# 0.29 - Metadata handling reworked, multiple entries of the same type are now supported.
+# Several missing types added.
+# FastConcat class has been removed as in-memory handling with lists is faster, even for huge files.
+# 0.30 - Add support for outputting **all** metadata values - encode content with hex if of unknown type
+# 0.31 - Now supports Print Replica ebooks, outputting PDF and mysterious data sections
+# 0.32 - Now supports NCX file extraction/building.
+# Overhauled the structure of mobiunpack to be more class oriented.
+# 0.33 - Split Classes ito separate files and added prelim support for KF8 format eBooks
+# 0.34 - Improved KF8 support, guide support, bug fixes
+# 0.35 - Added splitting combo mobi7/mobi8 into standalone mobi7 and mobi8 files
+# Also handle mobi8-only file properly
+# 0.36 - very minor changes to support KF8 mobis with no flow items, no ncx, etc
+# 0.37 - separate output, add command line switches to control, interface to Mobi_Unpack.pyw
+# 0.38 - improve split function by resetting flags properly, fix bug in Thumbnail Images
+# 0.39 - improve split function so that ToC info is not lost for standalone mobi8s
+# 0.40 - make mobi7 split match official versions, add support for graphic novel metadata,
+# improve debug for KF8
+# 0.41 - fix when StartOffset set to 0xffffffff, fix to work with older mobi versions,
+# fix other minor metadata issues
+# 0.42 - add new class interface to allow it to integrate more easily with internal calibre routines
+# 0.43 - bug fixes for new class interface
+# 0.44 - more bug fixes and fix for potnetial bug caused by not properly closing created zip archive
+# 0.45 - sync to version in the new Mobi_Unpack plugin
+# 0.46 - fixes for: obfuscated fonts, improper toc links and ncx, add support for opentype fonts
+# 0.47 - minor opf improvements
+# 0.48 - ncx link fixes
+# 0.49 - use azw3 when splitting mobis
+# 0.50 - unknown change
+# 0.51 - fix for converting filepos links to hrefs, Added GPL3 notice, made KF8 extension just '.azw3'
+# 0.52 - fix for cover metadata (no support for Mobipocket Creator)
+# 0.53 - fix for proper identification of embedded fonts, added new metadata items
+# 0.54 - Added error-handling so wonky embedded fonts don't bomb the whole unpack process,
+# entity escape KF8 metadata to ensure valid OPF.
+# 0.55 Strip extra StartOffset EXTH from the mobi8 header when splitting, keeping only the relevant one
+# For mobi8 files, don't generate duplicate guide entries from the metadata if we could extract one
+# from the OTH table.
+# 0.56 - Added further entity escaping of OPF text.
+# Allow unicode string file paths to be passed as arguments to the unpackBook method without blowing up later
+# when the attempt to "re"-unicode a portion of that filename occurs in the process_all_mobi_headers method.
+# 0.57 - Fixed eror when splitting Preview files downloaded from KDP website
+# 0.58 - Output original kindlegen build log ('CMET' record) if included in the package.
+# 0.58 - Include and extend functionality of DumpMobiHeader, replacing DEBUG with DUMP
+# 0.59 - Much added DUMP functionality, including full dumping and descriptions of sections
+# 0.60 - Bug fixes in opf, div tables, bad links, page breaks, section descriptions
+# - plus a number of other bug fixed that were found by Sergey Dubinets
+# - fixs for file/paths that require full unicode to work properly
+# - replace subprocess with multiprocessing to remove need for unbuffered stdout
+# 0.61 - renamed to be KindleUnpack and more unicode/utf-8 path bug fixes and other minor fixes
+# 0.62 - fix for multiprocessing on Windows, split fixes, opf improvements
+# 0.63 - Modified to process right to left page progression books properly.
+# - Added some id_map_strings and RESC section processing; metadata and
+# - spine in the RESC are integrated partly to content.opf.
+# 0.63a- Separated K8 RESC processor to an individual file. Bug fixes. Added cover page creation.
+# 0.64 - minor bug fixes to more properly handle unicode command lines, and support for more jpeg types
+# 0.64a- Modifed to handle something irregular mobi and azw3 files.
+# 0.64b- Modifed to create k8resc.spine for no RECS files.
+# 0.65 - Bug fixes to shorten title and remove epub3 "properties" to make the output epub2 compliant
+# 0.65a- Bug fixes to extract RESC section correctly, to prevent item id confliction
+# - and to process multiline comments in RESC.
+# 0.66 - Bug fix to deal with missing first resource information sometimes generated by calibre
+# 0.66a- Fixed minor bugs, which probably do not affect the output anything
+# 0.67 - Fixed Mobi Split functionality bug with azw3 images not being properly copied
+# 0.68 - preliminary support for handling PAGE sections to create page-map.xml
+# 0.69 - preliminary support for CONT and CRES for HD Images
+# 0.70 - preliminary support for decoding apnx files when used with azw3 ebooks
+# 0.71 - extensive refactoring of kindleunpack.py to make it more manageable
+# 0.72 - many bug fixes from tkeo: fix pageProcessing, fix print replica, fix resc usage, fix font mangling, etc.
+# 0.72a- fix for still broken PrintReplica support
+# 0.72b- preview for primary epub3 support. A parameter epubver(default='2') is added to process_all_mobi_headers(), unpackBook().
+# 0.72c- preview for apnx page support
+# 0.72d- more bugs fixed in preview features, much improved GUI with ability to dynaically grow the Log Window with preference support
+# 0.72e- more bug fixes, Tk GUI adds support for epub version and HDImage use
+# 0.72f- more bug fixes, implement use hd images if present
+# 0.72g- minor bug fixes and cleanups from tkeo
+# 0.72h- updated mobi_header and mobi_k8proc to use the correct fragment and guide terms in place of div and other
+# to better match the terms that both Calibre and Amazon use internally to their own software
+# 0.72x- very experimental conversion to use new mobi_k8resc.py and some of its associated changes
+# 0.72y- more changes to simplify and integrate in epub3 support in a simpler manner
+# 0.72z- remove redundancy in mobi_opf.py and bug fixes for mobi_k8resc.py
+# 0.73 faster mobi split, numerous bug fixes in mobi_k8proc, mobi_header, mobi_opf, mobi_k8resc, etc
+# 0.74 added refines metadata, fixed language code in ncx and title in nav, added support for opf: from refines
+# 0.75 much improved dictioanry support including support for multiple inflection sections, minor mobi_opf fixes
+# 0.76 pre-release version only fix name related issues in opf by not using original file name in mobi7
+# 0.77 bug fix for unpacking HDImages with included Fonts
+# 0.80 converted to work with both python 2.7 and Python 3.3 and later
+# 0.81 various fixes
+# 0.82 Handle calibre-generated mobis that can have skeletons with no fragments
+# 0.83 Fix header item 114 being mistakenly treated as a string instead of a value
+
+DUMP = False
+""" Set to True to dump all possible information. """
+
+WRITE_RAW_DATA = False
+""" Set to True to create additional files with raw data for debugging/reverse engineering. """
+
+SPLIT_COMBO_MOBIS = False
+""" Set to True to split combination mobis into mobi7 and mobi8 pieces. """
+
+CREATE_COVER_PAGE = True # XXX experimental
+""" Create and insert a cover xhtml page. """
+
+EOF_RECORD = b'\xe9\x8e' + b'\r\n'
+""" The EOF record content. """
+
+TERMINATION_INDICATOR1 = b'\x00'
+TERMINATION_INDICATOR2 = b'\x00\x00'
+TERMINATION_INDICATOR3 = b'\x00\x00\x00'
+
+KINDLEGENSRC_FILENAME = "kindlegensrc.zip"
+""" The name for the kindlegen source archive. """
+
+KINDLEGENLOG_FILENAME = "kindlegenbuild.log"
+""" The name for the kindlegen build log. """
+
+K8_BOUNDARY = b'BOUNDARY'
+""" The section data that divides K8 mobi ebooks. """
+
+import os
+import struct
+import re
+import zlib
+import getopt
+
+class unpackException(Exception):
+ pass
+
+
+# import the kindleunpack support libraries
+from .unpack_structure import fileNames
+from .mobi_sectioner import Sectionizer, describe
+from .mobi_header import MobiHeader, dump_contexth
+from .mobi_utils import toBase32
+from .mobi_opf import OPFProcessor
+from .mobi_html import HTMLProcessor, XHTMLK8Processor
+from .mobi_ncx import ncxExtract
+from .mobi_k8proc import K8Processor
+from .mobi_split import mobi_split
+from .mobi_k8resc import K8RESCProcessor
+from .mobi_nav import NAVProcessor
+from .mobi_cover import CoverProcessor, get_image_type
+from .mobi_pagemap import PageMapProcessor
+from .mobi_dict import dictSupport
+
+
+def processSRCS(i, files, rscnames, sect, data):
+ # extract the source zip archive and save it.
+ print("File contains kindlegen source archive, extracting as %s" % KINDLEGENSRC_FILENAME)
+ srcname = os.path.join(files.outdir, KINDLEGENSRC_FILENAME)
+ with open(pathof(srcname), 'wb') as f:
+ f.write(data[16:])
+ rscnames.append(None)
+ sect.setsectiondescription(i,"Zipped Source Files")
+ return rscnames
+
+
+def processPAGE(i, files, rscnames, sect, data, mh, pagemapproc):
+ # process any page map information and create an apnx file
+ pagemapproc = PageMapProcessor(mh, data)
+ rscnames.append(None)
+ sect.setsectiondescription(i,"PageMap")
+ apnx_meta = {}
+ acr = sect.palmname.decode('latin-1').rstrip('\x00')
+ apnx_meta['acr'] = acr
+ apnx_meta['cdeType'] = mh.metadata['cdeType'][0]
+ apnx_meta['contentGuid'] = hex(int(mh.metadata['UniqueID'][0]))[2:]
+ apnx_meta['asin'] = mh.metadata['ASIN'][0]
+ apnx_meta['pageMap'] = pagemapproc.getPageMap()
+ if mh.version == 8:
+ apnx_meta['format'] = 'MOBI_8'
+ else:
+ apnx_meta['format'] = 'MOBI_7'
+ apnx_data = pagemapproc.generateAPNX(apnx_meta)
+ if mh.isK8():
+ outname = os.path.join(files.outdir, 'mobi8-'+files.getInputFileBasename() + '.apnx')
+ else:
+ outname = os.path.join(files.outdir, 'mobi7-'+files.getInputFileBasename() + '.apnx')
+ with open(pathof(outname), 'wb') as f:
+ f.write(apnx_data)
+ return rscnames, pagemapproc
+
+
+def processCMET(i, files, rscnames, sect, data):
+ # extract the build log
+ print("File contains kindlegen build log, extracting as %s" % KINDLEGENLOG_FILENAME)
+ srcname = os.path.join(files.outdir, KINDLEGENLOG_FILENAME)
+ with open(pathof(srcname), 'wb') as f:
+ f.write(data[10:])
+ rscnames.append(None)
+ sect.setsectiondescription(i,"Kindlegen log")
+ return rscnames
+
+
+# fonts only exist in KF8 ebooks
+# Format: bytes 0 - 3: 'FONT'
+# bytes 4 - 7: uncompressed size
+# bytes 8 - 11: flags
+# flag bit 0x0001 - zlib compression
+# flag bit 0x0002 - obfuscated with xor string
+# bytes 12 - 15: offset to start of compressed font data
+# bytes 16 - 19: length of xor string stored before the start of the comnpress font data
+# bytes 20 - 23: start of xor string
+def processFONT(i, files, rscnames, sect, data, obfuscate_data, beg, rsc_ptr):
+ fontname = "font%05d" % i
+ ext = '.dat'
+ font_error = False
+ font_data = data
+ try:
+ usize, fflags, dstart, xor_len, xor_start = struct.unpack_from(b'>LLLLL',data,4)
+ except:
+ print("Failed to extract font: {0:s} from section {1:d}".format(fontname,i))
+ font_error = True
+ ext = '.failed'
+ pass
+ if not font_error:
+ print("Extracting font:", fontname)
+ font_data = data[dstart:]
+ extent = len(font_data)
+ extent = min(extent, 1040)
+ if fflags & 0x0002:
+ # obfuscated so need to de-obfuscate the first 1040 bytes
+ key = bytearray(data[xor_start: xor_start+ xor_len])
+ buf = bytearray(font_data)
+ for n in range(extent):
+ buf[n] ^= key[n%xor_len]
+ font_data = bytes(buf)
+ if fflags & 0x0001:
+ # ZLIB compressed data
+ font_data = zlib.decompress(font_data)
+ hdr = font_data[0:4]
+ if hdr == b'\0\1\0\0' or hdr == b'true' or hdr == b'ttcf':
+ ext = '.ttf'
+ elif hdr == b'OTTO':
+ ext = '.otf'
+ else:
+ print("Warning: unknown font header %s" % hexlify(hdr))
+ if (ext == '.ttf' or ext == '.otf') and (fflags & 0x0002):
+ obfuscate_data.append(fontname + ext)
+ fontname += ext
+ outfnt = os.path.join(files.imgdir, fontname)
+ with open(pathof(outfnt), 'wb') as f:
+ f.write(font_data)
+ rscnames.append(fontname)
+ sect.setsectiondescription(i,"Font {0:s}".format(fontname))
+ if rsc_ptr == -1:
+ rsc_ptr = i - beg
+ return rscnames, obfuscate_data, rsc_ptr
+
+
+def processCRES(i, files, rscnames, sect, data, beg, rsc_ptr, use_hd):
+ # extract an HDImage
+ global DUMP
+ data = data[12:]
+ imgtype = get_image_type(None, data)
+
+ if imgtype is None:
+ print("Warning: CRES Section %s does not contain a recognised resource" % i)
+ rscnames.append(None)
+ sect.setsectiondescription(i,"Mysterious CRES data, first four bytes %s" % describe(data[0:4]))
+ if DUMP:
+ fname = "unknown%05d.dat" % i
+ outname= os.path.join(files.outdir, fname)
+ with open(pathof(outname), 'wb') as f:
+ f.write(data)
+ sect.setsectiondescription(i,"Mysterious CRES data, first four bytes %s extracting as %s" % (describe(data[0:4]), fname))
+ rsc_ptr += 1
+ return rscnames, rsc_ptr
+
+ if use_hd:
+ # overwrite corresponding lower res image with hd version
+ imgname = rscnames[rsc_ptr]
+ imgdest = files.imgdir
+ else:
+ imgname = "HDimage%05d.%s" % (i, imgtype)
+ imgdest = files.hdimgdir
+ print("Extracting HD image: {0:s} from section {1:d}".format(imgname,i))
+ outimg = os.path.join(imgdest, imgname)
+ with open(pathof(outimg), 'wb') as f:
+ f.write(data)
+ rscnames.append(None)
+ sect.setsectiondescription(i,"Optional HD Image {0:s}".format(imgname))
+ rsc_ptr += 1
+ return rscnames, rsc_ptr
+
+
+def processCONT(i, files, rscnames, sect, data):
+ global DUMP
+ # process a container header, most of this is unknown
+ # right now only extract its EXTH
+ dt = data[0:12]
+ if dt == b"CONTBOUNDARY":
+ rscnames.append(None)
+ sect.setsectiondescription(i,"CONTAINER BOUNDARY")
+ else:
+ sect.setsectiondescription(i,"CONT Header")
+ rscnames.append(None)
+ if DUMP:
+ cpage, = struct.unpack_from(b'>L', data, 12)
+ contexth = data[48:]
+ print("\n\nContainer EXTH Dump")
+ dump_contexth(cpage, contexth)
+ fname = "CONT_Header%05d.dat" % i
+ outname= os.path.join(files.outdir, fname)
+ with open(pathof(outname), 'wb') as f:
+ f.write(data)
+ return rscnames
+
+
+def processkind(i, files, rscnames, sect, data):
+ global DUMP
+ dt = data[0:12]
+ if dt == b"kindle:embed":
+ if DUMP:
+ print("\n\nHD Image Container Description String")
+ print(data)
+ sect.setsectiondescription(i,"HD Image Container Description String")
+ rscnames.append(None)
+ return rscnames
+
+
+# spine information from the original content.opf
+def processRESC(i, files, rscnames, sect, data, k8resc):
+ global DUMP
+ if DUMP:
+ rescname = "RESC%05d.dat" % i
+ print("Extracting Resource: ", rescname)
+ outrsc = os.path.join(files.outdir, rescname)
+ with open(pathof(outrsc), 'wb') as f:
+ f.write(data)
+ if True: # try:
+ # parse the spine and metadata from RESC
+ k8resc = K8RESCProcessor(data[16:], DUMP)
+ else: # except:
+ print("Warning: cannot extract information from RESC.")
+ k8resc = None
+ rscnames.append(None)
+ sect.setsectiondescription(i,"K8 RESC section")
+ return rscnames, k8resc
+
+
+def processImage(i, files, rscnames, sect, data, beg, rsc_ptr, cover_offset, thumb_offset):
+ global DUMP
+ # Extract an Image
+ imgtype = get_image_type(None, data)
+ if imgtype is None:
+ print("Warning: Section %s does not contain a recognised resource" % i)
+ rscnames.append(None)
+ sect.setsectiondescription(i,"Mysterious Section, first four bytes %s" % describe(data[0:4]))
+ if DUMP:
+ fname = "unknown%05d.dat" % i
+ outname= os.path.join(files.outdir, fname)
+ with open(pathof(outname), 'wb') as f:
+ f.write(data)
+ sect.setsectiondescription(i,"Mysterious Section, first four bytes %s extracting as %s" % (describe(data[0:4]), fname))
+ return rscnames, rsc_ptr
+
+ imgname = "image%05d.%s" % (i, imgtype)
+ if cover_offset is not None and i == beg + cover_offset:
+ imgname = "cover%05d.%s" % (i, imgtype)
+ if thumb_offset is not None and i == beg + thumb_offset:
+ imgname = "thumb%05d.%s" % (i, imgtype)
+ print("Extracting image: {0:s} from section {1:d}".format(imgname,i))
+ outimg = os.path.join(files.imgdir, imgname)
+ with open(pathof(outimg), 'wb') as f:
+ f.write(data)
+ rscnames.append(imgname)
+ sect.setsectiondescription(i,"Image {0:s}".format(imgname))
+ if rsc_ptr == -1:
+ rsc_ptr = i - beg
+ return rscnames, rsc_ptr
+
+
+def processPrintReplica(metadata, files, rscnames, mh):
+ global DUMP
+ global WRITE_RAW_DATA
+ rawML = mh.getRawML()
+ if DUMP or WRITE_RAW_DATA:
+ outraw = os.path.join(files.outdir,files.getInputFileBasename() + '.rawpr')
+ with open(pathof(outraw),'wb') as f:
+ f.write(rawML)
+
+ fileinfo = []
+ print("Print Replica ebook detected")
+ try:
+ numTables, = struct.unpack_from(b'>L', rawML, 0x04)
+ tableIndexOffset = 8 + 4*numTables
+ # for each table, read in count of sections, assume first section is a PDF
+ # and output other sections as binary files
+ for i in range(numTables):
+ sectionCount, = struct.unpack_from(b'>L', rawML, 0x08 + 4*i)
+ for j in range(sectionCount):
+ sectionOffset, sectionLength, = struct.unpack_from(b'>LL', rawML, tableIndexOffset)
+ tableIndexOffset += 8
+ if j == 0:
+ entryName = os.path.join(files.outdir, files.getInputFileBasename() + ('.%03d.pdf' % (i+1)))
+ else:
+ entryName = os.path.join(files.outdir, files.getInputFileBasename() + ('.%03d.%03d.data' % ((i+1),j)))
+ with open(pathof(entryName), 'wb') as f:
+ f.write(rawML[sectionOffset:(sectionOffset+sectionLength)])
+ except Exception as e:
+ print('Error processing Print Replica: ' + str(e))
+
+ fileinfo.append([None,'', files.getInputFileBasename() + '.pdf'])
+ usedmap = {}
+ for name in rscnames:
+ if name is not None:
+ usedmap[name] = 'used'
+ opf = OPFProcessor(files, metadata, fileinfo, rscnames, False, mh, usedmap)
+ opf.writeOPF()
+
+
+def processMobi8(mh, metadata, sect, files, rscnames, pagemapproc, k8resc, obfuscate_data, apnxfile=None, epubver='2'):
+ global DUMP
+ global WRITE_RAW_DATA
+
+ # extract raw markup langauge
+ rawML = mh.getRawML()
+ if DUMP or WRITE_RAW_DATA:
+ outraw = os.path.join(files.k8dir,files.getInputFileBasename() + '.rawml')
+ with open(pathof(outraw),'wb') as f:
+ f.write(rawML)
+
+ # KF8 require other indexes which contain parsing information and the FDST info
+ # to process the rawml back into the xhtml files, css files, svg image files, etc
+ k8proc = K8Processor(mh, sect, files, DUMP)
+ k8proc.buildParts(rawML)
+
+ # collect information for the guide first
+ guidetext = unicode_str(k8proc.getGuideText())
+
+ # if the guide was empty, add in any guide info from metadata, such as StartOffset
+ if not guidetext and 'StartOffset' in metadata:
+ # Apparently, KG 2.5 carries over the StartOffset from the mobi7 part...
+ # Taking that into account, we only care about the *last* StartOffset, which
+ # should always be the correct one in these cases (the one actually pointing
+ # to the right place in the mobi8 part).
+ starts = metadata['StartOffset']
+ last_start = starts[-1]
+ last_start = int(last_start)
+ if last_start == 0xffffffff:
+ last_start = 0
+ seq, idtext = k8proc.getFragTblInfo(last_start)
+ filename, idtext = k8proc.getIDTagByPosFid(toBase32(seq), b'0000000000')
+ linktgt = filename
+ idtext = unicode_str(idtext, mh.codec)
+ if idtext != '':
+ linktgt += '#' + idtext
+ guidetext += '<reference type="text" href="Text/%s" />\n' % linktgt
+
+ # if apnxfile is passed in use it for page map information
+ if apnxfile is not None and pagemapproc is None:
+ with open(apnxfile, 'rb') as f:
+ apnxdata = b"00000000" + f.read()
+ pagemapproc = PageMapProcessor(mh, apnxdata)
+
+ # generate the page map
+ pagemapxml = ''
+ if pagemapproc is not None:
+ pagemapxml = pagemapproc.generateKF8PageMapXML(k8proc)
+ outpm = os.path.join(files.k8oebps,'page-map.xml')
+ with open(pathof(outpm),'wb') as f:
+ f.write(pagemapxml.encode('utf-8'))
+ if DUMP:
+ print(pagemapproc.getNames())
+ print(pagemapproc.getOffsets())
+ print("\n\nPage Map")
+ print(pagemapxml)
+
+ # process the toc ncx
+ # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num
+ print("Processing ncx / toc")
+ ncx = ncxExtract(mh, files)
+ ncx_data = ncx.parseNCX()
+ # extend the ncx data with filenames and proper internal idtags
+ for i in range(len(ncx_data)):
+ ncxmap = ncx_data[i]
+ [junk1, junk2, junk3, fid, junk4, off] = ncxmap['pos_fid'].split(':')
+ filename, idtag = k8proc.getIDTagByPosFid(fid, off)
+ ncxmap['filename'] = filename
+ ncxmap['idtag'] = unicode_str(idtag)
+ ncx_data[i] = ncxmap
+
+ # convert the rawML to a set of xhtml files
+ print("Building an epub-like structure")
+ htmlproc = XHTMLK8Processor(rscnames, k8proc)
+ usedmap = htmlproc.buildXHTML()
+
+ # write out the xhtml svg, and css files
+ # fileinfo = [skelid|coverpage, dir, name]
+ fileinfo = []
+ # first create a cover page if none exists
+ if CREATE_COVER_PAGE:
+ cover = CoverProcessor(files, metadata, rscnames)
+ cover_img = utf8_str(cover.getImageName())
+ need_to_create_cover_page = False
+ if cover_img is not None:
+ if k8resc is None or not k8resc.hasSpine():
+ part = k8proc.getPart(0)
+ if part.find(cover_img) == -1:
+ need_to_create_cover_page = True
+ else:
+ if "coverpage" not in k8resc.spine_idrefs:
+ part = k8proc.getPart(int(k8resc.spine_order[0]))
+ if part.find(cover_img) == -1:
+ k8resc.prepend_to_spine("coverpage", "inserted", "no", None)
+ if k8resc.spine_order[0] == "coverpage":
+ need_to_create_cover_page = True
+ if need_to_create_cover_page:
+ filename = cover.getXHTMLName()
+ fileinfo.append(["coverpage", 'Text', filename])
+ guidetext += cover.guide_toxml()
+ cover.writeXHTML()
+
+ n = k8proc.getNumberOfParts()
+ for i in range(n):
+ part = k8proc.getPart(i)
+ [skelnum, dir, filename, beg, end, aidtext] = k8proc.getPartInfo(i)
+ fileinfo.append([str(skelnum), dir, filename])
+ fname = os.path.join(files.k8oebps,dir,filename)
+ with open(pathof(fname),'wb') as f:
+ f.write(part)
+ n = k8proc.getNumberOfFlows()
+ for i in range(1, n):
+ [ptype, pformat, pdir, filename] = k8proc.getFlowInfo(i)
+ flowpart = k8proc.getFlow(i)
+ if pformat == b'file':
+ fileinfo.append([None, pdir, filename])
+ fname = os.path.join(files.k8oebps,pdir,filename)
+ with open(pathof(fname),'wb') as f:
+ f.write(flowpart)
+
+ # create the opf
+ opf = OPFProcessor(files, metadata.copy(), fileinfo, rscnames, True, mh, usedmap,
+ pagemapxml=pagemapxml, guidetext=guidetext, k8resc=k8resc, epubver=epubver)
+ uuid = opf.writeOPF(bool(obfuscate_data))
+
+ if opf.hasNCX():
+ # Create a toc.ncx.
+ ncx.writeK8NCX(ncx_data, metadata)
+ if opf.hasNAV():
+ # Create a navigation document.
+ nav = NAVProcessor(files)
+ nav.writeNAV(ncx_data, guidetext, metadata)
+
+ # make an epub-like structure of it all
+ print("Creating an epub-like file")
+ files.makeEPUB(usedmap, obfuscate_data, uuid)
+
+
+def processMobi7(mh, metadata, sect, files, rscnames):
+ global DUMP
+ global WRITE_RAW_DATA
+ # An original Mobi
+ rawML = mh.getRawML()
+ if DUMP or WRITE_RAW_DATA:
+ outraw = os.path.join(files.mobi7dir,files.getInputFileBasename() + '.rawml')
+ with open(pathof(outraw),'wb') as f:
+ f.write(rawML)
+
+ # process the toc ncx
+ # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num
+ ncx = ncxExtract(mh, files)
+ ncx_data = ncx.parseNCX()
+ ncx.writeNCX(metadata)
+
+ positionMap = {}
+
+ # if Dictionary build up the positionMap
+ if mh.isDictionary():
+ if mh.DictInLanguage():
+ metadata['DictInLanguage'] = [mh.DictInLanguage()]
+ if mh.DictOutLanguage():
+ metadata['DictOutLanguage'] = [mh.DictOutLanguage()]
+ positionMap = dictSupport(mh, sect).getPositionMap()
+
+ # convert the rawml back to Mobi ml
+ proc = HTMLProcessor(files, metadata, rscnames)
+ srctext = proc.findAnchors(rawML, ncx_data, positionMap)
+ srctext, usedmap = proc.insertHREFS()
+
+ # write the proper mobi html
+ fileinfo=[]
+ # fname = files.getInputFileBasename() + '.html'
+ fname = 'book.html'
+ fileinfo.append([None,'', fname])
+ outhtml = os.path.join(files.mobi7dir, fname)
+ with open(pathof(outhtml), 'wb') as f:
+ f.write(srctext)
+
+ # extract guidetext from srctext
+ guidetext =b''
+ # no pagemap support for older mobis
+ # pagemapxml = None
+ guidematch = re.search(br'''<guide>(.*)</guide>''',srctext,re.IGNORECASE+re.DOTALL)
+ if guidematch:
+ guidetext = guidematch.group(1)
+ # sometimes old mobi guide from srctext horribly written so need to clean up
+ guidetext = guidetext.replace(b"\r", b"")
+ guidetext = guidetext.replace(b'<REFERENCE', b'<reference')
+ guidetext = guidetext.replace(b' HREF=', b' href=')
+ guidetext = guidetext.replace(b' TITLE=', b' title=')
+ guidetext = guidetext.replace(b' TYPE=', b' type=')
+ # reference must be a self-closing tag
+ # and any href must be replaced with filepos information
+ ref_tag_pattern = re.compile(br'''(<reference [^>]*>)''', re.IGNORECASE)
+ guidepieces = ref_tag_pattern.split(guidetext)
+ for i in range(1,len(guidepieces), 2):
+ reftag = guidepieces[i]
+ # remove any href there now to replace with filepos
+ reftag = re.sub(br'''href\s*=[^'"]*['"][^'"]*['"]''',b'', reftag)
+ # make sure the reference tag ends properly
+ if not reftag.endswith(b"/>"):
+ reftag = reftag[0:-1] + b"/>"
+ guidepieces[i] = reftag
+ guidetext = b''.join(guidepieces)
+ replacetext = br'''href="'''+utf8_str(fileinfo[0][2])+ br'''#filepos\1"'''
+ guidetext = re.sub(br'''filepos=['"]{0,1}0*(\d+)['"]{0,1}''', replacetext, guidetext)
+ guidetext += b'\n'
+
+ if 'StartOffset' in metadata:
+ for value in metadata['StartOffset']:
+ if int(value) == 0xffffffff:
+ value = '0'
+ starting_offset = value
+ # get guide items from metadata
+ metaguidetext = b'<reference type="text" href="'+utf8_str(fileinfo[0][2])+b'#filepos'+utf8_str(starting_offset)+b'" />\n'
+ guidetext += metaguidetext
+
+ if isinstance(guidetext, binary_type):
+ guidetext = guidetext.decode(mh.codec)
+
+ # create an OPF
+ opf = OPFProcessor(files, metadata, fileinfo, rscnames, ncx.isNCX, mh, usedmap, guidetext=guidetext)
+ opf.writeOPF()
+
+
+def processUnknownSections(mh, sect, files, K8Boundary):
+ global DUMP
+ global TERMINATION_INDICATOR1
+ global TERMINATION_INDICATOR2
+ global TERMINATION_INDICATOR3
+ if DUMP:
+ print("Unpacking any remaining unknown records")
+ beg = mh.start
+ end = sect.num_sections
+ if beg < K8Boundary:
+ # then we're processing the first part of a combination file
+ end = K8Boundary
+ for i in range(beg, end):
+ if sect.sectiondescriptions[i] == "":
+ data = sect.loadSection(i)
+ type = data[0:4]
+ if type == TERMINATION_INDICATOR3:
+ description = "Termination Marker 3 Nulls"
+ elif type == TERMINATION_INDICATOR2:
+ description = "Termination Marker 2 Nulls"
+ elif type == TERMINATION_INDICATOR1:
+ description = "Termination Marker 1 Null"
+ elif type == "INDX":
+ fname = "Unknown%05d_INDX.dat" % i
+ description = "Unknown INDX section"
+ if DUMP:
+ outname= os.path.join(files.outdir, fname)
+ with open(pathof(outname), 'wb') as f:
+ f.write(data)
+ print("Extracting %s: %s from section %d" % (description, fname, i))
+ description = description + ", extracting as %s" % fname
+ else:
+ fname = "unknown%05d.dat" % i
+ description = "Mysterious Section, first four bytes %s" % describe(data[0:4])
+ if DUMP:
+ outname= os.path.join(files.outdir, fname)
+ with open(pathof(outname), 'wb') as f:
+ f.write(data)
+ print("Extracting %s: %s from section %d" % (description, fname, i))
+ description = description + ", extracting as %s" % fname
+ sect.setsectiondescription(i, description)
+
+
+def process_all_mobi_headers(files, apnxfile, sect, mhlst, K8Boundary, k8only=False, epubver='2', use_hd=False):
+ global DUMP
+ global WRITE_RAW_DATA
+ rscnames = []
+ rsc_ptr = -1
+ k8resc = None
+ obfuscate_data = []
+ for mh in mhlst:
+ pagemapproc = None
+ if mh.isK8():
+ sect.setsectiondescription(mh.start,"KF8 Header")
+ mhname = os.path.join(files.outdir,"header_K8.dat")
+ print("Processing K8 section of book...")
+ elif mh.isPrintReplica():
+ sect.setsectiondescription(mh.start,"Print Replica Header")
+ mhname = os.path.join(files.outdir,"header_PR.dat")
+ print("Processing PrintReplica section of book...")
+ else:
+ if mh.version == 0:
+ sect.setsectiondescription(mh.start, "PalmDoc Header".format(mh.version))
+ else:
+ sect.setsectiondescription(mh.start,"Mobipocket {0:d} Header".format(mh.version))
+ mhname = os.path.join(files.outdir,"header.dat")
+ print("Processing Mobipocket {0:d} section of book...".format(mh.version))
+
+ if DUMP:
+ # write out raw mobi header data
+ with open(pathof(mhname), 'wb') as f:
+ f.write(mh.header)
+
+ # process each mobi header
+ metadata = mh.getMetaData()
+ mh.describeHeader(DUMP)
+ if mh.isEncrypted():
+ raise unpackException('Book is encrypted')
+
+ pagemapproc = None
+
+ # first handle all of the different resource sections: images, resources, fonts, and etc
+ # build up a list of image names to use to postprocess the ebook
+
+ print("Unpacking images, resources, fonts, etc")
+ beg = mh.firstresource
+ end = sect.num_sections
+ if beg < K8Boundary:
+ # processing first part of a combination file
+ end = K8Boundary
+
+ # Not sure the try/except is necessary, but just in case
+ try:
+ thumb_offset = int(metadata.get('ThumbOffset', ['-1'])[0])
+ except:
+ thumb_offset = None
+
+ cover_offset = int(metadata.get('CoverOffset', ['-1'])[0])
+ if not CREATE_COVER_PAGE:
+ cover_offset = None
+
+ for i in range(beg, end):
+ data = sect.loadSection(i)
+ type = data[0:4]
+
+ # handle the basics first
+ if type in [b"FLIS", b"FCIS", b"FDST", b"DATP"]:
+ if DUMP:
+ fname = unicode_str(type) + "%05d" % i
+ if mh.isK8():
+ fname += "_K8"
+ fname += '.dat'
+ outname= os.path.join(files.outdir, fname)
+ with open(pathof(outname), 'wb') as f:
+ f.write(data)
+ print("Dumping section {0:d} type {1:s} to file {2:s} ".format(i,unicode_str(type),outname))
+ sect.setsectiondescription(i,"Type {0:s}".format(unicode_str(type)))
+ rscnames.append(None)
+ elif type == b"SRCS":
+ rscnames = processSRCS(i, files, rscnames, sect, data)
+ elif type == b"PAGE":
+ rscnames, pagemapproc = processPAGE(i, files, rscnames, sect, data, mh, pagemapproc)
+ elif type == b"CMET":
+ rscnames = processCMET(i, files, rscnames, sect, data)
+ elif type == b"FONT":
+ rscnames, obfuscate_data, rsc_ptr = processFONT(i, files, rscnames, sect, data, obfuscate_data, beg, rsc_ptr)
+ elif type == b"CRES":
+ rscnames, rsc_ptr = processCRES(i, files, rscnames, sect, data, beg, rsc_ptr, use_hd)
+ elif type == b"CONT":
+ rscnames = processCONT(i, files, rscnames, sect, data)
+ elif type == b"kind":
+ rscnames = processkind(i, files, rscnames, sect, data)
+ elif type == b'\xa0\xa0\xa0\xa0':
+ sect.setsectiondescription(i,"Empty_HD_Image/Resource_Placeholder")
+ rscnames.append(None)
+ rsc_ptr += 1
+ elif type == b"RESC":
+ rscnames, k8resc = processRESC(i, files, rscnames, sect, data, k8resc)
+ elif data == EOF_RECORD:
+ sect.setsectiondescription(i,"End Of File")
+ rscnames.append(None)
+ elif data[0:8] == b"BOUNDARY":
+ sect.setsectiondescription(i,"BOUNDARY Marker")
+ rscnames.append(None)
+ else:
+ # if reached here should be an image ow treat as unknown
+ rscnames, rsc_ptr = processImage(i, files, rscnames, sect, data, beg, rsc_ptr, cover_offset, thumb_offset)
+ # done unpacking resources
+
+ # Print Replica
+ if mh.isPrintReplica() and not k8only:
+ processPrintReplica(metadata, files, rscnames, mh)
+ continue
+
+ # KF8 (Mobi 8)
+ if mh.isK8():
+ processMobi8(mh, metadata, sect, files, rscnames, pagemapproc, k8resc, obfuscate_data, apnxfile, epubver)
+
+ # Old Mobi (Mobi 7)
+ elif not k8only:
+ processMobi7(mh, metadata, sect, files, rscnames)
+
+ # process any remaining unknown sections of the palm file
+ processUnknownSections(mh, sect, files, K8Boundary)
+
+ return
+
+
+def unpackBook(infile, outdir, apnxfile=None, epubver='2', use_hd=False, dodump=False, dowriteraw=False, dosplitcombos=False):
+ global DUMP
+ global WRITE_RAW_DATA
+ global SPLIT_COMBO_MOBIS
+ if DUMP or dodump:
+ DUMP = True
+ if WRITE_RAW_DATA or dowriteraw:
+ WRITE_RAW_DATA = True
+ if SPLIT_COMBO_MOBIS or dosplitcombos:
+ SPLIT_COMBO_MOBIS = True
+
+ infile = unicode_str(infile)
+ outdir = unicode_str(outdir)
+ if apnxfile is not None:
+ apnxfile = unicode_str(apnxfile)
+
+ files = fileNames(infile, outdir)
+
+ # process the PalmDoc database header and verify it is a mobi
+ sect = Sectionizer(infile)
+ if sect.ident != b'BOOKMOBI' and sect.ident != b'TEXtREAd':
+ raise unpackException('Invalid file format')
+ if DUMP:
+ sect.dumppalmheader()
+ else:
+ print("Palm DB type: %s, %d sections." % (sect.ident.decode('utf-8'),sect.num_sections))
+
+ # scan sections to see if this is a compound mobi file (K8 format)
+ # and build a list of all mobi headers to process.
+ mhlst = []
+ mh = MobiHeader(sect,0)
+ # if this is a mobi8-only file hasK8 here will be true
+ mhlst.append(mh)
+ K8Boundary = -1
+
+ if mh.isK8():
+ print("Unpacking a KF8 book...")
+ hasK8 = True
+ else:
+ # This is either a Mobipocket 7 or earlier, or a combi M7/KF8
+ # Find out which
+ hasK8 = False
+ for i in range(len(sect.sectionoffsets)-1):
+ before, after = sect.sectionoffsets[i:i+2]
+ if (after - before) == 8:
+ data = sect.loadSection(i)
+ if data == K8_BOUNDARY:
+ sect.setsectiondescription(i,"Mobi/KF8 Boundary Section")
+ mh = MobiHeader(sect,i+1)
+ hasK8 = True
+ mhlst.append(mh)
+ K8Boundary = i
+ break
+ if hasK8:
+ print("Unpacking a Combination M{0:d}/KF8 book...".format(mh.version))
+ if SPLIT_COMBO_MOBIS:
+ # if this is a combination mobi7-mobi8 file split them up
+ mobisplit = mobi_split(infile)
+ if mobisplit.combo:
+ outmobi7 = os.path.join(files.outdir, 'mobi7-'+files.getInputFileBasename() + '.mobi')
+ outmobi8 = os.path.join(files.outdir, 'mobi8-'+files.getInputFileBasename() + '.azw3')
+ with open(pathof(outmobi7), 'wb') as f:
+ f.write(mobisplit.getResult7())
+ with open(pathof(outmobi8), 'wb') as f:
+ f.write(mobisplit.getResult8())
+ else:
+ print("Unpacking a Mobipocket {0:d} book...".format(mh.version))
+
+ if hasK8:
+ files.makeK8Struct()
+
+ process_all_mobi_headers(files, apnxfile, sect, mhlst, K8Boundary, False, epubver, use_hd)
+
+ if DUMP:
+ sect.dumpsectionsinfo()
+ return
+
+
+def usage(progname):
+ print("")
+ print("Description:")
+ print(" Unpacks an unencrypted Kindle/MobiPocket ebook to html and images")
+ print(" or an unencrypted Kindle/Print Replica ebook to PDF and images")
+ print(" into the specified output folder.")
+ print("Usage:")
+ print(" %s -r -s -p apnxfile -d -h --epub_version= infile [outdir]" % progname)
+ print("Options:")
+ print(" -h print this help message")
+ print(" -i use HD Images, if present, to overwrite reduced resolution images")
+ print(" -s split combination mobis into mobi7 and mobi8 ebooks")
+ print(" -p APNXFILE path to an .apnx file associated with the azw3 input (optional)")
+ print(" --epub_version= specify epub version to unpack to: 2, 3, A (for automatic) or ")
+ print(" F (force to fit to epub2 definitions), default is 2")
+ print(" -d dump headers and other info to output and extra files")
+ print(" -r write raw data to the output folder")
+
+
+def main(argv=unicode_argv()):
+ global DUMP
+ global WRITE_RAW_DATA
+ global SPLIT_COMBO_MOBIS
+
+ print("KindleUnpack v0.83")
+ print(" Based on initial mobipocket version Copyright © 2009 Charles M. Hannum <root@ihack.net>")
+ print(" Extensive Extensions and Improvements Copyright © 2009-2020 ")
+ print(" by: P. Durrant, K. Hendricks, S. Siebert, fandrieu, DiapDealer, nickredding, tkeo.")
+ print(" This program is free software: you can redistribute it and/or modify")
+ print(" it under the terms of the GNU General Public License as published by")
+ print(" the Free Software Foundation, version 3.")
+
+ progname = os.path.basename(argv[0])
+ try:
+ opts, args = getopt.getopt(argv[1:], "dhirsp:", ['epub_version='])
+ except getopt.GetoptError as err:
+ print(str(err))
+ usage(progname)
+ sys.exit(2)
+
+ if len(args)<1:
+ usage(progname)
+ sys.exit(2)
+
+ apnxfile = None
+ epubver = '2'
+ use_hd = False
+
+ for o, a in opts:
+ if o == "-h":
+ usage(progname)
+ sys.exit(0)
+ if o == "-i":
+ use_hd = True
+ if o == "-d":
+ DUMP = True
+ if o == "-r":
+ WRITE_RAW_DATA = True
+ if o == "-s":
+ SPLIT_COMBO_MOBIS = True
+ if o == "-p":
+ apnxfile = a
+ if o == "--epub_version":
+ epubver = a
+
+ if len(args) > 1:
+ infile, outdir = args
+ else:
+ infile = args[0]
+ outdir = os.path.splitext(infile)[0]
+
+ infileext = os.path.splitext(infile)[1].upper()
+ if infileext not in ['.MOBI', '.PRC', '.AZW', '.AZW3', '.AZW4']:
+ print("Error: first parameter must be a Kindle/Mobipocket ebook or a Kindle/Print Replica ebook.")
+ return 1
+
+ try:
+ print('Unpacking Book...')
+ unpackBook(infile, outdir, apnxfile, epubver, use_hd)
+ print('Completed')
+
+ except ValueError as e:
+ print("Error: %s" % e)
+ print(traceback.format_exc())
+ return 1
+
+ return 0
+
+
+if __name__ == '__main__':
+ sys.exit(main())
diff --git a/src/epy_reader/tools/KindleUnpack/mobi_cover.py b/src/epy_reader/tools/KindleUnpack/mobi_cover.py
new file mode 100644
index 0000000..3078ac4
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/mobi_cover.py
@@ -0,0 +1,238 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import unicode_str
+
+from .unipath import pathof
+import os
+import imghdr
+
+import struct
+# note: struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+USE_SVG_WRAPPER = True
+""" Set to True to use svg wrapper for default. """
+
+FORCE_DEFAULT_TITLE = False
+""" Set to True to force to use the default title. """
+
+COVER_PAGE_FINENAME = 'cover_page.xhtml'
+""" The name for the cover page. """
+
+DEFAULT_TITLE = 'Cover'
+""" The default title for the cover page. """
+
+MAX_WIDTH = 4096
+""" The max width for the svg cover page. """
+
+MAX_HEIGHT = 4096
+""" The max height for the svg cover page. """
+
+
+def get_image_type(imgname, imgdata=None):
+ imgtype = unicode_str(imghdr.what(pathof(imgname), imgdata))
+
+ # imghdr only checks for JFIF or Exif JPEG files. Apparently, there are some
+ # with only the magic JPEG bytes out there...
+ # ImageMagick handles those, so, do it too.
+ if imgtype is None:
+ if imgdata is None:
+ with open(pathof(imgname), 'rb') as f:
+ imgdata = f.read()
+ if imgdata[0:2] == b'\xFF\xD8':
+ # Get last non-null bytes
+ last = len(imgdata)
+ while (imgdata[last-1:last] == b'\x00'):
+ last-=1
+ # Be extra safe, check the trailing bytes, too.
+ if imgdata[last-2:last] == b'\xFF\xD9':
+ imgtype = "jpeg"
+ return imgtype
+
+
+def get_image_size(imgname, imgdata=None):
+ '''Determine the image type of imgname (or imgdata) and return its size.
+
+ Originally,
+ Determine the image type of fhandle and return its size.
+ from draco'''
+ if imgdata is None:
+ fhandle = open(pathof(imgname), 'rb')
+ head = fhandle.read(24)
+ else:
+ head = imgdata[0:24]
+ if len(head) != 24:
+ return
+
+ imgtype = get_image_type(imgname, imgdata)
+ if imgtype == 'png':
+ check = struct.unpack(b'>i', head[4:8])[0]
+ if check != 0x0d0a1a0a:
+ return
+ width, height = struct.unpack(b'>ii', head[16:24])
+ elif imgtype == 'gif':
+ width, height = struct.unpack(b'<HH', head[6:10])
+ elif imgtype == 'jpeg' and imgdata is None:
+ try:
+ fhandle.seek(0) # Read 0xff next
+ size = 2
+ ftype = 0
+ while not 0xc0 <= ftype <= 0xcf:
+ fhandle.seek(size, 1)
+ byte = fhandle.read(1)
+ while ord(byte) == 0xff:
+ byte = fhandle.read(1)
+ ftype = ord(byte)
+ size = struct.unpack(b'>H', fhandle.read(2))[0] - 2
+ # We are at a SOFn block
+ fhandle.seek(1, 1) # Skip `precision' byte.
+ height, width = struct.unpack(b'>HH', fhandle.read(4))
+ except Exception: # IGNORE:W0703
+ return
+ elif imgtype == 'jpeg' and imgdata is not None:
+ try:
+ pos = 0
+ size = 2
+ ftype = 0
+ while not 0xc0 <= ftype <= 0xcf:
+ pos += size
+ byte = imgdata[pos:pos+1]
+ pos += 1
+ while ord(byte) == 0xff:
+ byte = imgdata[pos:pos+1]
+ pos += 1
+ ftype = ord(byte)
+ size = struct.unpack(b'>H', imgdata[pos:pos+2])[0] - 2
+ pos += 2
+ # We are at a SOFn block
+ pos += 1 # Skip `precision' byte.
+ height, width = struct.unpack(b'>HH', imgdata[pos:pos+4])
+ pos += 4
+ except Exception: # IGNORE:W0703
+ return
+ else:
+ return
+ return width, height
+
+# XXX experimental
+class CoverProcessor(object):
+
+ """Create a cover page.
+
+ """
+ def __init__(self, files, metadata, rscnames, imgname=None, imgdata=None):
+ self.files = files
+ self.metadata = metadata
+ self.rscnames = rscnames
+ self.cover_page = COVER_PAGE_FINENAME
+ self.use_svg = USE_SVG_WRAPPER # Use svg wrapper.
+ self.lang = metadata.get('Language', ['en'])[0]
+ # This should ensure that if the methods to find the cover image's
+ # dimensions should fail for any reason, the SVG routine will not be used.
+ [self.width, self.height] = (-1,-1)
+ if FORCE_DEFAULT_TITLE:
+ self.title = DEFAULT_TITLE
+ else:
+ self.title = metadata.get('Title', [DEFAULT_TITLE])[0]
+
+ self.cover_image = None
+ if imgname is not None:
+ self.cover_image = imgname
+ elif 'CoverOffset' in metadata:
+ imageNumber = int(metadata['CoverOffset'][0])
+ cover_image = self.rscnames[imageNumber]
+ if cover_image is not None:
+ self.cover_image = cover_image
+ else:
+ print('Warning: Cannot identify the cover image.')
+ if self.use_svg:
+ try:
+ if imgdata is None:
+ fname = os.path.join(files.imgdir, self.cover_image)
+ [self.width, self.height] = get_image_size(fname)
+ else:
+ [self.width, self.height] = get_image_size(None, imgdata)
+ except:
+ self.use_svg = False
+ width = self.width
+ height = self.height
+ if width < 0 or height < 0 or width > MAX_WIDTH or height > MAX_HEIGHT:
+ self.use_svg = False
+ return
+
+ def getImageName(self):
+ return self.cover_image
+
+ def getXHTMLName(self):
+ return self.cover_page
+
+ def buildXHTML(self):
+ print('Building a cover page.')
+ files = self.files
+ cover_image = self.cover_image
+ title = self.title
+ lang = self.lang
+
+ image_dir = os.path.normpath(os.path.relpath(files.k8images, files.k8text))
+ image_path = os.path.join(image_dir, cover_image).replace('\\', '/')
+
+ if not self.use_svg:
+ data = ''
+ data += '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html>'
+ data += '<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"'
+ data += ' xml:lang="{:s}">\n'.format(lang)
+ data += '<head>\n<title>{:s}</title>\n'.format(title)
+ data += '<style type="text/css">\n'
+ data += 'body {\n margin: 0;\n padding: 0;\n text-align: center;\n}\n'
+ data += 'div {\n height: 100%;\n width: 100%;\n text-align: center;\n page-break-inside: avoid;\n}\n'
+ data += 'img {\n display: inline-block;\n height: 100%;\n margin: 0 auto;\n}\n'
+ data += '</style>\n</head>\n'
+ data += '<body><div>\n'
+ data += ' <img src="{:s}" alt=""/>\n'.format(image_path)
+ data += '</div></body>\n</html>'
+ else:
+ width = self.width
+ height = self.height
+ viewBox = "0 0 {0:d} {1:d}".format(width, height)
+
+ data = ''
+ data += '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html>'
+ data += '<html xmlns="http://www.w3.org/1999/xhtml"'
+ data += ' xml:lang="{:s}">\n'.format(lang)
+ data += '<head>\n <title>{:s}</title>\n'.format(title)
+ data += '<style type="text/css">\n'
+ data += 'svg {padding: 0pt; margin:0pt}\n'
+ data += 'body { text-align: center; padding:0pt; margin: 0pt; }\n'
+ data += '</style>\n</head>\n'
+ data += '<body>\n <div>\n'
+ data += ' <svg xmlns="http://www.w3.org/2000/svg" height="100%" preserveAspectRatio="xMidYMid meet"'
+ data += ' version="1.1" viewBox="{0:s}" width="100%" xmlns:xlink="http://www.w3.org/1999/xlink">\n'.format(viewBox)
+ data += ' <image height="{0}" width="{1}" xlink:href="{2}"/>\n'.format(height, width, image_path)
+ data += ' </svg>\n'
+ data += ' </div>\n</body>\n</html>'
+ return data
+
+ def writeXHTML(self):
+ files = self.files
+ cover_page = self.cover_page
+
+ data = self.buildXHTML()
+
+ outfile = os.path.join(files.k8text, cover_page)
+ if os.path.exists(pathof(outfile)):
+ print('Warning: {:s} already exists.'.format(cover_page))
+ os.remove(pathof(outfile))
+ with open(pathof(outfile), 'wb') as f:
+ f.write(data.encode('utf-8'))
+ return
+
+ def guide_toxml(self):
+ files = self.files
+ text_dir = os.path.relpath(files.k8text, files.k8oebps)
+ data = '<reference type="cover" title="Cover" href="{:s}/{:s}" />\n'.format(
+ text_dir, self.cover_page)
+ return data
diff --git a/src/epy_reader/tools/KindleUnpack/mobi_dict.py b/src/epy_reader/tools/KindleUnpack/mobi_dict.py
new file mode 100644
index 0000000..bfc2ea8
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/mobi_dict.py
@@ -0,0 +1,377 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, PY3, utf8_str, bstr, bchr
+
+if PY2:
+ range = xrange
+ array_format = b'B'
+if PY3:
+ unichr = chr
+ array_format = "B"
+
+import array
+
+import struct
+# note: struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+from .mobi_index import getVariableWidthValue, readTagSection, getTagMap
+from .mobi_utils import toHex
+
+DEBUG_DICT = False
+
+class InflectionData(object):
+
+ def __init__(self, infldatas):
+ self.infldatas = infldatas
+ self.starts = []
+ self.counts = []
+ for idata in self.infldatas:
+ start, = struct.unpack_from(b'>L', idata, 0x14)
+ count, = struct.unpack_from(b'>L', idata, 0x18)
+ self.starts.append(start)
+ self.counts.append(count)
+
+ def lookup(self, lookupvalue):
+ i = 0
+ rvalue = lookupvalue
+ while rvalue >= self.counts[i]:
+ rvalue = rvalue - self.counts[i]
+ i += 1
+ if i == len(self.counts):
+ print("Error: Problem with multiple inflections data sections")
+ return lookupvalue, self.starts[0], self.counts[0], self.infldatas[0]
+ return rvalue, self.starts[i], self.counts[i], self.infldatas[i]
+
+ def offsets(self, value):
+ rvalue, start, count, data = self.lookup(value)
+ offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue))
+ if rvalue + 1 < count:
+ nextOffset, = struct.unpack_from(b'>H',data, start + 4 + (2 * (rvalue + 1)))
+ else:
+ nextOffset = None
+ return offset, nextOffset, data
+
+
+class dictSupport(object):
+
+ def __init__(self, mh, sect):
+ self.mh = mh
+ self.header = mh.header
+ self.sect = sect
+ self.metaOrthIndex = mh.metaOrthIndex
+ self.metaInflIndex = mh.metaInflIndex
+
+ def parseHeader(self, data):
+ "read INDX header"
+ if not data[:4] == b'INDX':
+ print("Warning: index section is not INDX")
+ return False
+ words = (
+ 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
+ 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc'
+ )
+ num = len(words)
+ values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)])
+ header = {}
+ for n in range(num):
+ header[words[n]] = values[n]
+
+ ordt1 = None
+ ordt2 = None
+
+ otype, oentries, op1, op2, otagx = struct.unpack_from(b'>LLLLL',data, 0xa4)
+ header['otype'] = otype
+ header['oentries'] = oentries
+
+ if DEBUG_DICT:
+ print("otype %d, oentries %d, op1 %d, op2 %d, otagx %d" % (otype, oentries, op1, op2, otagx))
+
+ if header['code'] == 0xfdea or oentries > 0:
+ # some dictionaries seem to be codepage 65002 (0xFDEA) which seems
+ # to be some sort of strange EBCDIC utf-8 or 16 encoded strings
+ # So we need to look for them and store them away to process leading text
+ # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
+ # we only ever seem to use the second but ...
+ #
+ # if otype = 0, ORDT table uses 16 bit values as offsets into the table
+ # if otype = 1, ORDT table uses 8 bit values as offsets inot the table
+
+ assert(data[op1:op1+4] == b'ORDT')
+ assert(data[op2:op2+4] == b'ORDT')
+ ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4)
+ ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4)
+
+ if DEBUG_DICT:
+ print("parsed INDX header:")
+ for key in header:
+ print(key, "%x" % header[key],)
+ print("\n")
+ return header, ordt1, ordt2
+
+ def getPositionMap(self):
+ sect = self.sect
+
+ positionMap = {}
+
+ metaOrthIndex = self.metaOrthIndex
+ metaInflIndex = self.metaInflIndex
+
+ decodeInflection = True
+ if metaOrthIndex != 0xFFFFFFFF:
+ print("Info: Document contains orthographic index, handle as dictionary")
+ if metaInflIndex == 0xFFFFFFFF:
+ decodeInflection = False
+ else:
+ metaInflIndexData = sect.loadSection(metaInflIndex)
+
+ print("\nParsing metaInflIndexData")
+ midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData)
+
+ metaIndexCount = midxhdr['count']
+ idatas = []
+ for j in range(metaIndexCount):
+ idatas.append(sect.loadSection(metaInflIndex + 1 + j))
+ dinfl = InflectionData(idatas)
+
+ inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount)
+ tagSectionStart = midxhdr['len']
+ inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData)
+ if DEBUG_DICT:
+ print("inflectionTagTable: %s" % inflectionTagTable)
+ if self.hasTag(inflectionTagTable, 0x07):
+ print("Error: Dictionary uses obsolete inflection rule scheme which is not yet supported")
+ decodeInflection = False
+
+ data = sect.loadSection(metaOrthIndex)
+
+ print("\nParsing metaOrthIndex")
+ idxhdr, hordt1, hordt2 = self.parseHeader(data)
+
+ tagSectionStart = idxhdr['len']
+ controlByteCount, tagTable = readTagSection(tagSectionStart, data)
+ orthIndexCount = idxhdr['count']
+ print("orthIndexCount is", orthIndexCount)
+ if DEBUG_DICT:
+ print("orthTagTable: %s" % tagTable)
+ if hordt2 is not None:
+ print("orth entry uses ordt2 lookup table of type ", idxhdr['otype'])
+ hasEntryLength = self.hasTag(tagTable, 0x02)
+ if not hasEntryLength:
+ print("Info: Index doesn't contain entry length tags")
+
+ print("Read dictionary index data")
+ for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount):
+ data = sect.loadSection(i)
+ hdrinfo, ordt1, ordt2 = self.parseHeader(data)
+ idxtPos = hdrinfo['start']
+ entryCount = hdrinfo['count']
+ idxPositions = []
+ for j in range(entryCount):
+ pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j))
+ idxPositions.append(pos)
+ # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
+ idxPositions.append(idxtPos)
+ for j in range(entryCount):
+ startPos = idxPositions[j]
+ endPos = idxPositions[j+1]
+ textLength = ord(data[startPos:startPos+1])
+ text = data[startPos+1:startPos+1+textLength]
+ if hordt2 is not None:
+ utext = u""
+ if idxhdr['otype'] == 0:
+ pattern = b'>H'
+ inc = 2
+ else:
+ pattern = b'>B'
+ inc = 1
+ pos = 0
+ while pos < textLength:
+ off, = struct.unpack_from(pattern, text, pos)
+ if off < len(hordt2):
+ utext += unichr(hordt2[off])
+ else:
+ utext += unichr(off)
+ pos += inc
+ text = utext.encode('utf-8')
+
+ tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos)
+ if 0x01 in tagMap:
+ if decodeInflection and 0x2a in tagMap:
+ inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable,
+ dinfl, inflNameData, tagMap[0x2a])
+ else:
+ inflectionGroups = b''
+ assert len(tagMap[0x01]) == 1
+ entryStartPosition = tagMap[0x01][0]
+ if hasEntryLength:
+ # The idx:entry attribute "scriptable" must be present to create entry length tags.
+ ml = b'<idx:entry scriptable="yes"><idx:orth value="' + text + b'">' + inflectionGroups + b'</idx:orth>'
+ if entryStartPosition in positionMap:
+ positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml
+ else:
+ positionMap[entryStartPosition] = ml
+ assert len(tagMap[0x02]) == 1
+ entryEndPosition = entryStartPosition + tagMap[0x02][0]
+ if entryEndPosition in positionMap:
+ positionMap[entryEndPosition] = b"</idx:entry>" + positionMap[entryEndPosition]
+ else:
+ positionMap[entryEndPosition] = b"</idx:entry>"
+
+ else:
+ indexTags = b'<idx:entry>\n<idx:orth value="' + text + b'">\n' + inflectionGroups + b'</idx:entry>\n'
+ if entryStartPosition in positionMap:
+ positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags
+ else:
+ positionMap[entryStartPosition] = indexTags
+ return positionMap
+
+ def hasTag(self, tagTable, tag):
+ '''
+ Test if tag table contains given tag.
+
+ @param tagTable: The tag table.
+ @param tag: The tag to search.
+ @return: True if tag table contains given tag; False otherwise.
+ '''
+ for currentTag, _, _, _ in tagTable:
+ if currentTag == tag:
+ return True
+ return False
+
+ def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList):
+ '''
+ Create string which contains the inflection groups with inflection rules as mobipocket tags.
+
+ @param mainEntry: The word to inflect.
+ @param controlByteCount: The number of control bytes.
+ @param tagTable: The tag table.
+ @param data: The Inflection data object to properly select the right inflection data section to use
+ @param inflectionNames: The inflection rule name data.
+ @param groupList: The list of inflection groups to process.
+ @return: String with inflection groups and rules or empty string if required tags are not available.
+ '''
+ result = b""
+ for value in groupList:
+ offset, nextOffset, data = dinfl.offsets(value)
+
+ # First byte seems to be always 0x00 and must be skipped.
+ assert ord(data[offset:offset+1]) == 0x00
+ tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset)
+
+ # Make sure that the required tags are available.
+ if 0x05 not in tagMap:
+ print("Error: Required tag 0x05 not found in tagMap")
+ return ""
+ if 0x1a not in tagMap:
+ print("Error: Required tag 0x1a not found in tagMap")
+ return b''
+
+ result += b'<idx:infl>'
+
+ for i in range(len(tagMap[0x05])):
+
+ # Get name of inflection rule.
+ value = tagMap[0x05][i]
+ consumed, textLength = getVariableWidthValue(inflectionNames, value)
+ inflectionName = inflectionNames[value+consumed:value+consumed+textLength]
+
+ # Get and apply inflection rule across possibly multiple inflection data sections
+ value = tagMap[0x1a][i]
+ rvalue, start, count, data = dinfl.lookup(value)
+ offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue))
+ textLength = ord(data[offset:offset+1])
+ inflection = self.applyInflectionRule(mainEntry, data, offset+1, offset+1+textLength)
+ if inflection is not None:
+ result += b' <idx:iform name="' + inflectionName + b'" value="' + inflection + b'"/>'
+
+ result += b'</idx:infl>'
+ return result
+
+ def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end):
+ '''
+ Apply inflection rule.
+
+ @param mainEntry: The word to inflect.
+ @param inflectionRuleData: The inflection rules.
+ @param start: The start position of the inflection rule to use.
+ @param end: The end position of the inflection rule to use.
+ @return: The string with the inflected word or None if an error occurs.
+ '''
+ mode = -1
+ byteArray = array.array(array_format, mainEntry)
+ position = len(byteArray)
+ for charOffset in range(start, end):
+ char = inflectionRuleData[charOffset:charOffset+1]
+ abyte = ord(char)
+ if abyte >= 0x0a and abyte <= 0x13:
+ # Move cursor backwards
+ offset = abyte - 0x0a
+ if mode not in [0x02, 0x03]:
+ mode = 0x02
+ position = len(byteArray)
+ position -= offset
+ elif abyte > 0x13:
+ if mode == -1:
+ print("Error: Unexpected first byte %i of inflection rule" % abyte)
+ return None
+ elif position == -1:
+ print("Error: Unexpected first byte %i of inflection rule" % abyte)
+ return None
+ else:
+ if mode == 0x01:
+ # Insert at word start
+ byteArray.insert(position, abyte)
+ position += 1
+ elif mode == 0x02:
+ # Insert at word end
+ byteArray.insert(position, abyte)
+ elif mode == 0x03:
+ # Delete at word end
+ position -= 1
+ deleted = byteArray.pop(position)
+ if bchr(deleted) != char:
+ if DEBUG_DICT:
+ print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted)))
+ print("Error: Delete operation of inflection rule failed")
+ return None
+ elif mode == 0x04:
+ # Delete at word start
+ deleted = byteArray.pop(position)
+ if bchr(deleted) != char:
+ if DEBUG_DICT:
+ print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted)))
+ print("Error: Delete operation of inflection rule failed")
+ return None
+ else:
+ print("Error: Inflection rule mode %x is not implemented" % mode)
+ return None
+ elif abyte == 0x01:
+ # Insert at word start
+ if mode not in [0x01, 0x04]:
+ position = 0
+ mode = abyte
+ elif abyte == 0x02:
+ # Insert at word end
+ if mode not in [0x02, 0x03]:
+ position = len(byteArray)
+ mode = abyte
+ elif abyte == 0x03:
+ # Delete at word end
+ if mode not in [0x02, 0x03]:
+ position = len(byteArray)
+ mode = abyte
+ elif abyte == 0x04:
+ # Delete at word start
+ if mode not in [0x01, 0x04]:
+ position = 0
+ # Delete at word start
+ mode = abyte
+ else:
+ print("Error: Inflection rule mode %x is not implemented" % abyte)
+ return None
+ return utf8_str(byteArray.tostring())
diff --git a/src/epy_reader/tools/KindleUnpack/mobi_header.py b/src/epy_reader/tools/KindleUnpack/mobi_header.py
new file mode 100644
index 0000000..a15f636
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/mobi_header.py
@@ -0,0 +1,936 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+DEBUG_USE_ORDERED_DICTIONARY = False # OrderedDict is supoorted >= python 2.7.
+""" set to True to use OrderedDict for MobiHeader.metadata."""
+
+if DEBUG_USE_ORDERED_DICTIONARY:
+ from collections import OrderedDict as dict_
+else:
+ dict_ = dict
+
+from .compatibility_utils import PY2, unicode_str, hexlify, bord
+
+if PY2:
+ range = xrange
+
+import struct
+import uuid
+
+# import the mobiunpack support libraries
+from .mobi_utils import getLanguage
+from .mobi_uncompress import HuffcdicReader, PalmdocReader, UncompressedReader
+
+class unpackException(Exception):
+ pass
+
+
+def sortedHeaderKeys(mheader):
+ hdrkeys = sorted(list(mheader.keys()), key=lambda akey: mheader[akey][0])
+ return hdrkeys
+
+
+# HD Containers have their own headers and their own EXTH
+# this is just guesswork so far, making big assumption that
+# metavalue key numbers remain the same in the CONT EXTH
+
+# Note: The layout of the CONT Header is still unknown
+# so just deal with their EXTH sections for now
+
+def dump_contexth(cpage, extheader):
+ # determine text encoding
+ codec = 'windows-1252'
+ codec_map = {
+ 1252 : 'windows-1252',
+ 65001: 'utf-8',
+ }
+ if cpage in codec_map:
+ codec = codec_map[cpage]
+ if extheader == b'':
+ return
+ id_map_strings = {
+ 1 : 'Drm Server Id',
+ 2 : 'Drm Commerce Id',
+ 3 : 'Drm Ebookbase Book Id',
+ 4 : 'Drm Ebookbase Dep Id',
+ 100 : 'Creator',
+ 101 : 'Publisher',
+ 102 : 'Imprint',
+ 103 : 'Description',
+ 104 : 'ISBN',
+ 105 : 'Subject',
+ 106 : 'Published',
+ 107 : 'Review',
+ 108 : 'Contributor',
+ 109 : 'Rights',
+ 110 : 'SubjectCode',
+ 111 : 'Type',
+ 112 : 'Source',
+ 113 : 'ASIN',
+ # 114 : 'versionNumber',
+ 117 : 'Adult',
+ 118 : 'Retail-Price',
+ 119 : 'Retail-Currency',
+ 120 : 'TSC',
+ 122 : 'fixed-layout',
+ 123 : 'book-type',
+ 124 : 'orientation-lock',
+ 126 : 'original-resolution',
+ 127 : 'zero-gutter',
+ 128 : 'zero-margin',
+ 129 : 'MetadataResourceURI',
+ 132 : 'RegionMagnification',
+ 150 : 'LendingEnabled',
+ 200 : 'DictShortName',
+ 501 : 'cdeType',
+ 502 : 'last_update_time',
+ 503 : 'Updated_Title',
+ 504 : 'CDEContentKey',
+ 505 : 'AmazonContentReference',
+ 506 : 'Title-Language',
+ 507 : 'Title-Display-Direction',
+ 508 : 'Title-Pronunciation',
+ 509 : 'Title-Collation',
+ 510 : 'Secondary-Title',
+ 511 : 'Secondary-Title-Language',
+ 512 : 'Secondary-Title-Direction',
+ 513 : 'Secondary-Title-Pronunciation',
+ 514 : 'Secondary-Title-Collation',
+ 515 : 'Author-Language',
+ 516 : 'Author-Display-Direction',
+ 517 : 'Author-Pronunciation',
+ 518 : 'Author-Collation',
+ 519 : 'Author-Type',
+ 520 : 'Publisher-Language',
+ 521 : 'Publisher-Display-Direction',
+ 522 : 'Publisher-Pronunciation',
+ 523 : 'Publisher-Collation',
+ 524 : 'Content-Language-Tag',
+ 525 : 'primary-writing-mode',
+ 526 : 'NCX-Ingested-By-Software',
+ 527 : 'page-progression-direction',
+ 528 : 'override-kindle-fonts',
+ 529 : 'Compression-Upgraded',
+ 530 : 'Soft-Hyphens-In-Content',
+ 531 : 'Dictionary_In_Langague',
+ 532 : 'Dictionary_Out_Language',
+ 533 : 'Font_Converted',
+ 534 : 'Amazon_Creator_Info',
+ 535 : 'Creator-Build-Tag',
+ 536 : 'HD-Media-Containers-Info', # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?)
+ 538 : 'Resource-Container-Fidelity',
+ 539 : 'HD-Container-Mimetype',
+ 540 : 'Sample-For_Special-Purpose',
+ 541 : 'Kindletool-Operation-Information',
+ 542 : 'Container_Id',
+ 543 : 'Asset-Type', # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER
+ 544 : 'Unknown_544',
+ }
+ id_map_values = {
+ 114 : 'versionNumber',
+ 115 : 'sample',
+ 116 : 'StartOffset',
+ 121 : 'Mobi8-Boundary-Section',
+ 125 : 'Embedded-Record-Count',
+ 130 : 'Offline-Sample',
+ 131 : 'Metadata-Record-Offset',
+ 201 : 'CoverOffset',
+ 202 : 'ThumbOffset',
+ 203 : 'HasFakeCover',
+ 204 : 'Creator-Software',
+ 205 : 'Creator-Major-Version',
+ 206 : 'Creator-Minor-Version',
+ 207 : 'Creator-Build-Number',
+ 401 : 'Clipping-Limit',
+ 402 : 'Publisher-Limit',
+ 404 : 'Text-to-Speech-Disabled',
+ 406 : 'Rental-Expiration-Time',
+ }
+ id_map_hexstrings = {
+ 208 : 'Watermark_(hex)',
+ 209 : 'Tamper-Proof-Keys_(hex)',
+ 300 : 'Font-Signature_(hex)',
+ 403 : 'Unknown_(403)_(hex)',
+ 405 : 'Ownership-Type_(hex)',
+ 407 : 'Unknown_(407)_(hex)',
+ 420 : 'Multimedia-Content-Reference_(hex)',
+ 450 : 'Locations_Match_(hex)',
+ 451 : 'Full-Story-Length_(hex)',
+ 452 : 'Sample-Start_Location_(hex)',
+ 453 : 'Sample-End-Location_(hex)',
+ }
+ _length, num_items = struct.unpack(b'>LL', extheader[4:12])
+ extheader = extheader[12:]
+ pos = 0
+ for _ in range(num_items):
+ id, size = struct.unpack(b'>LL', extheader[pos:pos+8])
+ content = extheader[pos + 8: pos + size]
+ if id in id_map_strings:
+ name = id_map_strings[id]
+ print('\n Key: "%s"\n Value: "%s"' % (name, content.decode(codec, errors='replace')))
+ elif id in id_map_values:
+ name = id_map_values[id]
+ if size == 9:
+ value, = struct.unpack(b'B',content)
+ print('\n Key: "%s"\n Value: 0x%01x' % (name, value))
+ elif size == 10:
+ value, = struct.unpack(b'>H',content)
+ print('\n Key: "%s"\n Value: 0x%02x' % (name, value))
+ elif size == 12:
+ value, = struct.unpack(b'>L',content)
+ print('\n Key: "%s"\n Value: 0x%04x' % (name, value))
+ else:
+ print("\nError: Value for %s has unexpected size of %s" % (name, size))
+ elif id in id_map_hexstrings:
+ name = id_map_hexstrings[id]
+ print('\n Key: "%s"\n Value: 0x%s' % (name, hexlify(content)))
+ else:
+ print("\nWarning: Unknown metadata with id %s found" % id)
+ name = str(id) + ' (hex)'
+ print(' Key: "%s"\n Value: 0x%s' % (name, hexlify(content)))
+ pos += size
+ return
+
+
+class MobiHeader:
+ # all values are packed in big endian format
+ palmdoc_header = {
+ 'compression_type' : (0x00, b'>H', 2),
+ 'fill0' : (0x02, b'>H', 2),
+ 'text_length' : (0x04, b'>L', 4),
+ 'text_records' : (0x08, b'>H', 2),
+ 'max_section_size' : (0x0a, b'>H', 2),
+ 'read_pos ' : (0x0c, b'>L', 4),
+ }
+
+ mobi6_header = {
+ 'compression_type' : (0x00, b'>H', 2),
+ 'fill0' : (0x02, b'>H', 2),
+ 'text_length' : (0x04, b'>L', 4),
+ 'text_records' : (0x08, b'>H', 2),
+ 'max_section_size' : (0x0a, b'>H', 2),
+ 'crypto_type' : (0x0c, b'>H', 2),
+ 'fill1' : (0x0e, b'>H', 2),
+ 'magic' : (0x10, b'4s', 4),
+ 'header_length (from MOBI)' : (0x14, b'>L', 4),
+ 'type' : (0x18, b'>L', 4),
+ 'codepage' : (0x1c, b'>L', 4),
+ 'unique_id' : (0x20, b'>L', 4),
+ 'version' : (0x24, b'>L', 4),
+ 'metaorthindex' : (0x28, b'>L', 4),
+ 'metainflindex' : (0x2c, b'>L', 4),
+ 'index_names' : (0x30, b'>L', 4),
+ 'index_keys' : (0x34, b'>L', 4),
+ 'extra_index0' : (0x38, b'>L', 4),
+ 'extra_index1' : (0x3c, b'>L', 4),
+ 'extra_index2' : (0x40, b'>L', 4),
+ 'extra_index3' : (0x44, b'>L', 4),
+ 'extra_index4' : (0x48, b'>L', 4),
+ 'extra_index5' : (0x4c, b'>L', 4),
+ 'first_nontext' : (0x50, b'>L', 4),
+ 'title_offset' : (0x54, b'>L', 4),
+ 'title_length' : (0x58, b'>L', 4),
+ 'language_code' : (0x5c, b'>L', 4),
+ 'dict_in_lang' : (0x60, b'>L', 4),
+ 'dict_out_lang' : (0x64, b'>L', 4),
+ 'min_version' : (0x68, b'>L', 4),
+ 'first_resc_offset' : (0x6c, b'>L', 4),
+ 'huff_offset' : (0x70, b'>L', 4),
+ 'huff_num' : (0x74, b'>L', 4),
+ 'huff_tbl_offset' : (0x78, b'>L', 4),
+ 'huff_tbl_len' : (0x7c, b'>L', 4),
+ 'exth_flags' : (0x80, b'>L', 4),
+ 'fill3_a' : (0x84, b'>L', 4),
+ 'fill3_b' : (0x88, b'>L', 4),
+ 'fill3_c' : (0x8c, b'>L', 4),
+ 'fill3_d' : (0x90, b'>L', 4),
+ 'fill3_e' : (0x94, b'>L', 4),
+ 'fill3_f' : (0x98, b'>L', 4),
+ 'fill3_g' : (0x9c, b'>L', 4),
+ 'fill3_h' : (0xa0, b'>L', 4),
+ 'unknown0' : (0xa4, b'>L', 4),
+ 'drm_offset' : (0xa8, b'>L', 4),
+ 'drm_count' : (0xac, b'>L', 4),
+ 'drm_size' : (0xb0, b'>L', 4),
+ 'drm_flags' : (0xb4, b'>L', 4),
+ 'fill4_a' : (0xb8, b'>L', 4),
+ 'fill4_b' : (0xbc, b'>L', 4),
+ 'first_content' : (0xc0, b'>H', 2),
+ 'last_content' : (0xc2, b'>H', 2),
+ 'unknown0' : (0xc4, b'>L', 4),
+ 'fcis_offset' : (0xc8, b'>L', 4),
+ 'fcis_count' : (0xcc, b'>L', 4),
+ 'flis_offset' : (0xd0, b'>L', 4),
+ 'flis_count' : (0xd4, b'>L', 4),
+ 'unknown1' : (0xd8, b'>L', 4),
+ 'unknown2' : (0xdc, b'>L', 4),
+ 'srcs_offset' : (0xe0, b'>L', 4),
+ 'srcs_count' : (0xe4, b'>L', 4),
+ 'unknown3' : (0xe8, b'>L', 4),
+ 'unknown4' : (0xec, b'>L', 4),
+ 'fill5' : (0xf0, b'>H', 2),
+ 'traildata_flags' : (0xf2, b'>H', 2),
+ 'ncx_index' : (0xf4, b'>L', 4),
+ 'unknown5' : (0xf8, b'>L', 4),
+ 'unknown6' : (0xfc, b'>L', 4),
+ 'datp_offset' : (0x100, b'>L', 4),
+ 'unknown7' : (0x104, b'>L', 4),
+ 'Unknown ' : (0x108, b'>L', 4),
+ 'Unknown ' : (0x10C, b'>L', 4),
+ 'Unknown ' : (0x110, b'>L', 4),
+ 'Unknown ' : (0x114, b'>L', 4),
+ 'Unknown ' : (0x118, b'>L', 4),
+ 'Unknown ' : (0x11C, b'>L', 4),
+ 'Unknown ' : (0x120, b'>L', 4),
+ 'Unknown ' : (0x124, b'>L', 4),
+ 'Unknown ' : (0x128, b'>L', 4),
+ 'Unknown ' : (0x12C, b'>L', 4),
+ 'Unknown ' : (0x130, b'>L', 4),
+ 'Unknown ' : (0x134, b'>L', 4),
+ 'Unknown ' : (0x138, b'>L', 4),
+ 'Unknown ' : (0x11C, b'>L', 4),
+ }
+
+ mobi8_header = {
+ 'compression_type' : (0x00, b'>H', 2),
+ 'fill0' : (0x02, b'>H', 2),
+ 'text_length' : (0x04, b'>L', 4),
+ 'text_records' : (0x08, b'>H', 2),
+ 'max_section_size' : (0x0a, b'>H', 2),
+ 'crypto_type' : (0x0c, b'>H', 2),
+ 'fill1' : (0x0e, b'>H', 2),
+ 'magic' : (0x10, b'4s', 4),
+ 'header_length (from MOBI)' : (0x14, b'>L', 4),
+ 'type' : (0x18, b'>L', 4),
+ 'codepage' : (0x1c, b'>L', 4),
+ 'unique_id' : (0x20, b'>L', 4),
+ 'version' : (0x24, b'>L', 4),
+ 'metaorthindex' : (0x28, b'>L', 4),
+ 'metainflindex' : (0x2c, b'>L', 4),
+ 'index_names' : (0x30, b'>L', 4),
+ 'index_keys' : (0x34, b'>L', 4),
+ 'extra_index0' : (0x38, b'>L', 4),
+ 'extra_index1' : (0x3c, b'>L', 4),
+ 'extra_index2' : (0x40, b'>L', 4),
+ 'extra_index3' : (0x44, b'>L', 4),
+ 'extra_index4' : (0x48, b'>L', 4),
+ 'extra_index5' : (0x4c, b'>L', 4),
+ 'first_nontext' : (0x50, b'>L', 4),
+ 'title_offset' : (0x54, b'>L', 4),
+ 'title_length' : (0x58, b'>L', 4),
+ 'language_code' : (0x5c, b'>L', 4),
+ 'dict_in_lang' : (0x60, b'>L', 4),
+ 'dict_out_lang' : (0x64, b'>L', 4),
+ 'min_version' : (0x68, b'>L', 4),
+ 'first_resc_offset' : (0x6c, b'>L', 4),
+ 'huff_offset' : (0x70, b'>L', 4),
+ 'huff_num' : (0x74, b'>L', 4),
+ 'huff_tbl_offset' : (0x78, b'>L', 4),
+ 'huff_tbl_len' : (0x7c, b'>L', 4),
+ 'exth_flags' : (0x80, b'>L', 4),
+ 'fill3_a' : (0x84, b'>L', 4),
+ 'fill3_b' : (0x88, b'>L', 4),
+ 'fill3_c' : (0x8c, b'>L', 4),
+ 'fill3_d' : (0x90, b'>L', 4),
+ 'fill3_e' : (0x94, b'>L', 4),
+ 'fill3_f' : (0x98, b'>L', 4),
+ 'fill3_g' : (0x9c, b'>L', 4),
+ 'fill3_h' : (0xa0, b'>L', 4),
+ 'unknown0' : (0xa4, b'>L', 4),
+ 'drm_offset' : (0xa8, b'>L', 4),
+ 'drm_count' : (0xac, b'>L', 4),
+ 'drm_size' : (0xb0, b'>L', 4),
+ 'drm_flags' : (0xb4, b'>L', 4),
+ 'fill4_a' : (0xb8, b'>L', 4),
+ 'fill4_b' : (0xbc, b'>L', 4),
+ 'fdst_offset' : (0xc0, b'>L', 4),
+ 'fdst_flow_count' : (0xc4, b'>L', 4),
+ 'fcis_offset' : (0xc8, b'>L', 4),
+ 'fcis_count' : (0xcc, b'>L', 4),
+ 'flis_offset' : (0xd0, b'>L', 4),
+ 'flis_count' : (0xd4, b'>L', 4),
+ 'unknown1' : (0xd8, b'>L', 4),
+ 'unknown2' : (0xdc, b'>L', 4),
+ 'srcs_offset' : (0xe0, b'>L', 4),
+ 'srcs_count' : (0xe4, b'>L', 4),
+ 'unknown3' : (0xe8, b'>L', 4),
+ 'unknown4' : (0xec, b'>L', 4),
+ 'fill5' : (0xf0, b'>H', 2),
+ 'traildata_flags' : (0xf2, b'>H', 2),
+ 'ncx_index' : (0xf4, b'>L', 4),
+ 'fragment_index' : (0xf8, b'>L', 4),
+ 'skeleton_index' : (0xfc, b'>L', 4),
+ 'datp_offset' : (0x100, b'>L', 4),
+ 'guide_index' : (0x104, b'>L', 4),
+ 'Unknown ' : (0x108, b'>L', 4),
+ 'Unknown ' : (0x10C, b'>L', 4),
+ 'Unknown ' : (0x110, b'>L', 4),
+ 'Unknown ' : (0x114, b'>L', 4),
+ 'Unknown ' : (0x118, b'>L', 4),
+ 'Unknown ' : (0x11C, b'>L', 4),
+ 'Unknown ' : (0x120, b'>L', 4),
+ 'Unknown ' : (0x124, b'>L', 4),
+ 'Unknown ' : (0x128, b'>L', 4),
+ 'Unknown ' : (0x12C, b'>L', 4),
+ 'Unknown ' : (0x130, b'>L', 4),
+ 'Unknown ' : (0x134, b'>L', 4),
+ 'Unknown ' : (0x138, b'>L', 4),
+ 'Unknown ' : (0x11C, b'>L', 4),
+ }
+
+ palmdoc_header_sorted_keys = sortedHeaderKeys(palmdoc_header)
+ mobi6_header_sorted_keys = sortedHeaderKeys(mobi6_header)
+ mobi8_header_sorted_keys = sortedHeaderKeys(mobi8_header)
+
+ id_map_strings = {
+ 1 : 'Drm Server Id',
+ 2 : 'Drm Commerce Id',
+ 3 : 'Drm Ebookbase Book Id',
+ 4 : 'Drm Ebookbase Dep Id',
+ 100 : 'Creator',
+ 101 : 'Publisher',
+ 102 : 'Imprint',
+ 103 : 'Description',
+ 104 : 'ISBN',
+ 105 : 'Subject',
+ 106 : 'Published',
+ 107 : 'Review',
+ 108 : 'Contributor',
+ 109 : 'Rights',
+ 110 : 'SubjectCode',
+ 111 : 'Type',
+ 112 : 'Source',
+ 113 : 'ASIN',
+ # 114 : 'versionNumber',
+ 117 : 'Adult',
+ 118 : 'Retail-Price',
+ 119 : 'Retail-Currency',
+ 120 : 'TSC',
+ 122 : 'fixed-layout',
+ 123 : 'book-type',
+ 124 : 'orientation-lock',
+ 126 : 'original-resolution',
+ 127 : 'zero-gutter',
+ 128 : 'zero-margin',
+ 129 : 'MetadataResourceURI',
+ 132 : 'RegionMagnification',
+ 150 : 'LendingEnabled',
+ 200 : 'DictShortName',
+ 501 : 'cdeType',
+ 502 : 'last_update_time',
+ 503 : 'Updated_Title',
+ 504 : 'CDEContentKey',
+ 505 : 'AmazonContentReference',
+ 506 : 'Title-Language',
+ 507 : 'Title-Display-Direction',
+ 508 : 'Title-Pronunciation',
+ 509 : 'Title-Collation',
+ 510 : 'Secondary-Title',
+ 511 : 'Secondary-Title-Language',
+ 512 : 'Secondary-Title-Direction',
+ 513 : 'Secondary-Title-Pronunciation',
+ 514 : 'Secondary-Title-Collation',
+ 515 : 'Author-Language',
+ 516 : 'Author-Display-Direction',
+ 517 : 'Author-Pronunciation',
+ 518 : 'Author-Collation',
+ 519 : 'Author-Type',
+ 520 : 'Publisher-Language',
+ 521 : 'Publisher-Display-Direction',
+ 522 : 'Publisher-Pronunciation',
+ 523 : 'Publisher-Collation',
+ 524 : 'Content-Language-Tag',
+ 525 : 'primary-writing-mode',
+ 526 : 'NCX-Ingested-By-Software',
+ 527 : 'page-progression-direction',
+ 528 : 'override-kindle-fonts',
+ 529 : 'Compression-Upgraded',
+ 530 : 'Soft-Hyphens-In-Content',
+ 531 : 'Dictionary_In_Langague',
+ 532 : 'Dictionary_Out_Language',
+ 533 : 'Font_Converted',
+ 534 : 'Amazon_Creator_Info',
+ 535 : 'Creator-Build-Tag',
+ 536 : 'HD-Media-Containers-Info', # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?)
+ 538 : 'Resource-Container-Fidelity',
+ 539 : 'HD-Container-Mimetype',
+ 540 : 'Sample-For_Special-Purpose',
+ 541 : 'Kindletool-Operation-Information',
+ 542 : 'Container_Id',
+ 543 : 'Asset-Type', # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER
+ 544 : 'Unknown_544',
+ }
+ id_map_values = {
+ 114 : 'versionNumber',
+ 115 : 'sample',
+ 116 : 'StartOffset',
+ 121 : 'Mobi8-Boundary-Section',
+ 125 : 'Embedded-Record-Count',
+ 130 : 'Offline-Sample',
+ 131 : 'Metadata-Record-Offset',
+ 201 : 'CoverOffset',
+ 202 : 'ThumbOffset',
+ 203 : 'HasFakeCover',
+ 204 : 'Creator-Software',
+ 205 : 'Creator-Major-Version',
+ 206 : 'Creator-Minor-Version',
+ 207 : 'Creator-Build-Number',
+ 401 : 'Clipping-Limit',
+ 402 : 'Publisher-Limit',
+ 404 : 'Text-to-Speech-Disabled',
+ 406 : 'Rental-Expiration-Time',
+ }
+ id_map_hexstrings = {
+ 208 : 'Watermark_(hex)',
+ 209 : 'Tamper-Proof-Keys_(hex)',
+ 300 : 'Font-Signature_(hex)',
+ 403 : 'Unknown_(403)_(hex)',
+ 405 : 'Ownership-Type_(hex)',
+ 407 : 'Unknown_(407)_(hex)',
+ 420 : 'Multimedia-Content-Reference_(hex)',
+ 450 : 'Locations_Match_(hex)',
+ 451 : 'Full-Story-Length_(hex)',
+ 452 : 'Sample-Start_Location_(hex)',
+ 453 : 'Sample-End-Location_(hex)',
+ }
+
+ def __init__(self, sect, sectNumber):
+ self.sect = sect
+ self.start = sectNumber
+ self.header = self.sect.loadSection(self.start)
+ if len(self.header)>20 and self.header[16:20] == b'MOBI':
+ self.sect.setsectiondescription(0,"Mobipocket Header")
+ self.palm = False
+ elif self.sect.ident == b'TEXtREAd':
+ self.sect.setsectiondescription(0, "PalmDOC Header")
+ self.palm = True
+ else:
+ raise unpackException('Unknown File Format')
+
+ self.records, = struct.unpack_from(b'>H', self.header, 0x8)
+
+ # set defaults in case this is a PalmDOC
+ self.title = self.sect.palmname.decode('latin-1', errors='replace')
+ self.length = len(self.header)-16
+ self.type = 3
+ self.codepage = 1252
+ self.codec = 'windows-1252'
+ self.unique_id = 0
+ self.version = 0
+ self.hasExth = False
+ self.exth = b''
+ self.exth_offset = self.length + 16
+ self.exth_length = 0
+ self.crypto_type = 0
+ self.firstnontext = self.start+self.records + 1
+ self.firstresource = self.start+self.records + 1
+ self.ncxidx = 0xffffffff
+ self.metaOrthIndex = 0xffffffff
+ self.metaInflIndex = 0xffffffff
+ self.skelidx = 0xffffffff
+ self.fragidx = 0xffffffff
+ self.guideidx = 0xffffffff
+ self.fdst = 0xffffffff
+ self.mlstart = self.sect.loadSection(self.start+1)[:4]
+ self.rawSize = 0
+ self.metadata = dict_()
+
+ # set up for decompression/unpacking
+ self.compression, = struct.unpack_from(b'>H', self.header, 0x0)
+ if self.compression == 0x4448:
+ reader = HuffcdicReader()
+ huffoff, huffnum = struct.unpack_from(b'>LL', self.header, 0x70)
+ huffoff = huffoff + self.start
+ self.sect.setsectiondescription(huffoff,"Huffman Compression Seed")
+ reader.loadHuff(self.sect.loadSection(huffoff))
+ for i in range(1, huffnum):
+ self.sect.setsectiondescription(huffoff+i,"Huffman CDIC Compression Seed %d" % i)
+ reader.loadCdic(self.sect.loadSection(huffoff+i))
+ self.unpack = reader.unpack
+ elif self.compression == 2:
+ self.unpack = PalmdocReader().unpack
+ elif self.compression == 1:
+ self.unpack = UncompressedReader().unpack
+ else:
+ raise unpackException('invalid compression type: 0x%4x' % self.compression)
+
+ if self.palm:
+ return
+
+ self.length, self.type, self.codepage, self.unique_id, self.version = struct.unpack(b'>LLLLL', self.header[20:40])
+ codec_map = {
+ 1252 : 'windows-1252',
+ 65001: 'utf-8',
+ }
+ if self.codepage in codec_map:
+ self.codec = codec_map[self.codepage]
+
+ # title
+ toff, tlen = struct.unpack(b'>II', self.header[0x54:0x5c])
+ tend = toff + tlen
+ self.title=self.header[toff:tend].decode(self.codec, errors='replace')
+
+ exth_flag, = struct.unpack(b'>L', self.header[0x80:0x84])
+ self.hasExth = exth_flag & 0x40
+ self.exth_offset = self.length + 16
+ self.exth_length = 0
+ if self.hasExth:
+ self.exth_length, = struct.unpack_from(b'>L', self.header, self.exth_offset+4)
+ self.exth_length = ((self.exth_length + 3)>>2)<<2 # round to next 4 byte boundary
+ self.exth = self.header[self.exth_offset:self.exth_offset+self.exth_length]
+
+ # parse the exth / metadata
+ self.parseMetaData()
+
+ # self.mlstart = self.sect.loadSection(self.start+1)
+ # self.mlstart = self.mlstart[0:4]
+ self.crypto_type, = struct.unpack_from(b'>H', self.header, 0xC)
+
+ # Start sector for additional files such as images, fonts, resources, etc
+ # Can be missing so fall back to default set previously
+ ofst, = struct.unpack_from(b'>L', self.header, 0x6C)
+ if ofst != 0xffffffff:
+ self.firstresource = ofst + self.start
+ ofst, = struct.unpack_from(b'>L', self.header, 0x50)
+ if ofst != 0xffffffff:
+ self.firstnontext = ofst + self.start
+
+ if self.isPrintReplica():
+ return
+
+ if self.version < 8:
+ # Dictionary metaOrthIndex
+ self.metaOrthIndex, = struct.unpack_from(b'>L', self.header, 0x28)
+ if self.metaOrthIndex != 0xffffffff:
+ self.metaOrthIndex += self.start
+
+ # Dictionary metaInflIndex
+ self.metaInflIndex, = struct.unpack_from(b'>L', self.header, 0x2C)
+ if self.metaInflIndex != 0xffffffff:
+ self.metaInflIndex += self.start
+
+ # handle older headers without any ncxindex info and later
+ # specifically 0xe4 headers
+ if self.length + 16 < 0xf8:
+ return
+
+ # NCX Index
+ self.ncxidx, = struct.unpack(b'>L', self.header[0xf4:0xf8])
+ if self.ncxidx != 0xffffffff:
+ self.ncxidx += self.start
+
+ # K8 specific Indexes
+ if self.start != 0 or self.version == 8:
+ # Index into <xml> file skeletons in RawML
+ self.skelidx, = struct.unpack_from(b'>L', self.header, 0xfc)
+ if self.skelidx != 0xffffffff:
+ self.skelidx += self.start
+
+ # Index into <div> sections in RawML
+ self.fragidx, = struct.unpack_from(b'>L', self.header, 0xf8)
+ if self.fragidx != 0xffffffff:
+ self.fragidx += self.start
+
+ # Index into Other files
+ self.guideidx, = struct.unpack_from(b'>L', self.header, 0x104)
+ if self.guideidx != 0xffffffff:
+ self.guideidx += self.start
+
+ # dictionaries do not seem to use the same approach in K8's
+ # so disable them
+ self.metaOrthIndex = 0xffffffff
+ self.metaInflIndex = 0xffffffff
+
+ # need to use the FDST record to find out how to properly unpack
+ # the rawML into pieces
+ # it is simply a table of start and end locations for each flow piece
+ self.fdst, = struct.unpack_from(b'>L', self.header, 0xc0)
+ self.fdstcnt, = struct.unpack_from(b'>L', self.header, 0xc4)
+ # if cnt is 1 or less, fdst section mumber can be garbage
+ if self.fdstcnt <= 1:
+ self.fdst = 0xffffffff
+ if self.fdst != 0xffffffff:
+ self.fdst += self.start
+ # setting of fdst section description properly handled in mobi_kf8proc
+
+ def dump_exth(self):
+ # determine text encoding
+ codec=self.codec
+ if (not self.hasExth) or (self.exth_length) == 0 or (self.exth == b''):
+ return
+ num_items, = struct.unpack(b'>L', self.exth[8:12])
+ pos = 12
+ print("Key Size Description Value")
+ for _ in range(num_items):
+ id, size = struct.unpack(b'>LL', self.exth[pos:pos+8])
+ contentsize = size-8
+ content = self.exth[pos + 8: pos + size]
+ if id in MobiHeader.id_map_strings:
+ exth_name = MobiHeader.id_map_strings[id]
+ print('{0: >3d} {1: >4d} {2: <30s} {3:s}'.format(id, contentsize, exth_name, content.decode(codec, errors='replace')))
+ elif id in MobiHeader.id_map_values:
+ exth_name = MobiHeader.id_map_values[id]
+ if size == 9:
+ value, = struct.unpack(b'B',content)
+ print('{0:3d} byte {1:<30s} {2:d}'.format(id, exth_name, value))
+ elif size == 10:
+ value, = struct.unpack(b'>H',content)
+ print('{0:3d} word {1:<30s} 0x{2:0>4X} ({2:d})'.format(id, exth_name, value))
+ elif size == 12:
+ value, = struct.unpack(b'>L',content)
+ print('{0:3d} long {1:<30s} 0x{2:0>8X} ({2:d})'.format(id, exth_name, value))
+ else:
+ print('{0: >3d} {1: >4d} {2: <30s} (0x{3:s})'.format(id, contentsize, "Bad size for "+exth_name, hexlify(content)))
+ elif id in MobiHeader.id_map_hexstrings:
+ exth_name = MobiHeader.id_map_hexstrings[id]
+ print('{0:3d} {1:4d} {2:<30s} 0x{3:s}'.format(id, contentsize, exth_name, hexlify(content)))
+ else:
+ exth_name = "Unknown EXTH ID {0:d}".format(id)
+ print("{0: >3d} {1: >4d} {2: <30s} 0x{3:s}".format(id, contentsize, exth_name, hexlify(content)))
+ pos += size
+ return
+
+ def dumpheader(self):
+ # first 16 bytes are not part of the official mobiheader
+ # but we will treat it as such
+ # so section 0 is 16 (decimal) + self.length in total == at least 0x108 bytes for Mobi 8 headers
+ print("Dumping section %d, Mobipocket Header version: %d, total length %d" % (self.start,self.version, self.length+16))
+ self.hdr = {}
+ # set it up for the proper header version
+ if self.version == 0:
+ self.mobi_header = MobiHeader.palmdoc_header
+ self.mobi_header_sorted_keys = MobiHeader.palmdoc_header_sorted_keys
+ elif self.version < 8:
+ self.mobi_header = MobiHeader.mobi6_header
+ self.mobi_header_sorted_keys = MobiHeader.mobi6_header_sorted_keys
+ else:
+ self.mobi_header = MobiHeader.mobi8_header
+ self.mobi_header_sorted_keys = MobiHeader.mobi8_header_sorted_keys
+
+ # parse the header information
+ for key in self.mobi_header_sorted_keys:
+ (pos, format, tot_len) = self.mobi_header[key]
+ if pos < (self.length + 16):
+ val, = struct.unpack_from(format, self.header, pos)
+ self.hdr[key] = val
+
+ if 'title_offset' in self.hdr:
+ title_offset = self.hdr['title_offset']
+ title_length = self.hdr['title_length']
+ else:
+ title_offset = 0
+ title_length = 0
+ if title_offset == 0:
+ title_offset = len(self.header)
+ title_length = 0
+ self.title = self.sect.palmname.decode('latin-1', errors='replace')
+ else:
+ self.title = self.header[title_offset:title_offset+title_length].decode(self.codec, errors='replace')
+ # title record always padded with two nul bytes and then padded with nuls to next 4 byte boundary
+ title_length = ((title_length+2+3)>>2)<<2
+
+ self.extra1 = self.header[self.exth_offset+self.exth_length:title_offset]
+ self.extra2 = self.header[title_offset+title_length:]
+
+ print("Mobipocket header from section %d" % self.start)
+ print(" Offset Value Hex Dec Description")
+ for key in self.mobi_header_sorted_keys:
+ (pos, format, tot_len) = self.mobi_header[key]
+ if pos < (self.length + 16):
+ if key != 'magic':
+ fmt_string = "0x{0:0>3X} ({0:3d}){1: >" + str(9-2*tot_len) +"s}0x{2:0>" + str(2*tot_len) + "X} {2:10d} {3:s}"
+ else:
+ self.hdr[key] = unicode_str(self.hdr[key])
+ fmt_string = "0x{0:0>3X} ({0:3d}){2:>11s} {3:s}"
+ print(fmt_string.format(pos, " ",self.hdr[key], key))
+ print("")
+
+ if self.exth_length > 0:
+ print("EXTH metadata, offset %d, padded length %d" % (self.exth_offset,self.exth_length))
+ self.dump_exth()
+ print("")
+
+ if len(self.extra1) > 0:
+ print("Extra data between EXTH and Title, length %d" % len(self.extra1))
+ print(hexlify(self.extra1))
+ print("")
+
+ if title_length > 0:
+ print("Title in header at offset %d, padded length %d: '%s'" %(title_offset,title_length,self.title))
+ print("")
+
+ if len(self.extra2) > 0:
+ print("Extra data between Title and end of header, length %d" % len(self.extra2))
+ print(hexlify(self.extra2))
+ print("")
+
+ def isPrintReplica(self):
+ return self.mlstart[0:4] == b"%MOP"
+
+ def isK8(self):
+ return self.start != 0 or self.version == 8
+
+ def isEncrypted(self):
+ return self.crypto_type != 0
+
+ def hasNCX(self):
+ return self.ncxidx != 0xffffffff
+
+ def isDictionary(self):
+ return self.metaOrthIndex != 0xffffffff
+
+ def getncxIndex(self):
+ return self.ncxidx
+
+ def decompress(self, data):
+ return self.unpack(data)
+
+ def Language(self):
+ langcode = struct.unpack(b'!L', self.header[0x5c:0x60])[0]
+ langid = langcode & 0xFF
+ sublangid = (langcode >> 8) & 0xFF
+ return getLanguage(langid, sublangid)
+
+ def DictInLanguage(self):
+ if self.isDictionary():
+ langcode = struct.unpack(b'!L', self.header[0x60:0x64])[0]
+ langid = langcode & 0xFF
+ sublangid = (langcode >> 10) & 0xFF
+ if langid != 0:
+ return getLanguage(langid, sublangid)
+ return False
+
+ def DictOutLanguage(self):
+ if self.isDictionary():
+ langcode = struct.unpack(b'!L', self.header[0x64:0x68])[0]
+ langid = langcode & 0xFF
+ sublangid = (langcode >> 10) & 0xFF
+ if langid != 0:
+ return getLanguage(langid, sublangid)
+ return False
+
+ def getRawML(self):
+ def getSizeOfTrailingDataEntry(data):
+ num = 0
+ for v in data[-4:]:
+ if bord(v) & 0x80:
+ num = 0
+ num = (num << 7) | (bord(v) & 0x7f)
+ return num
+ def trimTrailingDataEntries(data):
+ for _ in range(trailers):
+ num = getSizeOfTrailingDataEntry(data)
+ data = data[:-num]
+ if multibyte:
+ num = (ord(data[-1:]) & 3) + 1
+ data = data[:-num]
+ return data
+ multibyte = 0
+ trailers = 0
+ if self.sect.ident == b'BOOKMOBI':
+ mobi_length, = struct.unpack_from(b'>L', self.header, 0x14)
+ mobi_version, = struct.unpack_from(b'>L', self.header, 0x68)
+ if (mobi_length >= 0xE4) and (mobi_version >= 5):
+ flags, = struct.unpack_from(b'>H', self.header, 0xF2)
+ multibyte = flags & 1
+ while flags > 1:
+ if flags & 2:
+ trailers += 1
+ flags = flags >> 1
+ # get raw mobi markup languge
+ print("Unpacking raw markup language")
+ dataList = []
+ # offset = 0
+ for i in range(1, self.records+1):
+ data = trimTrailingDataEntries(self.sect.loadSection(self.start + i))
+ dataList.append(self.unpack(data))
+ if self.isK8():
+ self.sect.setsectiondescription(self.start + i,"KF8 Text Section {0:d}".format(i))
+ elif self.version == 0:
+ self.sect.setsectiondescription(self.start + i,"PalmDOC Text Section {0:d}".format(i))
+ else:
+ self.sect.setsectiondescription(self.start + i,"Mobipocket Text Section {0:d}".format(i))
+ rawML = b''.join(dataList)
+ self.rawSize = len(rawML)
+ return rawML
+
+ # all metadata is stored in a dictionary with key and returns a *list* of values
+ # a list is used to allow for multiple creators, multiple contributors, etc
+ def parseMetaData(self):
+ def addValue(name, value):
+ if name not in self.metadata:
+ self.metadata[name] = [value]
+ else:
+ self.metadata[name].append(value)
+
+ codec=self.codec
+ if self.hasExth:
+ extheader=self.exth
+ _length, num_items = struct.unpack(b'>LL', extheader[4:12])
+ extheader = extheader[12:]
+ pos = 0
+ for _ in range(num_items):
+ id, size = struct.unpack(b'>LL', extheader[pos:pos+8])
+ content = extheader[pos + 8: pos + size]
+ if id in MobiHeader.id_map_strings:
+ name = MobiHeader.id_map_strings[id]
+ addValue(name, content.decode(codec, errors='replace'))
+ elif id in MobiHeader.id_map_values:
+ name = MobiHeader.id_map_values[id]
+ if size == 9:
+ value, = struct.unpack(b'B',content)
+ addValue(name, unicode_str(str(value)))
+ elif size == 10:
+ value, = struct.unpack(b'>H',content)
+ addValue(name, unicode_str(str(value)))
+ elif size == 12:
+ value, = struct.unpack(b'>L',content)
+ # handle special case of missing CoverOffset or missing ThumbOffset
+ if id == 201 or id == 202:
+ if value != 0xffffffff:
+ addValue(name, unicode_str(str(value)))
+ else:
+ addValue(name, unicode_str(str(value)))
+ else:
+ print("Warning: Bad key, size, value combination detected in EXTH ", id, size, hexlify(content))
+ addValue(name, hexlify(content))
+ elif id in MobiHeader.id_map_hexstrings:
+ name = MobiHeader.id_map_hexstrings[id]
+ addValue(name, hexlify(content))
+ else:
+ name = unicode_str(str(id)) + ' (hex)'
+ addValue(name, hexlify(content))
+ pos += size
+
+ # add the basics to the metadata each as a list element
+ self.metadata['Language'] = [self.Language()]
+ self.metadata['Title'] = [unicode_str(self.title,self.codec)]
+ self.metadata['Codec'] = [self.codec]
+ self.metadata['UniqueID'] = [unicode_str(str(self.unique_id))]
+ # if no asin create one using a uuid
+ if 'ASIN' not in self.metadata:
+ self.metadata['ASIN'] = [unicode_str(str(uuid.uuid4()))]
+ # if no cdeType set it to "EBOK"
+ if 'cdeType' not in self.metadata:
+ self.metadata['cdeType'] = ['EBOK']
+
+ def getMetaData(self):
+ return self.metadata
+
+ def describeHeader(self, DUMP):
+ print("Mobi Version:", self.version)
+ print("Codec:", self.codec)
+ print("Title:", self.title)
+ if 'Updated_Title' in self.metadata:
+ print("EXTH Title:", self.metadata['Updated_Title'][0])
+ if self.compression == 0x4448:
+ print("Huffdic compression")
+ elif self.compression == 2:
+ print("Palmdoc compression")
+ elif self.compression == 1:
+ print("No compression")
+ if DUMP:
+ self.dumpheader()
diff --git a/src/epy_reader/tools/KindleUnpack/mobi_html.py b/src/epy_reader/tools/KindleUnpack/mobi_html.py
new file mode 100644
index 0000000..eda766c
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/mobi_html.py
@@ -0,0 +1,439 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, utf8_str
+
+if PY2:
+ range = xrange
+
+import re
+# note: re requites the pattern to be the exact same type as the data to be searched in python3
+# but u"" is not allowed for the pattern itself only b""
+
+from .mobi_utils import fromBase32
+
+class HTMLProcessor:
+
+ def __init__(self, files, metadata, rscnames):
+ self.files = files
+ self.metadata = metadata
+ self.rscnames = rscnames
+ # for original style mobis, default to including all image files in the opf manifest
+ self.used = {}
+ for name in rscnames:
+ self.used[name] = 'used'
+
+ def findAnchors(self, rawtext, indx_data, positionMap):
+ # process the raw text
+ # find anchors...
+ print("Find link anchors")
+ link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''', re.IGNORECASE)
+ # TEST NCX: merge in filepos from indx
+ pos_links = [int(m.group(1)) for m in link_pattern.finditer(rawtext)]
+ if indx_data:
+ pos_indx = [e['pos'] for e in indx_data if e['pos']>0]
+ pos_links = list(set(pos_links + pos_indx))
+
+ for position in pos_links:
+ if position in positionMap:
+ positionMap[position] = positionMap[position] + utf8_str('<a id="filepos%d" />' % position)
+ else:
+ positionMap[position] = utf8_str('<a id="filepos%d" />' % position)
+
+ # apply dictionary metadata and anchors
+ print("Insert data into html")
+ pos = 0
+ lastPos = len(rawtext)
+ dataList = []
+ for end in sorted(positionMap.keys()):
+ if end == 0 or end > lastPos:
+ continue # something's up - can't put a tag in outside <html>...</html>
+ dataList.append(rawtext[pos:end])
+ dataList.append(positionMap[end])
+ pos = end
+ dataList.append(rawtext[pos:])
+ srctext = b"".join(dataList)
+ rawtext = None
+ dataList = None
+ self.srctext = srctext
+ self.indx_data = indx_data
+ return srctext
+
+ def insertHREFS(self):
+ srctext = self.srctext
+ rscnames = self.rscnames
+ metadata = self.metadata
+
+ # put in the hrefs
+ print("Insert hrefs into html")
+ # There doesn't seem to be a standard, so search as best as we can
+
+ link_pattern = re.compile(br'''<a([^>]*?)filepos=['"]{0,1}0*(\d+)['"]{0,1}([^>]*?)>''', re.IGNORECASE)
+ srctext = link_pattern.sub(br'''<a\1href="#filepos\2"\3>''', srctext)
+
+ # remove empty anchors
+ print("Remove empty anchors from html")
+ srctext = re.sub(br"<a\s*/>",br"", srctext)
+ srctext = re.sub(br"<a\s*>\s*</a>",br"", srctext)
+
+ # convert image references
+ print("Insert image references into html")
+ # split string into image tag pieces and other pieces
+ image_pattern = re.compile(br'''(<img.*?>)''', re.IGNORECASE)
+ image_index_pattern = re.compile(br'''recindex=['"]{0,1}([0-9]+)['"]{0,1}''', re.IGNORECASE)
+ srcpieces = image_pattern.split(srctext)
+ srctext = self.srctext = None
+
+ # all odd pieces are image tags (nulls string on even pieces if no space between them in srctext)
+ for i in range(1, len(srcpieces), 2):
+ tag = srcpieces[i]
+ for m in image_index_pattern.finditer(tag):
+ imageNumber = int(m.group(1))
+ imageName = rscnames[imageNumber-1]
+ if imageName is None:
+ print("Error: Referenced image %s was not recognized as a valid image" % imageNumber)
+ else:
+ replacement = b'src="Images/' + utf8_str(imageName) + b'"'
+ tag = image_index_pattern.sub(replacement, tag, 1)
+ srcpieces[i] = tag
+ srctext = b"".join(srcpieces)
+
+ # add in character set meta into the html header if needed
+ if 'Codec' in metadata:
+ srctext = srctext[0:12]+b'<meta http-equiv="content-type" content="text/html; charset='+utf8_str(metadata.get('Codec')[0])+b'" />'+srctext[12:]
+ return srctext, self.used
+
+
+class XHTMLK8Processor:
+
+ def __init__(self, rscnames, k8proc):
+ self.rscnames = rscnames
+ self.k8proc = k8proc
+ self.used = {}
+
+ def buildXHTML(self):
+
+ # first need to update all links that are internal which
+ # are based on positions within the xhtml files **BEFORE**
+ # cutting and pasting any pieces into the xhtml text files
+
+ # kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml)
+ # XXXX is the offset in records into divtbl
+ # YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position
+
+ # pos:fid pattern
+ posfid_pattern = re.compile(br'''(<a.*?href=.*?>)''', re.IGNORECASE)
+ posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''')
+
+ parts = []
+ print("Building proper xhtml for each file")
+ for i in range(self.k8proc.getNumberOfParts()):
+ part = self.k8proc.getPart(i)
+ [partnum, dir, filename, beg, end, aidtext] = self.k8proc.getPartInfo(i)
+
+ # internal links
+ srcpieces = posfid_pattern.split(part)
+ for j in range(1, len(srcpieces),2):
+ tag = srcpieces[j]
+ if tag.startswith(b'<'):
+ for m in posfid_index_pattern.finditer(tag):
+ posfid = m.group(1)
+ offset = m.group(2)
+ filename, idtag = self.k8proc.getIDTagByPosFid(posfid, offset)
+ if idtag == b'':
+ replacement= b'"' + utf8_str(filename) + b'"'
+ else:
+ replacement = b'"' + utf8_str(filename) + b'#' + idtag + b'"'
+ tag = posfid_index_pattern.sub(replacement, tag, 1)
+ srcpieces[j] = tag
+ part = b"".join(srcpieces)
+ parts.append(part)
+
+ # we are free to cut and paste as we see fit
+ # we can safely remove all of the Kindlegen generated aid tags
+ # change aid ids that are in k8proc.linked_aids to xhtml ids
+ find_tag_with_aid_pattern = re.compile(br'''(<[^>]*\said\s*=[^>]*>)''', re.IGNORECASE)
+ within_tag_aid_position_pattern = re.compile(br'''\said\s*=['"]([^'"]*)['"]''')
+ for i in range(len(parts)):
+ part = parts[i]
+ srcpieces = find_tag_with_aid_pattern.split(part)
+ for j in range(len(srcpieces)):
+ tag = srcpieces[j]
+ if tag.startswith(b'<'):
+ for m in within_tag_aid_position_pattern.finditer(tag):
+ try:
+ aid = m.group(1)
+ except IndexError:
+ aid = None
+ replacement = b''
+ if aid in self.k8proc.linked_aids:
+ replacement = b' id="aid-' + aid + b'"'
+ tag = within_tag_aid_position_pattern.sub(replacement, tag, 1)
+ srcpieces[j] = tag
+ part = b"".join(srcpieces)
+ parts[i] = part
+
+ # we can safely replace all of the Kindlegen generated data-AmznPageBreak tags
+ # with page-break-after style patterns
+ find_tag_with_AmznPageBreak_pattern = re.compile(br'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE)
+ within_tag_AmznPageBreak_position_pattern = re.compile(br'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''')
+ for i in range(len(parts)):
+ part = parts[i]
+ srcpieces = find_tag_with_AmznPageBreak_pattern.split(part)
+ for j in range(len(srcpieces)):
+ tag = srcpieces[j]
+ if tag.startswith(b'<'):
+ srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub(
+ lambda m:b' style="page-break-after:' + m.group(1) + b'"', tag)
+ part = b"".join(srcpieces)
+ parts[i] = part
+
+ # we have to handle substitutions for the flows pieces first as they may
+ # be inlined into the xhtml text
+ # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
+ # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
+ # kindle:embed:XXXX (used for fonts)
+
+ flows = []
+ flows.append(None)
+ flowinfo = []
+ flowinfo.append([None, None, None, None])
+
+ # regular expression search patterns
+ img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
+ img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE)
+
+ tag_pattern = re.compile(br'''(<[^>]*>)''')
+ flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
+
+ url_pattern = re.compile(br'''(url\(.*?\))''', re.IGNORECASE)
+ url_img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*["')]''', re.IGNORECASE)
+ font_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)["')]''', re.IGNORECASE)
+ url_css_index_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE)
+ url_svg_image_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=image/svg\+xml[^\)]*''', re.IGNORECASE)
+
+ for i in range(1, self.k8proc.getNumberOfFlows()):
+ [ftype, format, dir, filename] = self.k8proc.getFlowInfo(i)
+ flowpart = self.k8proc.getFlow(i)
+
+ # links to raster image files from image tags
+ # image_pattern
+ srcpieces = img_pattern.split(flowpart)
+ for j in range(1, len(srcpieces),2):
+ tag = srcpieces[j]
+ if tag.startswith(b'<im'):
+ for m in img_index_pattern.finditer(tag):
+ imageNumber = fromBase32(m.group(1))
+ imageName = self.rscnames[imageNumber-1]
+ if imageName is not None:
+ replacement = b'"../Images/' + utf8_str(imageName) + b'"'
+ self.used[imageName] = 'used'
+ tag = img_index_pattern.sub(replacement, tag, 1)
+ else:
+ print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag))
+ srcpieces[j] = tag
+ flowpart = b"".join(srcpieces)
+
+ # replacements inside css url():
+ srcpieces = url_pattern.split(flowpart)
+ for j in range(1, len(srcpieces),2):
+ tag = srcpieces[j]
+
+ # process links to raster image files
+ for m in url_img_index_pattern.finditer(tag):
+ imageNumber = fromBase32(m.group(1))
+ imageName = self.rscnames[imageNumber-1]
+ osep = m.group()[0:1]
+ csep = m.group()[-1:]
+ if imageName is not None:
+ replacement = osep + b'../Images/' + utf8_str(imageName) + csep
+ self.used[imageName] = 'used'
+ tag = url_img_index_pattern.sub(replacement, tag, 1)
+ else:
+ print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag))
+
+ # process links to fonts
+ for m in font_index_pattern.finditer(tag):
+ fontNumber = fromBase32(m.group(1))
+ fontName = self.rscnames[fontNumber-1]
+ osep = m.group()[0:1]
+ csep = m.group()[-1:]
+ if fontName is None:
+ print("Error: Referenced font %s was not recognized as a valid font in %s" % (fontNumber, tag))
+ else:
+ replacement = osep + b'../Fonts/' + utf8_str(fontName) + csep
+ tag = font_index_pattern.sub(replacement, tag, 1)
+ self.used[fontName] = 'used'
+
+ # process links to other css pieces
+ for m in url_css_index_pattern.finditer(tag):
+ num = fromBase32(m.group(1))
+ [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
+ replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
+ tag = url_css_index_pattern.sub(replacement, tag, 1)
+ self.used[fnm] = 'used'
+
+ # process links to svg images
+ for m in url_svg_image_pattern.finditer(tag):
+ num = fromBase32(m.group(1))
+ [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
+ replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
+ tag = url_svg_image_pattern.sub(replacement, tag, 1)
+ self.used[fnm] = 'used'
+
+ srcpieces[j] = tag
+ flowpart = b"".join(srcpieces)
+
+ # store away in our own copy
+ flows.append(flowpart)
+
+ # I do not think this case exists and even if it does exist, it needs to be done in a separate
+ # pass to prevent inlining a flow piece into another flow piece before the inserted one or the
+ # target one has been fully processed
+
+ # but keep it around if it ends up we do need it
+
+ # flow pattern not inside url()
+ # srcpieces = tag_pattern.split(flowpart)
+ # for j in range(1, len(srcpieces),2):
+ # tag = srcpieces[j]
+ # if tag.startswith(b'<'):
+ # for m in flow_pattern.finditer(tag):
+ # num = fromBase32(m.group(1))
+ # [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
+ # flowtext = self.k8proc.getFlow(num)
+ # if fmt == b'inline':
+ # tag = flowtext
+ # else:
+ # replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
+ # tag = flow_pattern.sub(replacement, tag, 1)
+ # self.used[fnm] = 'used'
+ # srcpieces[j] = tag
+ # flowpart = b"".join(srcpieces)
+
+ # now handle the main text xhtml parts
+
+ # Handle the flow items in the XHTML text pieces
+ # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
+ tag_pattern = re.compile(br'''(<[^>]*>)''')
+ flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
+ for i in range(len(parts)):
+ part = parts[i]
+ [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
+ # flow pattern
+ srcpieces = tag_pattern.split(part)
+ for j in range(1, len(srcpieces),2):
+ tag = srcpieces[j]
+ if tag.startswith(b'<'):
+ for m in flow_pattern.finditer(tag):
+ num = fromBase32(m.group(1))
+ if num > 0 and num < len(self.k8proc.flowinfo):
+ [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
+ flowpart = flows[num]
+ if fmt == b'inline':
+ tag = flowpart
+ else:
+ replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
+ tag = flow_pattern.sub(replacement, tag, 1)
+ self.used[fnm] = 'used'
+ else:
+ print("warning: ignoring non-existent flow link", tag, " value 0x%x" % num)
+ srcpieces[j] = tag
+ part = b''.join(srcpieces)
+
+ # store away modified version
+ parts[i] = part
+
+ # Handle any embedded raster images links in style= attributes urls
+ style_pattern = re.compile(br'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''', re.IGNORECASE)
+ img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE)
+
+ for i in range(len(parts)):
+ part = parts[i]
+ [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
+
+ # replace urls in style attributes
+ srcpieces = style_pattern.split(part)
+ for j in range(1, len(srcpieces),2):
+ tag = srcpieces[j]
+ if b'kindle:embed' in tag:
+ for m in img_index_pattern.finditer(tag):
+ imageNumber = fromBase32(m.group(1))
+ imageName = self.rscnames[imageNumber-1]
+ osep = m.group()[0:1]
+ csep = m.group()[-1:]
+ if imageName is not None:
+ replacement = osep + b'../Images/'+ utf8_str(imageName) + csep
+ self.used[imageName] = 'used'
+ tag = img_index_pattern.sub(replacement, tag, 1)
+ else:
+ print("Error: Referenced image %s in style url was not recognized in %s" % (imageNumber, tag))
+ srcpieces[j] = tag
+ part = b"".join(srcpieces)
+
+ # store away modified version
+ parts[i] = part
+
+ # Handle any embedded raster images links in the xhtml text
+ # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
+ img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
+ img_index_pattern = re.compile(br'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''')
+
+ for i in range(len(parts)):
+ part = parts[i]
+ [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
+
+ # links to raster image files
+ # image_pattern
+ srcpieces = img_pattern.split(part)
+ for j in range(1, len(srcpieces),2):
+ tag = srcpieces[j]
+ if tag.startswith(b'<im'):
+ for m in img_index_pattern.finditer(tag):
+ imageNumber = fromBase32(m.group(1))
+ imageName = self.rscnames[imageNumber-1]
+ if imageName is not None:
+ replacement = b'"../Images/' + utf8_str(imageName) + b'"'
+ self.used[imageName] = 'used'
+ tag = img_index_pattern.sub(replacement, tag, 1)
+ else:
+ print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag))
+ srcpieces[j] = tag
+ part = b"".join(srcpieces)
+ # store away modified version
+ parts[i] = part
+
+ # finally perform any general cleanups needed to make valid XHTML
+ # these include:
+ # in svg tags replace "perserveaspectratio" attributes with "perserveAspectRatio"
+ # in svg tags replace "viewbox" attributes with "viewBox"
+ # in <li> remove value="XX" attributes since these are illegal
+ tag_pattern = re.compile(br'''(<[^>]*>)''')
+ li_value_pattern = re.compile(br'''\svalue\s*=\s*['"][^'"]*['"]''', re.IGNORECASE)
+
+ for i in range(len(parts)):
+ part = parts[i]
+ [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
+
+ # tag pattern
+ srcpieces = tag_pattern.split(part)
+ for j in range(1, len(srcpieces),2):
+ tag = srcpieces[j]
+ if tag.startswith(b'<svg') or tag.startswith(b'<SVG'):
+ tag = tag.replace(b'preserveaspectratio',b'preserveAspectRatio')
+ tag = tag.replace(b'viewbox',b'viewBox')
+ elif tag.startswith(b'<li ') or tag.startswith(b'<LI '):
+ tagpieces = li_value_pattern.split(tag)
+ tag = b"".join(tagpieces)
+ srcpieces[j] = tag
+ part = b"".join(srcpieces)
+ # store away modified version
+ parts[i] = part
+
+ self.k8proc.setFlows(flows)
+ self.k8proc.setParts(parts)
+
+ return self.used
diff --git a/src/epy_reader/tools/KindleUnpack/mobi_index.py b/src/epy_reader/tools/KindleUnpack/mobi_index.py
new file mode 100644
index 0000000..397aaf8
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/mobi_index.py
@@ -0,0 +1,276 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, bchr, bstr, bord
+if PY2:
+ range = xrange
+
+import struct
+# note: struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+from .mobi_utils import toHex
+
+class MobiIndex:
+
+ def __init__(self, sect, DEBUG=False):
+ self.sect = sect
+ self.DEBUG = DEBUG
+
+ def getIndexData(self, idx, label="Unknown"):
+ sect = self.sect
+ outtbl = []
+ ctoc_text = {}
+ if idx != 0xffffffff:
+ sect.setsectiondescription(idx,"{0} Main INDX section".format(label))
+ data = sect.loadSection(idx)
+ idxhdr, hordt1, hordt2 = self.parseINDXHeader(data)
+ IndexCount = idxhdr['count']
+ # handle the case of multiple sections used for CTOC
+ rec_off = 0
+ off = idx + IndexCount + 1
+ for j in range(idxhdr['nctoc']):
+ cdata = sect.loadSection(off + j)
+ sect.setsectiondescription(off+j, label + ' CTOC Data ' + str(j))
+ ctocdict = self.readCTOC(cdata)
+ for k in ctocdict:
+ ctoc_text[k + rec_off] = ctocdict[k]
+ rec_off += 0x10000
+ tagSectionStart = idxhdr['len']
+ controlByteCount, tagTable = readTagSection(tagSectionStart, data)
+ if self.DEBUG:
+ print("ControlByteCount is", controlByteCount)
+ print("IndexCount is", IndexCount)
+ print("TagTable: %s" % tagTable)
+ for i in range(idx + 1, idx + 1 + IndexCount):
+ sect.setsectiondescription(i,"{0} Extra {1:d} INDX section".format(label,i-idx))
+ data = sect.loadSection(i)
+ hdrinfo, ordt1, ordt2 = self.parseINDXHeader(data)
+ idxtPos = hdrinfo['start']
+ entryCount = hdrinfo['count']
+ if self.DEBUG:
+ print(idxtPos, entryCount)
+ # loop through to build up the IDXT position starts
+ idxPositions = []
+ for j in range(entryCount):
+ pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j))
+ idxPositions.append(pos)
+ # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
+ idxPositions.append(idxtPos)
+ # for each entry in the IDXT build up the tagMap and any associated text
+ for j in range(entryCount):
+ startPos = idxPositions[j]
+ endPos = idxPositions[j+1]
+ textLength = ord(data[startPos:startPos+1])
+ text = data[startPos+1:startPos+1+textLength]
+ if hordt2 is not None:
+ text = b''.join(bchr(hordt2[bord(x)]) for x in text)
+ tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos)
+ outtbl.append([text, tagMap])
+ if self.DEBUG:
+ print(tagMap)
+ print(text)
+ return outtbl, ctoc_text
+
+ def parseINDXHeader(self, data):
+ "read INDX header"
+ if not data[:4] == b'INDX':
+ print("Warning: index section is not INDX")
+ return False
+ words = (
+ 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
+ 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc'
+ )
+ num = len(words)
+ values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)])
+ header = {}
+ for n in range(num):
+ header[words[n]] = values[n]
+
+ ordt1 = None
+ ordt2 = None
+
+ ocnt, oentries, op1, op2, otagx = struct.unpack_from(b'>LLLLL',data, 0xa4)
+ if header['code'] == 0xfdea or ocnt != 0 or oentries > 0:
+ # horribly hacked up ESP (sample) mobi books use two ORDT sections but never specify
+ # them in the proper place in the header. They seem to be codepage 65002 which seems
+ # to be some sort of strange EBCDIC utf-8 or 16 encoded strings
+
+ # so we need to look for them and store them away to process leading text
+ # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
+ # we only ever seem to use the seocnd but ...
+ assert(ocnt == 1)
+ assert(data[op1:op1+4] == b'ORDT')
+ assert(data[op2:op2+4] == b'ORDT')
+ ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4)
+ ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4)
+
+ if self.DEBUG:
+ print("parsed INDX header:")
+ for n in words:
+ print(n, "%X" % header[n],)
+ print("")
+ return header, ordt1, ordt2
+
+ def readCTOC(self, txtdata):
+ # read all blocks from CTOC
+ ctoc_data = {}
+ offset = 0
+ while offset<len(txtdata):
+ if PY2:
+ if txtdata[offset] == b'\0':
+ break
+ else:
+ if txtdata[offset] == 0:
+ break
+ idx_offs = offset
+ # first n bytes: name len as vwi
+ pos, ilen = getVariableWidthValue(txtdata, offset)
+ offset += pos
+ # <len> next bytes: name
+ name = txtdata[offset:offset+ilen]
+ offset += ilen
+ if self.DEBUG:
+ print("name length is ", ilen)
+ print(idx_offs, name)
+ ctoc_data[idx_offs] = name
+ return ctoc_data
+
+
+def getVariableWidthValue(data, offset):
+ '''
+ Decode variable width value from given bytes.
+
+ @param data: The bytes to decode.
+ @param offset: The start offset into data.
+ @return: Tuple of consumed bytes count and decoded value.
+ '''
+ value = 0
+ consumed = 0
+ finished = False
+ while not finished:
+ v = data[offset + consumed: offset + consumed + 1]
+ consumed += 1
+ if ord(v) & 0x80:
+ finished = True
+ value = (value << 7) | (ord(v) & 0x7f)
+ return consumed, value
+
+
+def readTagSection(start, data):
+ '''
+ Read tag section from given data.
+
+ @param start: The start position in the data.
+ @param data: The data to process.
+ @return: Tuple of control byte count and list of tag tuples.
+ '''
+ controlByteCount = 0
+ tags = []
+ if data[start:start+4] == b"TAGX":
+ firstEntryOffset, = struct.unpack_from(b'>L', data, start + 0x04)
+ controlByteCount, = struct.unpack_from(b'>L', data, start + 0x08)
+
+ # Skip the first 12 bytes already read above.
+ for i in range(12, firstEntryOffset, 4):
+ pos = start + i
+ tags.append((ord(data[pos:pos+1]), ord(data[pos+1:pos+2]), ord(data[pos+2:pos+3]), ord(data[pos+3:pos+4])))
+ return controlByteCount, tags
+
+
+def countSetBits(value, bits=8):
+ '''
+ Count the set bits in the given value.
+
+ @param value: Integer value.
+ @param bits: The number of bits of the input value (defaults to 8).
+ @return: Number of set bits.
+ '''
+ count = 0
+ for _ in range(bits):
+ if value & 0x01 == 0x01:
+ count += 1
+ value = value >> 1
+ return count
+
+
+def getTagMap(controlByteCount, tagTable, entryData, startPos, endPos):
+ '''
+ Create a map of tags and values from the given byte section.
+
+ @param controlByteCount: The number of control bytes.
+ @param tagTable: The tag table.
+ @param entryData: The data to process.
+ @param startPos: The starting position in entryData.
+ @param endPos: The end position in entryData or None if it is unknown.
+ @return: Hashmap of tag and list of values.
+ '''
+ tags = []
+ tagHashMap = {}
+ controlByteIndex = 0
+ dataStart = startPos + controlByteCount
+
+ for tag, valuesPerEntry, mask, endFlag in tagTable:
+ if endFlag == 0x01:
+ controlByteIndex += 1
+ continue
+ cbyte = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1])
+ if 0:
+ print("Control Byte Index %0x , Control Byte Value %0x" % (controlByteIndex, cbyte))
+
+ value = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1]) & mask
+ if value != 0:
+ if value == mask:
+ if countSetBits(mask) > 1:
+ # If all bits of masked value are set and the mask has more than one bit, a variable width value
+ # will follow after the control bytes which defines the length of bytes (NOT the value count!)
+ # which will contain the corresponding variable width values.
+ consumed, value = getVariableWidthValue(entryData, dataStart)
+ dataStart += consumed
+ tags.append((tag, None, value, valuesPerEntry))
+ else:
+ tags.append((tag, 1, None, valuesPerEntry))
+ else:
+ # Shift bits to get the masked value.
+ while mask & 0x01 == 0:
+ mask = mask >> 1
+ value = value >> 1
+ tags.append((tag, value, None, valuesPerEntry))
+ for tag, valueCount, valueBytes, valuesPerEntry in tags:
+ values = []
+ if valueCount is not None:
+ # Read valueCount * valuesPerEntry variable width values.
+ for _ in range(valueCount):
+ for _ in range(valuesPerEntry):
+ consumed, data = getVariableWidthValue(entryData, dataStart)
+ dataStart += consumed
+ values.append(data)
+ else:
+ # Convert valueBytes to variable width values.
+ totalConsumed = 0
+ while totalConsumed < valueBytes:
+ # Does this work for valuesPerEntry != 1?
+ consumed, data = getVariableWidthValue(entryData, dataStart)
+ dataStart += consumed
+ totalConsumed += consumed
+ values.append(data)
+ if totalConsumed != valueBytes:
+ print("Error: Should consume %s bytes, but consumed %s" % (valueBytes, totalConsumed))
+ tagHashMap[tag] = values
+ # Test that all bytes have been processed if endPos is given.
+ if endPos is not None and dataStart != endPos:
+ # The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
+ for char in entryData[dataStart:endPos]:
+ if bord(char) != 0:
+ print("Warning: There are unprocessed index bytes left: %s" % toHex(entryData[dataStart:endPos]))
+ if 0:
+ print("controlByteCount: %s" % controlByteCount)
+ print("tagTable: %s" % tagTable)
+ print("data: %s" % toHex(entryData[startPos:endPos]))
+ print("tagHashMap: %s" % tagHashMap)
+ break
+
+ return tagHashMap
diff --git a/src/epy_reader/tools/KindleUnpack/mobi_k8proc.py b/src/epy_reader/tools/KindleUnpack/mobi_k8proc.py
new file mode 100644
index 0000000..5b8274e
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/mobi_k8proc.py
@@ -0,0 +1,496 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, bstr, utf8_str
+
+if PY2:
+ range = xrange
+
+import os
+
+import struct
+# note: struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+import re
+# note: re requites the pattern to be the exact same type as the data to be searched in python3
+# but u"" is not allowed for the pattern itself only b""
+
+from .mobi_index import MobiIndex
+from .mobi_utils import fromBase32
+from .unipath import pathof
+
+_guide_types = [b'cover',b'title-page',b'toc',b'index',b'glossary',b'acknowledgements',
+ b'bibliography',b'colophon',b'copyright-page',b'dedication',
+ b'epigraph',b'foreward',b'loi',b'lot',b'notes',b'preface',b'text']
+
+# locate beginning and ending positions of tag with specific aid attribute
+def locate_beg_end_of_tag(ml, aid):
+ pattern = utf8_str(r'''<[^>]*\said\s*=\s*['"]%s['"][^>]*>''' % aid)
+ aid_pattern = re.compile(pattern,re.IGNORECASE)
+ for m in re.finditer(aid_pattern, ml):
+ plt = m.start()
+ pgt = ml.find(b'>',plt+1)
+ return plt, pgt
+ return 0, 0
+
+
+# iterate over all tags in block in reverse order, i.e. last ta to first tag
+def reverse_tag_iter(block):
+ end = len(block)
+ while True:
+ pgt = block.rfind(b'>', 0, end)
+ if pgt == -1:
+ break
+ plt = block.rfind(b'<', 0, pgt)
+ if plt == -1:
+ break
+ yield block[plt:pgt+1]
+ end = plt
+
+
+class K8Processor:
+
+ def __init__(self, mh, sect, files, debug=False):
+ self.sect = sect
+ self.files = files
+ self.mi = MobiIndex(sect)
+ self.mh = mh
+ self.skelidx = mh.skelidx
+ self.fragidx = mh.fragidx
+ self.guideidx = mh.guideidx
+ self.fdst = mh.fdst
+ self.flowmap = {}
+ self.flows = None
+ self.flowinfo = []
+ self.parts = None
+ self.partinfo = []
+ self.linked_aids = set()
+ self.fdsttbl= [0,0xffffffff]
+ self.DEBUG = debug
+
+ # read in and parse the FDST info which is very similar in format to the Palm DB section
+ # parsing except it provides offsets into rawML file and not the Palm DB file
+ # this is needed to split up the final css, svg, etc flow section
+ # that can exist at the end of the rawML file
+ if self.fdst != 0xffffffff:
+ header = self.sect.loadSection(self.fdst)
+ if header[0:4] == b"FDST":
+ num_sections, = struct.unpack_from(b'>L', header, 0x08)
+ self.fdsttbl = struct.unpack_from(bstr('>%dL' % (num_sections*2)), header, 12)[::2] + (mh.rawSize, )
+ sect.setsectiondescription(self.fdst,"KF8 FDST INDX")
+ if self.DEBUG:
+ print("\nFDST Section Map: %d sections" % num_sections)
+ for j in range(num_sections):
+ print("Section %d: 0x%08X - 0x%08X" % (j, self.fdsttbl[j],self.fdsttbl[j+1]))
+ else:
+ print("\nError: K8 Mobi with Missing FDST info")
+
+ # read/process skeleton index info to create the skeleton table
+ skeltbl = []
+ if self.skelidx != 0xffffffff:
+ # for i in range(2):
+ # fname = 'skel%04d.dat' % i
+ # data = self.sect.loadSection(self.skelidx + i)
+ # with open(pathof(fname), 'wb') as f:
+ # f.write(data)
+ outtbl, ctoc_text = self.mi.getIndexData(self.skelidx, "KF8 Skeleton")
+ fileptr = 0
+ for [text, tagMap] in outtbl:
+ # file number, skeleton name, fragtbl record count, start position, length
+ skeltbl.append([fileptr, text, tagMap[1][0], tagMap[6][0], tagMap[6][1]])
+ fileptr += 1
+ self.skeltbl = skeltbl
+ if self.DEBUG:
+ print("\nSkel Table: %d entries" % len(self.skeltbl))
+ print("table: filenum, skeleton name, frag tbl record count, start position, length")
+ for j in range(len(self.skeltbl)):
+ print(self.skeltbl[j])
+
+ # read/process the fragment index to create the fragment table
+ fragtbl = []
+ if self.fragidx != 0xffffffff:
+ # for i in range(3):
+ # fname = 'frag%04d.dat' % i
+ # data = self.sect.loadSection(self.fragidx + i)
+ # with open(pathof(fname), 'wb') as f:
+ # f.write(data)
+ outtbl, ctoc_text = self.mi.getIndexData(self.fragidx, "KF8 Fragment")
+ for [text, tagMap] in outtbl:
+ # insert position, ctoc offset (aidtext), file number, sequence number, start position, length
+ ctocoffset = tagMap[2][0]
+ ctocdata = ctoc_text[ctocoffset]
+ fragtbl.append([int(text), ctocdata, tagMap[3][0], tagMap[4][0], tagMap[6][0], tagMap[6][1]])
+ self.fragtbl = fragtbl
+ if self.DEBUG:
+ print("\nFragment Table: %d entries" % len(self.fragtbl))
+ print("table: file position, link id text, file num, sequence number, start position, length")
+ for j in range(len(self.fragtbl)):
+ print(self.fragtbl[j])
+
+ # read / process guide index for guide elements of opf
+ guidetbl = []
+ if self.guideidx != 0xffffffff:
+ # for i in range(3):
+ # fname = 'guide%04d.dat' % i
+ # data = self.sect.loadSection(self.guideidx + i)
+ # with open(pathof(fname), 'wb') as f:
+ # f.write(data)
+ outtbl, ctoc_text = self.mi.getIndexData(self.guideidx, "KF8 Guide elements)")
+ for [text, tagMap] in outtbl:
+ # ref_type, ref_title, frag number
+ ctocoffset = tagMap[1][0]
+ ref_title = ctoc_text[ctocoffset]
+ ref_type = text
+ fileno = None
+ if 3 in tagMap:
+ fileno = tagMap[3][0]
+ if 6 in tagMap:
+ fileno = tagMap[6][0]
+ guidetbl.append([ref_type, ref_title, fileno])
+ self.guidetbl = guidetbl
+ if self.DEBUG:
+ print("\nGuide Table: %d entries" % len(self.guidetbl))
+ print("table: ref_type, ref_title, fragtbl entry number")
+ for j in range(len(self.guidetbl)):
+ print(self.guidetbl[j])
+
+ def buildParts(self, rawML):
+ # now split the rawML into its flow pieces
+ self.flows = []
+ for j in range(0, len(self.fdsttbl)-1):
+ start = self.fdsttbl[j]
+ end = self.fdsttbl[j+1]
+ self.flows.append(rawML[start:end])
+
+ # the first piece represents the xhtml text
+ text = self.flows[0]
+ self.flows[0] = b''
+
+ # walk the <skeleton> and fragment tables to build original source xhtml files
+ # *without* destroying any file position information needed for later href processing
+ # and create final list of file separation start: stop points and etc in partinfo
+ if self.DEBUG:
+ print("\nRebuilding flow piece 0: the main body of the ebook")
+ self.parts = []
+ self.partinfo = []
+ fragptr = 0
+ baseptr = 0
+ cnt = 0
+ filename = 'part%04d.xhtml' % cnt
+ for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl:
+ baseptr = skelpos + skellen
+ skeleton = text[skelpos: baseptr]
+ aidtext = "0"
+ for i in range(fragcnt):
+ [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[fragptr]
+ aidtext = idtext[12:-2]
+ if i == 0:
+ filename = 'part%04d.xhtml' % filenum
+ slice = text[baseptr: baseptr + length]
+ insertpos = insertpos - skelpos
+ head = skeleton[:insertpos]
+ tail = skeleton[insertpos:]
+ actual_inspos = insertpos
+ if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') < head.rfind(b'<')):
+ # There is an incomplete tag in either the head or tail.
+ # This can happen for some badly formed KF8 files
+ print('The fragment table for %s has incorrect insert position. Calculating manually.' % skelname)
+ bp, ep = locate_beg_end_of_tag(skeleton, aidtext)
+ if bp != ep:
+ actual_inspos = ep + 1 + startpos
+ if insertpos != actual_inspos:
+ print("fixed corrupt fragment table insert position", insertpos+skelpos, actual_inspos+skelpos)
+ insertpos = actual_inspos
+ self.fragtbl[fragptr][0] = actual_inspos + skelpos
+ skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:]
+ baseptr = baseptr + length
+ fragptr += 1
+ cnt += 1
+ self.parts.append(skeleton)
+ self.partinfo.append([skelnum, 'Text', filename, skelpos, baseptr, aidtext])
+
+ assembled_text = b''.join(self.parts)
+ if self.DEBUG:
+ outassembled = os.path.join(self.files.k8dir, 'assembled_text.dat')
+ with open(pathof(outassembled),'wb') as f:
+ f.write(assembled_text)
+
+ # The primary css style sheet is typically stored next followed by any
+ # snippets of code that were previously inlined in the
+ # original xhtml but have been stripped out and placed here.
+ # This can include local CDATA snippets and and svg sections.
+
+ # The problem is that for most browsers and ereaders, you can not
+ # use <img src="imageXXXX.svg" /> to import any svg image that itself
+ # properly uses an <image/> tag to import some raster image - it
+ # should work according to the spec but does not for almost all browsers
+ # and ereaders and causes epub validation issues because those raster
+ # images are in manifest but not in xhtml text - since they only
+ # referenced from an svg image
+
+ # So we need to check the remaining flow pieces to see if they are css
+ # or svg images. if svg images, we must check if they have an <image />
+ # and if so inline them into the xhtml text pieces.
+
+ # there may be other sorts of pieces stored here but until we see one
+ # in the wild to reverse engineer we won't be able to tell
+ self.flowinfo.append([None, None, None, None])
+ svg_tag_pattern = re.compile(br'''(<svg[^>]*>)''', re.IGNORECASE)
+ image_tag_pattern = re.compile(br'''(<image[^>]*>)''', re.IGNORECASE)
+ for j in range(1,len(self.flows)):
+ flowpart = self.flows[j]
+ nstr = '%04d' % j
+ m = re.search(svg_tag_pattern, flowpart)
+ if m is not None:
+ # svg
+ ptype = b'svg'
+ start = m.start()
+ m2 = re.search(image_tag_pattern, flowpart)
+ if m2 is not None:
+ pformat = b'inline'
+ pdir = None
+ fname = None
+ # strip off anything before <svg if inlining
+ flowpart = flowpart[start:]
+ else:
+ pformat = b'file'
+ pdir = "Images"
+ fname = 'svgimg' + nstr + '.svg'
+ else:
+ # search for CDATA and if exists inline it
+ if flowpart.find(b'[CDATA[') >= 0:
+ ptype = b'css'
+ flowpart = b'<style type="text/css">\n' + flowpart + b'\n</style>\n'
+ pformat = b'inline'
+ pdir = None
+ fname = None
+ else:
+ # css - assume as standalone css file
+ ptype = b'css'
+ pformat = b'file'
+ pdir = "Styles"
+ fname = 'style' + nstr + '.css'
+
+ self.flows[j] = flowpart
+ self.flowinfo.append([ptype, pformat, pdir, fname])
+
+ if self.DEBUG:
+ print("\nFlow Map: %d entries" % len(self.flowinfo))
+ for fi in self.flowinfo:
+ print(fi)
+ print("\n")
+
+ print("\nXHTML File Part Position Information: %d entries" % len(self.partinfo))
+ for pi in self.partinfo:
+ print(pi)
+
+ if False: # self.Debug:
+ # dump all of the locations of the aid tags used in TEXT
+ # find id links only inside of tags
+ # inside any < > pair find all "aid=' and return whatever is inside the quotes
+ # [^>]* means match any amount of chars except for '>' char
+ # [^'"] match any amount of chars except for the quote character
+ # \s* means match any amount of whitespace
+ print("\npositions of all aid= pieces")
+ id_pattern = re.compile(br'''<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>''',re.IGNORECASE)
+ for m in re.finditer(id_pattern, rawML):
+ [filename, partnum, start, end] = self.getFileInfo(m.start())
+ [seqnum, idtext] = self.getFragTblInfo(m.start())
+ value = fromBase32(m.group(1))
+ print(" aid: %s value: %d at: %d -> part: %d, start: %d, end: %d" % (m.group(1), value, m.start(), partnum, start, end))
+ print(" %s fragtbl entry %d" % (idtext, seqnum))
+
+ return
+
+ # get information fragment table entry by pos
+ def getFragTblInfo(self, pos):
+ for j in range(len(self.fragtbl)):
+ [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[j]
+ if pos >= insertpos and pos < (insertpos + length):
+ # why are these "in: and before: added here
+ return seqnum, b'in: ' + idtext
+ if pos < insertpos:
+ return seqnum, b'before: ' + idtext
+ return None, None
+
+ # get information about the part (file) that exists at pos in original rawML
+ def getFileInfo(self, pos):
+ for [partnum, pdir, filename, start, end, aidtext] in self.partinfo:
+ if pos >= start and pos < end:
+ return filename, partnum, start, end
+ return None, None, None, None
+
+ # accessor functions to properly protect the internal structure
+ def getNumberOfParts(self):
+ return len(self.parts)
+
+ def getPart(self,i):
+ if i >= 0 and i < len(self.parts):
+ return self.parts[i]
+ return None
+
+ def getPartInfo(self, i):
+ if i >= 0 and i < len(self.partinfo):
+ return self.partinfo[i]
+ return None
+
+ def getNumberOfFlows(self):
+ return len(self.flows)
+
+ def getFlow(self,i):
+ # note flows[0] is empty - it was all of the original text
+ if i > 0 and i < len(self.flows):
+ return self.flows[i]
+ return None
+
+ def getFlowInfo(self,i):
+ # note flowinfo[0] is empty - it was all of the original text
+ if i > 0 and i < len(self.flowinfo):
+ return self.flowinfo[i]
+ return None
+
+ def getIDTagByPosFid(self, posfid, offset):
+ # first convert kindle:pos:fid and offset info to position in file
+ # (fromBase32 can handle both string types on input)
+ row = fromBase32(posfid)
+ off = fromBase32(offset)
+ [insertpos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[row]
+ pos = insertpos + off
+ fname, pn, skelpos, skelend = self.getFileInfo(pos)
+ if fname is None:
+ # pos does not exist
+ # default to skeleton pos instead
+ print("Link To Position", pos, "does not exist, retargeting to top of target")
+ pos = self.skeltbl[filenum][3]
+ fname, pn, skelpos, skelend = self.getFileInfo(pos)
+ # an existing "id=" or "name=" attribute must exist in original xhtml otherwise it would not have worked for linking.
+ # Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent
+ # some position information encoded into Base32 name.
+ # so find the closest "id=" before position the file by actually searching in that file
+ idtext = self.getIDTag(pos)
+ return fname, idtext
+
+ def getIDTag(self, pos):
+ # find the first tag with a named anchor (name or id attribute) before pos
+ fname, pn, skelpos, skelend = self.getFileInfo(pos)
+ if pn is None and skelpos is None:
+ print("Error: getIDTag - no file contains ", pos)
+ textblock = self.parts[pn]
+ npos = pos - skelpos
+ # if npos inside a tag then search all text before the its end of tag marker
+ pgt = textblock.find(b'>',npos)
+ plt = textblock.find(b'<',npos)
+ if plt == npos or pgt < plt:
+ npos = pgt + 1
+ # find id and name attributes only inside of tags
+ # use a reverse tag search since that is faster
+ # inside any < > pair find "id=" and "name=" attributes return it
+ # [^>]* means match any amount of chars except for '>' char
+ # [^'"] match any amount of chars except for the quote character
+ # \s* means match any amount of whitespace
+ textblock = textblock[0:npos]
+ id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE)
+ name_pattern = re.compile(br'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE)
+ aid_pattern = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''')
+ for tag in reverse_tag_iter(textblock):
+ # any ids in the body should default to top of file
+ if tag[0:6] == b'<body ':
+ return b''
+ if tag[0:6] != b'<meta ':
+ m = id_pattern.match(tag) or name_pattern.match(tag)
+ if m is not None:
+ return m.group(1)
+ m = aid_pattern.match(tag)
+ if m is not None:
+ self.linked_aids.add(m.group(1))
+ return b'aid-' + m.group(1)
+ return b''
+
+ # do we need to do deep copying
+ def setParts(self, parts):
+ assert(len(parts) == len(self.parts))
+ for i in range(len(parts)):
+ self.parts[i] = parts[i]
+
+ # do we need to do deep copying
+ def setFlows(self, flows):
+ assert(len(flows) == len(self.flows))
+ for i in range(len(flows)):
+ self.flows[i] = flows[i]
+
+ # get information about the part (file) that exists at pos in original rawML
+ def getSkelInfo(self, pos):
+ for [partnum, pdir, filename, start, end, aidtext] in self.partinfo:
+ if pos >= start and pos < end:
+ return [partnum, pdir, filename, start, end, aidtext]
+ return [None, None, None, None, None, None]
+
+ # fileno is actually a reference into fragtbl (a fragment)
+ def getGuideText(self):
+ guidetext = b''
+ for [ref_type, ref_title, fileno] in self.guidetbl:
+ if ref_type == b'thumbimagestandard':
+ continue
+ if ref_type not in _guide_types and not ref_type.startswith(b'other.'):
+ if ref_type == b'start':
+ ref_type = b'text'
+ else:
+ ref_type = b'other.' + ref_type
+ [pos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[fileno]
+ [pn, pdir, filename, skelpos, skelend, aidtext] = self.getSkelInfo(pos)
+ idtext = self.getIDTag(pos)
+ linktgt = filename.encode('utf-8')
+ if idtext != b'':
+ linktgt += b'#' + idtext
+ guidetext += b'<reference type="'+ref_type+b'" title="'+ref_title+b'" href="'+utf8_str(pdir)+b'/'+linktgt+b'" />\n'
+ # opf is encoded utf-8 so must convert any titles properly
+ guidetext = (guidetext.decode(self.mh.codec)).encode("utf-8")
+ return guidetext
+
+ def getPageIDTag(self, pos):
+ # find the first tag with a named anchor (name or id attribute) before pos
+ # but page map offsets need to little more leeway so if the offset points
+ # into a tag look for the next ending tag "/>" or "</" and start your search from there.
+ fname, pn, skelpos, skelend = self.getFileInfo(pos)
+ if pn is None and skelpos is None:
+ print("Error: getIDTag - no file contains ", pos)
+ textblock = self.parts[pn]
+ npos = pos - skelpos
+ # if npos inside a tag then search all text before next ending tag
+ pgt = textblock.find(b'>',npos)
+ plt = textblock.find(b'<',npos)
+ if plt == npos or pgt < plt:
+ # we are in a tag
+ # so find first ending tag
+ pend1 = textblock.find(b'/>', npos)
+ pend2 = textblock.find(b'</', npos)
+ if pend1 != -1 and pend2 != -1:
+ pend = min(pend1, pend2)
+ else:
+ pend = max(pend1, pend2)
+ if pend != -1:
+ npos = pend
+ else:
+ npos = pgt + 1
+ # find id and name attributes only inside of tags
+ # use a reverse tag search since that is faster
+ # inside any < > pair find "id=" and "name=" attributes return it
+ # [^>]* means match any amount of chars except for '>' char
+ # [^'"] match any amount of chars except for the quote character
+ # \s* means match any amount of whitespace
+ textblock = textblock[0:npos]
+ id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE)
+ name_pattern = re.compile(br'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE)
+ for tag in reverse_tag_iter(textblock):
+ # any ids in the body should default to top of file
+ if tag[0:6] == b'<body ':
+ return b''
+ if tag[0:6] != b'<meta ':
+ m = id_pattern.match(tag) or name_pattern.match(tag)
+ if m is not None:
+ return m.group(1)
+ return b''
diff --git a/src/epy_reader/tools/KindleUnpack/mobi_k8resc.py b/src/epy_reader/tools/KindleUnpack/mobi_k8resc.py
new file mode 100644
index 0000000..1e58e84
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/mobi_k8resc.py
@@ -0,0 +1,271 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+DEBUG_USE_ORDERED_DICTIONARY = False # OrderedDict is supoorted >= python 2.7.
+""" set to True to use OrderedDict for K8RESCProcessor.parsetag.tattr."""
+
+if DEBUG_USE_ORDERED_DICTIONARY:
+ from collections import OrderedDict as dict_
+else:
+ dict_ = dict
+
+from .compatibility_utils import unicode_str
+
+from .mobi_utils import fromBase32
+
+_OPF_PARENT_TAGS = ['xml', 'package', 'metadata', 'dc-metadata',
+ 'x-metadata', 'manifest', 'spine', 'tours', 'guide']
+
+class K8RESCProcessor(object):
+
+ def __init__(self, data, debug=False):
+ self._debug = debug
+ self.resc = None
+ self.opos = 0
+ self.extrameta = []
+ self.cover_name = None
+ self.spine_idrefs = {}
+ self.spine_order = []
+ self.spine_pageattributes = {}
+ self.spine_ppd = None
+ # need3 indicate the book has fields which require epub3.
+ # but the estimation of the source epub version from the fields is difficult.
+ self.need3 = False
+ self.package_ver = None
+ self.extra_metadata = []
+ self.refines_metadata = []
+ self.extra_attributes = []
+ # get header
+ start_pos = data.find(b'<')
+ self.resc_header = data[:start_pos]
+ # get resc data length
+ start = self.resc_header.find(b'=') + 1
+ end = self.resc_header.find(b'&', start)
+ resc_size = 0
+ if end > 0:
+ resc_size = fromBase32(self.resc_header[start:end])
+ resc_rawbytes = len(data) - start_pos
+ if resc_rawbytes == resc_size:
+ self.resc_length = resc_size
+ else:
+ # Most RESC has a nul string at its tail but some do not.
+ end_pos = data.find(b'\x00', start_pos)
+ if end_pos < 0:
+ self.resc_length = resc_rawbytes
+ else:
+ self.resc_length = end_pos - start_pos
+ if self.resc_length != resc_size:
+ print("Warning: RESC section length({:d}bytes) does not match its size({:d}bytes).".format(self.resc_length, resc_size))
+ # now parse RESC after converting it to unicode from utf-8
+ try:
+ self.resc = unicode_str(data[start_pos:start_pos+self.resc_length])
+ except UnicodeDecodeError:
+ self.resc = unicode_str(data[start_pos:start_pos+self.resc_length], enc='latin-1')
+ self.parseData()
+
+ def prepend_to_spine(self, key, idref, linear, properties):
+ self.spine_order = [key] + self.spine_order
+ self.spine_idrefs[key] = idref
+ attributes = {}
+ if linear is not None:
+ attributes['linear'] = linear
+ if properties is not None:
+ attributes['properties'] = properties
+ self.spine_pageattributes[key] = attributes
+
+ # RESC tag iterator
+ def resc_tag_iter(self):
+ tcontent = last_tattr = None
+ prefix = ['']
+ while True:
+ text, tag = self.parseresc()
+ if text is None and tag is None:
+ break
+ if text is not None:
+ tcontent = text.rstrip(' \r\n')
+ else: # we have a tag
+ ttype, tname, tattr = self.parsetag(tag)
+ if ttype == 'begin':
+ tcontent = None
+ prefix.append(tname + '.')
+ if tname in _OPF_PARENT_TAGS:
+ yield ''.join(prefix), tname, tattr, tcontent
+ else:
+ last_tattr = tattr
+ else: # single or end
+ if ttype == 'end':
+ prefix.pop()
+ tattr = last_tattr
+ last_tattr = None
+ if tname in _OPF_PARENT_TAGS:
+ tname += '-end'
+ yield ''.join(prefix), tname, tattr, tcontent
+ tcontent = None
+
+ # now parse the RESC to extract spine and extra metadata info
+ def parseData(self):
+ for prefix, tname, tattr, tcontent in self.resc_tag_iter():
+ if self._debug:
+ print(" Parsing RESC: ", prefix, tname, tattr, tcontent)
+ if tname == 'package':
+ self.package_ver = tattr.get('version', '2.0')
+ package_prefix = tattr.get('prefix','')
+ if self.package_ver.startswith('3') or package_prefix.startswith('rendition'):
+ self.need3 = True
+ if tname == 'spine':
+ self.spine_ppd = tattr.get('page-progession-direction', None)
+ if self.spine_ppd is not None and self.spine_ppd == 'rtl':
+ self.need3 = True
+ if tname == 'itemref':
+ skelid = tattr.pop('skelid', None)
+ if skelid is None and len(self.spine_order) == 0:
+ # assume it was removed initial coverpage
+ skelid = 'coverpage'
+ tattr['linear'] = 'no'
+ self.spine_order.append(skelid)
+ idref = tattr.pop('idref', None)
+ if idref is not None:
+ idref = 'x_' + idref
+ self.spine_idrefs[skelid] = idref
+ if 'id' in tattr:
+ del tattr['id']
+ # tattr["id"] = 'x_' + tattr["id"]
+ if 'properties' in tattr:
+ self.need3 = True
+ self.spine_pageattributes[skelid] = tattr
+ if tname == 'meta' or tname.startswith('dc:'):
+ if 'refines' in tattr or 'property' in tattr:
+ self.need3 = True
+ if tattr.get('name','') == 'cover':
+ cover_name = tattr.get('content',None)
+ if cover_name is not None:
+ cover_name = 'x_' + cover_name
+ self.cover_name = cover_name
+ else:
+ self.extrameta.append([tname, tattr, tcontent])
+
+ # parse and return either leading text or the next tag
+ def parseresc(self):
+ p = self.opos
+ if p >= len(self.resc):
+ return None, None
+ if self.resc[p] != '<':
+ res = self.resc.find('<',p)
+ if res == -1 :
+ res = len(self.resc)
+ self.opos = res
+ return self.resc[p:res], None
+ # handle comment as a special case
+ if self.resc[p:p+4] == '<!--':
+ te = self.resc.find('-->',p+1)
+ if te != -1:
+ te = te+2
+ else:
+ te = self.resc.find('>',p+1)
+ ntb = self.resc.find('<',p+1)
+ if ntb != -1 and ntb < te:
+ self.opos = ntb
+ return self.resc[p:ntb], None
+ self.opos = te + 1
+ return None, self.resc[p:te+1]
+
+ # parses tag to identify: [tname, ttype, tattr]
+ # tname: tag name
+ # ttype: tag type ('begin', 'end' or 'single');
+ # tattr: dictionary of tag atributes
+ def parsetag(self, s):
+ p = 1
+ tname = None
+ ttype = None
+ tattr = dict_()
+ while s[p:p+1] == ' ' :
+ p += 1
+ if s[p:p+1] == '/':
+ ttype = 'end'
+ p += 1
+ while s[p:p+1] == ' ' :
+ p += 1
+ b = p
+ while s[p:p+1] not in ('>', '/', ' ', '"', "'",'\r','\n') :
+ p += 1
+ tname=s[b:p].lower()
+ # some special cases
+ if tname == '?xml':
+ tname = 'xml'
+ if tname == '!--':
+ ttype = 'single'
+ comment = s[p:-3].strip()
+ tattr['comment'] = comment
+ if ttype is None:
+ # parse any attributes of begin or single tags
+ while s.find('=',p) != -1 :
+ while s[p:p+1] == ' ' :
+ p += 1
+ b = p
+ while s[p:p+1] != '=' :
+ p += 1
+ aname = s[b:p].lower()
+ aname = aname.rstrip(' ')
+ p += 1
+ while s[p:p+1] == ' ' :
+ p += 1
+ if s[p:p+1] in ('"', "'") :
+ p = p + 1
+ b = p
+ while s[p:p+1] not in ('"', "'"):
+ p += 1
+ val = s[b:p]
+ p += 1
+ else :
+ b = p
+ while s[p:p+1] not in ('>', '/', ' ') :
+ p += 1
+ val = s[b:p]
+ tattr[aname] = val
+ if ttype is None:
+ ttype = 'begin'
+ if s.find('/',p) >= 0:
+ ttype = 'single'
+ return ttype, tname, tattr
+
+ def taginfo_toxml(self, taginfo):
+ res = []
+ tname, tattr, tcontent = taginfo
+ res.append('<' + tname)
+ if tattr is not None:
+ for key in tattr:
+ res.append(' ' + key + '="'+tattr[key]+'"')
+ if tcontent is not None:
+ res.append('>' + tcontent + '</' + tname + '>\n')
+ else:
+ res.append('/>\n')
+ return "".join(res)
+
+ def hasSpine(self):
+ return len(self.spine_order) > 0
+
+ def needEPUB3(self):
+ return self.need3
+
+ def hasRefines(self):
+ for [tname, tattr, tcontent] in self.extrameta:
+ if 'refines' in tattr:
+ return True
+ return False
+
+ def createMetadata(self, epubver):
+ for taginfo in self.extrameta:
+ tname, tattr, tcontent = taginfo
+ if 'refines' in tattr:
+ if epubver == 'F' and 'property' in tattr:
+ attr = ' id="%s" opf:%s="%s"\n' % (tattr['refines'], tattr['property'], tcontent)
+ self.extra_attributes.append(attr)
+ else:
+ tag = self.taginfo_toxml(taginfo)
+ self.refines_metadata.append(tag)
+ else:
+ tag = self.taginfo_toxml(taginfo)
+ self.extra_metadata.append(tag)
diff --git a/src/epy_reader/tools/KindleUnpack/mobi_nav.py b/src/epy_reader/tools/KindleUnpack/mobi_nav.py
new file mode 100644
index 0000000..16fb0be
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/mobi_nav.py
@@ -0,0 +1,187 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import unicode_str
+import os
+from .unipath import pathof
+
+import re
+# note: re requites the pattern to be the exact same type as the data to be searched in python3
+# but u"" is not allowed for the pattern itself only b""
+
+DEBUG_NAV = False
+
+FORCE_DEFAULT_TITLE = False
+""" Set to True to force to use the default title. """
+
+NAVIGATION_FINENAME = 'nav.xhtml'
+""" The name for the navigation document. """
+
+DEFAULT_TITLE = 'Navigation'
+""" The default title for the navigation document. """
+
+class NAVProcessor(object):
+
+ def __init__(self, files):
+ self.files = files
+ self.navname = NAVIGATION_FINENAME
+
+ def buildLandmarks(self, guidetext):
+ header = ''
+ header += ' <nav epub:type="landmarks" id="landmarks" hidden="">\n'
+ header += ' <h2>Guide</h2>\n'
+ header += ' <ol>\n'
+ element = ' <li><a epub:type="{:s}" href="{:s}">{:s}</a></li>\n'
+ footer = ''
+ footer += ' </ol>\n'
+ footer += ' </nav>\n'
+
+ type_map = {
+ 'cover' : 'cover',
+ 'title-page' : 'title-page',
+ # ?: 'frontmatter',
+ 'text' : 'bodymatter',
+ # ?: 'backmatter',
+ 'toc' : 'toc',
+ 'loi' : 'loi',
+ 'lot' : 'lot',
+ 'preface' : 'preface',
+ 'bibliography' : 'bibliography',
+ 'index' : 'index',
+ 'glossary' : 'glossary',
+ 'acknowledgements' : 'acknowledgements',
+ 'colophon' : None,
+ 'copyright-page' : None,
+ 'dedication' : None,
+ 'epigraph' : None,
+ 'foreword' : None,
+ 'notes' : None
+ }
+
+ re_type = re.compile(r'\s+type\s*=\s*"(.*?)"', re.I)
+ re_title = re.compile(r'\s+title\s*=\s*"(.*?)"', re.I)
+ re_link = re.compile(r'\s+href\s*=\s*"(.*?)"', re.I)
+ dir_ = os.path.relpath(self.files.k8text, self.files.k8oebps).replace('\\', '/')
+
+ data = ''
+ references = re.findall(r'<reference\s+.*?>', unicode_str(guidetext), re.I)
+ for reference in references:
+ mo_type = re_type.search(reference)
+ mo_title = re_title.search(reference)
+ mo_link = re_link.search(reference)
+ if mo_type is not None:
+ type_ = type_map.get(mo_type.group(1), None)
+ else:
+ type_ = None
+ if mo_title is not None:
+ title = mo_title.group(1)
+ else:
+ title = None
+ if mo_link is not None:
+ link = mo_link.group(1)
+ else:
+ link = None
+
+ if type_ is not None and title is not None and link is not None:
+ link = os.path.relpath(link, dir_).replace('\\', '/')
+ data += element.format(type_, link, title)
+ if len(data) > 0:
+ return header + data + footer
+ else:
+ return ''
+
+ def buildTOC(self, indx_data):
+ header = ''
+ header += ' <nav epub:type="toc" id="toc">\n'
+ header += ' <h1>Table of contents</h1>\n'
+ footer = ' </nav>\n'
+
+ # recursive part
+ def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
+ if start>len(indx_data) or end>len(indx_data):
+ print("Warning (in buildTOC): missing INDX child entries", start, end, len(indx_data))
+ return ''
+ if DEBUG_NAV:
+ print("recursINDX (in buildTOC) lvl %d from %d to %d" % (lvl, start, end))
+ xhtml = ''
+ if start <= 0:
+ start = 0
+ if end <= 0:
+ end = len(indx_data)
+ if lvl > max_lvl:
+ max_lvl = lvl
+
+ indent1 = ' ' * (2 + lvl * 2)
+ indent2 = ' ' * (3 + lvl * 2)
+ xhtml += indent1 + '<ol>\n'
+ for i in range(start, end):
+ e = indx_data[i]
+ htmlfile = e['filename']
+ desttag = e['idtag']
+ text = e['text']
+ if not e['hlvl'] == lvl:
+ continue
+ num += 1
+ if desttag == '':
+ link = htmlfile
+ else:
+ link = '{:s}#{:s}'.format(htmlfile, desttag)
+ xhtml += indent2 + '<li>'
+ entry = '<a href="{:}">{:s}</a>'.format(link, text)
+ xhtml += entry
+ # recurs
+ if e['child1'] >= 0:
+ xhtml += '\n'
+ xhtmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,
+ e['child1'], e['childn'] + 1)
+ xhtml += xhtmlrec
+ xhtml += indent2
+ # close entry
+ xhtml += '</li>\n'
+ xhtml += indent1 + '</ol>\n'
+ return xhtml, max_lvl, num
+
+ data, max_lvl, num = recursINDX()
+ if not len(indx_data) == num:
+ print("Warning (in buildTOC): different number of entries in NCX", len(indx_data), num)
+ return header + data + footer
+
+ def buildNAV(self, ncx_data, guidetext, title, lang):
+ print("Building Navigation Document.")
+ if FORCE_DEFAULT_TITLE:
+ title = DEFAULT_TITLE
+ nav_header = ''
+ nav_header += '<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html>'
+ nav_header += '<html xmlns="http://www.w3.org/1999/xhtml"'
+ nav_header += ' xmlns:epub="http://www.idpf.org/2007/ops"'
+ nav_header += ' lang="{0:s}" xml:lang="{0:s}">\n'.format(lang)
+ nav_header += '<head>\n<title>{:s}</title>\n'.format(title)
+ nav_header += '<meta charset="UTF-8" />\n'
+ nav_header += '<style type="text/css">\n'
+ nav_header += 'nav#landmarks { display:none; }\n'
+ nav_header += 'ol { list-style-type: none; }'
+ nav_header += '</style>\n</head>\n<body>\n'
+ nav_footer = '</body>\n</html>\n'
+
+ landmarks = self.buildLandmarks(guidetext)
+ toc = self.buildTOC(ncx_data)
+
+ data = nav_header
+ data += landmarks
+ data += toc
+ data += nav_footer
+ return data
+
+ def getNAVName(self):
+ return self.navname
+
+ def writeNAV(self, ncx_data, guidetext, metadata):
+ # build the xhtml
+ # print("Write Navigation Document.")
+ xhtml = self.buildNAV(ncx_data, guidetext, metadata.get('Title')[0], metadata.get('Language')[0])
+ fname = os.path.join(self.files.k8text, self.navname)
+ with open(pathof(fname), 'wb') as f:
+ f.write(xhtml.encode('utf-8'))
diff --git a/src/epy_reader/tools/KindleUnpack/mobi_ncx.py b/src/epy_reader/tools/KindleUnpack/mobi_ncx.py
new file mode 100644
index 0000000..60ef9a0
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/mobi_ncx.py
@@ -0,0 +1,275 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+import os
+from .unipath import pathof
+from .compatibility_utils import unescapeit
+
+
+import re
+# note: re requites the pattern to be the exact same type as the data to be searched in python3
+# but u"" is not allowed for the pattern itself only b""
+
+from xml.sax.saxutils import escape as xmlescape
+
+from .mobi_utils import toBase32
+from .mobi_index import MobiIndex
+
+DEBUG_NCX = False
+
+class ncxExtract:
+
+ def __init__(self, mh, files):
+ self.mh = mh
+ self.sect = self.mh.sect
+ self.files = files
+ self.isNCX = False
+ self.mi = MobiIndex(self.sect)
+ self.ncxidx = self.mh.ncxidx
+ self.indx_data = None
+
+ def parseNCX(self):
+ indx_data = []
+ tag_fieldname_map = {
+ 1: ['pos',0],
+ 2: ['len',0],
+ 3: ['noffs',0],
+ 4: ['hlvl',0],
+ 5: ['koffs',0],
+ 6: ['pos_fid',0],
+ 21: ['parent',0],
+ 22: ['child1',0],
+ 23: ['childn',0]
+ }
+ if self.ncxidx != 0xffffffff:
+ outtbl, ctoc_text = self.mi.getIndexData(self.ncxidx, "NCX")
+ if DEBUG_NCX:
+ print(ctoc_text)
+ print(outtbl)
+ num = 0
+ for [text, tagMap] in outtbl:
+ tmp = {
+ 'name': text.decode('utf-8'),
+ 'pos': -1,
+ 'len': 0,
+ 'noffs': -1,
+ 'text' : "Unknown Text",
+ 'hlvl' : -1,
+ 'kind' : "Unknown Kind",
+ 'pos_fid' : None,
+ 'parent' : -1,
+ 'child1' : -1,
+ 'childn' : -1,
+ 'num' : num
+ }
+ for tag in tag_fieldname_map:
+ [fieldname, i] = tag_fieldname_map[tag]
+ if tag in tagMap:
+ fieldvalue = tagMap[tag][i]
+ if tag == 6:
+ pos_fid = toBase32(fieldvalue,4).decode('utf-8')
+ fieldvalue2 = tagMap[tag][i+1]
+ pos_off = toBase32(fieldvalue2,10).decode('utf-8')
+ fieldvalue = 'kindle:pos:fid:%s:off:%s' % (pos_fid, pos_off)
+ tmp[fieldname] = fieldvalue
+ if tag == 3:
+ toctext = ctoc_text.get(fieldvalue, 'Unknown Text')
+ toctext = toctext.decode(self.mh.codec)
+ tmp['text'] = toctext
+ if tag == 5:
+ kindtext = ctoc_text.get(fieldvalue, 'Unknown Kind')
+ kindtext = kindtext.decode(self.mh.codec)
+ tmp['kind'] = kindtext
+ indx_data.append(tmp)
+ if DEBUG_NCX:
+ print("record number: ", num)
+ print("name: ", tmp['name'],)
+ print("position", tmp['pos']," length: ", tmp['len'])
+ print("text: ", tmp['text'])
+ print("kind: ", tmp['kind'])
+ print("heading level: ", tmp['hlvl'])
+ print("parent:", tmp['parent'])
+ print("first child: ",tmp['child1']," last child: ", tmp['childn'])
+ print("pos_fid is ", tmp['pos_fid'])
+ print("\n\n")
+ num += 1
+ self.indx_data = indx_data
+ return indx_data
+
+ def buildNCX(self, htmlfile, title, ident, lang):
+ indx_data = self.indx_data
+
+ ncx_header = \
+'''<?xml version='1.0' encoding='utf-8'?>
+<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="%s">
+<head>
+<meta content="%s" name="dtb:uid"/>
+<meta content="%d" name="dtb:depth"/>
+<meta content="mobiunpack.py" name="dtb:generator"/>
+<meta content="0" name="dtb:totalPageCount"/>
+<meta content="0" name="dtb:maxPageNumber"/>
+</head>
+<docTitle>
+<text>%s</text>
+</docTitle>
+<navMap>
+'''
+
+ ncx_footer = \
+''' </navMap>
+</ncx>
+'''
+
+ ncx_entry = \
+'''<navPoint id="%s" playOrder="%d">
+<navLabel>
+<text>%s</text>
+</navLabel>
+<content src="%s"/>'''
+
+ # recursive part
+ def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
+ if start>len(indx_data) or end>len(indx_data):
+ print("Warning: missing INDX child entries", start, end, len(indx_data))
+ return ''
+ if DEBUG_NCX:
+ print("recursINDX lvl %d from %d to %d" % (lvl, start, end))
+ xml = ''
+ if start <= 0:
+ start = 0
+ if end <= 0:
+ end = len(indx_data)
+ if lvl > max_lvl:
+ max_lvl = lvl
+ indent = ' ' * (2 + lvl)
+
+ for i in range(start, end):
+ e = indx_data[i]
+ if not e['hlvl'] == lvl:
+ continue
+ # open entry
+ num += 1
+ link = '%s#filepos%d' % (htmlfile, e['pos'])
+ tagid = 'np_%d' % num
+ entry = ncx_entry % (tagid, num, xmlescape(unescapeit(e['text'])), link)
+ entry = re.sub(re.compile('^', re.M), indent, entry, 0)
+ xml += entry + '\n'
+ # recurs
+ if e['child1']>=0:
+ xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,
+ e['child1'], e['childn'] + 1)
+ xml += xmlrec
+ # close entry
+ xml += indent + '</navPoint>\n'
+ return xml, max_lvl, num
+
+ body, max_lvl, num = recursINDX()
+ header = ncx_header % (lang, ident, max_lvl + 1, title)
+ ncx = header + body + ncx_footer
+ if not len(indx_data) == num:
+ print("Warning: different number of entries in NCX", len(indx_data), num)
+ return ncx
+
+ def writeNCX(self, metadata):
+ # build the xml
+ self.isNCX = True
+ print("Write ncx")
+ # htmlname = os.path.basename(self.files.outbase)
+ # htmlname += '.html'
+ htmlname = 'book.html'
+ xml = self.buildNCX(htmlname, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0])
+ # write the ncx file
+ # ncxname = os.path.join(self.files.mobi7dir, self.files.getInputFileBasename() + '.ncx')
+ ncxname = os.path.join(self.files.mobi7dir, 'toc.ncx')
+ with open(pathof(ncxname), 'wb') as f:
+ f.write(xml.encode('utf-8'))
+
+ def buildK8NCX(self, indx_data, title, ident, lang):
+ ncx_header = \
+'''<?xml version='1.0' encoding='utf-8'?>
+<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="%s">
+<head>
+<meta content="%s" name="dtb:uid"/>
+<meta content="%d" name="dtb:depth"/>
+<meta content="mobiunpack.py" name="dtb:generator"/>
+<meta content="0" name="dtb:totalPageCount"/>
+<meta content="0" name="dtb:maxPageNumber"/>
+</head>
+<docTitle>
+<text>%s</text>
+</docTitle>
+<navMap>
+'''
+
+ ncx_footer = \
+''' </navMap>
+</ncx>
+'''
+
+ ncx_entry = \
+'''<navPoint id="%s" playOrder="%d">
+<navLabel>
+<text>%s</text>
+</navLabel>
+<content src="%s"/>'''
+
+ # recursive part
+ def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
+ if start>len(indx_data) or end>len(indx_data):
+ print("Warning: missing INDX child entries", start, end, len(indx_data))
+ return ''
+ if DEBUG_NCX:
+ print("recursINDX lvl %d from %d to %d" % (lvl, start, end))
+ xml = ''
+ if start <= 0:
+ start = 0
+ if end <= 0:
+ end = len(indx_data)
+ if lvl > max_lvl:
+ max_lvl = lvl
+ indent = ' ' * (2 + lvl)
+
+ for i in range(start, end):
+ e = indx_data[i]
+ htmlfile = e['filename']
+ desttag = e['idtag']
+ if not e['hlvl'] == lvl:
+ continue
+ # open entry
+ num += 1
+ if desttag == '':
+ link = 'Text/%s' % htmlfile
+ else:
+ link = 'Text/%s#%s' % (htmlfile, desttag)
+ tagid = 'np_%d' % num
+ entry = ncx_entry % (tagid, num, xmlescape(unescapeit(e['text'])), link)
+ entry = re.sub(re.compile('^', re.M), indent, entry, 0)
+ xml += entry + '\n'
+ # recurs
+ if e['child1']>=0:
+ xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,
+ e['child1'], e['childn'] + 1)
+ xml += xmlrec
+ # close entry
+ xml += indent + '</navPoint>\n'
+ return xml, max_lvl, num
+
+ body, max_lvl, num = recursINDX()
+ header = ncx_header % (lang, ident, max_lvl + 1, title)
+ ncx = header + body + ncx_footer
+ if not len(indx_data) == num:
+ print("Warning: different number of entries in NCX", len(indx_data), num)
+ return ncx
+
+ def writeK8NCX(self, ncx_data, metadata):
+ # build the xml
+ self.isNCX = True
+ print("Write K8 ncx")
+ xml = self.buildK8NCX(ncx_data, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0])
+ bname = 'toc.ncx'
+ ncxname = os.path.join(self.files.k8oebps,bname)
+ with open(pathof(ncxname), 'wb') as f:
+ f.write(xml.encode('utf-8'))
diff --git a/src/epy_reader/tools/KindleUnpack/mobi_opf.py b/src/epy_reader/tools/KindleUnpack/mobi_opf.py
new file mode 100644
index 0000000..742d776
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/mobi_opf.py
@@ -0,0 +1,686 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import unicode_str, unescapeit
+from .compatibility_utils import lzip
+
+from .unipath import pathof
+
+from xml.sax.saxutils import escape as xmlescape
+
+import os
+import uuid
+from datetime import datetime
+
+# In EPUB3, NCX and <guide> MAY exist in OPF, although the NCX is superseded
+# by the Navigation Document and the <guide> is deprecated. Currently, EPUB3_WITH_NCX
+# and EPUB3_WITH_GUIDE are set to True due to compatibility with epub2 reading systems.
+# They might be change to set to False in the future.
+
+EPUB3_WITH_NCX = True # Do not set to False except for debug.
+""" Set to True to create a toc.ncx when converting to epub3. """
+
+EPUB3_WITH_GUIDE = True # Do not set to False except for debug.
+""" Set to True to create a guide element in an opf when converting to epub3. """
+
+EPUB_OPF = 'content.opf'
+""" The name for the OPF of EPUB. """
+
+TOC_NCX = 'toc.ncx'
+""" The name for the TOC of EPUB2. """
+
+NAVIGATION_DOCUMENT = 'nav.xhtml'
+""" The name for the navigation document of EPUB3. """
+
+BEGIN_INFO_ONLY = '<!-- BEGIN INFORMATION ONLY '
+""" The comment to indicate the beginning of metadata which will be ignored by kindlegen. """
+
+END_INFO_ONLY = 'END INFORMATION ONLY -->'
+""" The comment to indicate the end of metadata which will be ignored by kindlegen. """
+
+EXTH_TITLE_FURIGANA = 'Title-Pronunciation'
+""" The name for Title Furigana(similar to file-as) set by KDP. """
+
+EXTH_CREATOR_FURIGANA = 'Author-Pronunciation'
+""" The name for Creator Furigana(similar to file-as) set by KDP. """
+
+EXTH_PUBLISHER_FURIGANA = 'Publisher-Pronunciation'
+""" The name for Publisher Furigana(similar to file-as) set by KDP. """
+
+EXTRA_ENTITIES = {'"': '&quot;', "'": "&apos;"}
+
+class OPFProcessor(object):
+
+ def __init__(self, files, metadata, fileinfo, rscnames, hasNCX, mh, usedmap, pagemapxml='', guidetext='', k8resc=None, epubver='2'):
+ self.files = files
+ self.metadata = metadata
+ self.fileinfo = fileinfo
+ self.rscnames = rscnames
+ self.has_ncx = hasNCX
+ self.codec = mh.codec
+ self.isK8 = mh.isK8()
+ self.printReplica = mh.isPrintReplica()
+ self.guidetext = unicode_str(guidetext)
+ self.used = usedmap
+ self.k8resc = k8resc
+ self.covername = None
+ self.cover_id = 'cover_img'
+ if self.k8resc is not None and self.k8resc.cover_name is not None:
+ # update cover id info from RESC if available
+ self.cover_id = self.k8resc.cover_name
+ # Create a unique urn uuid
+ self.BookId = unicode_str(str(uuid.uuid4()))
+ self.pagemap = pagemapxml
+
+ self.ncxname = None
+ self.navname = None
+
+ # page-progression-direction is only set in spine
+ self.page_progression_direction = metadata.pop('page-progression-direction', [None])[0]
+ if 'rl' in metadata.get('primary-writing-mode', [''])[0]:
+ self.page_progression_direction = 'rtl'
+ self.epubver = epubver # the epub version set by user
+ self.target_epubver = epubver # the epub vertion set by user or detected automatically
+ if self.epubver == 'A':
+ self.target_epubver = self.autodetectEPUBVersion()
+ elif self.epubver == 'F':
+ self.target_epubver = '2'
+ elif self.epubver != '2' and self.epubver != '3':
+ self.target_epubver = '2'
+
+ # id for rifine attributes
+ self.title_id = {}
+ self.creator_id = {}
+ self.publisher_id = {}
+ # extra attributes
+ self.title_attrib = {}
+ self.creator_attrib = {}
+ self.publisher_attrib = {}
+ self.extra_attributes = [] # for force epub2 option
+ # Create epub3 metadata from EXTH.
+ self.exth_solved_refines_metadata = []
+ self.exth_refines_metadata = []
+ self.exth_fixedlayout_metadata = []
+
+ self.defineRefinesID()
+ self.processRefinesMetadata()
+ if self.k8resc is not None:
+ # Create metadata in RESC section.
+ self.k8resc.createMetadata(epubver)
+ if self.target_epubver == "3":
+ self.createMetadataForFixedlayout()
+
+ def escapeit(self, sval, EXTRAS=None):
+ # note, xmlescape and unescape do not work with utf-8 bytestrings
+ sval = unicode_str(sval)
+ if EXTRAS:
+ res = xmlescape(unescapeit(sval), EXTRAS)
+ else:
+ res = xmlescape(unescapeit(sval))
+ return res
+
+ def createMetaTag(self, data, property, content, refid=''):
+ refines = ''
+ if refid:
+ refines = ' refines="#%s"' % refid
+ data.append('<meta property="%s"%s>%s</meta>\n' % (property, refines, content))
+
+ def buildOPFMetadata(self, start_tag, has_obfuscated_fonts=False):
+ # convert from EXTH metadata format to target epub version metadata
+ # epub 3 will ignore <meta name="xxxx" content="yyyy" /> style metatags
+ # but allows them to be present for backwards compatibility
+ # instead the new format is
+ # <meta property="xxxx" id="iiii" ... > property_value</meta>
+ # and DCMES elements such as:
+ # <dc:blah id="iiii">value</dc:blah>
+
+ metadata = self.metadata
+ k8resc = self.k8resc
+
+ META_TAGS = ['Drm Server Id', 'Drm Commerce Id', 'Drm Ebookbase Book Id', 'ASIN', 'ThumbOffset', 'Fake Cover',
+ 'Creator Software', 'Creator Major Version', 'Creator Minor Version', 'Creator Build Number',
+ 'Watermark', 'Clipping Limit', 'Publisher Limit', 'Text to Speech Disabled', 'CDE Type',
+ 'Updated Title', 'Font Signature (hex)', 'Tamper Proof Keys (hex)',]
+
+ # def handleTag(data, metadata, key, tag, ids={}):
+ def handleTag(data, metadata, key, tag, attrib={}):
+ '''Format metadata values.
+
+ @param data: List of formatted metadata entries.
+ @param metadata: The metadata dictionary.
+ @param key: The key of the metadata value to handle.
+ @param tag: The opf tag corresponds to the metadata value.
+ ###@param ids: The ids in tags for refines property of epub3.
+ @param attrib: The extra attibute for refines or opf prefixs.
+ '''
+ if key in metadata:
+ for i, value in enumerate(metadata[key]):
+ closingTag = tag.split(" ")[0]
+ res = '<%s%s>%s</%s>\n' % (tag, attrib.get(i, ''), self.escapeit(value), closingTag)
+ data.append(res)
+ del metadata[key]
+
+ # these are allowed but ignored by epub3
+ def handleMetaPairs(data, metadata, key, name):
+ if key in metadata:
+ for value in metadata[key]:
+ res = '<meta name="%s" content="%s" />\n' % (name, self.escapeit(value, EXTRA_ENTITIES))
+ data.append(res)
+ del metadata[key]
+
+ data = []
+ data.append(start_tag + '\n')
+ # Handle standard metadata
+ if 'Title' in metadata:
+ handleTag(data, metadata, 'Title', 'dc:title', self.title_attrib)
+ else:
+ data.append('<dc:title>Untitled</dc:title>\n')
+ handleTag(data, metadata, 'Language', 'dc:language')
+ if 'UniqueID' in metadata:
+ handleTag(data, metadata, 'UniqueID', 'dc:identifier id="uid"')
+ else:
+ # No unique ID in original, give it a generic one.
+ data.append('<dc:identifier id="uid">0</dc:identifier>\n')
+
+ if self.target_epubver == '3':
+ # epub version 3 minimal metadata requires a dcterms:modifed date tag
+ self.createMetaTag(data, 'dcterms:modified', datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"))
+
+ if self.isK8 and has_obfuscated_fonts:
+ # Use the random generated urn:uuid so obuscated fonts work.
+ # It doesn't need to be _THE_ unique identifier to work as a key
+ # for obfuscated fonts in Sigil, ADE and calibre. Its just has
+ # to use the opf:scheme="UUID" and have the urn:uuid: prefix.
+ if self.target_epubver == '3':
+ data.append('<dc:identifier>urn:uuid:'+self.BookId+'</dc:identifier>\n')
+ else:
+ data.append('<dc:identifier opf:scheme="UUID">urn:uuid:'+self.BookId+'</dc:identifier>\n')
+
+ handleTag(data, metadata, 'Creator', 'dc:creator', self.creator_attrib)
+ handleTag(data, metadata, 'Contributor', 'dc:contributor')
+ handleTag(data, metadata, 'Publisher', 'dc:publisher', self.publisher_attrib)
+ handleTag(data, metadata, 'Source', 'dc:source')
+ handleTag(data, metadata, 'Type', 'dc:type')
+ if self.target_epubver == '3':
+ if 'ISBN' in metadata:
+ for i, value in enumerate(metadata['ISBN']):
+ res = '<dc:identifier>urn:isbn:%s</dc:identifier>\n' % self.escapeit(value)
+ data.append(res)
+ else:
+ handleTag(data, metadata, 'ISBN', 'dc:identifier opf:scheme="ISBN"')
+ if 'Subject' in metadata:
+ if 'SubjectCode' in metadata:
+ codeList = metadata['SubjectCode']
+ del metadata['SubjectCode']
+ else:
+ codeList = None
+ for i in range(len(metadata['Subject'])):
+ if codeList and i < len(codeList):
+ data.append('<dc:subject BASICCode="'+codeList[i]+'">')
+ else:
+ data.append('<dc:subject>')
+ data.append(self.escapeit(metadata['Subject'][i])+'</dc:subject>\n')
+ del metadata['Subject']
+ handleTag(data, metadata, 'Description', 'dc:description')
+ if self.target_epubver == '3':
+ if 'Published' in metadata:
+ for i, value in enumerate(metadata['Published']):
+ res = '<dc:date>%s</dc:date>\n' % self.escapeit(value)
+ data.append(res)
+ else:
+ handleTag(data, metadata, 'Published', 'dc:date opf:event="publication"')
+ handleTag(data, metadata, 'Rights', 'dc:rights')
+
+ if self.epubver == 'F':
+ if self.extra_attributes or k8resc is not None and k8resc.extra_attributes:
+ data.append('<!-- THE FOLLOWINGS ARE REQUIRED TO INSERT INTO <dc:xxx> MANUALLY\n')
+ if self.extra_attributes:
+ data += self.extra_attributes
+ if k8resc is not None and k8resc.extra_attributes:
+ data += k8resc.extra_attributes
+ data.append('-->\n')
+ else:
+ # Append refines metadata.
+ if self.exth_solved_refines_metadata:
+ data.append('<!-- Refines MetaData from EXTH -->\n')
+ data += self.exth_solved_refines_metadata
+ if self.exth_refines_metadata or k8resc is not None and k8resc.refines_metadata:
+ data.append('<!-- THE FOLLOWINGS ARE REQUIRED TO EDIT IDS MANUALLY\n')
+ if self.exth_refines_metadata:
+ data += self.exth_refines_metadata
+ if k8resc is not None and k8resc.refines_metadata:
+ data += k8resc.refines_metadata
+ data.append('-->\n')
+
+ # Append metadata in RESC section.
+ if k8resc is not None and k8resc.extra_metadata:
+ data.append('<!-- Extra MetaData from RESC\n')
+ data += k8resc.extra_metadata
+ data.append('-->\n')
+
+ if 'CoverOffset' in metadata:
+ imageNumber = int(metadata['CoverOffset'][0])
+ self.covername = self.rscnames[imageNumber]
+ if self.covername is None:
+ print("Error: Cover image %s was not recognized as a valid image" % imageNumber)
+ else:
+ # <meta name="cover"> is obsoleted in EPUB3, but kindlegen v2.9 requires it.
+ data.append('<meta name="cover" content="' + self.cover_id + '" />\n')
+ self.used[self.covername] = 'used'
+ del metadata['CoverOffset']
+
+ handleMetaPairs(data, metadata, 'Codec', 'output encoding')
+ # handle kindlegen specifc tags
+ handleTag(data, metadata, 'DictInLanguage', 'DictionaryInLanguage')
+ handleTag(data, metadata, 'DictOutLanguage', 'DictionaryOutLanguage')
+ handleMetaPairs(data, metadata, 'RegionMagnification', 'RegionMagnification')
+ handleMetaPairs(data, metadata, 'book-type', 'book-type')
+ handleMetaPairs(data, metadata, 'zero-gutter', 'zero-gutter')
+ handleMetaPairs(data, metadata, 'zero-margin', 'zero-margin')
+ handleMetaPairs(data, metadata, 'primary-writing-mode', 'primary-writing-mode')
+ handleMetaPairs(data, metadata, 'fixed-layout', 'fixed-layout')
+ handleMetaPairs(data, metadata, 'orientation-lock', 'orientation-lock')
+ handleMetaPairs(data, metadata, 'original-resolution', 'original-resolution')
+
+ # these are not allowed in epub2 or 3 so convert them to meta name content pairs
+ # perhaps these could better be mapped into the dcterms namespace instead
+ handleMetaPairs(data, metadata, 'Review', 'review')
+ handleMetaPairs(data, metadata, 'Imprint', 'imprint')
+ handleMetaPairs(data, metadata, 'Adult', 'adult')
+ handleMetaPairs(data, metadata, 'DictShortName', 'DictionaryVeryShortName')
+
+ # these are needed by kobo books upon submission but not sure if legal metadata in epub2 or epub3
+ if 'Price' in metadata and 'Currency' in metadata:
+ priceList = metadata['Price']
+ currencyList = metadata['Currency']
+ if len(priceList) != len(currencyList):
+ print("Error: found %s price entries, but %s currency entries.")
+ else:
+ for i in range(len(priceList)):
+ data.append('<SRP Currency="'+currencyList[i]+'">'+priceList[i]+'</SRP>\n')
+ del metadata['Price']
+ del metadata['Currency']
+
+ if self.target_epubver == '3':
+ # Append metadata for EPUB3.
+ if self.exth_fixedlayout_metadata:
+ data.append('<!-- EPUB3 MedaData converted from EXTH -->\n')
+ data += self.exth_fixedlayout_metadata
+
+ # all that remains is extra EXTH info we will store inside a comment inside meta name/content pairs
+ # so it can not impact anything and will be automatically stripped out if found again in a RESC section
+ data.append(BEGIN_INFO_ONLY + '\n')
+ if 'ThumbOffset' in metadata:
+ imageNumber = int(metadata['ThumbOffset'][0])
+ # Some bad books give image indexes that are 'out of range'
+ try:
+ imageName = self.rscnames[imageNumber]
+ except:
+ print('Number given for Cover Thumbnail is out of range: %s' % imageNumber)
+ imageName = None
+ if imageName is None:
+ print("Error: Cover Thumbnail image %s was not recognized as a valid image" % imageNumber)
+ else:
+ data.append('<meta name="Cover ThumbNail Image" content="'+ 'Images/'+imageName+'" />\n')
+ # self.used[imageName] = 'used' # thumbnail image is always generated by Kindlegen, so don't include in manifest
+ self.used[imageName] = 'not used'
+ del metadata['ThumbOffset']
+ for metaName in META_TAGS:
+ if metaName in metadata:
+ for value in metadata[metaName]:
+ data.append('<meta name="'+metaName+'" content="'+self.escapeit(value, EXTRA_ENTITIES)+'" />\n')
+ del metadata[metaName]
+ for key in list(metadata.keys()):
+ for value in metadata[key]:
+ data.append('<meta name="'+key+'" content="'+self.escapeit(value, EXTRA_ENTITIES)+'" />\n')
+ del metadata[key]
+ data.append(END_INFO_ONLY + '\n')
+ data.append('</metadata>\n')
+ return data
+
+ def buildOPFManifest(self, ncxname, navname=None):
+ # buildManifest for mobi7, azw4, epub2 and epub3.
+ k8resc = self.k8resc
+ cover_id = self.cover_id
+ hasK8RescSpine = k8resc is not None and k8resc.hasSpine()
+ self.ncxname = ncxname
+ self.navname = navname
+
+ data = []
+ data.append('<manifest>\n')
+ media_map = {
+ '.jpg' : 'image/jpeg',
+ '.jpeg' : 'image/jpeg',
+ '.png' : 'image/png',
+ '.gif' : 'image/gif',
+ '.svg' : 'image/svg+xml',
+ '.xhtml': 'application/xhtml+xml',
+ '.html' : 'text/html', # for mobi7
+ '.pdf' : 'application/pdf', # for azw4(print replica textbook)
+ '.ttf' : 'application/x-font-ttf',
+ '.otf' : 'application/x-font-opentype', # replaced?
+ '.css' : 'text/css',
+ # '.html' : 'text/x-oeb1-document', # for mobi7
+ # '.otf' : 'application/vnd.ms-opentype', # [OpenType] OpenType fonts
+ # '.woff' : 'application/font-woff', # [WOFF] WOFF fonts
+ # '.smil' : 'application/smil+xml', # [MediaOverlays301] EPUB Media Overlay documents
+ # '.pls' : 'application/pls+xml', # [PLS] Text-to-Speech (TTS) Pronunciation lexicons
+ # '.mp3' : 'audio/mpeg',
+ # '.mp4' : 'video/mp4',
+ # '.js' : 'text/javascript', # not supported in K8
+ }
+ spinerefs = []
+
+ idcnt = 0
+ for [key,dir,fname] in self.fileinfo:
+ name, ext = os.path.splitext(fname)
+ ext = ext.lower()
+ media = media_map.get(ext)
+ ref = "item%d" % idcnt
+ if hasK8RescSpine:
+ if key is not None and key in k8resc.spine_idrefs:
+ ref = k8resc.spine_idrefs[key]
+ properties = ''
+ if dir != '':
+ fpath = dir + '/' + fname
+ else:
+ fpath = fname
+ data.append('<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(ref, media, fpath, properties))
+
+ if ext in ['.xhtml', '.html']:
+ spinerefs.append(ref)
+ idcnt += 1
+
+ for fname in self.rscnames:
+ if fname is not None:
+ if self.used.get(fname,'not used') == 'not used':
+ continue
+ name, ext = os.path.splitext(fname)
+ ext = ext.lower()
+ media = media_map.get(ext,ext[1:])
+ properties = ''
+ if fname == self.covername:
+ ref = cover_id
+ if self.target_epubver == '3':
+ properties = 'properties="cover-image"'
+ else:
+ ref = "item%d" % idcnt
+ if ext == '.ttf' or ext == '.otf':
+ if self.isK8: # fonts are only used in Mobi 8
+ fpath = 'Fonts/' + fname
+ data.append('<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(ref, media, fpath, properties))
+ else:
+ fpath = 'Images/' + fname
+ data.append('<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(ref, media, fpath, properties))
+ idcnt += 1
+
+ if self.target_epubver == '3' and navname is not None:
+ data.append('<item id="nav" media-type="application/xhtml+xml" href="Text/' + navname + '" properties="nav"/>\n')
+ if self.has_ncx and ncxname is not None:
+ data.append('<item id="ncx" media-type="application/x-dtbncx+xml" href="' + ncxname +'" />\n')
+ if self.pagemap != '':
+ data.append('<item id="map" media-type="application/oebs-page-map+xml" href="page-map.xml" />\n')
+ data.append('</manifest>\n')
+ return [data, spinerefs]
+
+ def buildOPFSpine(self, spinerefs, isNCX):
+ # build spine
+ k8resc = self.k8resc
+ hasK8RescSpine = k8resc is not None and k8resc.hasSpine()
+ data = []
+ ppd = ''
+ if self.isK8 and self.page_progression_direction is not None:
+ ppd = ' page-progression-direction="{:s}"'.format(self.page_progression_direction)
+ ncx = ''
+ if isNCX:
+ ncx = ' toc="ncx"'
+ map=''
+ if self.pagemap != '':
+ map = ' page-map="map"'
+ if self.epubver == 'F':
+ if ppd:
+ ppd = '<!--' + ppd + ' -->'
+ spine_start_tag = '<spine{1:s}{2:s}>{0:s}\n'.format(ppd, map, ncx)
+ else:
+ spine_start_tag = '<spine{0:s}{1:s}{2:s}>\n'.format(ppd, map, ncx)
+ data.append(spine_start_tag)
+
+ if hasK8RescSpine:
+ for key in k8resc.spine_order:
+ idref = k8resc.spine_idrefs[key]
+ attribs = k8resc.spine_pageattributes[key]
+ tag = '<itemref idref="%s"' % idref
+ for aname, val in list(attribs.items()):
+ if self.epubver == 'F' and aname == 'properties':
+ continue
+ if val is not None:
+ tag += ' %s="%s"' % (aname, val)
+ tag += '/>'
+ if self.epubver == 'F' and 'properties' in attribs:
+ val = attribs['properties']
+ if val is not None:
+ tag += '<!-- properties="%s" -->' % val
+ tag += '\n'
+ data.append(tag)
+ else:
+ start = 0
+ # special case the created coverpage if need be
+ [key, dir, fname] = self.fileinfo[0]
+ if key is not None and key == "coverpage":
+ entry = spinerefs[start]
+ data.append('<itemref idref="%s" linear="no"/>\n' % entry)
+ start += 1
+ for entry in spinerefs[start:]:
+ data.append('<itemref idref="' + entry + '"/>\n')
+ data.append('</spine>\n')
+ return data
+
+ def buildMobi7OPF(self):
+ # Build an OPF for mobi7 and azw4.
+ print("Building an opf for mobi7/azw4.")
+ data = []
+ data.append('<?xml version="1.0" encoding="utf-8"?>\n')
+ data.append('<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="uid">\n')
+ metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">'
+ opf_metadata = self.buildOPFMetadata(metadata_tag)
+ data += opf_metadata
+ if self.has_ncx:
+ # ncxname = self.files.getInputFileBasename() + '.ncx'
+ ncxname = 'toc.ncx'
+ else:
+ ncxname = None
+ [opf_manifest, spinerefs] = self.buildOPFManifest(ncxname)
+ data += opf_manifest
+ opf_spine = self.buildOPFSpine(spinerefs, self.has_ncx)
+ data += opf_spine
+ data.append('<tours>\n</tours>\n')
+ if not self.printReplica:
+ guide ='<guide>\n' + self.guidetext + '</guide>\n'
+ data.append(guide)
+ data.append('</package>\n')
+ return ''.join(data)
+
+ def buildEPUBOPF(self, has_obfuscated_fonts=False):
+ print("Building an opf for mobi8 using epub version: ", self.target_epubver)
+ if self.target_epubver == '2':
+ has_ncx = self.has_ncx
+ has_guide = True
+ ncxname = None
+ ncxname = TOC_NCX
+ navname = None
+ package = '<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="uid">\n'
+ tours = '<tours>\n</tours>\n'
+ metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">'
+ else:
+ has_ncx = EPUB3_WITH_NCX
+ has_guide = EPUB3_WITH_GUIDE
+ ncxname = None
+ if has_ncx:
+ ncxname = TOC_NCX
+ navname = NAVIGATION_DOCUMENT
+ package = '<package version="3.0" xmlns="http://www.idpf.org/2007/opf" prefix="rendition: http://www.idpf.org/vocab/rendition/#" unique-identifier="uid">\n'
+ tours = ''
+ metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">'
+
+ data = []
+ data.append('<?xml version="1.0" encoding="utf-8"?>\n')
+ data.append(package)
+ opf_metadata = self.buildOPFMetadata(metadata_tag, has_obfuscated_fonts)
+ data += opf_metadata
+ [opf_manifest, spinerefs] = self.buildOPFManifest(ncxname, navname)
+ data += opf_manifest
+ opf_spine = self.buildOPFSpine(spinerefs, has_ncx)
+ data += opf_spine
+ data.append(tours)
+ if has_guide:
+ guide ='<guide>\n' + self.guidetext + '</guide>\n'
+ data.append(guide)
+ data.append('</package>\n')
+ return ''.join(data)
+
+ def writeOPF(self, has_obfuscated_fonts=False):
+ if self.isK8:
+ data = self.buildEPUBOPF(has_obfuscated_fonts)
+ outopf = os.path.join(self.files.k8oebps, EPUB_OPF)
+ with open(pathof(outopf), 'wb') as f:
+ f.write(data.encode('utf-8'))
+ return self.BookId
+ else:
+ data = self.buildMobi7OPF()
+ outopf = os.path.join(self.files.mobi7dir, 'content.opf')
+ with open(pathof(outopf), 'wb') as f:
+ f.write(data.encode('utf-8'))
+ return 0
+
+ def getBookId(self):
+ return self.BookId
+
+ def getNCXName(self):
+ return self.ncxname
+
+ def getNAVName(self):
+ return self.navname
+
+ def getEPUBVersion(self):
+ return self.target_epubver
+
+ def hasNCX(self):
+ return self.ncxname is not None and self.has_ncx
+
+ def hasNAV(self):
+ return self.navname is not None
+
+ def autodetectEPUBVersion(self):
+ # Determine EPUB version from metadata and RESC.
+ metadata = self.metadata
+ k8resc = self.k8resc
+ epubver = '2'
+ if 'true' == metadata.get('fixed-layout', [''])[0].lower():
+ epubver = '3'
+ elif metadata.get('orientation-lock', [''])[0].lower() in ['portrait', 'landscape']:
+ epubver = '3'
+ elif self.page_progression_direction == 'rtl':
+ epubver = '3'
+ elif EXTH_TITLE_FURIGANA in metadata:
+ epubver = '3'
+ elif EXTH_CREATOR_FURIGANA in metadata:
+ epubver = '3'
+ elif EXTH_PUBLISHER_FURIGANA in metadata:
+ epubver = '3'
+ elif k8resc is not None and k8resc.needEPUB3():
+ epubver = '3'
+ return epubver
+
+ def defineRefinesID(self):
+ # the following EXTH are set by KDP.
+ # 'Title_Furigana_(508)'
+ # 'Creator_Furigana_(517)',
+ # 'Publisher_Furigana_(522)'
+ # It is difficult to find correspondence between Title, Creator, Publisher
+ # and EXTH 508,512, 522 if they have more than two values since KDP seems not preserve the oders of EXTH 508,512 and 522.
+ # It is also difficult to find correspondence between them and tags which have refine attributes in RESC.
+ # So editing manually is required.
+ metadata = self.metadata
+
+ needRefinesId = False
+ if self.k8resc is not None:
+ needRefinesId = self.k8resc.hasRefines()
+ # Create id for rifine attributes
+ if (needRefinesId or EXTH_TITLE_FURIGANA in metadata) and 'Title' in metadata:
+ for i in range(len(metadata.get('Title'))):
+ self.title_id[i] = 'title%02d' % (i+1)
+
+ if (needRefinesId or EXTH_CREATOR_FURIGANA in metadata) and 'Creator' in metadata:
+ for i in range(len(metadata.get('Creator'))):
+ self.creator_id[i] = 'creator%02d' % (i+1)
+
+ if (needRefinesId or EXTH_PUBLISHER_FURIGANA in metadata) and 'Publisher' in metadata:
+ for i in range(len(metadata.get('Publisher'))):
+ self.publisher_id[i] = 'publisher%02d' % (i+1)
+
+ def processRefinesMetadata(self):
+ # create refines metadata defined in epub3 or convert refines property to opf: attribues for epub2.
+ metadata = self.metadata
+
+ refines_list = [
+ [EXTH_TITLE_FURIGANA, self.title_id, self.title_attrib, 'title00'],
+ [EXTH_CREATOR_FURIGANA, self.creator_id, self.creator_attrib, 'creator00'],
+ [EXTH_PUBLISHER_FURIGANA, self.publisher_id, self.publisher_attrib, 'publisher00']
+ ]
+
+ create_refines_metadata = False
+ for EXTH in lzip(*refines_list)[0]:
+ if EXTH in metadata:
+ create_refines_metadata = True
+ break
+ if create_refines_metadata:
+ for [EXTH, id, attrib, defaultid] in refines_list:
+ if self.target_epubver == '3':
+ for i, value in list(id.items()):
+ attrib[i] = ' id="%s"' % value
+
+ if EXTH in metadata:
+ if len(metadata[EXTH]) == 1 and len(id) == 1:
+ self.createMetaTag(self.exth_solved_refines_metadata, 'file-as', metadata[EXTH][0], id[0])
+ else:
+ for i, value in enumerate(metadata[EXTH]):
+ self.createMetaTag(self.exth_refines_metadata, 'file-as', value, id.get(i, defaultid))
+ else:
+ if EXTH in metadata:
+ if len(metadata[EXTH]) == 1 and len(id) == 1:
+ attr = ' opf:file-as="%s"' % metadata[EXTH][0]
+ attrib[0] = attr
+ else:
+ for i, value in enumerate(metadata[EXTH]):
+ attr = ' id="#%s" opf:file-as="%s"\n' % (id.get(i, defaultid), value)
+ self.extra_attributes.append(attr)
+
+ def createMetadataForFixedlayout(self):
+ # convert fixed layout to epub3 format if needed.
+ metadata = self.metadata
+
+ if 'fixed-layout' in metadata:
+ fixedlayout = metadata['fixed-layout'][0]
+ content = {'true' : 'pre-paginated'}.get(fixedlayout.lower(), 'reflowable')
+ self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:layout', content)
+
+ if 'orientation-lock' in metadata:
+ content = metadata['orientation-lock'][0].lower()
+ if content == 'portrait' or content == 'landscape':
+ self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:orientation', content)
+
+ # according to epub3 spec about correspondence with Amazon
+ # if 'original-resolution' is provided it needs to be converted to
+ # meta viewport property tag stored in the <head></head> of **each**
+ # xhtml page - so this tag would need to be handled by editing each part
+ # before reaching this routine
+ # we need to add support for this to the k8html routine
+ # if 'original-resolution' in metadata.keys():
+ # resolution = metadata['original-resolution'][0].lower()
+ # width, height = resolution.split('x')
+ # if width.isdigit() and int(width) > 0 and height.isdigit() and int(height) > 0:
+ # viewport = 'width=%s, height=%s' % (width, height)
+ # self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:viewport', viewport)
diff --git a/src/epy_reader/tools/KindleUnpack/mobi_pagemap.py b/src/epy_reader/tools/KindleUnpack/mobi_pagemap.py
new file mode 100644
index 0000000..5228d4e
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/mobi_pagemap.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, unicode_str
+
+if PY2:
+ range = xrange
+
+import struct
+# note: struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+import re
+# note: re requites the pattern to be the exact same type as the data to be searched in python3
+# but u"" is not allowed for the pattern itself only b""
+
+
+_TABLE = [('m', 1000), ('cm', 900), ('d', 500), ('cd', 400), ('c', 100), ('xc', 90), ('l', 50), ('xl', 40), ('x', 10), ('ix', 9), ('v', 5), ('iv', 4), ('i', 1)]
+
+def int_to_roman(i):
+ parts = []
+ num = i
+ for letter, value in _TABLE:
+ while value <= num:
+ num -= value
+ parts.append(letter)
+ return ''.join(parts)
+
+def roman_to_int(s):
+ result = 0
+ rnstr = s
+ for letter, value in _TABLE:
+ while rnstr.startswith(letter):
+ result += value
+ rnstr = rnstr[len(letter):]
+ return result
+
+_pattern = r'''\(([^\)]*)\)'''
+_tup_pattern = re.compile(_pattern,re.IGNORECASE)
+
+
+def _parseNames(numpages, data):
+ data = unicode_str(data)
+ pagenames = []
+ pageMap = ''
+ for i in range(numpages):
+ pagenames.append(None)
+ for m in re.finditer(_tup_pattern, data):
+ tup = m.group(1)
+ if pageMap != '':
+ pageMap += ','
+ pageMap += '(' + tup + ')'
+ spos, nametype, svalue = tup.split(",")
+ # print(spos, nametype, svalue)
+ if nametype == 'a' or nametype == 'r':
+ svalue = int(svalue)
+ spos = int(spos)
+ for i in range(spos - 1, numpages):
+ if nametype == 'r':
+ pname = int_to_roman(svalue)
+ svalue += 1
+ elif nametype == 'a':
+ pname = "%s" % svalue
+ svalue += 1
+ elif nametype == 'c':
+ sp = svalue.find('|')
+ if sp == -1:
+ pname = svalue
+ else:
+ pname = svalue[0:sp]
+ svalue = svalue[sp+1:]
+ else:
+ print("Error: unknown page numbering type", nametype)
+ pagenames[i] = pname
+ return pagenames, pageMap
+
+
+class PageMapProcessor:
+
+ def __init__(self, mh, data):
+ self.mh = mh
+ self.data = data
+ self.pagenames = []
+ self.pageoffsets = []
+ self.pageMap = ''
+ self.pm_len = 0
+ self.pm_nn = 0
+ self.pn_bits = 0
+ self.pmoff = None
+ self.pmstr = ''
+ print("Extracting Page Map Information")
+ rev_len, = struct.unpack_from(b'>L', self.data, 0x10)
+ # skip over header, revision string length data, and revision string
+ ptr = 0x14 + rev_len
+ pm_1, self.pm_len, self.pm_nn, self.pm_bits = struct.unpack_from(b'>4H', self.data, ptr)
+ # print(pm_1, self.pm_len, self.pm_nn, self.pm_bits)
+ self.pmstr = self.data[ptr+8:ptr+8+self.pm_len]
+ self.pmoff = self.data[ptr+8+self.pm_len:]
+ offsize = b">L"
+ offwidth = 4
+ if self.pm_bits == 16:
+ offsize = b">H"
+ offwidth = 2
+ ptr = 0
+ for i in range(self.pm_nn):
+ od, = struct.unpack_from(offsize, self.pmoff, ptr)
+ ptr += offwidth
+ self.pageoffsets.append(od)
+ self.pagenames, self.pageMap = _parseNames(self.pm_nn, self.pmstr)
+
+ def getPageMap(self):
+ return self.pageMap
+
+ def getNames(self):
+ return self.pagenames
+
+ def getOffsets(self):
+ return self.pageoffsets
+
+ # page-map.xml will be unicode but encoded to utf-8 immediately before being written to a file
+ def generateKF8PageMapXML(self, k8proc):
+ pagemapxml = '<page-map xmlns="http://www.idpf.org/2007/opf">\n'
+ for i in range(len(self.pagenames)):
+ pos = self.pageoffsets[i]
+ name = self.pagenames[i]
+ if name is not None and name != "":
+ [pn, dir, filename, skelpos, skelend, aidtext] = k8proc.getSkelInfo(pos)
+ idtext = unicode_str(k8proc.getPageIDTag(pos))
+ linktgt = unicode_str(filename)
+ if idtext != '':
+ linktgt += '#' + idtext
+ pagemapxml += '<page name="%s" href="%s/%s" />\n' % (name, dir, linktgt)
+ pagemapxml += "</page-map>\n"
+ return pagemapxml
+
+ def generateAPNX(self, apnx_meta):
+ if apnx_meta['format'] == 'MOBI_8':
+ content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","format":"%(format)s","fileRevisionId":"1","acr":"%(acr)s"}' %apnx_meta
+ else:
+ content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","fileRevisionId":"1"}' % apnx_meta
+ content_header = content_header.encode('utf-8')
+ page_header = '{"asin":"%(asin)s","pageMap":"%(pageMap)s"}' % apnx_meta
+ page_header = page_header.encode('utf-8')
+ apnx = struct.pack(b'>H',1) + struct.pack(b'>H',1)
+ apnx += struct.pack(b'>I', 12 + len(content_header))
+ apnx += struct.pack(b'>I', len(content_header))
+ apnx += content_header
+ apnx += struct.pack(b'>H', 1)
+ apnx += struct.pack(b'>H', len(page_header))
+ apnx += struct.pack(b'>H', self.pm_nn)
+ apnx += struct.pack(b'>H', 32)
+ apnx += page_header
+ for page in self.pageoffsets:
+ apnx += struct.pack(b'>L', page)
+ return apnx
diff --git a/src/epy_reader/tools/KindleUnpack/mobi_sectioner.py b/src/epy_reader/tools/KindleUnpack/mobi_sectioner.py
new file mode 100644
index 0000000..81f62bb
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/mobi_sectioner.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, hexlify, bstr, bord, bchar
+
+import datetime
+
+if PY2:
+ range = xrange
+
+# note: struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+import struct
+
+from .unipath import pathof
+
+DUMP = False
+""" Set to True to dump all possible information. """
+
+class unpackException(Exception):
+ pass
+
+
+def describe(data):
+ txtans = ''
+ hexans = hexlify(data)
+ for i in data:
+ if bord(i) < 32 or bord(i) > 127:
+ txtans += '?'
+ else:
+ txtans += bchar(i).decode('latin-1')
+ return '"' + txtans + '"' + ' 0x'+ hexans
+
+def datetimefrompalmtime(palmtime):
+ if palmtime > 0x7FFFFFFF:
+ pythondatetime = datetime.datetime(year=1904,month=1,day=1)+datetime.timedelta(seconds=palmtime)
+ else:
+ pythondatetime = datetime.datetime(year=1970,month=1,day=1)+datetime.timedelta(seconds=palmtime)
+ return pythondatetime
+
+
+class Sectionizer:
+
+ def __init__(self, filename):
+ self.data = b''
+ with open(pathof(filename), 'rb') as f:
+ self.data = f.read()
+ self.palmheader = self.data[:78]
+ self.palmname = self.data[:32]
+ self.ident = self.palmheader[0x3C:0x3C+8]
+ self.num_sections, = struct.unpack_from(b'>H', self.palmheader, 76)
+ self.filelength = len(self.data)
+ sectionsdata = struct.unpack_from(bstr('>%dL' % (self.num_sections*2)), self.data, 78) + (self.filelength, 0)
+ self.sectionoffsets = sectionsdata[::2]
+ self.sectionattributes = sectionsdata[1::2]
+ self.sectiondescriptions = ["" for x in range(self.num_sections+1)]
+ self.sectiondescriptions[-1] = "File Length Only"
+ return
+
+ def dumpsectionsinfo(self):
+ print("Section Offset Length UID Attribs Description")
+ for i in range(self.num_sections):
+ print("%3d %3X 0x%07X 0x%05X % 8d % 7d %s" % (i,i, self.sectionoffsets[i], self.sectionoffsets[
+ i+1] - self.sectionoffsets[i], self.sectionattributes[i]&0xFFFFFF, (self.sectionattributes[i]>>24)&0xFF, self.sectiondescriptions[i]))
+ print("%3d %3X 0x%07X %s" %
+ (self.num_sections,self.num_sections, self.sectionoffsets[self.num_sections], self.sectiondescriptions[self.num_sections]))
+
+ def setsectiondescription(self, section, description):
+ if section < len(self.sectiondescriptions):
+ self.sectiondescriptions[section] = description
+ else:
+ print("Section out of range: %d, description %s" % (section,description))
+
+ def dumppalmheader(self):
+ print("Palm Database Header")
+ print("Database name: " + repr(self.palmheader[:32]))
+ dbattributes, = struct.unpack_from(b'>H', self.palmheader, 32)
+ print("Bitfield attributes: 0x%0X" % dbattributes,)
+ if dbattributes != 0:
+ print(" (",)
+ if (dbattributes & 2):
+ print("Read-only; ",)
+ if (dbattributes & 4):
+ print("Dirty AppInfoArea; ",)
+ if (dbattributes & 8):
+ print("Needs to be backed up; ",)
+ if (dbattributes & 16):
+ print("OK to install over newer; ",)
+ if (dbattributes & 32):
+ print("Reset after installation; ",)
+ if (dbattributes & 64):
+ print("No copying by PalmPilot beaming; ",)
+ print(")")
+ else:
+ print("")
+ print("File version: %d" % struct.unpack_from(b'>H', self.palmheader, 34)[0])
+ dbcreation, = struct.unpack_from(b'>L', self.palmheader, 36)
+ print("Creation Date: " + str(datetimefrompalmtime(dbcreation))+ (" (0x%0X)" % dbcreation))
+ dbmodification, = struct.unpack_from(b'>L', self.palmheader, 40)
+ print("Modification Date: " + str(datetimefrompalmtime(dbmodification))+ (" (0x%0X)" % dbmodification))
+ dbbackup, = struct.unpack_from(b'>L', self.palmheader, 44)
+ if dbbackup != 0:
+ print("Backup Date: " + str(datetimefrompalmtime(dbbackup))+ (" (0x%0X)" % dbbackup))
+ print("Modification No.: %d" % struct.unpack_from(b'>L', self.palmheader, 48)[0])
+ print("App Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 52)[0])
+ print("Sort Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 56)[0])
+ print("Type/Creator: %s/%s" % (repr(self.palmheader[60:64]), repr(self.palmheader[64:68])))
+ print("Unique seed: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 68)[0])
+ expectedzero, = struct.unpack_from(b'>L', self.palmheader, 72)
+ if expectedzero != 0:
+ print("Should be zero but isn't: %d" % struct.unpack_from(b'>L', self.palmheader, 72)[0])
+ print("Number of sections: %d" % struct.unpack_from(b'>H', self.palmheader, 76)[0])
+ return
+
+ def loadSection(self, section):
+ before, after = self.sectionoffsets[section:section+2]
+ return self.data[before:after]
diff --git a/src/epy_reader/tools/KindleUnpack/mobi_split.py b/src/epy_reader/tools/KindleUnpack/mobi_split.py
new file mode 100755
index 0000000..3535029
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/mobi_split.py
@@ -0,0 +1,438 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+import struct
+# note: struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+from .unipath import pathof
+
+
+# important pdb header offsets
+unique_id_seed = 68
+number_of_pdb_records = 76
+
+# important palmdoc header offsets
+book_length = 4
+book_record_count = 8
+first_pdb_record = 78
+
+# important rec0 offsets
+length_of_book = 4
+mobi_header_base = 16
+mobi_header_length = 20
+mobi_type = 24
+mobi_version = 36
+first_non_text = 80
+title_offset = 84
+first_resc_record = 108
+first_content_index = 192
+last_content_index = 194
+kf8_fdst_index = 192 # for KF8 mobi headers
+fcis_index = 200
+flis_index = 208
+srcs_index = 224
+srcs_count = 228
+primary_index = 244
+datp_index = 256
+huffoff = 112
+hufftbloff = 120
+
+def getint(datain,ofs,sz=b'L'):
+ i, = struct.unpack_from(b'>'+sz,datain,ofs)
+ return i
+
+def writeint(datain,ofs,n,len=b'L'):
+ if len==b'L':
+ return datain[:ofs]+struct.pack(b'>L',n)+datain[ofs+4:]
+ else:
+ return datain[:ofs]+struct.pack(b'>H',n)+datain[ofs+2:]
+
+def getsecaddr(datain,secno):
+ nsec = getint(datain,number_of_pdb_records,b'H')
+ assert secno>=0 & secno<nsec,'secno %d out of range (nsec=%d)'%(secno,nsec)
+ secstart = getint(datain,first_pdb_record+secno*8)
+ if secno == nsec-1:
+ secend = len(datain)
+ else:
+ secend = getint(datain,first_pdb_record+(secno+1)*8)
+ return secstart,secend
+
+def readsection(datain,secno):
+ secstart, secend = getsecaddr(datain,secno)
+ return datain[secstart:secend]
+
+def writesection(datain,secno,secdata): # overwrite, accounting for different length
+ # dataout = deletesectionrange(datain,secno, secno)
+ # return insertsection(dataout, secno, secdata)
+ datalst = []
+ nsec = getint(datain,number_of_pdb_records,b'H')
+ zerosecstart,zerosecend = getsecaddr(datain,0)
+ secstart,secend = getsecaddr(datain,secno)
+ dif = len(secdata) - (secend - secstart)
+ datalst.append(datain[:unique_id_seed])
+ datalst.append(struct.pack(b'>L',2*nsec+1))
+ datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
+ datalst.append(struct.pack(b'>H',nsec))
+ newstart = zerosecstart
+ for i in range(0,secno):
+ ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+ datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
+ datalst.append(struct.pack(b'>L', secstart) + struct.pack(b'>L', (2*secno)))
+ for i in range(secno+1,nsec):
+ ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+ ofs = ofs + dif
+ datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
+ lpad = newstart - (first_pdb_record + 8*nsec)
+ if lpad > 0:
+ datalst.append(b'\0' * lpad)
+ datalst.append(datain[zerosecstart:secstart])
+ datalst.append(secdata)
+ datalst.append(datain[secend:])
+ dataout = b''.join(datalst)
+ return dataout
+
+def nullsection(datain,secno): # make it zero-length without deleting it
+ datalst = []
+ nsec = getint(datain,number_of_pdb_records,b'H')
+ secstart, secend = getsecaddr(datain,secno)
+ zerosecstart, zerosecend = getsecaddr(datain, 0)
+ dif = secend-secstart
+ datalst.append(datain[:first_pdb_record])
+ for i in range(0,secno+1):
+ ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+ datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
+ for i in range(secno+1, nsec):
+ ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+ ofs = ofs - dif
+ datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
+ lpad = zerosecstart - (first_pdb_record + 8*nsec)
+ if lpad > 0:
+ datalst.append(b'\0' * lpad)
+ datalst.append(datain[zerosecstart: secstart])
+ datalst.append(datain[secend:])
+ dataout = b''.join(datalst)
+ return dataout
+
+def deletesectionrange(datain,firstsec,lastsec): # delete a range of sections
+ datalst = []
+ firstsecstart,firstsecend = getsecaddr(datain,firstsec)
+ lastsecstart,lastsecend = getsecaddr(datain,lastsec)
+ zerosecstart, zerosecend = getsecaddr(datain, 0)
+ dif = lastsecend - firstsecstart + 8*(lastsec-firstsec+1)
+ nsec = getint(datain,number_of_pdb_records,b'H')
+ datalst.append(datain[:unique_id_seed])
+ datalst.append(struct.pack(b'>L',2*(nsec-(lastsec-firstsec+1))+1))
+ datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
+ datalst.append(struct.pack(b'>H',nsec-(lastsec-firstsec+1)))
+ newstart = zerosecstart - 8*(lastsec-firstsec+1)
+ for i in range(0,firstsec):
+ ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+ ofs = ofs-8*(lastsec-firstsec+1)
+ datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
+ for i in range(lastsec+1,nsec):
+ ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+ ofs = ofs - dif
+ flgval = 2*(i-(lastsec-firstsec+1))
+ datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
+ lpad = newstart - (first_pdb_record + 8*(nsec - (lastsec - firstsec + 1)))
+ if lpad > 0:
+ datalst.append(b'\0' * lpad)
+ datalst.append(datain[zerosecstart:firstsecstart])
+ datalst.append(datain[lastsecend:])
+ dataout = b''.join(datalst)
+ return dataout
+
+def insertsection(datain,secno,secdata): # insert a new section
+ datalst = []
+ nsec = getint(datain,number_of_pdb_records,b'H')
+ # print("inserting secno" , secno, "into" ,nsec, "sections")
+ secstart,secend = getsecaddr(datain,secno)
+ zerosecstart,zerosecend = getsecaddr(datain,0)
+ dif = len(secdata)
+ datalst.append(datain[:unique_id_seed])
+ datalst.append(struct.pack(b'>L',2*(nsec+1)+1))
+ datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
+ datalst.append(struct.pack(b'>H',nsec+1))
+ newstart = zerosecstart + 8
+ for i in range(0,secno):
+ ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+ ofs += 8
+ datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
+ datalst.append(struct.pack(b'>L', secstart + 8) + struct.pack(b'>L', (2*secno)))
+ for i in range(secno,nsec):
+ ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+ ofs = ofs + dif + 8
+ flgval = 2*(i+1)
+ datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
+ lpad = newstart - (first_pdb_record + 8*(nsec + 1))
+ if lpad > 0:
+ datalst.append(b'\0' * lpad)
+ datalst.append(datain[zerosecstart:secstart])
+ datalst.append(secdata)
+ datalst.append(datain[secstart:])
+ dataout = b''.join(datalst)
+ return dataout
+
+
+def insertsectionrange(sectionsource,firstsec,lastsec,sectiontarget,targetsec): # insert a range of sections
+ # print("inserting secno" , firstsec, "to", lastsec, "into" ,targetsec, "sections")
+ # dataout = sectiontarget
+ # for idx in range(lastsec,firstsec-1,-1):
+ # dataout = insertsection(dataout,targetsec,readsection(sectionsource,idx))
+ # return dataout
+ datalst = []
+ nsec = getint(sectiontarget,number_of_pdb_records,b'H')
+ zerosecstart, zerosecend = getsecaddr(sectiontarget,0)
+ insstart, nul = getsecaddr(sectiontarget,targetsec)
+ nins = lastsec - firstsec + 1
+ srcstart, nul = getsecaddr(sectionsource,firstsec)
+ nul, srcend = getsecaddr(sectionsource,lastsec)
+ newstart = zerosecstart + 8*nins
+
+ datalst.append(sectiontarget[:unique_id_seed])
+ datalst.append(struct.pack(b'>L',2*(nsec+nins)+1))
+ datalst.append(sectiontarget[unique_id_seed+4:number_of_pdb_records])
+ datalst.append(struct.pack(b'>H',nsec+nins))
+ for i in range(0,targetsec):
+ ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8)
+ ofsnew = ofs + 8*nins
+ flgvalnew = flgval
+ datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew))
+ # print(ofsnew, flgvalnew, ofs, flgval)
+ srcstart0, nul = getsecaddr(sectionsource,firstsec)
+ for i in range(nins):
+ isrcstart, nul = getsecaddr(sectionsource,firstsec+i)
+ ofsnew = insstart + (isrcstart-srcstart0) + 8*nins
+ flgvalnew = 2*(targetsec+i)
+ datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew))
+ # print(ofsnew, flgvalnew)
+ dif = srcend - srcstart
+ for i in range(targetsec,nsec):
+ ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8)
+ ofsnew = ofs + dif + 8*nins
+ flgvalnew = 2*(i+nins)
+ datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L',flgvalnew))
+ # print(ofsnew, flgvalnew, ofs, flgval)
+ lpad = newstart - (first_pdb_record + 8*(nsec + nins))
+ if lpad > 0:
+ datalst.append(b'\0' * lpad)
+ datalst.append(sectiontarget[zerosecstart:insstart])
+ datalst.append(sectionsource[srcstart:srcend])
+ datalst.append(sectiontarget[insstart:])
+ dataout = b''.join(datalst)
+ return dataout
+
+def get_exth_params(rec0):
+ ebase = mobi_header_base + getint(rec0,mobi_header_length)
+ elen = getint(rec0,ebase+4)
+ enum = getint(rec0,ebase+8)
+ return ebase,elen,enum
+
+def add_exth(rec0,exth_num,exth_bytes):
+ ebase,elen,enum = get_exth_params(rec0)
+ newrecsize = 8+len(exth_bytes)
+ newrec0 = rec0[0:ebase+4]+struct.pack(b'>L',elen+newrecsize)+struct.pack(b'>L',enum+1)+\
+ struct.pack(b'>L',exth_num)+struct.pack(b'>L',newrecsize)+exth_bytes+rec0[ebase+12:]
+ newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+newrecsize)
+ return newrec0
+
+def read_exth(rec0,exth_num):
+ exth_values = []
+ ebase,elen,enum = get_exth_params(rec0)
+ ebase = ebase+12
+ while enum>0:
+ exth_id = getint(rec0,ebase)
+ if exth_id == exth_num:
+ # We might have multiple exths, so build a list.
+ exth_values.append(rec0[ebase+8:ebase+getint(rec0,ebase+4)])
+ enum = enum-1
+ ebase = ebase+getint(rec0,ebase+4)
+ return exth_values
+
+def write_exth(rec0,exth_num,exth_bytes):
+ ebase,elen,enum = get_exth_params(rec0)
+ ebase_idx = ebase+12
+ enum_idx = enum
+ while enum_idx>0:
+ exth_id = getint(rec0,ebase_idx)
+ if exth_id == exth_num:
+ dif = len(exth_bytes)+8-getint(rec0,ebase_idx+4)
+ newrec0 = rec0
+ if dif != 0:
+ newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+dif)
+ return newrec0[:ebase+4]+struct.pack(b'>L',elen+len(exth_bytes)+8-getint(rec0,ebase_idx+4))+\
+ struct.pack(b'>L',enum)+rec0[ebase+12:ebase_idx+4]+\
+ struct.pack(b'>L',len(exth_bytes)+8)+exth_bytes+\
+ rec0[ebase_idx+getint(rec0,ebase_idx+4):]
+ enum_idx = enum_idx-1
+ ebase_idx = ebase_idx+getint(rec0,ebase_idx+4)
+ return rec0
+
+def del_exth(rec0,exth_num):
+ ebase,elen,enum = get_exth_params(rec0)
+ ebase_idx = ebase+12
+ enum_idx = 0
+ while enum_idx < enum:
+ exth_id = getint(rec0,ebase_idx)
+ exth_size = getint(rec0,ebase_idx+4)
+ if exth_id == exth_num:
+ newrec0 = rec0
+ newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)-exth_size)
+ newrec0 = newrec0[:ebase_idx]+newrec0[ebase_idx+exth_size:]
+ newrec0 = newrec0[0:ebase+4]+struct.pack(b'>L',elen-exth_size)+struct.pack(b'>L',enum-1)+newrec0[ebase+12:]
+ return newrec0
+ enum_idx += 1
+ ebase_idx = ebase_idx+exth_size
+ return rec0
+
+
+class mobi_split:
+
+ def __init__(self, infile):
+ datain = b''
+ with open(pathof(infile), 'rb') as f:
+ datain = f.read()
+ datain_rec0 = readsection(datain,0)
+ ver = getint(datain_rec0,mobi_version)
+ self.combo = (ver!=8)
+ if not self.combo:
+ return
+ exth121 = read_exth(datain_rec0,121)
+ if len(exth121) == 0:
+ self.combo = False
+ return
+ else:
+ # only pay attention to first exth121
+ # (there should only be one)
+ datain_kf8, = struct.unpack_from(b'>L',exth121[0],0)
+ if datain_kf8 == 0xffffffff:
+ self.combo = False
+ return
+ datain_kfrec0 =readsection(datain,datain_kf8)
+
+ # create the standalone mobi7
+ num_sec = getint(datain,number_of_pdb_records,b'H')
+ # remove BOUNDARY up to but not including ELF record
+ self.result_file7 = deletesectionrange(datain,datain_kf8-1,num_sec-2)
+ # check if there are SRCS records and delete them
+ srcs = getint(datain_rec0,srcs_index)
+ num_srcs = getint(datain_rec0,srcs_count)
+ if srcs != 0xffffffff and num_srcs > 0:
+ self.result_file7 = deletesectionrange(self.result_file7,srcs,srcs+num_srcs-1)
+ datain_rec0 = writeint(datain_rec0,srcs_index,0xffffffff)
+ datain_rec0 = writeint(datain_rec0,srcs_count,0)
+ # reset the EXTH 121 KF8 Boundary meta data to 0xffffffff
+ datain_rec0 = write_exth(datain_rec0,121, struct.pack(b'>L', 0xffffffff))
+ # datain_rec0 = del_exth(datain_rec0,121)
+ # datain_rec0 = del_exth(datain_rec0,534)
+ # don't remove the EXTH 125 KF8 Count of Resources, seems to be present in mobi6 files as well
+ # set the EXTH 129 KF8 Masthead / Cover Image string to the null string
+ datain_rec0 = write_exth(datain_rec0,129, b'')
+ # don't remove the EXTH 131 KF8 Unidentified Count, seems to be present in mobi6 files as well
+
+ # need to reset flags stored in 0x80-0x83
+ # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050
+ # Bit Flags
+ # 0x1000 = Bit 12 indicates if embedded fonts are used or not
+ # 0x0800 = means this Header points to *shared* images/resource/fonts ??
+ # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8?
+ # 0x0040 = exth exists
+ # 0x0010 = Not sure but this is always set so far
+ fval, = struct.unpack_from(b'>L',datain_rec0, 0x80)
+ # need to remove flag 0x0800 for KindlePreviewer 2.8 and unset Bit 12 for embedded fonts
+ fval = fval & 0x07FF
+ datain_rec0 = datain_rec0[:0x80] + struct.pack(b'>L',fval) + datain_rec0[0x84:]
+
+ self.result_file7 = writesection(self.result_file7,0,datain_rec0)
+
+ # no need to replace kf8 style fcis with mobi 7 one
+ # fcis_secnum, = struct.unpack_from(b'>L',datain_rec0, 0xc8)
+ # if fcis_secnum != 0xffffffff:
+ # fcis_info = readsection(datain, fcis_secnum)
+ # text_len, = struct.unpack_from(b'>L', fcis_info, 0x14)
+ # new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
+ # new_fcis += struct.pack(b'>L',text_len)
+ # new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
+ # self.result_file7 = writesection(self.result_file7, fcis_secnum, new_fcis)
+
+ firstimage = getint(datain_rec0,first_resc_record)
+ lastimage = getint(datain_rec0,last_content_index,b'H')
+ # print("Old First Image, last Image", firstimage,lastimage)
+ if lastimage == 0xffff:
+ # find the lowest of the next sections and copy up to that.
+ ofs_list = [(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')]
+ for ofs,sz in ofs_list:
+ n = getint(datain_rec0,ofs,sz)
+ # print("n",n)
+ if n > 0 and n < lastimage:
+ lastimage = n-1
+ print("First Image, last Image", firstimage,lastimage)
+
+ # Try to null out FONT and RES, but leave the (empty) PDB record so image refs remain valid
+ for i in range(firstimage,lastimage):
+ imgsec = readsection(self.result_file7,i)
+ if imgsec[0:4] in [b'RESC',b'FONT']:
+ self.result_file7 = nullsection(self.result_file7,i)
+
+ # mobi7 finished
+
+ # create standalone mobi8
+ self.result_file8 = deletesectionrange(datain,0,datain_kf8-1)
+ target = getint(datain_kfrec0,first_resc_record)
+ self.result_file8 = insertsectionrange(datain,firstimage,lastimage,self.result_file8,target)
+ datain_kfrec0 =readsection(self.result_file8,0)
+
+ # Only keep the correct EXTH 116 StartOffset, KG 2.5 carries over the one from the mobi7 part, which then points at garbage in the mobi8 part, and confuses FW 3.4
+ kf8starts = read_exth(datain_kfrec0,116)
+ # If we have multiple StartOffset, keep only the last one
+ kf8start_count = len(kf8starts)
+ while kf8start_count > 1:
+ kf8start_count -= 1
+ datain_kfrec0 = del_exth(datain_kfrec0,116)
+
+ # update the EXTH 125 KF8 Count of Images/Fonts/Resources
+ datain_kfrec0 = write_exth(datain_kfrec0,125,struct.pack(b'>L',lastimage-firstimage+1))
+
+ # need to reset flags stored in 0x80-0x83
+ # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050
+ # standalone mobi8 with exth: 0x0050
+ # Bit Flags
+ # 0x1000 = Bit 12 indicates if embedded fonts are used or not
+ # 0x0800 = means this Header points to *shared* images/resource/fonts ??
+ # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8?
+ # 0x0040 = exth exists
+ # 0x0010 = Not sure but this is always set so far
+ fval, = struct.unpack_from('>L',datain_kfrec0, 0x80)
+ fval = fval & 0x1FFF
+ fval |= 0x0800
+ datain_kfrec0 = datain_kfrec0[:0x80] + struct.pack(b'>L',fval) + datain_kfrec0[0x84:]
+
+ # properly update other index pointers that have been shifted by the insertion of images
+ ofs_list = [(kf8_fdst_index,b'L'),(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')]
+ for ofs,sz in ofs_list:
+ n = getint(datain_kfrec0,ofs,sz)
+ if n != 0xffffffff:
+ datain_kfrec0 = writeint(datain_kfrec0,ofs,n+lastimage-firstimage+1,sz)
+ self.result_file8 = writesection(self.result_file8,0,datain_kfrec0)
+
+ # no need to replace kf8 style fcis with mobi 7 one
+ # fcis_secnum, = struct.unpack_from(b'>L',datain_kfrec0, 0xc8)
+ # if fcis_secnum != 0xffffffff:
+ # fcis_info = readsection(self.result_file8, fcis_secnum)
+ # text_len, = struct.unpack_from(b'>L', fcis_info, 0x14)
+ # new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
+ # new_fcis += struct.pack(b'>L',text_len)
+ # new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
+ # self.result_file8 = writesection(self.result_file8, fcis_secnum, new_fcis)
+
+ # mobi8 finished
+
+ def getResult8(self):
+ return self.result_file8
+
+ def getResult7(self):
+ return self.result_file7
diff --git a/src/epy_reader/tools/KindleUnpack/mobi_uncompress.py b/src/epy_reader/tools/KindleUnpack/mobi_uncompress.py
new file mode 100644
index 0000000..c5fad85
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/mobi_uncompress.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, bchr, lmap, bstr
+
+if PY2:
+ range = xrange
+
+import struct
+# note: struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+
+class unpackException(Exception):
+ pass
+
+class UncompressedReader:
+
+ def unpack(self, data):
+ return data
+
+class PalmdocReader:
+
+ def unpack(self, i):
+ o, p = b'', 0
+ while p < len(i):
+ # for python 3 must use slice since i[p] returns int while slice returns character
+ c = ord(i[p:p+1])
+ p += 1
+ if (c >= 1 and c <= 8):
+ o += i[p:p+c]
+ p += c
+ elif (c < 128):
+ o += bchr(c)
+ elif (c >= 192):
+ o += b' ' + bchr(c ^ 128)
+ else:
+ if p < len(i):
+ c = (c << 8) | ord(i[p:p+1])
+ p += 1
+ m = (c >> 3) & 0x07ff
+ n = (c & 7) + 3
+ if (m > n):
+ o += o[-m:n-m]
+ else:
+ for _ in range(n):
+ # because of completely ass-backwards decision by python mainters for python 3
+ # we must use slice for bytes as i[p] returns int while slice returns character
+ if m == 1:
+ o += o[-m:]
+ else:
+ o += o[-m:-m+1]
+ return o
+
+class HuffcdicReader:
+ q = struct.Struct(b'>Q').unpack_from
+
+ def loadHuff(self, huff):
+ if huff[0:8] != b'HUFF\x00\x00\x00\x18':
+ raise unpackException('invalid huff header')
+ off1, off2 = struct.unpack_from(b'>LL', huff, 8)
+
+ def dict1_unpack(v):
+ codelen, term, maxcode = v&0x1f, v&0x80, v>>8
+ assert codelen != 0
+ if codelen <= 8:
+ assert term
+ maxcode = ((maxcode + 1) << (32 - codelen)) - 1
+ return (codelen, term, maxcode)
+ self.dict1 = lmap(dict1_unpack, struct.unpack_from(b'>256L', huff, off1))
+
+ dict2 = struct.unpack_from(b'>64L', huff, off2)
+ self.mincode, self.maxcode = (), ()
+ for codelen, mincode in enumerate((0,) + dict2[0::2]):
+ self.mincode += (mincode << (32 - codelen), )
+ for codelen, maxcode in enumerate((0,) + dict2[1::2]):
+ self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, )
+
+ self.dictionary = []
+
+ def loadCdic(self, cdic):
+ if cdic[0:8] != b'CDIC\x00\x00\x00\x10':
+ raise unpackException('invalid cdic header')
+ phrases, bits = struct.unpack_from(b'>LL', cdic, 8)
+ n = min(1<<bits, phrases-len(self.dictionary))
+ h = struct.Struct(b'>H').unpack_from
+ def getslice(off):
+ blen, = h(cdic, 16+off)
+ slice = cdic[18+off:18+off+(blen&0x7fff)]
+ return (slice, blen&0x8000)
+ self.dictionary += lmap(getslice, struct.unpack_from(bstr('>%dH' % n), cdic, 16))
+
+ def unpack(self, data):
+ q = HuffcdicReader.q
+
+ bitsleft = len(data) * 8
+ data += b"\x00\x00\x00\x00\x00\x00\x00\x00"
+ pos = 0
+ x, = q(data, pos)
+ n = 32
+
+ s = b''
+ while True:
+ if n <= 0:
+ pos += 4
+ x, = q(data, pos)
+ n += 32
+ code = (x >> n) & ((1 << 32) - 1)
+
+ codelen, term, maxcode = self.dict1[code >> 24]
+ if not term:
+ while code < self.mincode[codelen]:
+ codelen += 1
+ maxcode = self.maxcode[codelen]
+
+ n -= codelen
+ bitsleft -= codelen
+ if bitsleft < 0:
+ break
+
+ r = (maxcode - code) >> (32 - codelen)
+ slice, flag = self.dictionary[r]
+ if not flag:
+ self.dictionary[r] = None
+ slice = self.unpack(slice)
+ self.dictionary[r] = (slice, 1)
+ s += slice
+ return s
diff --git a/src/epy_reader/tools/KindleUnpack/mobi_utils.py b/src/epy_reader/tools/KindleUnpack/mobi_utils.py
new file mode 100644
index 0000000..6791e0d
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/mobi_utils.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+# flake8: noqa
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, text_type, bchr, bord
+
+import binascii
+
+if PY2:
+ range = xrange
+
+from itertools import cycle
+
+def getLanguage(langID, sublangID):
+ mobilangdict = {
+ 54 : {0 : 'af'}, # Afrikaans
+ 28 : {0 : 'sq'}, # Albanian
+ 1 : {0 : 'ar' , 5 : 'ar-dz' , 15 : 'ar-bh' , 3 : 'ar-eg' , 2 : 'ar-iq', 11 : 'ar-jo' , 13 : 'ar-kw' , 12 : 'ar-lb' , 4: 'ar-ly',
+ 6 : 'ar-ma' , 8 : 'ar-om' , 16 : 'ar-qa' , 1 : 'ar-sa' , 10 : 'ar-sy' , 7 : 'ar-tn' , 14 : 'ar-ae' , 9 : 'ar-ye'},
+ # Arabic, Arabic (Algeria), Arabic (Bahrain), Arabic (Egypt), Arabic
+ # (Iraq), Arabic (Jordan), Arabic (Kuwait), Arabic (Lebanon), Arabic
+ # (Libya), Arabic (Morocco), Arabic (Oman), Arabic (Qatar), Arabic
+ # (Saudi Arabia), Arabic (Syria), Arabic (Tunisia), Arabic (United Arab
+ # Emirates), Arabic (Yemen)
+ 43 : {0 : 'hy'}, # Armenian
+ 77 : {0 : 'as'}, # Assamese
+ 44 : {0 : 'az'}, # "Azeri (IANA: Azerbaijani)
+ 45 : {0 : 'eu'}, # Basque
+ 35 : {0 : 'be'}, # Belarusian
+ 69 : {0 : 'bn'}, # Bengali
+ 2 : {0 : 'bg'}, # Bulgarian
+ 3 : {0 : 'ca'}, # Catalan
+ 4 : {0 : 'zh' , 3 : 'zh-hk' , 2 : 'zh-cn' , 4 : 'zh-sg' , 1 : 'zh-tw'},
+ # Chinese, Chinese (Hong Kong), Chinese (PRC), Chinese (Singapore), Chinese (Taiwan)
+ 26 : {0 : 'hr', 3 : 'sr'}, # Croatian, Serbian
+ 5 : {0 : 'cs'}, # Czech
+ 6 : {0 : 'da'}, # Danish
+ 19 : {0: 'nl', 1 : 'nl' , 2 : 'nl-be'}, # Dutch / Flemish, Dutch (Belgium)
+ 9 : {0: 'en', 1 : 'en' , 3 : 'en-au' , 40 : 'en-bz' , 4 : 'en-ca' , 6 : 'en-ie' , 8 : 'en-jm' , 5 : 'en-nz' , 13 : 'en-ph' ,
+ 7 : 'en-za' , 11 : 'en-tt' , 2 : 'en-gb', 1 : 'en-us' , 12 : 'en-zw'},
+ # English, English (Australia), English (Belize), English (Canada),
+ # English (Ireland), English (Jamaica), English (New Zealand), English
+ # (Philippines), English (South Africa), English (Trinidad), English
+ # (United Kingdom), English (United States), English (Zimbabwe)
+ 37 : {0 : 'et'}, # Estonian
+ 56 : {0 : 'fo'}, # Faroese
+ 41 : {0 : 'fa'}, # Farsi / Persian
+ 11 : {0 : 'fi'}, # Finnish
+ 12 : {0 : 'fr', 1 : 'fr' , 2 : 'fr-be' , 3 : 'fr-ca' , 5 : 'fr-lu' , 6 : 'fr-mc' , 4 : 'fr-ch'},
+ # French, French (Belgium), French (Canada), French (Luxembourg), French (Monaco), French (Switzerland)
+ 55 : {0 : 'ka'}, # Georgian
+ 7 : {0 : 'de', 1 : 'de' , 3 : 'de-at' , 5 : 'de-li' , 4 : 'de-lu' , 2 : 'de-ch'},
+ # German, German (Austria), German (Liechtenstein), German (Luxembourg), German (Switzerland)
+ 8 : {0 : 'el'}, # Greek, Modern (1453-)
+ 71 : {0 : 'gu'}, # Gujarati
+ 13 : {0 : 'he'}, # Hebrew (also code 'iw'?)
+ 57 : {0 : 'hi'}, # Hindi
+ 14 : {0 : 'hu'}, # Hungarian
+ 15 : {0 : 'is'}, # Icelandic
+ 33 : {0 : 'id'}, # Indonesian
+ 16 : {0 : 'it', 1 : 'it' , 2 : 'it-ch'}, # Italian, Italian (Switzerland)
+ 17 : {0 : 'ja'}, # Japanese
+ 75 : {0 : 'kn'}, # Kannada
+ 63 : {0 : 'kk'}, # Kazakh
+ 87 : {0 : 'x-kok'}, # Konkani (real language code is 'kok'?)
+ 18 : {0 : 'ko'}, # Korean
+ 38 : {0 : 'lv'}, # Latvian
+ 39 : {0 : 'lt'}, # Lithuanian
+ 47 : {0 : 'mk'}, # Macedonian
+ 62 : {0 : 'ms'}, # Malay
+ 76 : {0 : 'ml'}, # Malayalam
+ 58 : {0 : 'mt'}, # Maltese
+ 78 : {0 : 'mr'}, # Marathi
+ 97 : {0 : 'ne'}, # Nepali
+ 20 : {0 : 'no'}, # Norwegian
+ 72 : {0 : 'or'}, # Oriya
+ 21 : {0 : 'pl'}, # Polish
+ 22 : {0 : 'pt', 2 : 'pt' , 1 : 'pt-br'}, # Portuguese, Portuguese (Brazil)
+ 70 : {0 : 'pa'}, # Punjabi
+ 23 : {0 : 'rm'}, # "Rhaeto-Romanic" (IANA: Romansh)
+ 24 : {0 : 'ro'}, # Romanian
+ 25 : {0 : 'ru'}, # Russian
+ 59 : {0 : 'sz'}, # "Sami (Lappish)" (not an IANA language code)
+ # IANA code for "Northern Sami" is 'se'
+ # 'SZ' is the IANA region code for Swaziland
+ 79 : {0 : 'sa'}, # Sanskrit
+ 27 : {0 : 'sk'}, # Slovak
+ 36 : {0 : 'sl'}, # Slovenian
+ 46 : {0 : 'sb'}, # "Sorbian" (not an IANA language code)
+ # 'SB' is IANA region code for 'Solomon Islands'
+ # Lower Sorbian = 'dsb'
+ # Upper Sorbian = 'hsb'
+ # Sorbian Languages = 'wen'
+ 10 : {0 : 'es' , 4 : 'es' , 44 : 'es-ar' , 64 : 'es-bo' , 52 : 'es-cl' , 36 : 'es-co' , 20 : 'es-cr' , 28 : 'es-do' ,
+ 48 : 'es-ec' , 68 : 'es-sv' , 16 : 'es-gt' , 72 : 'es-hn' , 8 : 'es-mx' , 76 : 'es-ni' , 24 : 'es-pa' ,
+ 60 : 'es-py' , 40 : 'es-pe' , 80 : 'es-pr' , 56 : 'es-uy' , 32 : 'es-ve'},
+ # Spanish, Spanish (Mobipocket bug?), Spanish (Argentina), Spanish
+ # (Bolivia), Spanish (Chile), Spanish (Colombia), Spanish (Costa Rica),
+ # Spanish (Dominican Republic), Spanish (Ecuador), Spanish (El
+ # Salvador), Spanish (Guatemala), Spanish (Honduras), Spanish (Mexico),
+ # Spanish (Nicaragua), Spanish (Panama), Spanish (Paraguay), Spanish
+ # (Peru), Spanish (Puerto Rico), Spanish (Uruguay), Spanish (Venezuela)
+ 48 : {0 : 'sx'}, # "Sutu" (not an IANA language code)
+ # "Sutu" is another name for "Southern Sotho"?
+ # IANA code for "Southern Sotho" is 'st'
+ 65 : {0 : 'sw'}, # Swahili
+ 29 : {0 : 'sv' , 1 : 'sv' , 8 : 'sv-fi'}, # Swedish, Swedish (Finland)
+ 73 : {0 : 'ta'}, # Tamil
+ 68 : {0 : 'tt'}, # Tatar
+ 74 : {0 : 'te'}, # Telugu
+ 30 : {0 : 'th'}, # Thai
+ 49 : {0 : 'ts'}, # Tsonga
+ 50 : {0 : 'tn'}, # Tswana
+ 31 : {0 : 'tr'}, # Turkish
+ 34 : {0 : 'uk'}, # Ukrainian
+ 32 : {0 : 'ur'}, # Urdu
+ 67 : {0 : 'uz', 2 : 'uz'}, # Uzbek
+ 42 : {0 : 'vi'}, # Vietnamese
+ 52 : {0 : 'xh'}, # Xhosa
+ 53 : {0 : 'zu'}, # Zulu
+ }
+ lang = "en"
+ if langID in mobilangdict:
+ subdict = mobilangdict[langID]
+ lang = subdict[0]
+ if sublangID in subdict:
+ lang = subdict[sublangID]
+ return lang
+
+
+def toHex(byteList):
+ return binascii.hexlify(byteList)
+
+# returns base32 bytestring
+def toBase32(value, npad=4):
+ digits = b'0123456789ABCDEFGHIJKLMNOPQRSTUV'
+ num_string=b''
+ current = value
+ while current != 0:
+ next, remainder = divmod(current, 32)
+ rem_string = digits[remainder:remainder+1]
+ num_string = rem_string + num_string
+ current=next
+ if num_string == b'':
+ num_string = b'0'
+ pad = npad - len(num_string)
+ if pad > 0:
+ num_string = b'0' * pad + num_string
+ return num_string
+
+
+# converts base32 string to value
+def fromBase32(str_num):
+ if isinstance(str_num, text_type):
+ str_num = str_num.encode('latin-1')
+ scalelst = [1,32,1024,32768,1048576,33554432,1073741824,34359738368]
+ value = 0
+ j = 0
+ n = len(str_num)
+ scale = 0
+ for i in range(n):
+ c = str_num[n-i-1:n-i]
+ if c in b'0123456789':
+ v = ord(c) - ord(b'0')
+ else:
+ v = ord(c) - ord(b'A') + 10
+ if j < len(scalelst):
+ scale = scalelst[j]
+ else:
+ scale = scale * 32
+ j += 1
+ if v != 0:
+ value = value + (v * scale)
+ return value
+
+
+# note: if decode a bytestring using 'latin-1' (or any other 0-255 encoding)
+# in place of ascii you will get a byte to half-word or integer
+# one to one mapping of values from 0 - 255
+
+def mangle_fonts(encryption_key, data):
+ if isinstance(encryption_key, text_type):
+ encryption_key = encryption_key.encode('latin-1')
+ crypt = data[:1024]
+ key = cycle(iter(map(bord, encryption_key)))
+ # encrypt = ''.join([chr(ord(x)^key.next()) for x in crypt])
+ encrypt = b''.join([bchr(bord(x)^next(key)) for x in crypt])
+ return encrypt + data[1024:]
diff --git a/src/epy_reader/tools/KindleUnpack/mobiml2xhtml.py b/src/epy_reader/tools/KindleUnpack/mobiml2xhtml.py
new file mode 100755
index 0000000..94fc671
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/mobiml2xhtml.py
@@ -0,0 +1,527 @@
+#! /usr/bin/python
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+
+# this program works in concert with the output from KindleUnpack
+
+'''
+Convert from Mobi ML to XHTML
+'''
+
+from __future__ import division, absolute_import, print_function
+
+import os
+import sys
+import re
+
+SPECIAL_HANDLING_TAGS = {
+ '?xml' : ('xmlheader', -1),
+ '!--' : ('comment', -3),
+ '!DOCTYPE' : ('doctype', -1),
+}
+
+SPECIAL_HANDLING_TYPES = ['xmlheader', 'doctype', 'comment']
+
+SELF_CLOSING_TAGS = ['br' , 'hr', 'input', 'img', 'image', 'meta', 'spacer', 'link', 'frame', 'base', 'col', 'reference']
+
+class MobiMLConverter(object):
+
+ PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
+ IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
+
+ def __init__(self, filename):
+ self.base_css_rules = 'blockquote { margin: 0em 0em 0em 1.25em }\n'
+ self.base_css_rules += 'p { margin: 0em }\n'
+ self.base_css_rules += '.bold { font-weight: bold }\n'
+ self.base_css_rules += '.italic { font-style: italic }\n'
+ self.base_css_rules += '.mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n'
+ self.tag_css_rules = {}
+ self.tag_css_rule_cnt = 0
+ self.path = []
+ self.filename = filename
+ self.wipml = open(self.filename, 'r').read()
+ self.pos = 0
+ self.opfname = self.filename.rsplit('.',1)[0] + '.opf'
+ self.opos = 0
+ self.meta = ''
+ self.cssname = os.path.join(os.path.dirname(self.filename),'styles.css')
+ self.current_font_size = 3
+ self.font_history = []
+
+ def cleanup_html(self):
+ self.wipml = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.wipml)
+ self.wipml = self.wipml.replace('\r\n', '\n')
+ self.wipml = self.wipml.replace('> <', '>\n<')
+ self.wipml = self.wipml.replace('<mbp: ', '<mbp:')
+ # self.wipml = re.sub(r'<?xml[^>]*>', '', self.wipml)
+ self.wipml = self.wipml.replace('<br></br>','<br/>')
+
+ def replace_page_breaks(self):
+ self.wipml = self.PAGE_BREAK_PAT.sub(
+ '<div class="mbp_pagebreak" />',
+ self.wipml)
+
+ # parse leading text of ml and tag
+ def parseml(self):
+ p = self.pos
+ if p >= len(self.wipml):
+ return None
+ if self.wipml[p] != '<':
+ res = self.wipml.find('<',p)
+ if res == -1 :
+ res = len(self.wipml)
+ self.pos = res
+ return self.wipml[p:res], None
+ # handle comment as a special case to deal with multi-line comments
+ if self.wipml[p:p+4] == '<!--':
+ te = self.wipml.find('-->',p+1)
+ if te != -1:
+ te = te+2
+ else :
+ te = self.wipml.find('>',p+1)
+ ntb = self.wipml.find('<',p+1)
+ if ntb != -1 and ntb < te:
+ self.pos = ntb
+ return self.wipml[p:ntb], None
+ self.pos = te + 1
+ return None, self.wipml[p:te+1]
+
+ # parses string version of tag to identify its name,
+ # its type 'begin', 'end' or 'single',
+ # plus build a hashtable of its attributes
+ # code is written to handle the possiblity of very poor formating
+ def parsetag(self, s):
+ p = 1
+ # get the tag name
+ tname = None
+ ttype = None
+ tattr = {}
+ while s[p:p+1] == ' ' :
+ p += 1
+ if s[p:p+1] == '/':
+ ttype = 'end'
+ p += 1
+ while s[p:p+1] == ' ' :
+ p += 1
+ b = p
+ while s[p:p+1] not in ('>', '/', ' ', '"', "'", "\r", "\n") :
+ p += 1
+ tname=s[b:p].lower()
+ if tname == '!doctype':
+ tname = '!DOCTYPE'
+ # special cases
+ if tname in SPECIAL_HANDLING_TAGS:
+ ttype, backstep = SPECIAL_HANDLING_TAGS[tname]
+ tattr['special'] = s[p:backstep]
+ if ttype is None:
+ # parse any attributes
+ while s.find('=',p) != -1 :
+ while s[p:p+1] == ' ' :
+ p += 1
+ b = p
+ while s[p:p+1] != '=' :
+ p += 1
+ aname = s[b:p].lower()
+ aname = aname.rstrip(' ')
+ p += 1
+ while s[p:p+1] == ' ' :
+ p += 1
+ if s[p:p+1] in ('"', "'") :
+ p = p + 1
+ b = p
+ while s[p:p+1] not in ('"', "'") :
+ p += 1
+ val = s[b:p]
+ p += 1
+ else :
+ b = p
+ while s[p:p+1] not in ('>', '/', ' ') :
+ p += 1
+ val = s[b:p]
+ tattr[aname] = val
+ # label beginning and single tags
+ if ttype is None:
+ ttype = 'begin'
+ if s.find(' /',p) >= 0:
+ ttype = 'single_ext'
+ elif s.find('/',p) >= 0:
+ ttype = 'single'
+ return ttype, tname, tattr
+
+ # main routine to convert from mobi markup language to html
+ def processml(self):
+
+ # are these really needed
+ html_done = False
+ head_done = False
+ body_done = False
+
+ skip = False
+
+ htmlstr = ''
+ self.replace_page_breaks()
+ self.cleanup_html()
+
+ # now parse the cleaned up ml into standard xhtml
+ while True:
+
+ r = self.parseml()
+ if not r:
+ break
+
+ text, tag = r
+
+ if text:
+ if not skip:
+ htmlstr += text
+
+ if tag:
+ ttype, tname, tattr = self.parsetag(tag)
+
+ # If we run into a DTD or xml declarations inside the body ... bail.
+ if tname in SPECIAL_HANDLING_TAGS and tname != 'comment' and body_done:
+ htmlstr += '\n</body></html>'
+ break
+
+ # make sure self-closing tags actually self-close
+ if ttype == 'begin' and tname in SELF_CLOSING_TAGS:
+ ttype = 'single'
+
+ # make sure any end tags of self-closing tags are discarded
+ if ttype == 'end' and tname in SELF_CLOSING_TAGS:
+ continue
+
+ # remove embedded guide and refernces from old mobis
+ if tname in ('guide', 'ncx', 'reference') and ttype in ('begin', 'single', 'single_ext'):
+ tname = 'removeme:{0}'.format(tname)
+ tattr = None
+ if tname in ('guide', 'ncx', 'reference', 'font', 'span') and ttype == 'end':
+ if self.path[-1] == 'removeme:{0}'.format(tname):
+ tname = 'removeme:{0}'.format(tname)
+ tattr = None
+
+ # Get rid of font tags that only have a color attribute.
+ if tname == 'font' and ttype in ('begin', 'single', 'single_ext'):
+ if 'color' in tattr and len(tattr) == 1:
+ tname = 'removeme:{0}'.format(tname)
+ tattr = None
+
+ # Get rid of empty spans in the markup.
+ if tname == 'span' and ttype in ('begin', 'single', 'single_ext') and not len(tattr):
+ tname = 'removeme:{0}'.format(tname)
+
+ # need to handle fonts outside of the normal methods
+ # so fonts tags won't be added to the self.path since we keep track
+ # of font tags separately with self.font_history
+ if tname == 'font' and ttype == 'begin':
+ # check for nested font start tags
+ if len(self.font_history) > 0 :
+ # inject a font end tag
+ taginfo = ('end', 'font', None)
+ htmlstr += self.processtag(taginfo)
+ self.font_history.append((ttype, tname, tattr))
+ # handle the current font start tag
+ taginfo = (ttype, tname, tattr)
+ htmlstr += self.processtag(taginfo)
+ continue
+
+ # check for nested font tags and unnest them
+ if tname == 'font' and ttype == 'end':
+ self.font_history.pop()
+ # handle this font end tag
+ taginfo = ('end', 'font', None)
+ htmlstr += self.processtag(taginfo)
+ # check if we were nested
+ if len(self.font_history) > 0:
+ # inject a copy of the most recent font start tag from history
+ taginfo = self.font_history[-1]
+ htmlstr += self.processtag(taginfo)
+ continue
+
+ # keep track of nesting path
+ if ttype == 'begin':
+ self.path.append(tname)
+ elif ttype == 'end':
+ if tname != self.path[-1]:
+ print('improper nesting: ', self.path, tname, ttype)
+ if tname not in self.path:
+ # handle case of end tag with no beginning by injecting empty begin tag
+ taginfo = ('begin', tname, None)
+ htmlstr += self.processtag(taginfo)
+ print(" - fixed by injecting empty start tag ", tname)
+ self.path.append(tname)
+ elif len(self.path) > 1 and tname == self.path[-2]:
+ # handle case of dangling missing end
+ taginfo = ('end', self.path[-1], None)
+ htmlstr += self.processtag(taginfo)
+ print(" - fixed by injecting end tag ", self.path[-1])
+ self.path.pop()
+ self.path.pop()
+
+ if tname == 'removeme:{0}'.format(tname):
+ if ttype in ('begin', 'single', 'single_ext'):
+ skip = True
+ else:
+ skip = False
+ else:
+ taginfo = (ttype, tname, tattr)
+ htmlstr += self.processtag(taginfo)
+
+ # handle potential issue of multiple html, head, and body sections
+ if tname == 'html' and ttype == 'begin' and not html_done:
+ htmlstr += '\n'
+ html_done = True
+
+ if tname == 'head' and ttype == 'begin' and not head_done:
+ htmlstr += '\n'
+ # also add in metadata and style link tags
+ htmlstr += self.meta
+ htmlstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
+ head_done = True
+
+ if tname == 'body' and ttype == 'begin' and not body_done:
+ htmlstr += '\n'
+ body_done = True
+
+ # handle issue of possibly missing html, head, and body tags
+ # I have not seen this but the original did something like this so ...
+ if not body_done:
+ htmlstr = '<body>\n' + htmlstr + '</body>\n'
+ if not head_done:
+ headstr = '<head>\n'
+ headstr += self.meta
+ headstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
+ headstr += '</head>\n'
+ htmlstr = headstr + htmlstr
+ if not html_done:
+ htmlstr = '<html>\n' + htmlstr + '</html>\n'
+
+ # finally add DOCTYPE info
+ htmlstr = '<?xml version="1.0"?>\n<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n' + htmlstr
+
+ css = self.base_css_rules
+ for cls, rule in self.tag_css_rules.items():
+ css += '.%s { %s }\n' % (cls, rule)
+
+ return (htmlstr, css, self.cssname)
+
+ def ensure_unit(self, raw, unit='px'):
+ if re.search(r'\d+$', raw) is not None:
+ raw += unit
+ return raw
+
+ # flatten possibly modified tag back to string
+ def taginfo_tostring(self, taginfo):
+ (ttype, tname, tattr) = taginfo
+ if ttype is None or tname is None:
+ return ''
+ if ttype == 'end':
+ return '</%s>' % tname
+ if ttype in SPECIAL_HANDLING_TYPES and tattr is not None and 'special' in tattr:
+ info = tattr['special']
+ if ttype == 'comment':
+ return '<%s %s-->' % (tname, info)
+ else:
+ return '<%s %s>' % (tname, info)
+ res = []
+ res.append('<%s' % tname)
+ if tattr is not None:
+ for key in tattr:
+ res.append(' %s="%s"' % (key, tattr[key]))
+ if ttype == 'single':
+ res.append('/>')
+ elif ttype == 'single_ext':
+ res.append(' />')
+ else :
+ res.append('>')
+ return "".join(res)
+
+ # routines to convert from mobi ml tags atributes to xhtml attributes and styles
+ def processtag(self, taginfo):
+ # Converting mobi font sizes to numerics
+ size_map = {
+ 'xx-small': '1',
+ 'x-small': '2',
+ 'small': '3',
+ 'medium': '4',
+ 'large': '5',
+ 'x-large': '6',
+ 'xx-large': '7',
+ }
+
+ size_to_em_map = {
+ '1': '.65em',
+ '2': '.75em',
+ '3': '1em',
+ '4': '1.125em',
+ '5': '1.25em',
+ '6': '1.5em',
+ '7': '2em',
+ }
+
+ # current tag to work on
+ (ttype, tname, tattr) = taginfo
+ if not tattr:
+ tattr = {}
+
+ styles = []
+
+ if tname is None or tname.startswith('removeme'):
+ return ''
+
+ # have not seen an example of this yet so keep it here to be safe
+ # until this is better understood
+ if tname in ('country-region', 'place', 'placetype', 'placename',
+ 'state', 'city', 'street', 'address', 'content'):
+ tname = 'div' if tname == 'content' else 'span'
+ for key in tattr:
+ tattr.pop(key)
+
+ # handle general case of style, height, width, bgcolor in any tag
+ if 'style' in tattr:
+ style = tattr.pop('style').strip()
+ if style:
+ styles.append(style)
+
+ if 'align' in tattr:
+ align = tattr.pop('align').strip()
+ if align:
+ if tname in ('table', 'td', 'tr'):
+ pass
+ else:
+ styles.append('text-align: %s' % align)
+
+ if 'height' in tattr:
+ height = tattr.pop('height').strip()
+ if height and '<' not in height and '>' not in height and re.search(r'\d+', height):
+ if tname in ('table', 'td', 'tr'):
+ pass
+ elif tname == 'img':
+ tattr['height'] = height
+ else:
+ styles.append('margin-top: %s' % self.ensure_unit(height))
+
+ if 'width' in tattr:
+ width = tattr.pop('width').strip()
+ if width and re.search(r'\d+', width):
+ if tname in ('table', 'td', 'tr'):
+ pass
+ elif tname == 'img':
+ tattr['width'] = width
+ else:
+ styles.append('text-indent: %s' % self.ensure_unit(width))
+ if width.startswith('-'):
+ styles.append('margin-left: %s' % self.ensure_unit(width[1:]))
+
+ if 'bgcolor' in tattr:
+ # no proprietary html allowed
+ if tname == 'div':
+ del tattr['bgcolor']
+
+ elif tname == 'font':
+ # Change font tags to span tags
+ tname = 'span'
+ if ttype in ('begin', 'single', 'single_ext'):
+ # move the face attribute to css font-family
+ if 'face' in tattr:
+ face = tattr.pop('face').strip()
+ styles.append('font-family: "%s"' % face)
+
+ # Monitor the constantly changing font sizes, change them to ems and move
+ # them to css. The following will work for 'flat' font tags, but nested font tags
+ # will cause things to go wonky. Need to revert to the parent font tag's size
+ # when a closing tag is encountered.
+ if 'size' in tattr:
+ sz = tattr.pop('size').strip().lower()
+ try:
+ float(sz)
+ except ValueError:
+ if sz in size_map:
+ sz = size_map[sz]
+ else:
+ if sz.startswith('-') or sz.startswith('+'):
+ sz = self.current_font_size + float(sz)
+ if sz > 7:
+ sz = 7
+ elif sz < 1:
+ sz = 1
+ sz = str(int(sz))
+ styles.append('font-size: %s' % size_to_em_map[sz])
+ self.current_font_size = int(sz)
+
+ elif tname == 'img':
+ for attr in ('width', 'height'):
+ if attr in tattr:
+ val = tattr[attr]
+ if val.lower().endswith('em'):
+ try:
+ nval = float(val[:-2])
+ nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile
+ tattr[attr] = "%dpx"%int(nval)
+ except:
+ del tattr[attr]
+ elif val.lower().endswith('%'):
+ del tattr[attr]
+
+ # convert the anchor tags
+ if 'filepos-id' in tattr:
+ tattr['id'] = tattr.pop('filepos-id')
+ if 'name' in tattr and tattr['name'] != tattr['id']:
+ tattr['name'] = tattr['id']
+
+ if 'filepos' in tattr:
+ filepos = tattr.pop('filepos')
+ try:
+ tattr['href'] = "#filepos%d" % int(filepos)
+ except ValueError:
+ pass
+
+ if styles:
+ ncls = None
+ rule = '; '.join(styles)
+ for sel, srule in self.tag_css_rules.items():
+ if srule == rule:
+ ncls = sel
+ break
+ if ncls is None:
+ self.tag_css_rule_cnt += 1
+ ncls = 'rule_%d' % self.tag_css_rule_cnt
+ self.tag_css_rules[ncls] = rule
+ cls = tattr.get('class', '')
+ cls = cls + (' ' if cls else '') + ncls
+ tattr['class'] = cls
+
+ # convert updated tag back to string representation
+ if len(tattr) == 0:
+ tattr = None
+ taginfo = (ttype, tname, tattr)
+ return self.taginfo_tostring(taginfo)
+
+''' main only left in for testing outside of plugin '''
+
+def main(argv=sys.argv):
+ if len(argv) != 2:
+ return 1
+ else:
+ infile = argv[1]
+
+ try:
+ print('Converting Mobi Markup Language to XHTML')
+ mlc = MobiMLConverter(infile)
+ print('Processing ...')
+ htmlstr, css, cssname = mlc.processml()
+ outname = infile.rsplit('.',1)[0] + '_converted.html'
+ open(outname, 'w').write(htmlstr)
+ open(cssname, 'w').write(css)
+ print('Completed')
+ print('XHTML version of book can be found at: ' + outname)
+
+ except ValueError as e:
+ print("Error: %s" % e)
+ return 1
+
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/src/epy_reader/tools/KindleUnpack/unipath.py b/src/epy_reader/tools/KindleUnpack/unipath.py
new file mode 100755
index 0000000..2416279
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/unipath.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this list of
+# conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice, this list
+# of conditions and the following disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+from .compatibility_utils import PY2, text_type, binary_type
+
+import sys
+import os
+
+# utility routines to convert all paths to be full unicode
+
+# Under Python 2, if a bytestring, try to convert it to unicode using sys.getfilesystemencoding
+# Under Python 3, if bytes, try to convert it to unicode using os.fsencode() to decode it
+
+# Mac OS X and Windows will happily support full unicode paths
+# Linux can support full unicode paths but allows arbitrary byte paths which may be inconsistent with unicode
+
+fsencoding = sys.getfilesystemencoding()
+
+def pathof(s, enc=fsencoding):
+ if s is None:
+ return None
+ if isinstance(s, text_type):
+ return s
+ if isinstance(s, binary_type):
+ try:
+ return s.decode(enc)
+ except:
+ pass
+ return s
+
+def exists(s):
+ return os.path.exists(pathof(s))
+
+def isfile(s):
+ return os.path.isfile(pathof(s))
+
+def isdir(s):
+ return os.path.isdir(pathof(s))
+
+def mkdir(s):
+ return os.mkdir(pathof(s))
+
+def listdir(s):
+ rv = []
+ for file in os.listdir(pathof(s)):
+ rv.append(pathof(file))
+ return rv
+
+def getcwd():
+ if PY2:
+ return os.getcwdu()
+ return os.getcwd()
+
+def walk(top):
+ top = pathof(top)
+ rv = []
+ for base, dnames, names in os.walk(top):
+ base = pathof(base)
+ for name in names:
+ name = pathof(name)
+ rv.append(relpath(os.path.join(base, name), top))
+ return rv
+
+def relpath(path, start=None):
+ return os.path.relpath(pathof(path) , pathof(start))
+
+def abspath(path):
+ return os.path.abspath(pathof(path))
diff --git a/src/epy_reader/tools/KindleUnpack/unpack_structure.py b/src/epy_reader/tools/KindleUnpack/unpack_structure.py
new file mode 100644
index 0000000..2e66eb8
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/unpack_structure.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import text_type
+
+from . import unipath
+from .unipath import pathof
+
+DUMP = False
+""" Set to True to dump all possible information. """
+
+import os
+
+import re
+# note: re requites the pattern to be the exact same type as the data to be searched in python3
+# but u"" is not allowed for the pattern itself only b""
+
+import zipfile
+import binascii
+from .mobi_utils import mangle_fonts
+
+class unpackException(Exception):
+ pass
+
+class ZipInfo(zipfile.ZipInfo):
+
+ def __init__(self, *args, **kwargs):
+ if 'compress_type' in kwargs:
+ compress_type = kwargs.pop('compress_type')
+ super(ZipInfo, self).__init__(*args, **kwargs)
+ self.compress_type = compress_type
+
+class fileNames:
+
+ def __init__(self, infile, outdir):
+ self.infile = infile
+ self.outdir = outdir
+ if not unipath.exists(self.outdir):
+ unipath.mkdir(self.outdir)
+ self.mobi7dir = os.path.join(self.outdir,'mobi7')
+ if not unipath.exists(self.mobi7dir):
+ unipath.mkdir(self.mobi7dir)
+ self.imgdir = os.path.join(self.mobi7dir, 'Images')
+ if not unipath.exists(self.imgdir):
+ unipath.mkdir(self.imgdir)
+ self.hdimgdir = os.path.join(self.outdir,'HDImages')
+ if not unipath.exists(self.hdimgdir):
+ unipath.mkdir(self.hdimgdir)
+ self.outbase = os.path.join(self.outdir, os.path.splitext(os.path.split(infile)[1])[0])
+
+ def getInputFileBasename(self):
+ return os.path.splitext(os.path.basename(self.infile))[0]
+
+ def makeK8Struct(self):
+ self.k8dir = os.path.join(self.outdir,'mobi8')
+ if not unipath.exists(self.k8dir):
+ unipath.mkdir(self.k8dir)
+ self.k8metainf = os.path.join(self.k8dir,'META-INF')
+ if not unipath.exists(self.k8metainf):
+ unipath.mkdir(self.k8metainf)
+ self.k8oebps = os.path.join(self.k8dir,'OEBPS')
+ if not unipath.exists(self.k8oebps):
+ unipath.mkdir(self.k8oebps)
+ self.k8images = os.path.join(self.k8oebps,'Images')
+ if not unipath.exists(self.k8images):
+ unipath.mkdir(self.k8images)
+ self.k8fonts = os.path.join(self.k8oebps,'Fonts')
+ if not unipath.exists(self.k8fonts):
+ unipath.mkdir(self.k8fonts)
+ self.k8styles = os.path.join(self.k8oebps,'Styles')
+ if not unipath.exists(self.k8styles):
+ unipath.mkdir(self.k8styles)
+ self.k8text = os.path.join(self.k8oebps,'Text')
+ if not unipath.exists(self.k8text):
+ unipath.mkdir(self.k8text)
+
+ # recursive zip creation support routine
+ def zipUpDir(self, myzip, tdir, localname):
+ currentdir = tdir
+ if localname != "":
+ currentdir = os.path.join(currentdir,localname)
+ list = unipath.listdir(currentdir)
+ for file in list:
+ afilename = file
+ localfilePath = os.path.join(localname, afilename)
+ realfilePath = os.path.join(currentdir,file)
+ if unipath.isfile(realfilePath):
+ myzip.write(pathof(realfilePath), pathof(localfilePath), zipfile.ZIP_DEFLATED)
+ elif unipath.isdir(realfilePath):
+ self.zipUpDir(myzip, tdir, localfilePath)
+
+ def makeEPUB(self, usedmap, obfuscate_data, uid):
+ bname = os.path.join(self.k8dir, self.getInputFileBasename() + '.epub')
+ # Create an encryption key for Adobe font obfuscation
+ # based on the epub's uid
+ if isinstance(uid,text_type):
+ uid = uid.encode('ascii')
+ if obfuscate_data:
+ key = re.sub(br'[^a-fA-F0-9]', b'', uid)
+ key = binascii.unhexlify((key + key)[:32])
+
+ # copy over all images and fonts that are actually used in the ebook
+ # and remove all font files from mobi7 since not supported
+ imgnames = unipath.listdir(self.imgdir)
+ for name in imgnames:
+ if usedmap.get(name,'not used') == 'used':
+ filein = os.path.join(self.imgdir,name)
+ if name.endswith(".ttf"):
+ fileout = os.path.join(self.k8fonts,name)
+ elif name.endswith(".otf"):
+ fileout = os.path.join(self.k8fonts,name)
+ elif name.endswith(".failed"):
+ fileout = os.path.join(self.k8fonts,name)
+ else:
+ fileout = os.path.join(self.k8images,name)
+ data = b''
+ with open(pathof(filein),'rb') as f:
+ data = f.read()
+ if obfuscate_data:
+ if name in obfuscate_data:
+ data = mangle_fonts(key, data)
+ open(pathof(fileout),'wb').write(data)
+ if name.endswith(".ttf") or name.endswith(".otf"):
+ os.remove(pathof(filein))
+
+ # opf file name hard coded to "content.opf"
+ container = '<?xml version="1.0" encoding="UTF-8"?>\n'
+ container += '<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">\n'
+ container += ' <rootfiles>\n'
+ container += '<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>'
+ container += ' </rootfiles>\n</container>\n'
+ fileout = os.path.join(self.k8metainf,'container.xml')
+ with open(pathof(fileout),'wb') as f:
+ f.write(container.encode('utf-8'))
+
+ if obfuscate_data:
+ encryption = '<encryption xmlns="urn:oasis:names:tc:opendocument:xmlns:container" \
+xmlns:enc="http://www.w3.org/2001/04/xmlenc#" xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">\n'
+ for font in obfuscate_data:
+ encryption += ' <enc:EncryptedData>\n'
+ encryption += ' <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>\n'
+ encryption += ' <enc:CipherData>\n'
+ encryption += ' <enc:CipherReference URI="OEBPS/Fonts/' + font + '"/>\n'
+ encryption += ' </enc:CipherData>\n'
+ encryption += ' </enc:EncryptedData>\n'
+ encryption += '</encryption>\n'
+ fileout = os.path.join(self.k8metainf,'encryption.xml')
+ with open(pathof(fileout),'wb') as f:
+ f.write(encryption.encode('utf-8'))
+
+ # ready to build epub
+ self.outzip = zipfile.ZipFile(pathof(bname), 'w')
+
+ # add the mimetype file uncompressed
+ mimetype = b'application/epub+zip'
+ fileout = os.path.join(self.k8dir,'mimetype')
+ with open(pathof(fileout),'wb') as f:
+ f.write(mimetype)
+ nzinfo = ZipInfo('mimetype', compress_type=zipfile.ZIP_STORED)
+ nzinfo.external_attr = 0o600 << 16 # make this a normal file
+ self.outzip.writestr(nzinfo, mimetype)
+ self.zipUpDir(self.outzip,self.k8dir,'META-INF')
+ self.zipUpDir(self.outzip,self.k8dir,'OEBPS')
+ self.outzip.close()