diff options
author | Benawi Adha <benawiadha@gmail.com> | 2022-10-02 21:22:38 +0700 |
---|---|---|
committer | Benawi Adha <benawiadha@gmail.com> | 2022-10-02 21:22:38 +0700 |
commit | 258c30d2e088cd4ab091a53794da3f93af79915d (patch) | |
tree | f49340bf565deb20c730358af74a01bcc231de53 /src/epy_reader/tools/KindleUnpack | |
parent | d43533f01d9d5baf5f78b71f832641382bd5962a (diff) | |
download | epy-258c30d2e088cd4ab091a53794da3f93af79915d.tar.gz |
Major refactor: breakdown epy.py script
into package project structure for easier
development
Squashed commit of the following:
commit 01309b961a4ab32394bff0d90949b57435dfda47
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 21:15:04 2022 +0700
Fix missing objects
commit aab2e773c30b255c81b1250b3b20967d5da40338
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 21:09:31 2022 +0700
Update README.md
commit d4e98926bcd9b00ce0410ad71249d24e6315abc5
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 21:07:28 2022 +0700
Add keywords in pyproject.toml
commit 432055af8245560a3ff2e046aef0b4e87da44930
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 21:04:34 2022 +0700
Bump version and deprecete setup.py
commit 51dd15aab8f8ff5996f822f8378e813f0b9fb80d
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 20:56:38 2022 +0700
Formatting
commit 81fb35e3b6fa0e27d79ef1da77202ed81eb99500
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 20:55:08 2022 +0700
Fix speakers module
commit 3b852e7c59b38d5a28520038e35f50a95270d2f1
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:52:46 2022 +0700
Fix circular import
commit 061e8a2649dabacd28a9e2f972559475316c654c
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:39:27 2022 +0700
Run formatting
commit abc2d0ab156992c63dc04745d14a69679a60accb
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:39:00 2022 +0700
Update isort and black config in pyproject
commit 5dc2e41bab5b997bd719bdc1561eb51ba0c17a83
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:31:00 2022 +0700
Add app Config
commit ed485a2ea8281585bf86dc5772f0c6dd9c803cc4
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:23:02 2022 +0700
Update debugpy script
commit 68b0553dd4d63eb4b847132c68ea4018587fa8ec
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:14:11 2022 +0700
Connect reader to main script
commit 63c3dd176f18a784a4ed2e88aa72b13d1c2b0990
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:11:17 2022 +0700
Implement reader
commit ce5eec8fb4e1db3870a16a07541365cd777d6c4c
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 19:29:49 2022 +0700
Fix script in pyproject.toml
commit 941e8e49f1593731fb582d92084206772b3f0442
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 19:28:39 2022 +0700
Rename modules
commit 5a3e7f766aee774c09b3b5336f3a2968e9cb1d0c
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 19:28:20 2022 +0700
Rename tool method
commit 3c0503ff475cb7eff8b12d3be0bda7a38efe1072
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 19:27:03 2022 +0700
Add ebooks lib
commit b5f71c3296a7d6f36454f6e1cbe84e15a45092ee
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 17:25:11 2022 +0700
Initial reorganization
Diffstat (limited to 'src/epy_reader/tools/KindleUnpack')
21 files changed, 7315 insertions, 0 deletions
diff --git a/src/epy_reader/tools/KindleUnpack/__init__.py b/src/epy_reader/tools/KindleUnpack/__init__.py new file mode 100644 index 0000000..0077258 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai diff --git a/src/epy_reader/tools/KindleUnpack/compatibility_utils.py b/src/epy_reader/tools/KindleUnpack/compatibility_utils.py new file mode 100755 index 0000000..c46c0bb --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/compatibility_utils.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this list of +# conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, this list +# of conditions and the following disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY +# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import unicode_literals, division, absolute_import, print_function + +import sys +import codecs + +PY2 = sys.version_info[0] == 2 +PY3 = sys.version_info[0] == 3 + +iswindows = sys.platform.startswith('win') + +try: + from urllib.parse import unquote +except ImportError: + from urllib import unquote + +if PY2: + from HTMLParser import HTMLParser + _h = HTMLParser() +elif sys.version_info[1] < 4: + import html.parser + _h = html.parser.HTMLParser() +else: + import html as _h + +if PY3: + text_type = str + binary_type = bytes + # if will be printing arbitraty binary data to stdout on python 3 + # sys.stdin = sys.stdin.detach() + # sys.stdout = sys.stdout.detach() + # sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach()) +else: + range = xrange + text_type = unicode + binary_type = str + # if will be printing unicode under python 2 need to protect + # against sys.stdout.encoding being None stupidly forcing forcing ascii encoding of unicode + # sys.stdout = codecs.getwriter("utf-8")(sys.stdout) + # alternatively set environment variable as follows **before** launching python: export PYTHONIOENCODING=UTF-8 + +# NOTE: Python 3 is completely broken when accessing single bytes in bytes strings +# (and they amazingly claim by design and no bug!) + +# To illustrate: this works for unicode in Python 3 and for all Python 2.X for both bytestrings and unicode +# >>> o = '123456789' +# >>> o[-3] +# '7' +# >>> type(o[-3]) +# <class 'str'> +# >>> type(o) +# <class 'str'> + +# Unfortunately, this is what Python 3 does for no sane reason and only for bytestrings +# >>> o = b'123456789' +# >>> o[-3] +# 55 +# >>> type(o[-3]) +# <class 'int'> +# >>> type(o) +# <class 'bytes'> + +# This mind boggling behaviour also happens when indexing a bytestring and/or +# iteratoring over a bytestring. In other words it will return an int but not +# the byte itself!!!!!!! + +# The only way to access a single byte as a byte in bytestring and get the byte in both +# Python 2 and Python 3 is to use a slice + +# This problem is so common there are horrible hacks floating around the net to **try** +# to work around it, so that code that works on both Python 2 and Python 3 is possible. + +# So in order to write code that works on both Python 2 and Python 3 +# if you index or access a single byte and want its ord() then use the bord() function. +# If instead you want it as a single character byte use the bchar() function +# both of which are defined below. + +if PY3: + # Also Note: if decode a bytestring using 'latin-1' (or any other full range 0-255 encoding) + # in place of ascii you will get a byte value to half-word or integer value + # one-to-one mapping (in the 0 - 255 range) + + def bchr(s): + return bytes([s]) + + def bstr(s): + if isinstance(s, str): + return bytes(s, 'latin-1') + else: + return bytes(s) + + def bord(s): + return s + + def bchar(s): + return bytes([s]) + +else: + def bchr(s): + return chr(s) + + def bstr(s): + return str(s) + + def bord(s): + return ord(s) + + def bchar(s): + return s + +if PY3: + # list-producing versions of the major Python iterating functions + def lrange(*args, **kwargs): + return list(range(*args, **kwargs)) + + def lzip(*args, **kwargs): + return list(zip(*args, **kwargs)) + + def lmap(*args, **kwargs): + return list(map(*args, **kwargs)) + + def lfilter(*args, **kwargs): + return list(filter(*args, **kwargs)) +else: + import __builtin__ + # Python 2-builtin ranges produce lists + lrange = __builtin__.range + lzip = __builtin__.zip + lmap = __builtin__.map + lfilter = __builtin__.filter + +# In Python 3 you can no longer use .encode('hex') on a bytestring +# instead use the following on both platforms +import binascii +def hexlify(bdata): + return (binascii.hexlify(bdata)).decode('ascii') + +# If you: import struct +# Note: struct pack, unpack, unpack_from all *require* bytestring format +# data all the way up to at least Python 2.7.5, Python 3 is okay with either + +# If you: import re +# note: Python 3 "re" requires the pattern to be the exact same type as the data to be +# searched ... but u"" is not allowed for the pattern itself only b"" +# Python 2.X allows the pattern to be any type and converts it to match the data +# and returns the same type as the data + +# convert string to be utf-8 encoded +def utf8_str(p, enc='utf-8'): + if p is None: + return None + if isinstance(p, text_type): + return p.encode('utf-8') + if enc != 'utf-8': + return p.decode(enc).encode('utf-8') + return p + +# convert string to be unicode encoded +def unicode_str(p, enc='utf-8'): + if p is None: + return None + if isinstance(p, text_type): + return p + return p.decode(enc) + +ASCII_CHARS = set(chr(x) for x in range(128)) +URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ' + 'abcdefghijklmnopqrstuvwxyz' + '0123456789' '#' '_.-/~') +IRI_UNSAFE = ASCII_CHARS - URL_SAFE + +# returns a quoted IRI (not a URI) +def quoteurl(href): + if isinstance(href,binary_type): + href = href.decode('utf-8') + result = [] + for char in href: + if char in IRI_UNSAFE: + char = "%%%02x" % ord(char) + result.append(char) + return ''.join(result) + +# unquotes url/iri +def unquoteurl(href): + if isinstance(href,binary_type): + href = href.decode('utf-8') + href = unquote(href) + return href + +# unescape html +def unescapeit(sval): + return _h.unescape(sval) + +# Python 2.X commandline parsing under Windows has been horribly broken for years! +# Use the following code to emulate full unicode commandline parsing on Python 2 +# ie. To get sys.argv arguments and properly encode them as unicode + +def unicode_argv(): + global iswindows + global PY3 + if PY3: + return sys.argv + if iswindows: + # Versions 2.x of Python don't support Unicode in sys.argv on + # Windows, with the underlying Windows API instead replacing multi-byte + # characters with '?'. So use shell32.GetCommandLineArgvW to get sys.argv + # as a list of Unicode strings + from ctypes import POINTER, byref, cdll, c_int, windll + from ctypes.wintypes import LPCWSTR, LPWSTR + + GetCommandLineW = cdll.kernel32.GetCommandLineW + GetCommandLineW.argtypes = [] + GetCommandLineW.restype = LPCWSTR + + CommandLineToArgvW = windll.shell32.CommandLineToArgvW + CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)] + CommandLineToArgvW.restype = POINTER(LPWSTR) + + cmd = GetCommandLineW() + argc = c_int(0) + argv = CommandLineToArgvW(cmd, byref(argc)) + if argc.value > 0: + # Remove Python executable and commands if present + start = argc.value - len(sys.argv) + return [argv[i] for i in + range(start, argc.value)] + # this should never happen + return None + else: + argv = [] + argvencoding = sys.stdin.encoding + if argvencoding is None: + argvencoding = sys.getfilesystemencoding() + if argvencoding is None: + argvencoding = 'utf-8' + for arg in sys.argv: + if isinstance(arg, text_type): + argv.append(arg) + else: + argv.append(arg.decode(argvencoding)) + return argv + + +# Python 2.X is broken in that it does not recognize CP65001 as UTF-8 +def add_cp65001_codec(): + if PY2: + try: + codecs.lookup('cp65001') + except LookupError: + codecs.register( + lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None) + return diff --git a/src/epy_reader/tools/KindleUnpack/kindleunpack.py b/src/epy_reader/tools/KindleUnpack/kindleunpack.py new file mode 100644 index 0000000..317941a --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/kindleunpack.py @@ -0,0 +1,1029 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +import os + +__path__ = ["lib", os.path.dirname(os.path.realpath(__file__)), "kindleunpack"] + +import sys +import codecs +import traceback + +from .compatibility_utils import PY2, binary_type, utf8_str, unicode_str +from .compatibility_utils import unicode_argv, add_cp65001_codec +from .compatibility_utils import hexlify + +add_cp65001_codec() + +from .unipath import pathof + +if PY2: + range = xrange + # since will be printing unicode under python 2 need to protect + # against sys.stdout.encoding being None stupidly forcing forcing ascii encoding + if sys.stdout.encoding is None: + sys.stdout = codecs.getwriter("utf-8")(sys.stdout) + else: + encoding = sys.stdout.encoding + sys.stdout = codecs.getwriter(encoding)(sys.stdout) + +# Changelog +# 0.11 - Version by adamselene +# 0.11pd - Tweaked version by pdurrant +# 0.12 - extracts pictures too, and all into a folder. +# 0.13 - added back in optional output dir for those who don't want it based on infile +# 0.14 - auto flush stdout and wrapped in main, added proper return codes +# 0.15 - added support for metadata +# 0.16 - metadata now starting to be output as an opf file (PD) +# 0.17 - Also created tweaked text as source for Mobipocket Creator +# 0.18 - removed raw mobi file completely but kept _meta.html file for ease of conversion +# 0.19 - added in metadata for ASIN, Updated Title and Rights to the opf +# 0.20 - remove _meta.html since no longer needed +# 0.21 - Fixed some typos in the opf output, and also updated handling +# of test for trailing data/multibyte characters +# 0.22 - Fixed problem with > 9 images +# 0.23 - Now output Start guide item +# 0.24 - Set firstaddl value for 'TEXtREAd' +# 0.25 - Now added character set metadata to html file for utf-8 files. +# 0.26 - Dictionary support added. Image handling speed improved. +# For huge files create temp files to speed up decoding. +# Language decoding fixed. Metadata is now converted to utf-8 when written to opf file. +# 0.27 - Add idx:entry attribute "scriptable" if dictionary contains entry length tags. +# Don't save non-image sections as images. Extract and save source zip file +# included by kindlegen as kindlegensrc.zip. +# 0.28 - Added back correct image file name extensions, created FastConcat class to simplify and clean up +# 0.29 - Metadata handling reworked, multiple entries of the same type are now supported. +# Several missing types added. +# FastConcat class has been removed as in-memory handling with lists is faster, even for huge files. +# 0.30 - Add support for outputting **all** metadata values - encode content with hex if of unknown type +# 0.31 - Now supports Print Replica ebooks, outputting PDF and mysterious data sections +# 0.32 - Now supports NCX file extraction/building. +# Overhauled the structure of mobiunpack to be more class oriented. +# 0.33 - Split Classes ito separate files and added prelim support for KF8 format eBooks +# 0.34 - Improved KF8 support, guide support, bug fixes +# 0.35 - Added splitting combo mobi7/mobi8 into standalone mobi7 and mobi8 files +# Also handle mobi8-only file properly +# 0.36 - very minor changes to support KF8 mobis with no flow items, no ncx, etc +# 0.37 - separate output, add command line switches to control, interface to Mobi_Unpack.pyw +# 0.38 - improve split function by resetting flags properly, fix bug in Thumbnail Images +# 0.39 - improve split function so that ToC info is not lost for standalone mobi8s +# 0.40 - make mobi7 split match official versions, add support for graphic novel metadata, +# improve debug for KF8 +# 0.41 - fix when StartOffset set to 0xffffffff, fix to work with older mobi versions, +# fix other minor metadata issues +# 0.42 - add new class interface to allow it to integrate more easily with internal calibre routines +# 0.43 - bug fixes for new class interface +# 0.44 - more bug fixes and fix for potnetial bug caused by not properly closing created zip archive +# 0.45 - sync to version in the new Mobi_Unpack plugin +# 0.46 - fixes for: obfuscated fonts, improper toc links and ncx, add support for opentype fonts +# 0.47 - minor opf improvements +# 0.48 - ncx link fixes +# 0.49 - use azw3 when splitting mobis +# 0.50 - unknown change +# 0.51 - fix for converting filepos links to hrefs, Added GPL3 notice, made KF8 extension just '.azw3' +# 0.52 - fix for cover metadata (no support for Mobipocket Creator) +# 0.53 - fix for proper identification of embedded fonts, added new metadata items +# 0.54 - Added error-handling so wonky embedded fonts don't bomb the whole unpack process, +# entity escape KF8 metadata to ensure valid OPF. +# 0.55 Strip extra StartOffset EXTH from the mobi8 header when splitting, keeping only the relevant one +# For mobi8 files, don't generate duplicate guide entries from the metadata if we could extract one +# from the OTH table. +# 0.56 - Added further entity escaping of OPF text. +# Allow unicode string file paths to be passed as arguments to the unpackBook method without blowing up later +# when the attempt to "re"-unicode a portion of that filename occurs in the process_all_mobi_headers method. +# 0.57 - Fixed eror when splitting Preview files downloaded from KDP website +# 0.58 - Output original kindlegen build log ('CMET' record) if included in the package. +# 0.58 - Include and extend functionality of DumpMobiHeader, replacing DEBUG with DUMP +# 0.59 - Much added DUMP functionality, including full dumping and descriptions of sections +# 0.60 - Bug fixes in opf, div tables, bad links, page breaks, section descriptions +# - plus a number of other bug fixed that were found by Sergey Dubinets +# - fixs for file/paths that require full unicode to work properly +# - replace subprocess with multiprocessing to remove need for unbuffered stdout +# 0.61 - renamed to be KindleUnpack and more unicode/utf-8 path bug fixes and other minor fixes +# 0.62 - fix for multiprocessing on Windows, split fixes, opf improvements +# 0.63 - Modified to process right to left page progression books properly. +# - Added some id_map_strings and RESC section processing; metadata and +# - spine in the RESC are integrated partly to content.opf. +# 0.63a- Separated K8 RESC processor to an individual file. Bug fixes. Added cover page creation. +# 0.64 - minor bug fixes to more properly handle unicode command lines, and support for more jpeg types +# 0.64a- Modifed to handle something irregular mobi and azw3 files. +# 0.64b- Modifed to create k8resc.spine for no RECS files. +# 0.65 - Bug fixes to shorten title and remove epub3 "properties" to make the output epub2 compliant +# 0.65a- Bug fixes to extract RESC section correctly, to prevent item id confliction +# - and to process multiline comments in RESC. +# 0.66 - Bug fix to deal with missing first resource information sometimes generated by calibre +# 0.66a- Fixed minor bugs, which probably do not affect the output anything +# 0.67 - Fixed Mobi Split functionality bug with azw3 images not being properly copied +# 0.68 - preliminary support for handling PAGE sections to create page-map.xml +# 0.69 - preliminary support for CONT and CRES for HD Images +# 0.70 - preliminary support for decoding apnx files when used with azw3 ebooks +# 0.71 - extensive refactoring of kindleunpack.py to make it more manageable +# 0.72 - many bug fixes from tkeo: fix pageProcessing, fix print replica, fix resc usage, fix font mangling, etc. +# 0.72a- fix for still broken PrintReplica support +# 0.72b- preview for primary epub3 support. A parameter epubver(default='2') is added to process_all_mobi_headers(), unpackBook(). +# 0.72c- preview for apnx page support +# 0.72d- more bugs fixed in preview features, much improved GUI with ability to dynaically grow the Log Window with preference support +# 0.72e- more bug fixes, Tk GUI adds support for epub version and HDImage use +# 0.72f- more bug fixes, implement use hd images if present +# 0.72g- minor bug fixes and cleanups from tkeo +# 0.72h- updated mobi_header and mobi_k8proc to use the correct fragment and guide terms in place of div and other +# to better match the terms that both Calibre and Amazon use internally to their own software +# 0.72x- very experimental conversion to use new mobi_k8resc.py and some of its associated changes +# 0.72y- more changes to simplify and integrate in epub3 support in a simpler manner +# 0.72z- remove redundancy in mobi_opf.py and bug fixes for mobi_k8resc.py +# 0.73 faster mobi split, numerous bug fixes in mobi_k8proc, mobi_header, mobi_opf, mobi_k8resc, etc +# 0.74 added refines metadata, fixed language code in ncx and title in nav, added support for opf: from refines +# 0.75 much improved dictioanry support including support for multiple inflection sections, minor mobi_opf fixes +# 0.76 pre-release version only fix name related issues in opf by not using original file name in mobi7 +# 0.77 bug fix for unpacking HDImages with included Fonts +# 0.80 converted to work with both python 2.7 and Python 3.3 and later +# 0.81 various fixes +# 0.82 Handle calibre-generated mobis that can have skeletons with no fragments +# 0.83 Fix header item 114 being mistakenly treated as a string instead of a value + +DUMP = False +""" Set to True to dump all possible information. """ + +WRITE_RAW_DATA = False +""" Set to True to create additional files with raw data for debugging/reverse engineering. """ + +SPLIT_COMBO_MOBIS = False +""" Set to True to split combination mobis into mobi7 and mobi8 pieces. """ + +CREATE_COVER_PAGE = True # XXX experimental +""" Create and insert a cover xhtml page. """ + +EOF_RECORD = b'\xe9\x8e' + b'\r\n' +""" The EOF record content. """ + +TERMINATION_INDICATOR1 = b'\x00' +TERMINATION_INDICATOR2 = b'\x00\x00' +TERMINATION_INDICATOR3 = b'\x00\x00\x00' + +KINDLEGENSRC_FILENAME = "kindlegensrc.zip" +""" The name for the kindlegen source archive. """ + +KINDLEGENLOG_FILENAME = "kindlegenbuild.log" +""" The name for the kindlegen build log. """ + +K8_BOUNDARY = b'BOUNDARY' +""" The section data that divides K8 mobi ebooks. """ + +import os +import struct +import re +import zlib +import getopt + +class unpackException(Exception): + pass + + +# import the kindleunpack support libraries +from .unpack_structure import fileNames +from .mobi_sectioner import Sectionizer, describe +from .mobi_header import MobiHeader, dump_contexth +from .mobi_utils import toBase32 +from .mobi_opf import OPFProcessor +from .mobi_html import HTMLProcessor, XHTMLK8Processor +from .mobi_ncx import ncxExtract +from .mobi_k8proc import K8Processor +from .mobi_split import mobi_split +from .mobi_k8resc import K8RESCProcessor +from .mobi_nav import NAVProcessor +from .mobi_cover import CoverProcessor, get_image_type +from .mobi_pagemap import PageMapProcessor +from .mobi_dict import dictSupport + + +def processSRCS(i, files, rscnames, sect, data): + # extract the source zip archive and save it. + print("File contains kindlegen source archive, extracting as %s" % KINDLEGENSRC_FILENAME) + srcname = os.path.join(files.outdir, KINDLEGENSRC_FILENAME) + with open(pathof(srcname), 'wb') as f: + f.write(data[16:]) + rscnames.append(None) + sect.setsectiondescription(i,"Zipped Source Files") + return rscnames + + +def processPAGE(i, files, rscnames, sect, data, mh, pagemapproc): + # process any page map information and create an apnx file + pagemapproc = PageMapProcessor(mh, data) + rscnames.append(None) + sect.setsectiondescription(i,"PageMap") + apnx_meta = {} + acr = sect.palmname.decode('latin-1').rstrip('\x00') + apnx_meta['acr'] = acr + apnx_meta['cdeType'] = mh.metadata['cdeType'][0] + apnx_meta['contentGuid'] = hex(int(mh.metadata['UniqueID'][0]))[2:] + apnx_meta['asin'] = mh.metadata['ASIN'][0] + apnx_meta['pageMap'] = pagemapproc.getPageMap() + if mh.version == 8: + apnx_meta['format'] = 'MOBI_8' + else: + apnx_meta['format'] = 'MOBI_7' + apnx_data = pagemapproc.generateAPNX(apnx_meta) + if mh.isK8(): + outname = os.path.join(files.outdir, 'mobi8-'+files.getInputFileBasename() + '.apnx') + else: + outname = os.path.join(files.outdir, 'mobi7-'+files.getInputFileBasename() + '.apnx') + with open(pathof(outname), 'wb') as f: + f.write(apnx_data) + return rscnames, pagemapproc + + +def processCMET(i, files, rscnames, sect, data): + # extract the build log + print("File contains kindlegen build log, extracting as %s" % KINDLEGENLOG_FILENAME) + srcname = os.path.join(files.outdir, KINDLEGENLOG_FILENAME) + with open(pathof(srcname), 'wb') as f: + f.write(data[10:]) + rscnames.append(None) + sect.setsectiondescription(i,"Kindlegen log") + return rscnames + + +# fonts only exist in KF8 ebooks +# Format: bytes 0 - 3: 'FONT' +# bytes 4 - 7: uncompressed size +# bytes 8 - 11: flags +# flag bit 0x0001 - zlib compression +# flag bit 0x0002 - obfuscated with xor string +# bytes 12 - 15: offset to start of compressed font data +# bytes 16 - 19: length of xor string stored before the start of the comnpress font data +# bytes 20 - 23: start of xor string +def processFONT(i, files, rscnames, sect, data, obfuscate_data, beg, rsc_ptr): + fontname = "font%05d" % i + ext = '.dat' + font_error = False + font_data = data + try: + usize, fflags, dstart, xor_len, xor_start = struct.unpack_from(b'>LLLLL',data,4) + except: + print("Failed to extract font: {0:s} from section {1:d}".format(fontname,i)) + font_error = True + ext = '.failed' + pass + if not font_error: + print("Extracting font:", fontname) + font_data = data[dstart:] + extent = len(font_data) + extent = min(extent, 1040) + if fflags & 0x0002: + # obfuscated so need to de-obfuscate the first 1040 bytes + key = bytearray(data[xor_start: xor_start+ xor_len]) + buf = bytearray(font_data) + for n in range(extent): + buf[n] ^= key[n%xor_len] + font_data = bytes(buf) + if fflags & 0x0001: + # ZLIB compressed data + font_data = zlib.decompress(font_data) + hdr = font_data[0:4] + if hdr == b'\0\1\0\0' or hdr == b'true' or hdr == b'ttcf': + ext = '.ttf' + elif hdr == b'OTTO': + ext = '.otf' + else: + print("Warning: unknown font header %s" % hexlify(hdr)) + if (ext == '.ttf' or ext == '.otf') and (fflags & 0x0002): + obfuscate_data.append(fontname + ext) + fontname += ext + outfnt = os.path.join(files.imgdir, fontname) + with open(pathof(outfnt), 'wb') as f: + f.write(font_data) + rscnames.append(fontname) + sect.setsectiondescription(i,"Font {0:s}".format(fontname)) + if rsc_ptr == -1: + rsc_ptr = i - beg + return rscnames, obfuscate_data, rsc_ptr + + +def processCRES(i, files, rscnames, sect, data, beg, rsc_ptr, use_hd): + # extract an HDImage + global DUMP + data = data[12:] + imgtype = get_image_type(None, data) + + if imgtype is None: + print("Warning: CRES Section %s does not contain a recognised resource" % i) + rscnames.append(None) + sect.setsectiondescription(i,"Mysterious CRES data, first four bytes %s" % describe(data[0:4])) + if DUMP: + fname = "unknown%05d.dat" % i + outname= os.path.join(files.outdir, fname) + with open(pathof(outname), 'wb') as f: + f.write(data) + sect.setsectiondescription(i,"Mysterious CRES data, first four bytes %s extracting as %s" % (describe(data[0:4]), fname)) + rsc_ptr += 1 + return rscnames, rsc_ptr + + if use_hd: + # overwrite corresponding lower res image with hd version + imgname = rscnames[rsc_ptr] + imgdest = files.imgdir + else: + imgname = "HDimage%05d.%s" % (i, imgtype) + imgdest = files.hdimgdir + print("Extracting HD image: {0:s} from section {1:d}".format(imgname,i)) + outimg = os.path.join(imgdest, imgname) + with open(pathof(outimg), 'wb') as f: + f.write(data) + rscnames.append(None) + sect.setsectiondescription(i,"Optional HD Image {0:s}".format(imgname)) + rsc_ptr += 1 + return rscnames, rsc_ptr + + +def processCONT(i, files, rscnames, sect, data): + global DUMP + # process a container header, most of this is unknown + # right now only extract its EXTH + dt = data[0:12] + if dt == b"CONTBOUNDARY": + rscnames.append(None) + sect.setsectiondescription(i,"CONTAINER BOUNDARY") + else: + sect.setsectiondescription(i,"CONT Header") + rscnames.append(None) + if DUMP: + cpage, = struct.unpack_from(b'>L', data, 12) + contexth = data[48:] + print("\n\nContainer EXTH Dump") + dump_contexth(cpage, contexth) + fname = "CONT_Header%05d.dat" % i + outname= os.path.join(files.outdir, fname) + with open(pathof(outname), 'wb') as f: + f.write(data) + return rscnames + + +def processkind(i, files, rscnames, sect, data): + global DUMP + dt = data[0:12] + if dt == b"kindle:embed": + if DUMP: + print("\n\nHD Image Container Description String") + print(data) + sect.setsectiondescription(i,"HD Image Container Description String") + rscnames.append(None) + return rscnames + + +# spine information from the original content.opf +def processRESC(i, files, rscnames, sect, data, k8resc): + global DUMP + if DUMP: + rescname = "RESC%05d.dat" % i + print("Extracting Resource: ", rescname) + outrsc = os.path.join(files.outdir, rescname) + with open(pathof(outrsc), 'wb') as f: + f.write(data) + if True: # try: + # parse the spine and metadata from RESC + k8resc = K8RESCProcessor(data[16:], DUMP) + else: # except: + print("Warning: cannot extract information from RESC.") + k8resc = None + rscnames.append(None) + sect.setsectiondescription(i,"K8 RESC section") + return rscnames, k8resc + + +def processImage(i, files, rscnames, sect, data, beg, rsc_ptr, cover_offset, thumb_offset): + global DUMP + # Extract an Image + imgtype = get_image_type(None, data) + if imgtype is None: + print("Warning: Section %s does not contain a recognised resource" % i) + rscnames.append(None) + sect.setsectiondescription(i,"Mysterious Section, first four bytes %s" % describe(data[0:4])) + if DUMP: + fname = "unknown%05d.dat" % i + outname= os.path.join(files.outdir, fname) + with open(pathof(outname), 'wb') as f: + f.write(data) + sect.setsectiondescription(i,"Mysterious Section, first four bytes %s extracting as %s" % (describe(data[0:4]), fname)) + return rscnames, rsc_ptr + + imgname = "image%05d.%s" % (i, imgtype) + if cover_offset is not None and i == beg + cover_offset: + imgname = "cover%05d.%s" % (i, imgtype) + if thumb_offset is not None and i == beg + thumb_offset: + imgname = "thumb%05d.%s" % (i, imgtype) + print("Extracting image: {0:s} from section {1:d}".format(imgname,i)) + outimg = os.path.join(files.imgdir, imgname) + with open(pathof(outimg), 'wb') as f: + f.write(data) + rscnames.append(imgname) + sect.setsectiondescription(i,"Image {0:s}".format(imgname)) + if rsc_ptr == -1: + rsc_ptr = i - beg + return rscnames, rsc_ptr + + +def processPrintReplica(metadata, files, rscnames, mh): + global DUMP + global WRITE_RAW_DATA + rawML = mh.getRawML() + if DUMP or WRITE_RAW_DATA: + outraw = os.path.join(files.outdir,files.getInputFileBasename() + '.rawpr') + with open(pathof(outraw),'wb') as f: + f.write(rawML) + + fileinfo = [] + print("Print Replica ebook detected") + try: + numTables, = struct.unpack_from(b'>L', rawML, 0x04) + tableIndexOffset = 8 + 4*numTables + # for each table, read in count of sections, assume first section is a PDF + # and output other sections as binary files + for i in range(numTables): + sectionCount, = struct.unpack_from(b'>L', rawML, 0x08 + 4*i) + for j in range(sectionCount): + sectionOffset, sectionLength, = struct.unpack_from(b'>LL', rawML, tableIndexOffset) + tableIndexOffset += 8 + if j == 0: + entryName = os.path.join(files.outdir, files.getInputFileBasename() + ('.%03d.pdf' % (i+1))) + else: + entryName = os.path.join(files.outdir, files.getInputFileBasename() + ('.%03d.%03d.data' % ((i+1),j))) + with open(pathof(entryName), 'wb') as f: + f.write(rawML[sectionOffset:(sectionOffset+sectionLength)]) + except Exception as e: + print('Error processing Print Replica: ' + str(e)) + + fileinfo.append([None,'', files.getInputFileBasename() + '.pdf']) + usedmap = {} + for name in rscnames: + if name is not None: + usedmap[name] = 'used' + opf = OPFProcessor(files, metadata, fileinfo, rscnames, False, mh, usedmap) + opf.writeOPF() + + +def processMobi8(mh, metadata, sect, files, rscnames, pagemapproc, k8resc, obfuscate_data, apnxfile=None, epubver='2'): + global DUMP + global WRITE_RAW_DATA + + # extract raw markup langauge + rawML = mh.getRawML() + if DUMP or WRITE_RAW_DATA: + outraw = os.path.join(files.k8dir,files.getInputFileBasename() + '.rawml') + with open(pathof(outraw),'wb') as f: + f.write(rawML) + + # KF8 require other indexes which contain parsing information and the FDST info + # to process the rawml back into the xhtml files, css files, svg image files, etc + k8proc = K8Processor(mh, sect, files, DUMP) + k8proc.buildParts(rawML) + + # collect information for the guide first + guidetext = unicode_str(k8proc.getGuideText()) + + # if the guide was empty, add in any guide info from metadata, such as StartOffset + if not guidetext and 'StartOffset' in metadata: + # Apparently, KG 2.5 carries over the StartOffset from the mobi7 part... + # Taking that into account, we only care about the *last* StartOffset, which + # should always be the correct one in these cases (the one actually pointing + # to the right place in the mobi8 part). + starts = metadata['StartOffset'] + last_start = starts[-1] + last_start = int(last_start) + if last_start == 0xffffffff: + last_start = 0 + seq, idtext = k8proc.getFragTblInfo(last_start) + filename, idtext = k8proc.getIDTagByPosFid(toBase32(seq), b'0000000000') + linktgt = filename + idtext = unicode_str(idtext, mh.codec) + if idtext != '': + linktgt += '#' + idtext + guidetext += '<reference type="text" href="Text/%s" />\n' % linktgt + + # if apnxfile is passed in use it for page map information + if apnxfile is not None and pagemapproc is None: + with open(apnxfile, 'rb') as f: + apnxdata = b"00000000" + f.read() + pagemapproc = PageMapProcessor(mh, apnxdata) + + # generate the page map + pagemapxml = '' + if pagemapproc is not None: + pagemapxml = pagemapproc.generateKF8PageMapXML(k8proc) + outpm = os.path.join(files.k8oebps,'page-map.xml') + with open(pathof(outpm),'wb') as f: + f.write(pagemapxml.encode('utf-8')) + if DUMP: + print(pagemapproc.getNames()) + print(pagemapproc.getOffsets()) + print("\n\nPage Map") + print(pagemapxml) + + # process the toc ncx + # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num + print("Processing ncx / toc") + ncx = ncxExtract(mh, files) + ncx_data = ncx.parseNCX() + # extend the ncx data with filenames and proper internal idtags + for i in range(len(ncx_data)): + ncxmap = ncx_data[i] + [junk1, junk2, junk3, fid, junk4, off] = ncxmap['pos_fid'].split(':') + filename, idtag = k8proc.getIDTagByPosFid(fid, off) + ncxmap['filename'] = filename + ncxmap['idtag'] = unicode_str(idtag) + ncx_data[i] = ncxmap + + # convert the rawML to a set of xhtml files + print("Building an epub-like structure") + htmlproc = XHTMLK8Processor(rscnames, k8proc) + usedmap = htmlproc.buildXHTML() + + # write out the xhtml svg, and css files + # fileinfo = [skelid|coverpage, dir, name] + fileinfo = [] + # first create a cover page if none exists + if CREATE_COVER_PAGE: + cover = CoverProcessor(files, metadata, rscnames) + cover_img = utf8_str(cover.getImageName()) + need_to_create_cover_page = False + if cover_img is not None: + if k8resc is None or not k8resc.hasSpine(): + part = k8proc.getPart(0) + if part.find(cover_img) == -1: + need_to_create_cover_page = True + else: + if "coverpage" not in k8resc.spine_idrefs: + part = k8proc.getPart(int(k8resc.spine_order[0])) + if part.find(cover_img) == -1: + k8resc.prepend_to_spine("coverpage", "inserted", "no", None) + if k8resc.spine_order[0] == "coverpage": + need_to_create_cover_page = True + if need_to_create_cover_page: + filename = cover.getXHTMLName() + fileinfo.append(["coverpage", 'Text', filename]) + guidetext += cover.guide_toxml() + cover.writeXHTML() + + n = k8proc.getNumberOfParts() + for i in range(n): + part = k8proc.getPart(i) + [skelnum, dir, filename, beg, end, aidtext] = k8proc.getPartInfo(i) + fileinfo.append([str(skelnum), dir, filename]) + fname = os.path.join(files.k8oebps,dir,filename) + with open(pathof(fname),'wb') as f: + f.write(part) + n = k8proc.getNumberOfFlows() + for i in range(1, n): + [ptype, pformat, pdir, filename] = k8proc.getFlowInfo(i) + flowpart = k8proc.getFlow(i) + if pformat == b'file': + fileinfo.append([None, pdir, filename]) + fname = os.path.join(files.k8oebps,pdir,filename) + with open(pathof(fname),'wb') as f: + f.write(flowpart) + + # create the opf + opf = OPFProcessor(files, metadata.copy(), fileinfo, rscnames, True, mh, usedmap, + pagemapxml=pagemapxml, guidetext=guidetext, k8resc=k8resc, epubver=epubver) + uuid = opf.writeOPF(bool(obfuscate_data)) + + if opf.hasNCX(): + # Create a toc.ncx. + ncx.writeK8NCX(ncx_data, metadata) + if opf.hasNAV(): + # Create a navigation document. + nav = NAVProcessor(files) + nav.writeNAV(ncx_data, guidetext, metadata) + + # make an epub-like structure of it all + print("Creating an epub-like file") + files.makeEPUB(usedmap, obfuscate_data, uuid) + + +def processMobi7(mh, metadata, sect, files, rscnames): + global DUMP + global WRITE_RAW_DATA + # An original Mobi + rawML = mh.getRawML() + if DUMP or WRITE_RAW_DATA: + outraw = os.path.join(files.mobi7dir,files.getInputFileBasename() + '.rawml') + with open(pathof(outraw),'wb') as f: + f.write(rawML) + + # process the toc ncx + # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num + ncx = ncxExtract(mh, files) + ncx_data = ncx.parseNCX() + ncx.writeNCX(metadata) + + positionMap = {} + + # if Dictionary build up the positionMap + if mh.isDictionary(): + if mh.DictInLanguage(): + metadata['DictInLanguage'] = [mh.DictInLanguage()] + if mh.DictOutLanguage(): + metadata['DictOutLanguage'] = [mh.DictOutLanguage()] + positionMap = dictSupport(mh, sect).getPositionMap() + + # convert the rawml back to Mobi ml + proc = HTMLProcessor(files, metadata, rscnames) + srctext = proc.findAnchors(rawML, ncx_data, positionMap) + srctext, usedmap = proc.insertHREFS() + + # write the proper mobi html + fileinfo=[] + # fname = files.getInputFileBasename() + '.html' + fname = 'book.html' + fileinfo.append([None,'', fname]) + outhtml = os.path.join(files.mobi7dir, fname) + with open(pathof(outhtml), 'wb') as f: + f.write(srctext) + + # extract guidetext from srctext + guidetext =b'' + # no pagemap support for older mobis + # pagemapxml = None + guidematch = re.search(br'''<guide>(.*)</guide>''',srctext,re.IGNORECASE+re.DOTALL) + if guidematch: + guidetext = guidematch.group(1) + # sometimes old mobi guide from srctext horribly written so need to clean up + guidetext = guidetext.replace(b"\r", b"") + guidetext = guidetext.replace(b'<REFERENCE', b'<reference') + guidetext = guidetext.replace(b' HREF=', b' href=') + guidetext = guidetext.replace(b' TITLE=', b' title=') + guidetext = guidetext.replace(b' TYPE=', b' type=') + # reference must be a self-closing tag + # and any href must be replaced with filepos information + ref_tag_pattern = re.compile(br'''(<reference [^>]*>)''', re.IGNORECASE) + guidepieces = ref_tag_pattern.split(guidetext) + for i in range(1,len(guidepieces), 2): + reftag = guidepieces[i] + # remove any href there now to replace with filepos + reftag = re.sub(br'''href\s*=[^'"]*['"][^'"]*['"]''',b'', reftag) + # make sure the reference tag ends properly + if not reftag.endswith(b"/>"): + reftag = reftag[0:-1] + b"/>" + guidepieces[i] = reftag + guidetext = b''.join(guidepieces) + replacetext = br'''href="'''+utf8_str(fileinfo[0][2])+ br'''#filepos\1"''' + guidetext = re.sub(br'''filepos=['"]{0,1}0*(\d+)['"]{0,1}''', replacetext, guidetext) + guidetext += b'\n' + + if 'StartOffset' in metadata: + for value in metadata['StartOffset']: + if int(value) == 0xffffffff: + value = '0' + starting_offset = value + # get guide items from metadata + metaguidetext = b'<reference type="text" href="'+utf8_str(fileinfo[0][2])+b'#filepos'+utf8_str(starting_offset)+b'" />\n' + guidetext += metaguidetext + + if isinstance(guidetext, binary_type): + guidetext = guidetext.decode(mh.codec) + + # create an OPF + opf = OPFProcessor(files, metadata, fileinfo, rscnames, ncx.isNCX, mh, usedmap, guidetext=guidetext) + opf.writeOPF() + + +def processUnknownSections(mh, sect, files, K8Boundary): + global DUMP + global TERMINATION_INDICATOR1 + global TERMINATION_INDICATOR2 + global TERMINATION_INDICATOR3 + if DUMP: + print("Unpacking any remaining unknown records") + beg = mh.start + end = sect.num_sections + if beg < K8Boundary: + # then we're processing the first part of a combination file + end = K8Boundary + for i in range(beg, end): + if sect.sectiondescriptions[i] == "": + data = sect.loadSection(i) + type = data[0:4] + if type == TERMINATION_INDICATOR3: + description = "Termination Marker 3 Nulls" + elif type == TERMINATION_INDICATOR2: + description = "Termination Marker 2 Nulls" + elif type == TERMINATION_INDICATOR1: + description = "Termination Marker 1 Null" + elif type == "INDX": + fname = "Unknown%05d_INDX.dat" % i + description = "Unknown INDX section" + if DUMP: + outname= os.path.join(files.outdir, fname) + with open(pathof(outname), 'wb') as f: + f.write(data) + print("Extracting %s: %s from section %d" % (description, fname, i)) + description = description + ", extracting as %s" % fname + else: + fname = "unknown%05d.dat" % i + description = "Mysterious Section, first four bytes %s" % describe(data[0:4]) + if DUMP: + outname= os.path.join(files.outdir, fname) + with open(pathof(outname), 'wb') as f: + f.write(data) + print("Extracting %s: %s from section %d" % (description, fname, i)) + description = description + ", extracting as %s" % fname + sect.setsectiondescription(i, description) + + +def process_all_mobi_headers(files, apnxfile, sect, mhlst, K8Boundary, k8only=False, epubver='2', use_hd=False): + global DUMP + global WRITE_RAW_DATA + rscnames = [] + rsc_ptr = -1 + k8resc = None + obfuscate_data = [] + for mh in mhlst: + pagemapproc = None + if mh.isK8(): + sect.setsectiondescription(mh.start,"KF8 Header") + mhname = os.path.join(files.outdir,"header_K8.dat") + print("Processing K8 section of book...") + elif mh.isPrintReplica(): + sect.setsectiondescription(mh.start,"Print Replica Header") + mhname = os.path.join(files.outdir,"header_PR.dat") + print("Processing PrintReplica section of book...") + else: + if mh.version == 0: + sect.setsectiondescription(mh.start, "PalmDoc Header".format(mh.version)) + else: + sect.setsectiondescription(mh.start,"Mobipocket {0:d} Header".format(mh.version)) + mhname = os.path.join(files.outdir,"header.dat") + print("Processing Mobipocket {0:d} section of book...".format(mh.version)) + + if DUMP: + # write out raw mobi header data + with open(pathof(mhname), 'wb') as f: + f.write(mh.header) + + # process each mobi header + metadata = mh.getMetaData() + mh.describeHeader(DUMP) + if mh.isEncrypted(): + raise unpackException('Book is encrypted') + + pagemapproc = None + + # first handle all of the different resource sections: images, resources, fonts, and etc + # build up a list of image names to use to postprocess the ebook + + print("Unpacking images, resources, fonts, etc") + beg = mh.firstresource + end = sect.num_sections + if beg < K8Boundary: + # processing first part of a combination file + end = K8Boundary + + # Not sure the try/except is necessary, but just in case + try: + thumb_offset = int(metadata.get('ThumbOffset', ['-1'])[0]) + except: + thumb_offset = None + + cover_offset = int(metadata.get('CoverOffset', ['-1'])[0]) + if not CREATE_COVER_PAGE: + cover_offset = None + + for i in range(beg, end): + data = sect.loadSection(i) + type = data[0:4] + + # handle the basics first + if type in [b"FLIS", b"FCIS", b"FDST", b"DATP"]: + if DUMP: + fname = unicode_str(type) + "%05d" % i + if mh.isK8(): + fname += "_K8" + fname += '.dat' + outname= os.path.join(files.outdir, fname) + with open(pathof(outname), 'wb') as f: + f.write(data) + print("Dumping section {0:d} type {1:s} to file {2:s} ".format(i,unicode_str(type),outname)) + sect.setsectiondescription(i,"Type {0:s}".format(unicode_str(type))) + rscnames.append(None) + elif type == b"SRCS": + rscnames = processSRCS(i, files, rscnames, sect, data) + elif type == b"PAGE": + rscnames, pagemapproc = processPAGE(i, files, rscnames, sect, data, mh, pagemapproc) + elif type == b"CMET": + rscnames = processCMET(i, files, rscnames, sect, data) + elif type == b"FONT": + rscnames, obfuscate_data, rsc_ptr = processFONT(i, files, rscnames, sect, data, obfuscate_data, beg, rsc_ptr) + elif type == b"CRES": + rscnames, rsc_ptr = processCRES(i, files, rscnames, sect, data, beg, rsc_ptr, use_hd) + elif type == b"CONT": + rscnames = processCONT(i, files, rscnames, sect, data) + elif type == b"kind": + rscnames = processkind(i, files, rscnames, sect, data) + elif type == b'\xa0\xa0\xa0\xa0': + sect.setsectiondescription(i,"Empty_HD_Image/Resource_Placeholder") + rscnames.append(None) + rsc_ptr += 1 + elif type == b"RESC": + rscnames, k8resc = processRESC(i, files, rscnames, sect, data, k8resc) + elif data == EOF_RECORD: + sect.setsectiondescription(i,"End Of File") + rscnames.append(None) + elif data[0:8] == b"BOUNDARY": + sect.setsectiondescription(i,"BOUNDARY Marker") + rscnames.append(None) + else: + # if reached here should be an image ow treat as unknown + rscnames, rsc_ptr = processImage(i, files, rscnames, sect, data, beg, rsc_ptr, cover_offset, thumb_offset) + # done unpacking resources + + # Print Replica + if mh.isPrintReplica() and not k8only: + processPrintReplica(metadata, files, rscnames, mh) + continue + + # KF8 (Mobi 8) + if mh.isK8(): + processMobi8(mh, metadata, sect, files, rscnames, pagemapproc, k8resc, obfuscate_data, apnxfile, epubver) + + # Old Mobi (Mobi 7) + elif not k8only: + processMobi7(mh, metadata, sect, files, rscnames) + + # process any remaining unknown sections of the palm file + processUnknownSections(mh, sect, files, K8Boundary) + + return + + +def unpackBook(infile, outdir, apnxfile=None, epubver='2', use_hd=False, dodump=False, dowriteraw=False, dosplitcombos=False): + global DUMP + global WRITE_RAW_DATA + global SPLIT_COMBO_MOBIS + if DUMP or dodump: + DUMP = True + if WRITE_RAW_DATA or dowriteraw: + WRITE_RAW_DATA = True + if SPLIT_COMBO_MOBIS or dosplitcombos: + SPLIT_COMBO_MOBIS = True + + infile = unicode_str(infile) + outdir = unicode_str(outdir) + if apnxfile is not None: + apnxfile = unicode_str(apnxfile) + + files = fileNames(infile, outdir) + + # process the PalmDoc database header and verify it is a mobi + sect = Sectionizer(infile) + if sect.ident != b'BOOKMOBI' and sect.ident != b'TEXtREAd': + raise unpackException('Invalid file format') + if DUMP: + sect.dumppalmheader() + else: + print("Palm DB type: %s, %d sections." % (sect.ident.decode('utf-8'),sect.num_sections)) + + # scan sections to see if this is a compound mobi file (K8 format) + # and build a list of all mobi headers to process. + mhlst = [] + mh = MobiHeader(sect,0) + # if this is a mobi8-only file hasK8 here will be true + mhlst.append(mh) + K8Boundary = -1 + + if mh.isK8(): + print("Unpacking a KF8 book...") + hasK8 = True + else: + # This is either a Mobipocket 7 or earlier, or a combi M7/KF8 + # Find out which + hasK8 = False + for i in range(len(sect.sectionoffsets)-1): + before, after = sect.sectionoffsets[i:i+2] + if (after - before) == 8: + data = sect.loadSection(i) + if data == K8_BOUNDARY: + sect.setsectiondescription(i,"Mobi/KF8 Boundary Section") + mh = MobiHeader(sect,i+1) + hasK8 = True + mhlst.append(mh) + K8Boundary = i + break + if hasK8: + print("Unpacking a Combination M{0:d}/KF8 book...".format(mh.version)) + if SPLIT_COMBO_MOBIS: + # if this is a combination mobi7-mobi8 file split them up + mobisplit = mobi_split(infile) + if mobisplit.combo: + outmobi7 = os.path.join(files.outdir, 'mobi7-'+files.getInputFileBasename() + '.mobi') + outmobi8 = os.path.join(files.outdir, 'mobi8-'+files.getInputFileBasename() + '.azw3') + with open(pathof(outmobi7), 'wb') as f: + f.write(mobisplit.getResult7()) + with open(pathof(outmobi8), 'wb') as f: + f.write(mobisplit.getResult8()) + else: + print("Unpacking a Mobipocket {0:d} book...".format(mh.version)) + + if hasK8: + files.makeK8Struct() + + process_all_mobi_headers(files, apnxfile, sect, mhlst, K8Boundary, False, epubver, use_hd) + + if DUMP: + sect.dumpsectionsinfo() + return + + +def usage(progname): + print("") + print("Description:") + print(" Unpacks an unencrypted Kindle/MobiPocket ebook to html and images") + print(" or an unencrypted Kindle/Print Replica ebook to PDF and images") + print(" into the specified output folder.") + print("Usage:") + print(" %s -r -s -p apnxfile -d -h --epub_version= infile [outdir]" % progname) + print("Options:") + print(" -h print this help message") + print(" -i use HD Images, if present, to overwrite reduced resolution images") + print(" -s split combination mobis into mobi7 and mobi8 ebooks") + print(" -p APNXFILE path to an .apnx file associated with the azw3 input (optional)") + print(" --epub_version= specify epub version to unpack to: 2, 3, A (for automatic) or ") + print(" F (force to fit to epub2 definitions), default is 2") + print(" -d dump headers and other info to output and extra files") + print(" -r write raw data to the output folder") + + +def main(argv=unicode_argv()): + global DUMP + global WRITE_RAW_DATA + global SPLIT_COMBO_MOBIS + + print("KindleUnpack v0.83") + print(" Based on initial mobipocket version Copyright © 2009 Charles M. Hannum <root@ihack.net>") + print(" Extensive Extensions and Improvements Copyright © 2009-2020 ") + print(" by: P. Durrant, K. Hendricks, S. Siebert, fandrieu, DiapDealer, nickredding, tkeo.") + print(" This program is free software: you can redistribute it and/or modify") + print(" it under the terms of the GNU General Public License as published by") + print(" the Free Software Foundation, version 3.") + + progname = os.path.basename(argv[0]) + try: + opts, args = getopt.getopt(argv[1:], "dhirsp:", ['epub_version=']) + except getopt.GetoptError as err: + print(str(err)) + usage(progname) + sys.exit(2) + + if len(args)<1: + usage(progname) + sys.exit(2) + + apnxfile = None + epubver = '2' + use_hd = False + + for o, a in opts: + if o == "-h": + usage(progname) + sys.exit(0) + if o == "-i": + use_hd = True + if o == "-d": + DUMP = True + if o == "-r": + WRITE_RAW_DATA = True + if o == "-s": + SPLIT_COMBO_MOBIS = True + if o == "-p": + apnxfile = a + if o == "--epub_version": + epubver = a + + if len(args) > 1: + infile, outdir = args + else: + infile = args[0] + outdir = os.path.splitext(infile)[0] + + infileext = os.path.splitext(infile)[1].upper() + if infileext not in ['.MOBI', '.PRC', '.AZW', '.AZW3', '.AZW4']: + print("Error: first parameter must be a Kindle/Mobipocket ebook or a Kindle/Print Replica ebook.") + return 1 + + try: + print('Unpacking Book...') + unpackBook(infile, outdir, apnxfile, epubver, use_hd) + print('Completed') + + except ValueError as e: + print("Error: %s" % e) + print(traceback.format_exc()) + return 1 + + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/epy_reader/tools/KindleUnpack/mobi_cover.py b/src/epy_reader/tools/KindleUnpack/mobi_cover.py new file mode 100644 index 0000000..3078ac4 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_cover.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import unicode_str + +from .unipath import pathof +import os +import imghdr + +import struct +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring + +USE_SVG_WRAPPER = True +""" Set to True to use svg wrapper for default. """ + +FORCE_DEFAULT_TITLE = False +""" Set to True to force to use the default title. """ + +COVER_PAGE_FINENAME = 'cover_page.xhtml' +""" The name for the cover page. """ + +DEFAULT_TITLE = 'Cover' +""" The default title for the cover page. """ + +MAX_WIDTH = 4096 +""" The max width for the svg cover page. """ + +MAX_HEIGHT = 4096 +""" The max height for the svg cover page. """ + + +def get_image_type(imgname, imgdata=None): + imgtype = unicode_str(imghdr.what(pathof(imgname), imgdata)) + + # imghdr only checks for JFIF or Exif JPEG files. Apparently, there are some + # with only the magic JPEG bytes out there... + # ImageMagick handles those, so, do it too. + if imgtype is None: + if imgdata is None: + with open(pathof(imgname), 'rb') as f: + imgdata = f.read() + if imgdata[0:2] == b'\xFF\xD8': + # Get last non-null bytes + last = len(imgdata) + while (imgdata[last-1:last] == b'\x00'): + last-=1 + # Be extra safe, check the trailing bytes, too. + if imgdata[last-2:last] == b'\xFF\xD9': + imgtype = "jpeg" + return imgtype + + +def get_image_size(imgname, imgdata=None): + '''Determine the image type of imgname (or imgdata) and return its size. + + Originally, + Determine the image type of fhandle and return its size. + from draco''' + if imgdata is None: + fhandle = open(pathof(imgname), 'rb') + head = fhandle.read(24) + else: + head = imgdata[0:24] + if len(head) != 24: + return + + imgtype = get_image_type(imgname, imgdata) + if imgtype == 'png': + check = struct.unpack(b'>i', head[4:8])[0] + if check != 0x0d0a1a0a: + return + width, height = struct.unpack(b'>ii', head[16:24]) + elif imgtype == 'gif': + width, height = struct.unpack(b'<HH', head[6:10]) + elif imgtype == 'jpeg' and imgdata is None: + try: + fhandle.seek(0) # Read 0xff next + size = 2 + ftype = 0 + while not 0xc0 <= ftype <= 0xcf: + fhandle.seek(size, 1) + byte = fhandle.read(1) + while ord(byte) == 0xff: + byte = fhandle.read(1) + ftype = ord(byte) + size = struct.unpack(b'>H', fhandle.read(2))[0] - 2 + # We are at a SOFn block + fhandle.seek(1, 1) # Skip `precision' byte. + height, width = struct.unpack(b'>HH', fhandle.read(4)) + except Exception: # IGNORE:W0703 + return + elif imgtype == 'jpeg' and imgdata is not None: + try: + pos = 0 + size = 2 + ftype = 0 + while not 0xc0 <= ftype <= 0xcf: + pos += size + byte = imgdata[pos:pos+1] + pos += 1 + while ord(byte) == 0xff: + byte = imgdata[pos:pos+1] + pos += 1 + ftype = ord(byte) + size = struct.unpack(b'>H', imgdata[pos:pos+2])[0] - 2 + pos += 2 + # We are at a SOFn block + pos += 1 # Skip `precision' byte. + height, width = struct.unpack(b'>HH', imgdata[pos:pos+4]) + pos += 4 + except Exception: # IGNORE:W0703 + return + else: + return + return width, height + +# XXX experimental +class CoverProcessor(object): + + """Create a cover page. + + """ + def __init__(self, files, metadata, rscnames, imgname=None, imgdata=None): + self.files = files + self.metadata = metadata + self.rscnames = rscnames + self.cover_page = COVER_PAGE_FINENAME + self.use_svg = USE_SVG_WRAPPER # Use svg wrapper. + self.lang = metadata.get('Language', ['en'])[0] + # This should ensure that if the methods to find the cover image's + # dimensions should fail for any reason, the SVG routine will not be used. + [self.width, self.height] = (-1,-1) + if FORCE_DEFAULT_TITLE: + self.title = DEFAULT_TITLE + else: + self.title = metadata.get('Title', [DEFAULT_TITLE])[0] + + self.cover_image = None + if imgname is not None: + self.cover_image = imgname + elif 'CoverOffset' in metadata: + imageNumber = int(metadata['CoverOffset'][0]) + cover_image = self.rscnames[imageNumber] + if cover_image is not None: + self.cover_image = cover_image + else: + print('Warning: Cannot identify the cover image.') + if self.use_svg: + try: + if imgdata is None: + fname = os.path.join(files.imgdir, self.cover_image) + [self.width, self.height] = get_image_size(fname) + else: + [self.width, self.height] = get_image_size(None, imgdata) + except: + self.use_svg = False + width = self.width + height = self.height + if width < 0 or height < 0 or width > MAX_WIDTH or height > MAX_HEIGHT: + self.use_svg = False + return + + def getImageName(self): + return self.cover_image + + def getXHTMLName(self): + return self.cover_page + + def buildXHTML(self): + print('Building a cover page.') + files = self.files + cover_image = self.cover_image + title = self.title + lang = self.lang + + image_dir = os.path.normpath(os.path.relpath(files.k8images, files.k8text)) + image_path = os.path.join(image_dir, cover_image).replace('\\', '/') + + if not self.use_svg: + data = '' + data += '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html>' + data += '<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"' + data += ' xml:lang="{:s}">\n'.format(lang) + data += '<head>\n<title>{:s}</title>\n'.format(title) + data += '<style type="text/css">\n' + data += 'body {\n margin: 0;\n padding: 0;\n text-align: center;\n}\n' + data += 'div {\n height: 100%;\n width: 100%;\n text-align: center;\n page-break-inside: avoid;\n}\n' + data += 'img {\n display: inline-block;\n height: 100%;\n margin: 0 auto;\n}\n' + data += '</style>\n</head>\n' + data += '<body><div>\n' + data += ' <img src="{:s}" alt=""/>\n'.format(image_path) + data += '</div></body>\n</html>' + else: + width = self.width + height = self.height + viewBox = "0 0 {0:d} {1:d}".format(width, height) + + data = '' + data += '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html>' + data += '<html xmlns="http://www.w3.org/1999/xhtml"' + data += ' xml:lang="{:s}">\n'.format(lang) + data += '<head>\n <title>{:s}</title>\n'.format(title) + data += '<style type="text/css">\n' + data += 'svg {padding: 0pt; margin:0pt}\n' + data += 'body { text-align: center; padding:0pt; margin: 0pt; }\n' + data += '</style>\n</head>\n' + data += '<body>\n <div>\n' + data += ' <svg xmlns="http://www.w3.org/2000/svg" height="100%" preserveAspectRatio="xMidYMid meet"' + data += ' version="1.1" viewBox="{0:s}" width="100%" xmlns:xlink="http://www.w3.org/1999/xlink">\n'.format(viewBox) + data += ' <image height="{0}" width="{1}" xlink:href="{2}"/>\n'.format(height, width, image_path) + data += ' </svg>\n' + data += ' </div>\n</body>\n</html>' + return data + + def writeXHTML(self): + files = self.files + cover_page = self.cover_page + + data = self.buildXHTML() + + outfile = os.path.join(files.k8text, cover_page) + if os.path.exists(pathof(outfile)): + print('Warning: {:s} already exists.'.format(cover_page)) + os.remove(pathof(outfile)) + with open(pathof(outfile), 'wb') as f: + f.write(data.encode('utf-8')) + return + + def guide_toxml(self): + files = self.files + text_dir = os.path.relpath(files.k8text, files.k8oebps) + data = '<reference type="cover" title="Cover" href="{:s}/{:s}" />\n'.format( + text_dir, self.cover_page) + return data diff --git a/src/epy_reader/tools/KindleUnpack/mobi_dict.py b/src/epy_reader/tools/KindleUnpack/mobi_dict.py new file mode 100644 index 0000000..bfc2ea8 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_dict.py @@ -0,0 +1,377 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, PY3, utf8_str, bstr, bchr + +if PY2: + range = xrange + array_format = b'B' +if PY3: + unichr = chr + array_format = "B" + +import array + +import struct +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring + +from .mobi_index import getVariableWidthValue, readTagSection, getTagMap +from .mobi_utils import toHex + +DEBUG_DICT = False + +class InflectionData(object): + + def __init__(self, infldatas): + self.infldatas = infldatas + self.starts = [] + self.counts = [] + for idata in self.infldatas: + start, = struct.unpack_from(b'>L', idata, 0x14) + count, = struct.unpack_from(b'>L', idata, 0x18) + self.starts.append(start) + self.counts.append(count) + + def lookup(self, lookupvalue): + i = 0 + rvalue = lookupvalue + while rvalue >= self.counts[i]: + rvalue = rvalue - self.counts[i] + i += 1 + if i == len(self.counts): + print("Error: Problem with multiple inflections data sections") + return lookupvalue, self.starts[0], self.counts[0], self.infldatas[0] + return rvalue, self.starts[i], self.counts[i], self.infldatas[i] + + def offsets(self, value): + rvalue, start, count, data = self.lookup(value) + offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue)) + if rvalue + 1 < count: + nextOffset, = struct.unpack_from(b'>H',data, start + 4 + (2 * (rvalue + 1))) + else: + nextOffset = None + return offset, nextOffset, data + + +class dictSupport(object): + + def __init__(self, mh, sect): + self.mh = mh + self.header = mh.header + self.sect = sect + self.metaOrthIndex = mh.metaOrthIndex + self.metaInflIndex = mh.metaInflIndex + + def parseHeader(self, data): + "read INDX header" + if not data[:4] == b'INDX': + print("Warning: index section is not INDX") + return False + words = ( + 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code', + 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc' + ) + num = len(words) + values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)]) + header = {} + for n in range(num): + header[words[n]] = values[n] + + ordt1 = None + ordt2 = None + + otype, oentries, op1, op2, otagx = struct.unpack_from(b'>LLLLL',data, 0xa4) + header['otype'] = otype + header['oentries'] = oentries + + if DEBUG_DICT: + print("otype %d, oentries %d, op1 %d, op2 %d, otagx %d" % (otype, oentries, op1, op2, otagx)) + + if header['code'] == 0xfdea or oentries > 0: + # some dictionaries seem to be codepage 65002 (0xFDEA) which seems + # to be some sort of strange EBCDIC utf-8 or 16 encoded strings + # So we need to look for them and store them away to process leading text + # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries + # we only ever seem to use the second but ... + # + # if otype = 0, ORDT table uses 16 bit values as offsets into the table + # if otype = 1, ORDT table uses 8 bit values as offsets inot the table + + assert(data[op1:op1+4] == b'ORDT') + assert(data[op2:op2+4] == b'ORDT') + ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4) + ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4) + + if DEBUG_DICT: + print("parsed INDX header:") + for key in header: + print(key, "%x" % header[key],) + print("\n") + return header, ordt1, ordt2 + + def getPositionMap(self): + sect = self.sect + + positionMap = {} + + metaOrthIndex = self.metaOrthIndex + metaInflIndex = self.metaInflIndex + + decodeInflection = True + if metaOrthIndex != 0xFFFFFFFF: + print("Info: Document contains orthographic index, handle as dictionary") + if metaInflIndex == 0xFFFFFFFF: + decodeInflection = False + else: + metaInflIndexData = sect.loadSection(metaInflIndex) + + print("\nParsing metaInflIndexData") + midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData) + + metaIndexCount = midxhdr['count'] + idatas = [] + for j in range(metaIndexCount): + idatas.append(sect.loadSection(metaInflIndex + 1 + j)) + dinfl = InflectionData(idatas) + + inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount) + tagSectionStart = midxhdr['len'] + inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData) + if DEBUG_DICT: + print("inflectionTagTable: %s" % inflectionTagTable) + if self.hasTag(inflectionTagTable, 0x07): + print("Error: Dictionary uses obsolete inflection rule scheme which is not yet supported") + decodeInflection = False + + data = sect.loadSection(metaOrthIndex) + + print("\nParsing metaOrthIndex") + idxhdr, hordt1, hordt2 = self.parseHeader(data) + + tagSectionStart = idxhdr['len'] + controlByteCount, tagTable = readTagSection(tagSectionStart, data) + orthIndexCount = idxhdr['count'] + print("orthIndexCount is", orthIndexCount) + if DEBUG_DICT: + print("orthTagTable: %s" % tagTable) + if hordt2 is not None: + print("orth entry uses ordt2 lookup table of type ", idxhdr['otype']) + hasEntryLength = self.hasTag(tagTable, 0x02) + if not hasEntryLength: + print("Info: Index doesn't contain entry length tags") + + print("Read dictionary index data") + for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount): + data = sect.loadSection(i) + hdrinfo, ordt1, ordt2 = self.parseHeader(data) + idxtPos = hdrinfo['start'] + entryCount = hdrinfo['count'] + idxPositions = [] + for j in range(entryCount): + pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j)) + idxPositions.append(pos) + # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) + idxPositions.append(idxtPos) + for j in range(entryCount): + startPos = idxPositions[j] + endPos = idxPositions[j+1] + textLength = ord(data[startPos:startPos+1]) + text = data[startPos+1:startPos+1+textLength] + if hordt2 is not None: + utext = u"" + if idxhdr['otype'] == 0: + pattern = b'>H' + inc = 2 + else: + pattern = b'>B' + inc = 1 + pos = 0 + while pos < textLength: + off, = struct.unpack_from(pattern, text, pos) + if off < len(hordt2): + utext += unichr(hordt2[off]) + else: + utext += unichr(off) + pos += inc + text = utext.encode('utf-8') + + tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos) + if 0x01 in tagMap: + if decodeInflection and 0x2a in tagMap: + inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable, + dinfl, inflNameData, tagMap[0x2a]) + else: + inflectionGroups = b'' + assert len(tagMap[0x01]) == 1 + entryStartPosition = tagMap[0x01][0] + if hasEntryLength: + # The idx:entry attribute "scriptable" must be present to create entry length tags. + ml = b'<idx:entry scriptable="yes"><idx:orth value="' + text + b'">' + inflectionGroups + b'</idx:orth>' + if entryStartPosition in positionMap: + positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml + else: + positionMap[entryStartPosition] = ml + assert len(tagMap[0x02]) == 1 + entryEndPosition = entryStartPosition + tagMap[0x02][0] + if entryEndPosition in positionMap: + positionMap[entryEndPosition] = b"</idx:entry>" + positionMap[entryEndPosition] + else: + positionMap[entryEndPosition] = b"</idx:entry>" + + else: + indexTags = b'<idx:entry>\n<idx:orth value="' + text + b'">\n' + inflectionGroups + b'</idx:entry>\n' + if entryStartPosition in positionMap: + positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags + else: + positionMap[entryStartPosition] = indexTags + return positionMap + + def hasTag(self, tagTable, tag): + ''' + Test if tag table contains given tag. + + @param tagTable: The tag table. + @param tag: The tag to search. + @return: True if tag table contains given tag; False otherwise. + ''' + for currentTag, _, _, _ in tagTable: + if currentTag == tag: + return True + return False + + def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList): + ''' + Create string which contains the inflection groups with inflection rules as mobipocket tags. + + @param mainEntry: The word to inflect. + @param controlByteCount: The number of control bytes. + @param tagTable: The tag table. + @param data: The Inflection data object to properly select the right inflection data section to use + @param inflectionNames: The inflection rule name data. + @param groupList: The list of inflection groups to process. + @return: String with inflection groups and rules or empty string if required tags are not available. + ''' + result = b"" + for value in groupList: + offset, nextOffset, data = dinfl.offsets(value) + + # First byte seems to be always 0x00 and must be skipped. + assert ord(data[offset:offset+1]) == 0x00 + tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset) + + # Make sure that the required tags are available. + if 0x05 not in tagMap: + print("Error: Required tag 0x05 not found in tagMap") + return "" + if 0x1a not in tagMap: + print("Error: Required tag 0x1a not found in tagMap") + return b'' + + result += b'<idx:infl>' + + for i in range(len(tagMap[0x05])): + + # Get name of inflection rule. + value = tagMap[0x05][i] + consumed, textLength = getVariableWidthValue(inflectionNames, value) + inflectionName = inflectionNames[value+consumed:value+consumed+textLength] + + # Get and apply inflection rule across possibly multiple inflection data sections + value = tagMap[0x1a][i] + rvalue, start, count, data = dinfl.lookup(value) + offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue)) + textLength = ord(data[offset:offset+1]) + inflection = self.applyInflectionRule(mainEntry, data, offset+1, offset+1+textLength) + if inflection is not None: + result += b' <idx:iform name="' + inflectionName + b'" value="' + inflection + b'"/>' + + result += b'</idx:infl>' + return result + + def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end): + ''' + Apply inflection rule. + + @param mainEntry: The word to inflect. + @param inflectionRuleData: The inflection rules. + @param start: The start position of the inflection rule to use. + @param end: The end position of the inflection rule to use. + @return: The string with the inflected word or None if an error occurs. + ''' + mode = -1 + byteArray = array.array(array_format, mainEntry) + position = len(byteArray) + for charOffset in range(start, end): + char = inflectionRuleData[charOffset:charOffset+1] + abyte = ord(char) + if abyte >= 0x0a and abyte <= 0x13: + # Move cursor backwards + offset = abyte - 0x0a + if mode not in [0x02, 0x03]: + mode = 0x02 + position = len(byteArray) + position -= offset + elif abyte > 0x13: + if mode == -1: + print("Error: Unexpected first byte %i of inflection rule" % abyte) + return None + elif position == -1: + print("Error: Unexpected first byte %i of inflection rule" % abyte) + return None + else: + if mode == 0x01: + # Insert at word start + byteArray.insert(position, abyte) + position += 1 + elif mode == 0x02: + # Insert at word end + byteArray.insert(position, abyte) + elif mode == 0x03: + # Delete at word end + position -= 1 + deleted = byteArray.pop(position) + if bchr(deleted) != char: + if DEBUG_DICT: + print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted))) + print("Error: Delete operation of inflection rule failed") + return None + elif mode == 0x04: + # Delete at word start + deleted = byteArray.pop(position) + if bchr(deleted) != char: + if DEBUG_DICT: + print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted))) + print("Error: Delete operation of inflection rule failed") + return None + else: + print("Error: Inflection rule mode %x is not implemented" % mode) + return None + elif abyte == 0x01: + # Insert at word start + if mode not in [0x01, 0x04]: + position = 0 + mode = abyte + elif abyte == 0x02: + # Insert at word end + if mode not in [0x02, 0x03]: + position = len(byteArray) + mode = abyte + elif abyte == 0x03: + # Delete at word end + if mode not in [0x02, 0x03]: + position = len(byteArray) + mode = abyte + elif abyte == 0x04: + # Delete at word start + if mode not in [0x01, 0x04]: + position = 0 + # Delete at word start + mode = abyte + else: + print("Error: Inflection rule mode %x is not implemented" % abyte) + return None + return utf8_str(byteArray.tostring()) diff --git a/src/epy_reader/tools/KindleUnpack/mobi_header.py b/src/epy_reader/tools/KindleUnpack/mobi_header.py new file mode 100644 index 0000000..a15f636 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_header.py @@ -0,0 +1,936 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +DEBUG_USE_ORDERED_DICTIONARY = False # OrderedDict is supoorted >= python 2.7. +""" set to True to use OrderedDict for MobiHeader.metadata.""" + +if DEBUG_USE_ORDERED_DICTIONARY: + from collections import OrderedDict as dict_ +else: + dict_ = dict + +from .compatibility_utils import PY2, unicode_str, hexlify, bord + +if PY2: + range = xrange + +import struct +import uuid + +# import the mobiunpack support libraries +from .mobi_utils import getLanguage +from .mobi_uncompress import HuffcdicReader, PalmdocReader, UncompressedReader + +class unpackException(Exception): + pass + + +def sortedHeaderKeys(mheader): + hdrkeys = sorted(list(mheader.keys()), key=lambda akey: mheader[akey][0]) + return hdrkeys + + +# HD Containers have their own headers and their own EXTH +# this is just guesswork so far, making big assumption that +# metavalue key numbers remain the same in the CONT EXTH + +# Note: The layout of the CONT Header is still unknown +# so just deal with their EXTH sections for now + +def dump_contexth(cpage, extheader): + # determine text encoding + codec = 'windows-1252' + codec_map = { + 1252 : 'windows-1252', + 65001: 'utf-8', + } + if cpage in codec_map: + codec = codec_map[cpage] + if extheader == b'': + return + id_map_strings = { + 1 : 'Drm Server Id', + 2 : 'Drm Commerce Id', + 3 : 'Drm Ebookbase Book Id', + 4 : 'Drm Ebookbase Dep Id', + 100 : 'Creator', + 101 : 'Publisher', + 102 : 'Imprint', + 103 : 'Description', + 104 : 'ISBN', + 105 : 'Subject', + 106 : 'Published', + 107 : 'Review', + 108 : 'Contributor', + 109 : 'Rights', + 110 : 'SubjectCode', + 111 : 'Type', + 112 : 'Source', + 113 : 'ASIN', + # 114 : 'versionNumber', + 117 : 'Adult', + 118 : 'Retail-Price', + 119 : 'Retail-Currency', + 120 : 'TSC', + 122 : 'fixed-layout', + 123 : 'book-type', + 124 : 'orientation-lock', + 126 : 'original-resolution', + 127 : 'zero-gutter', + 128 : 'zero-margin', + 129 : 'MetadataResourceURI', + 132 : 'RegionMagnification', + 150 : 'LendingEnabled', + 200 : 'DictShortName', + 501 : 'cdeType', + 502 : 'last_update_time', + 503 : 'Updated_Title', + 504 : 'CDEContentKey', + 505 : 'AmazonContentReference', + 506 : 'Title-Language', + 507 : 'Title-Display-Direction', + 508 : 'Title-Pronunciation', + 509 : 'Title-Collation', + 510 : 'Secondary-Title', + 511 : 'Secondary-Title-Language', + 512 : 'Secondary-Title-Direction', + 513 : 'Secondary-Title-Pronunciation', + 514 : 'Secondary-Title-Collation', + 515 : 'Author-Language', + 516 : 'Author-Display-Direction', + 517 : 'Author-Pronunciation', + 518 : 'Author-Collation', + 519 : 'Author-Type', + 520 : 'Publisher-Language', + 521 : 'Publisher-Display-Direction', + 522 : 'Publisher-Pronunciation', + 523 : 'Publisher-Collation', + 524 : 'Content-Language-Tag', + 525 : 'primary-writing-mode', + 526 : 'NCX-Ingested-By-Software', + 527 : 'page-progression-direction', + 528 : 'override-kindle-fonts', + 529 : 'Compression-Upgraded', + 530 : 'Soft-Hyphens-In-Content', + 531 : 'Dictionary_In_Langague', + 532 : 'Dictionary_Out_Language', + 533 : 'Font_Converted', + 534 : 'Amazon_Creator_Info', + 535 : 'Creator-Build-Tag', + 536 : 'HD-Media-Containers-Info', # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?) + 538 : 'Resource-Container-Fidelity', + 539 : 'HD-Container-Mimetype', + 540 : 'Sample-For_Special-Purpose', + 541 : 'Kindletool-Operation-Information', + 542 : 'Container_Id', + 543 : 'Asset-Type', # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER + 544 : 'Unknown_544', + } + id_map_values = { + 114 : 'versionNumber', + 115 : 'sample', + 116 : 'StartOffset', + 121 : 'Mobi8-Boundary-Section', + 125 : 'Embedded-Record-Count', + 130 : 'Offline-Sample', + 131 : 'Metadata-Record-Offset', + 201 : 'CoverOffset', + 202 : 'ThumbOffset', + 203 : 'HasFakeCover', + 204 : 'Creator-Software', + 205 : 'Creator-Major-Version', + 206 : 'Creator-Minor-Version', + 207 : 'Creator-Build-Number', + 401 : 'Clipping-Limit', + 402 : 'Publisher-Limit', + 404 : 'Text-to-Speech-Disabled', + 406 : 'Rental-Expiration-Time', + } + id_map_hexstrings = { + 208 : 'Watermark_(hex)', + 209 : 'Tamper-Proof-Keys_(hex)', + 300 : 'Font-Signature_(hex)', + 403 : 'Unknown_(403)_(hex)', + 405 : 'Ownership-Type_(hex)', + 407 : 'Unknown_(407)_(hex)', + 420 : 'Multimedia-Content-Reference_(hex)', + 450 : 'Locations_Match_(hex)', + 451 : 'Full-Story-Length_(hex)', + 452 : 'Sample-Start_Location_(hex)', + 453 : 'Sample-End-Location_(hex)', + } + _length, num_items = struct.unpack(b'>LL', extheader[4:12]) + extheader = extheader[12:] + pos = 0 + for _ in range(num_items): + id, size = struct.unpack(b'>LL', extheader[pos:pos+8]) + content = extheader[pos + 8: pos + size] + if id in id_map_strings: + name = id_map_strings[id] + print('\n Key: "%s"\n Value: "%s"' % (name, content.decode(codec, errors='replace'))) + elif id in id_map_values: + name = id_map_values[id] + if size == 9: + value, = struct.unpack(b'B',content) + print('\n Key: "%s"\n Value: 0x%01x' % (name, value)) + elif size == 10: + value, = struct.unpack(b'>H',content) + print('\n Key: "%s"\n Value: 0x%02x' % (name, value)) + elif size == 12: + value, = struct.unpack(b'>L',content) + print('\n Key: "%s"\n Value: 0x%04x' % (name, value)) + else: + print("\nError: Value for %s has unexpected size of %s" % (name, size)) + elif id in id_map_hexstrings: + name = id_map_hexstrings[id] + print('\n Key: "%s"\n Value: 0x%s' % (name, hexlify(content))) + else: + print("\nWarning: Unknown metadata with id %s found" % id) + name = str(id) + ' (hex)' + print(' Key: "%s"\n Value: 0x%s' % (name, hexlify(content))) + pos += size + return + + +class MobiHeader: + # all values are packed in big endian format + palmdoc_header = { + 'compression_type' : (0x00, b'>H', 2), + 'fill0' : (0x02, b'>H', 2), + 'text_length' : (0x04, b'>L', 4), + 'text_records' : (0x08, b'>H', 2), + 'max_section_size' : (0x0a, b'>H', 2), + 'read_pos ' : (0x0c, b'>L', 4), + } + + mobi6_header = { + 'compression_type' : (0x00, b'>H', 2), + 'fill0' : (0x02, b'>H', 2), + 'text_length' : (0x04, b'>L', 4), + 'text_records' : (0x08, b'>H', 2), + 'max_section_size' : (0x0a, b'>H', 2), + 'crypto_type' : (0x0c, b'>H', 2), + 'fill1' : (0x0e, b'>H', 2), + 'magic' : (0x10, b'4s', 4), + 'header_length (from MOBI)' : (0x14, b'>L', 4), + 'type' : (0x18, b'>L', 4), + 'codepage' : (0x1c, b'>L', 4), + 'unique_id' : (0x20, b'>L', 4), + 'version' : (0x24, b'>L', 4), + 'metaorthindex' : (0x28, b'>L', 4), + 'metainflindex' : (0x2c, b'>L', 4), + 'index_names' : (0x30, b'>L', 4), + 'index_keys' : (0x34, b'>L', 4), + 'extra_index0' : (0x38, b'>L', 4), + 'extra_index1' : (0x3c, b'>L', 4), + 'extra_index2' : (0x40, b'>L', 4), + 'extra_index3' : (0x44, b'>L', 4), + 'extra_index4' : (0x48, b'>L', 4), + 'extra_index5' : (0x4c, b'>L', 4), + 'first_nontext' : (0x50, b'>L', 4), + 'title_offset' : (0x54, b'>L', 4), + 'title_length' : (0x58, b'>L', 4), + 'language_code' : (0x5c, b'>L', 4), + 'dict_in_lang' : (0x60, b'>L', 4), + 'dict_out_lang' : (0x64, b'>L', 4), + 'min_version' : (0x68, b'>L', 4), + 'first_resc_offset' : (0x6c, b'>L', 4), + 'huff_offset' : (0x70, b'>L', 4), + 'huff_num' : (0x74, b'>L', 4), + 'huff_tbl_offset' : (0x78, b'>L', 4), + 'huff_tbl_len' : (0x7c, b'>L', 4), + 'exth_flags' : (0x80, b'>L', 4), + 'fill3_a' : (0x84, b'>L', 4), + 'fill3_b' : (0x88, b'>L', 4), + 'fill3_c' : (0x8c, b'>L', 4), + 'fill3_d' : (0x90, b'>L', 4), + 'fill3_e' : (0x94, b'>L', 4), + 'fill3_f' : (0x98, b'>L', 4), + 'fill3_g' : (0x9c, b'>L', 4), + 'fill3_h' : (0xa0, b'>L', 4), + 'unknown0' : (0xa4, b'>L', 4), + 'drm_offset' : (0xa8, b'>L', 4), + 'drm_count' : (0xac, b'>L', 4), + 'drm_size' : (0xb0, b'>L', 4), + 'drm_flags' : (0xb4, b'>L', 4), + 'fill4_a' : (0xb8, b'>L', 4), + 'fill4_b' : (0xbc, b'>L', 4), + 'first_content' : (0xc0, b'>H', 2), + 'last_content' : (0xc2, b'>H', 2), + 'unknown0' : (0xc4, b'>L', 4), + 'fcis_offset' : (0xc8, b'>L', 4), + 'fcis_count' : (0xcc, b'>L', 4), + 'flis_offset' : (0xd0, b'>L', 4), + 'flis_count' : (0xd4, b'>L', 4), + 'unknown1' : (0xd8, b'>L', 4), + 'unknown2' : (0xdc, b'>L', 4), + 'srcs_offset' : (0xe0, b'>L', 4), + 'srcs_count' : (0xe4, b'>L', 4), + 'unknown3' : (0xe8, b'>L', 4), + 'unknown4' : (0xec, b'>L', 4), + 'fill5' : (0xf0, b'>H', 2), + 'traildata_flags' : (0xf2, b'>H', 2), + 'ncx_index' : (0xf4, b'>L', 4), + 'unknown5' : (0xf8, b'>L', 4), + 'unknown6' : (0xfc, b'>L', 4), + 'datp_offset' : (0x100, b'>L', 4), + 'unknown7' : (0x104, b'>L', 4), + 'Unknown ' : (0x108, b'>L', 4), + 'Unknown ' : (0x10C, b'>L', 4), + 'Unknown ' : (0x110, b'>L', 4), + 'Unknown ' : (0x114, b'>L', 4), + 'Unknown ' : (0x118, b'>L', 4), + 'Unknown ' : (0x11C, b'>L', 4), + 'Unknown ' : (0x120, b'>L', 4), + 'Unknown ' : (0x124, b'>L', 4), + 'Unknown ' : (0x128, b'>L', 4), + 'Unknown ' : (0x12C, b'>L', 4), + 'Unknown ' : (0x130, b'>L', 4), + 'Unknown ' : (0x134, b'>L', 4), + 'Unknown ' : (0x138, b'>L', 4), + 'Unknown ' : (0x11C, b'>L', 4), + } + + mobi8_header = { + 'compression_type' : (0x00, b'>H', 2), + 'fill0' : (0x02, b'>H', 2), + 'text_length' : (0x04, b'>L', 4), + 'text_records' : (0x08, b'>H', 2), + 'max_section_size' : (0x0a, b'>H', 2), + 'crypto_type' : (0x0c, b'>H', 2), + 'fill1' : (0x0e, b'>H', 2), + 'magic' : (0x10, b'4s', 4), + 'header_length (from MOBI)' : (0x14, b'>L', 4), + 'type' : (0x18, b'>L', 4), + 'codepage' : (0x1c, b'>L', 4), + 'unique_id' : (0x20, b'>L', 4), + 'version' : (0x24, b'>L', 4), + 'metaorthindex' : (0x28, b'>L', 4), + 'metainflindex' : (0x2c, b'>L', 4), + 'index_names' : (0x30, b'>L', 4), + 'index_keys' : (0x34, b'>L', 4), + 'extra_index0' : (0x38, b'>L', 4), + 'extra_index1' : (0x3c, b'>L', 4), + 'extra_index2' : (0x40, b'>L', 4), + 'extra_index3' : (0x44, b'>L', 4), + 'extra_index4' : (0x48, b'>L', 4), + 'extra_index5' : (0x4c, b'>L', 4), + 'first_nontext' : (0x50, b'>L', 4), + 'title_offset' : (0x54, b'>L', 4), + 'title_length' : (0x58, b'>L', 4), + 'language_code' : (0x5c, b'>L', 4), + 'dict_in_lang' : (0x60, b'>L', 4), + 'dict_out_lang' : (0x64, b'>L', 4), + 'min_version' : (0x68, b'>L', 4), + 'first_resc_offset' : (0x6c, b'>L', 4), + 'huff_offset' : (0x70, b'>L', 4), + 'huff_num' : (0x74, b'>L', 4), + 'huff_tbl_offset' : (0x78, b'>L', 4), + 'huff_tbl_len' : (0x7c, b'>L', 4), + 'exth_flags' : (0x80, b'>L', 4), + 'fill3_a' : (0x84, b'>L', 4), + 'fill3_b' : (0x88, b'>L', 4), + 'fill3_c' : (0x8c, b'>L', 4), + 'fill3_d' : (0x90, b'>L', 4), + 'fill3_e' : (0x94, b'>L', 4), + 'fill3_f' : (0x98, b'>L', 4), + 'fill3_g' : (0x9c, b'>L', 4), + 'fill3_h' : (0xa0, b'>L', 4), + 'unknown0' : (0xa4, b'>L', 4), + 'drm_offset' : (0xa8, b'>L', 4), + 'drm_count' : (0xac, b'>L', 4), + 'drm_size' : (0xb0, b'>L', 4), + 'drm_flags' : (0xb4, b'>L', 4), + 'fill4_a' : (0xb8, b'>L', 4), + 'fill4_b' : (0xbc, b'>L', 4), + 'fdst_offset' : (0xc0, b'>L', 4), + 'fdst_flow_count' : (0xc4, b'>L', 4), + 'fcis_offset' : (0xc8, b'>L', 4), + 'fcis_count' : (0xcc, b'>L', 4), + 'flis_offset' : (0xd0, b'>L', 4), + 'flis_count' : (0xd4, b'>L', 4), + 'unknown1' : (0xd8, b'>L', 4), + 'unknown2' : (0xdc, b'>L', 4), + 'srcs_offset' : (0xe0, b'>L', 4), + 'srcs_count' : (0xe4, b'>L', 4), + 'unknown3' : (0xe8, b'>L', 4), + 'unknown4' : (0xec, b'>L', 4), + 'fill5' : (0xf0, b'>H', 2), + 'traildata_flags' : (0xf2, b'>H', 2), + 'ncx_index' : (0xf4, b'>L', 4), + 'fragment_index' : (0xf8, b'>L', 4), + 'skeleton_index' : (0xfc, b'>L', 4), + 'datp_offset' : (0x100, b'>L', 4), + 'guide_index' : (0x104, b'>L', 4), + 'Unknown ' : (0x108, b'>L', 4), + 'Unknown ' : (0x10C, b'>L', 4), + 'Unknown ' : (0x110, b'>L', 4), + 'Unknown ' : (0x114, b'>L', 4), + 'Unknown ' : (0x118, b'>L', 4), + 'Unknown ' : (0x11C, b'>L', 4), + 'Unknown ' : (0x120, b'>L', 4), + 'Unknown ' : (0x124, b'>L', 4), + 'Unknown ' : (0x128, b'>L', 4), + 'Unknown ' : (0x12C, b'>L', 4), + 'Unknown ' : (0x130, b'>L', 4), + 'Unknown ' : (0x134, b'>L', 4), + 'Unknown ' : (0x138, b'>L', 4), + 'Unknown ' : (0x11C, b'>L', 4), + } + + palmdoc_header_sorted_keys = sortedHeaderKeys(palmdoc_header) + mobi6_header_sorted_keys = sortedHeaderKeys(mobi6_header) + mobi8_header_sorted_keys = sortedHeaderKeys(mobi8_header) + + id_map_strings = { + 1 : 'Drm Server Id', + 2 : 'Drm Commerce Id', + 3 : 'Drm Ebookbase Book Id', + 4 : 'Drm Ebookbase Dep Id', + 100 : 'Creator', + 101 : 'Publisher', + 102 : 'Imprint', + 103 : 'Description', + 104 : 'ISBN', + 105 : 'Subject', + 106 : 'Published', + 107 : 'Review', + 108 : 'Contributor', + 109 : 'Rights', + 110 : 'SubjectCode', + 111 : 'Type', + 112 : 'Source', + 113 : 'ASIN', + # 114 : 'versionNumber', + 117 : 'Adult', + 118 : 'Retail-Price', + 119 : 'Retail-Currency', + 120 : 'TSC', + 122 : 'fixed-layout', + 123 : 'book-type', + 124 : 'orientation-lock', + 126 : 'original-resolution', + 127 : 'zero-gutter', + 128 : 'zero-margin', + 129 : 'MetadataResourceURI', + 132 : 'RegionMagnification', + 150 : 'LendingEnabled', + 200 : 'DictShortName', + 501 : 'cdeType', + 502 : 'last_update_time', + 503 : 'Updated_Title', + 504 : 'CDEContentKey', + 505 : 'AmazonContentReference', + 506 : 'Title-Language', + 507 : 'Title-Display-Direction', + 508 : 'Title-Pronunciation', + 509 : 'Title-Collation', + 510 : 'Secondary-Title', + 511 : 'Secondary-Title-Language', + 512 : 'Secondary-Title-Direction', + 513 : 'Secondary-Title-Pronunciation', + 514 : 'Secondary-Title-Collation', + 515 : 'Author-Language', + 516 : 'Author-Display-Direction', + 517 : 'Author-Pronunciation', + 518 : 'Author-Collation', + 519 : 'Author-Type', + 520 : 'Publisher-Language', + 521 : 'Publisher-Display-Direction', + 522 : 'Publisher-Pronunciation', + 523 : 'Publisher-Collation', + 524 : 'Content-Language-Tag', + 525 : 'primary-writing-mode', + 526 : 'NCX-Ingested-By-Software', + 527 : 'page-progression-direction', + 528 : 'override-kindle-fonts', + 529 : 'Compression-Upgraded', + 530 : 'Soft-Hyphens-In-Content', + 531 : 'Dictionary_In_Langague', + 532 : 'Dictionary_Out_Language', + 533 : 'Font_Converted', + 534 : 'Amazon_Creator_Info', + 535 : 'Creator-Build-Tag', + 536 : 'HD-Media-Containers-Info', # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?) + 538 : 'Resource-Container-Fidelity', + 539 : 'HD-Container-Mimetype', + 540 : 'Sample-For_Special-Purpose', + 541 : 'Kindletool-Operation-Information', + 542 : 'Container_Id', + 543 : 'Asset-Type', # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER + 544 : 'Unknown_544', + } + id_map_values = { + 114 : 'versionNumber', + 115 : 'sample', + 116 : 'StartOffset', + 121 : 'Mobi8-Boundary-Section', + 125 : 'Embedded-Record-Count', + 130 : 'Offline-Sample', + 131 : 'Metadata-Record-Offset', + 201 : 'CoverOffset', + 202 : 'ThumbOffset', + 203 : 'HasFakeCover', + 204 : 'Creator-Software', + 205 : 'Creator-Major-Version', + 206 : 'Creator-Minor-Version', + 207 : 'Creator-Build-Number', + 401 : 'Clipping-Limit', + 402 : 'Publisher-Limit', + 404 : 'Text-to-Speech-Disabled', + 406 : 'Rental-Expiration-Time', + } + id_map_hexstrings = { + 208 : 'Watermark_(hex)', + 209 : 'Tamper-Proof-Keys_(hex)', + 300 : 'Font-Signature_(hex)', + 403 : 'Unknown_(403)_(hex)', + 405 : 'Ownership-Type_(hex)', + 407 : 'Unknown_(407)_(hex)', + 420 : 'Multimedia-Content-Reference_(hex)', + 450 : 'Locations_Match_(hex)', + 451 : 'Full-Story-Length_(hex)', + 452 : 'Sample-Start_Location_(hex)', + 453 : 'Sample-End-Location_(hex)', + } + + def __init__(self, sect, sectNumber): + self.sect = sect + self.start = sectNumber + self.header = self.sect.loadSection(self.start) + if len(self.header)>20 and self.header[16:20] == b'MOBI': + self.sect.setsectiondescription(0,"Mobipocket Header") + self.palm = False + elif self.sect.ident == b'TEXtREAd': + self.sect.setsectiondescription(0, "PalmDOC Header") + self.palm = True + else: + raise unpackException('Unknown File Format') + + self.records, = struct.unpack_from(b'>H', self.header, 0x8) + + # set defaults in case this is a PalmDOC + self.title = self.sect.palmname.decode('latin-1', errors='replace') + self.length = len(self.header)-16 + self.type = 3 + self.codepage = 1252 + self.codec = 'windows-1252' + self.unique_id = 0 + self.version = 0 + self.hasExth = False + self.exth = b'' + self.exth_offset = self.length + 16 + self.exth_length = 0 + self.crypto_type = 0 + self.firstnontext = self.start+self.records + 1 + self.firstresource = self.start+self.records + 1 + self.ncxidx = 0xffffffff + self.metaOrthIndex = 0xffffffff + self.metaInflIndex = 0xffffffff + self.skelidx = 0xffffffff + self.fragidx = 0xffffffff + self.guideidx = 0xffffffff + self.fdst = 0xffffffff + self.mlstart = self.sect.loadSection(self.start+1)[:4] + self.rawSize = 0 + self.metadata = dict_() + + # set up for decompression/unpacking + self.compression, = struct.unpack_from(b'>H', self.header, 0x0) + if self.compression == 0x4448: + reader = HuffcdicReader() + huffoff, huffnum = struct.unpack_from(b'>LL', self.header, 0x70) + huffoff = huffoff + self.start + self.sect.setsectiondescription(huffoff,"Huffman Compression Seed") + reader.loadHuff(self.sect.loadSection(huffoff)) + for i in range(1, huffnum): + self.sect.setsectiondescription(huffoff+i,"Huffman CDIC Compression Seed %d" % i) + reader.loadCdic(self.sect.loadSection(huffoff+i)) + self.unpack = reader.unpack + elif self.compression == 2: + self.unpack = PalmdocReader().unpack + elif self.compression == 1: + self.unpack = UncompressedReader().unpack + else: + raise unpackException('invalid compression type: 0x%4x' % self.compression) + + if self.palm: + return + + self.length, self.type, self.codepage, self.unique_id, self.version = struct.unpack(b'>LLLLL', self.header[20:40]) + codec_map = { + 1252 : 'windows-1252', + 65001: 'utf-8', + } + if self.codepage in codec_map: + self.codec = codec_map[self.codepage] + + # title + toff, tlen = struct.unpack(b'>II', self.header[0x54:0x5c]) + tend = toff + tlen + self.title=self.header[toff:tend].decode(self.codec, errors='replace') + + exth_flag, = struct.unpack(b'>L', self.header[0x80:0x84]) + self.hasExth = exth_flag & 0x40 + self.exth_offset = self.length + 16 + self.exth_length = 0 + if self.hasExth: + self.exth_length, = struct.unpack_from(b'>L', self.header, self.exth_offset+4) + self.exth_length = ((self.exth_length + 3)>>2)<<2 # round to next 4 byte boundary + self.exth = self.header[self.exth_offset:self.exth_offset+self.exth_length] + + # parse the exth / metadata + self.parseMetaData() + + # self.mlstart = self.sect.loadSection(self.start+1) + # self.mlstart = self.mlstart[0:4] + self.crypto_type, = struct.unpack_from(b'>H', self.header, 0xC) + + # Start sector for additional files such as images, fonts, resources, etc + # Can be missing so fall back to default set previously + ofst, = struct.unpack_from(b'>L', self.header, 0x6C) + if ofst != 0xffffffff: + self.firstresource = ofst + self.start + ofst, = struct.unpack_from(b'>L', self.header, 0x50) + if ofst != 0xffffffff: + self.firstnontext = ofst + self.start + + if self.isPrintReplica(): + return + + if self.version < 8: + # Dictionary metaOrthIndex + self.metaOrthIndex, = struct.unpack_from(b'>L', self.header, 0x28) + if self.metaOrthIndex != 0xffffffff: + self.metaOrthIndex += self.start + + # Dictionary metaInflIndex + self.metaInflIndex, = struct.unpack_from(b'>L', self.header, 0x2C) + if self.metaInflIndex != 0xffffffff: + self.metaInflIndex += self.start + + # handle older headers without any ncxindex info and later + # specifically 0xe4 headers + if self.length + 16 < 0xf8: + return + + # NCX Index + self.ncxidx, = struct.unpack(b'>L', self.header[0xf4:0xf8]) + if self.ncxidx != 0xffffffff: + self.ncxidx += self.start + + # K8 specific Indexes + if self.start != 0 or self.version == 8: + # Index into <xml> file skeletons in RawML + self.skelidx, = struct.unpack_from(b'>L', self.header, 0xfc) + if self.skelidx != 0xffffffff: + self.skelidx += self.start + + # Index into <div> sections in RawML + self.fragidx, = struct.unpack_from(b'>L', self.header, 0xf8) + if self.fragidx != 0xffffffff: + self.fragidx += self.start + + # Index into Other files + self.guideidx, = struct.unpack_from(b'>L', self.header, 0x104) + if self.guideidx != 0xffffffff: + self.guideidx += self.start + + # dictionaries do not seem to use the same approach in K8's + # so disable them + self.metaOrthIndex = 0xffffffff + self.metaInflIndex = 0xffffffff + + # need to use the FDST record to find out how to properly unpack + # the rawML into pieces + # it is simply a table of start and end locations for each flow piece + self.fdst, = struct.unpack_from(b'>L', self.header, 0xc0) + self.fdstcnt, = struct.unpack_from(b'>L', self.header, 0xc4) + # if cnt is 1 or less, fdst section mumber can be garbage + if self.fdstcnt <= 1: + self.fdst = 0xffffffff + if self.fdst != 0xffffffff: + self.fdst += self.start + # setting of fdst section description properly handled in mobi_kf8proc + + def dump_exth(self): + # determine text encoding + codec=self.codec + if (not self.hasExth) or (self.exth_length) == 0 or (self.exth == b''): + return + num_items, = struct.unpack(b'>L', self.exth[8:12]) + pos = 12 + print("Key Size Description Value") + for _ in range(num_items): + id, size = struct.unpack(b'>LL', self.exth[pos:pos+8]) + contentsize = size-8 + content = self.exth[pos + 8: pos + size] + if id in MobiHeader.id_map_strings: + exth_name = MobiHeader.id_map_strings[id] + print('{0: >3d} {1: >4d} {2: <30s} {3:s}'.format(id, contentsize, exth_name, content.decode(codec, errors='replace'))) + elif id in MobiHeader.id_map_values: + exth_name = MobiHeader.id_map_values[id] + if size == 9: + value, = struct.unpack(b'B',content) + print('{0:3d} byte {1:<30s} {2:d}'.format(id, exth_name, value)) + elif size == 10: + value, = struct.unpack(b'>H',content) + print('{0:3d} word {1:<30s} 0x{2:0>4X} ({2:d})'.format(id, exth_name, value)) + elif size == 12: + value, = struct.unpack(b'>L',content) + print('{0:3d} long {1:<30s} 0x{2:0>8X} ({2:d})'.format(id, exth_name, value)) + else: + print('{0: >3d} {1: >4d} {2: <30s} (0x{3:s})'.format(id, contentsize, "Bad size for "+exth_name, hexlify(content))) + elif id in MobiHeader.id_map_hexstrings: + exth_name = MobiHeader.id_map_hexstrings[id] + print('{0:3d} {1:4d} {2:<30s} 0x{3:s}'.format(id, contentsize, exth_name, hexlify(content))) + else: + exth_name = "Unknown EXTH ID {0:d}".format(id) + print("{0: >3d} {1: >4d} {2: <30s} 0x{3:s}".format(id, contentsize, exth_name, hexlify(content))) + pos += size + return + + def dumpheader(self): + # first 16 bytes are not part of the official mobiheader + # but we will treat it as such + # so section 0 is 16 (decimal) + self.length in total == at least 0x108 bytes for Mobi 8 headers + print("Dumping section %d, Mobipocket Header version: %d, total length %d" % (self.start,self.version, self.length+16)) + self.hdr = {} + # set it up for the proper header version + if self.version == 0: + self.mobi_header = MobiHeader.palmdoc_header + self.mobi_header_sorted_keys = MobiHeader.palmdoc_header_sorted_keys + elif self.version < 8: + self.mobi_header = MobiHeader.mobi6_header + self.mobi_header_sorted_keys = MobiHeader.mobi6_header_sorted_keys + else: + self.mobi_header = MobiHeader.mobi8_header + self.mobi_header_sorted_keys = MobiHeader.mobi8_header_sorted_keys + + # parse the header information + for key in self.mobi_header_sorted_keys: + (pos, format, tot_len) = self.mobi_header[key] + if pos < (self.length + 16): + val, = struct.unpack_from(format, self.header, pos) + self.hdr[key] = val + + if 'title_offset' in self.hdr: + title_offset = self.hdr['title_offset'] + title_length = self.hdr['title_length'] + else: + title_offset = 0 + title_length = 0 + if title_offset == 0: + title_offset = len(self.header) + title_length = 0 + self.title = self.sect.palmname.decode('latin-1', errors='replace') + else: + self.title = self.header[title_offset:title_offset+title_length].decode(self.codec, errors='replace') + # title record always padded with two nul bytes and then padded with nuls to next 4 byte boundary + title_length = ((title_length+2+3)>>2)<<2 + + self.extra1 = self.header[self.exth_offset+self.exth_length:title_offset] + self.extra2 = self.header[title_offset+title_length:] + + print("Mobipocket header from section %d" % self.start) + print(" Offset Value Hex Dec Description") + for key in self.mobi_header_sorted_keys: + (pos, format, tot_len) = self.mobi_header[key] + if pos < (self.length + 16): + if key != 'magic': + fmt_string = "0x{0:0>3X} ({0:3d}){1: >" + str(9-2*tot_len) +"s}0x{2:0>" + str(2*tot_len) + "X} {2:10d} {3:s}" + else: + self.hdr[key] = unicode_str(self.hdr[key]) + fmt_string = "0x{0:0>3X} ({0:3d}){2:>11s} {3:s}" + print(fmt_string.format(pos, " ",self.hdr[key], key)) + print("") + + if self.exth_length > 0: + print("EXTH metadata, offset %d, padded length %d" % (self.exth_offset,self.exth_length)) + self.dump_exth() + print("") + + if len(self.extra1) > 0: + print("Extra data between EXTH and Title, length %d" % len(self.extra1)) + print(hexlify(self.extra1)) + print("") + + if title_length > 0: + print("Title in header at offset %d, padded length %d: '%s'" %(title_offset,title_length,self.title)) + print("") + + if len(self.extra2) > 0: + print("Extra data between Title and end of header, length %d" % len(self.extra2)) + print(hexlify(self.extra2)) + print("") + + def isPrintReplica(self): + return self.mlstart[0:4] == b"%MOP" + + def isK8(self): + return self.start != 0 or self.version == 8 + + def isEncrypted(self): + return self.crypto_type != 0 + + def hasNCX(self): + return self.ncxidx != 0xffffffff + + def isDictionary(self): + return self.metaOrthIndex != 0xffffffff + + def getncxIndex(self): + return self.ncxidx + + def decompress(self, data): + return self.unpack(data) + + def Language(self): + langcode = struct.unpack(b'!L', self.header[0x5c:0x60])[0] + langid = langcode & 0xFF + sublangid = (langcode >> 8) & 0xFF + return getLanguage(langid, sublangid) + + def DictInLanguage(self): + if self.isDictionary(): + langcode = struct.unpack(b'!L', self.header[0x60:0x64])[0] + langid = langcode & 0xFF + sublangid = (langcode >> 10) & 0xFF + if langid != 0: + return getLanguage(langid, sublangid) + return False + + def DictOutLanguage(self): + if self.isDictionary(): + langcode = struct.unpack(b'!L', self.header[0x64:0x68])[0] + langid = langcode & 0xFF + sublangid = (langcode >> 10) & 0xFF + if langid != 0: + return getLanguage(langid, sublangid) + return False + + def getRawML(self): + def getSizeOfTrailingDataEntry(data): + num = 0 + for v in data[-4:]: + if bord(v) & 0x80: + num = 0 + num = (num << 7) | (bord(v) & 0x7f) + return num + def trimTrailingDataEntries(data): + for _ in range(trailers): + num = getSizeOfTrailingDataEntry(data) + data = data[:-num] + if multibyte: + num = (ord(data[-1:]) & 3) + 1 + data = data[:-num] + return data + multibyte = 0 + trailers = 0 + if self.sect.ident == b'BOOKMOBI': + mobi_length, = struct.unpack_from(b'>L', self.header, 0x14) + mobi_version, = struct.unpack_from(b'>L', self.header, 0x68) + if (mobi_length >= 0xE4) and (mobi_version >= 5): + flags, = struct.unpack_from(b'>H', self.header, 0xF2) + multibyte = flags & 1 + while flags > 1: + if flags & 2: + trailers += 1 + flags = flags >> 1 + # get raw mobi markup languge + print("Unpacking raw markup language") + dataList = [] + # offset = 0 + for i in range(1, self.records+1): + data = trimTrailingDataEntries(self.sect.loadSection(self.start + i)) + dataList.append(self.unpack(data)) + if self.isK8(): + self.sect.setsectiondescription(self.start + i,"KF8 Text Section {0:d}".format(i)) + elif self.version == 0: + self.sect.setsectiondescription(self.start + i,"PalmDOC Text Section {0:d}".format(i)) + else: + self.sect.setsectiondescription(self.start + i,"Mobipocket Text Section {0:d}".format(i)) + rawML = b''.join(dataList) + self.rawSize = len(rawML) + return rawML + + # all metadata is stored in a dictionary with key and returns a *list* of values + # a list is used to allow for multiple creators, multiple contributors, etc + def parseMetaData(self): + def addValue(name, value): + if name not in self.metadata: + self.metadata[name] = [value] + else: + self.metadata[name].append(value) + + codec=self.codec + if self.hasExth: + extheader=self.exth + _length, num_items = struct.unpack(b'>LL', extheader[4:12]) + extheader = extheader[12:] + pos = 0 + for _ in range(num_items): + id, size = struct.unpack(b'>LL', extheader[pos:pos+8]) + content = extheader[pos + 8: pos + size] + if id in MobiHeader.id_map_strings: + name = MobiHeader.id_map_strings[id] + addValue(name, content.decode(codec, errors='replace')) + elif id in MobiHeader.id_map_values: + name = MobiHeader.id_map_values[id] + if size == 9: + value, = struct.unpack(b'B',content) + addValue(name, unicode_str(str(value))) + elif size == 10: + value, = struct.unpack(b'>H',content) + addValue(name, unicode_str(str(value))) + elif size == 12: + value, = struct.unpack(b'>L',content) + # handle special case of missing CoverOffset or missing ThumbOffset + if id == 201 or id == 202: + if value != 0xffffffff: + addValue(name, unicode_str(str(value))) + else: + addValue(name, unicode_str(str(value))) + else: + print("Warning: Bad key, size, value combination detected in EXTH ", id, size, hexlify(content)) + addValue(name, hexlify(content)) + elif id in MobiHeader.id_map_hexstrings: + name = MobiHeader.id_map_hexstrings[id] + addValue(name, hexlify(content)) + else: + name = unicode_str(str(id)) + ' (hex)' + addValue(name, hexlify(content)) + pos += size + + # add the basics to the metadata each as a list element + self.metadata['Language'] = [self.Language()] + self.metadata['Title'] = [unicode_str(self.title,self.codec)] + self.metadata['Codec'] = [self.codec] + self.metadata['UniqueID'] = [unicode_str(str(self.unique_id))] + # if no asin create one using a uuid + if 'ASIN' not in self.metadata: + self.metadata['ASIN'] = [unicode_str(str(uuid.uuid4()))] + # if no cdeType set it to "EBOK" + if 'cdeType' not in self.metadata: + self.metadata['cdeType'] = ['EBOK'] + + def getMetaData(self): + return self.metadata + + def describeHeader(self, DUMP): + print("Mobi Version:", self.version) + print("Codec:", self.codec) + print("Title:", self.title) + if 'Updated_Title' in self.metadata: + print("EXTH Title:", self.metadata['Updated_Title'][0]) + if self.compression == 0x4448: + print("Huffdic compression") + elif self.compression == 2: + print("Palmdoc compression") + elif self.compression == 1: + print("No compression") + if DUMP: + self.dumpheader() diff --git a/src/epy_reader/tools/KindleUnpack/mobi_html.py b/src/epy_reader/tools/KindleUnpack/mobi_html.py new file mode 100644 index 0000000..eda766c --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_html.py @@ -0,0 +1,439 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, utf8_str + +if PY2: + range = xrange + +import re +# note: re requites the pattern to be the exact same type as the data to be searched in python3 +# but u"" is not allowed for the pattern itself only b"" + +from .mobi_utils import fromBase32 + +class HTMLProcessor: + + def __init__(self, files, metadata, rscnames): + self.files = files + self.metadata = metadata + self.rscnames = rscnames + # for original style mobis, default to including all image files in the opf manifest + self.used = {} + for name in rscnames: + self.used[name] = 'used' + + def findAnchors(self, rawtext, indx_data, positionMap): + # process the raw text + # find anchors... + print("Find link anchors") + link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''', re.IGNORECASE) + # TEST NCX: merge in filepos from indx + pos_links = [int(m.group(1)) for m in link_pattern.finditer(rawtext)] + if indx_data: + pos_indx = [e['pos'] for e in indx_data if e['pos']>0] + pos_links = list(set(pos_links + pos_indx)) + + for position in pos_links: + if position in positionMap: + positionMap[position] = positionMap[position] + utf8_str('<a id="filepos%d" />' % position) + else: + positionMap[position] = utf8_str('<a id="filepos%d" />' % position) + + # apply dictionary metadata and anchors + print("Insert data into html") + pos = 0 + lastPos = len(rawtext) + dataList = [] + for end in sorted(positionMap.keys()): + if end == 0 or end > lastPos: + continue # something's up - can't put a tag in outside <html>...</html> + dataList.append(rawtext[pos:end]) + dataList.append(positionMap[end]) + pos = end + dataList.append(rawtext[pos:]) + srctext = b"".join(dataList) + rawtext = None + dataList = None + self.srctext = srctext + self.indx_data = indx_data + return srctext + + def insertHREFS(self): + srctext = self.srctext + rscnames = self.rscnames + metadata = self.metadata + + # put in the hrefs + print("Insert hrefs into html") + # There doesn't seem to be a standard, so search as best as we can + + link_pattern = re.compile(br'''<a([^>]*?)filepos=['"]{0,1}0*(\d+)['"]{0,1}([^>]*?)>''', re.IGNORECASE) + srctext = link_pattern.sub(br'''<a\1href="#filepos\2"\3>''', srctext) + + # remove empty anchors + print("Remove empty anchors from html") + srctext = re.sub(br"<a\s*/>",br"", srctext) + srctext = re.sub(br"<a\s*>\s*</a>",br"", srctext) + + # convert image references + print("Insert image references into html") + # split string into image tag pieces and other pieces + image_pattern = re.compile(br'''(<img.*?>)''', re.IGNORECASE) + image_index_pattern = re.compile(br'''recindex=['"]{0,1}([0-9]+)['"]{0,1}''', re.IGNORECASE) + srcpieces = image_pattern.split(srctext) + srctext = self.srctext = None + + # all odd pieces are image tags (nulls string on even pieces if no space between them in srctext) + for i in range(1, len(srcpieces), 2): + tag = srcpieces[i] + for m in image_index_pattern.finditer(tag): + imageNumber = int(m.group(1)) + imageName = rscnames[imageNumber-1] + if imageName is None: + print("Error: Referenced image %s was not recognized as a valid image" % imageNumber) + else: + replacement = b'src="Images/' + utf8_str(imageName) + b'"' + tag = image_index_pattern.sub(replacement, tag, 1) + srcpieces[i] = tag + srctext = b"".join(srcpieces) + + # add in character set meta into the html header if needed + if 'Codec' in metadata: + srctext = srctext[0:12]+b'<meta http-equiv="content-type" content="text/html; charset='+utf8_str(metadata.get('Codec')[0])+b'" />'+srctext[12:] + return srctext, self.used + + +class XHTMLK8Processor: + + def __init__(self, rscnames, k8proc): + self.rscnames = rscnames + self.k8proc = k8proc + self.used = {} + + def buildXHTML(self): + + # first need to update all links that are internal which + # are based on positions within the xhtml files **BEFORE** + # cutting and pasting any pieces into the xhtml text files + + # kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml) + # XXXX is the offset in records into divtbl + # YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position + + # pos:fid pattern + posfid_pattern = re.compile(br'''(<a.*?href=.*?>)''', re.IGNORECASE) + posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''') + + parts = [] + print("Building proper xhtml for each file") + for i in range(self.k8proc.getNumberOfParts()): + part = self.k8proc.getPart(i) + [partnum, dir, filename, beg, end, aidtext] = self.k8proc.getPartInfo(i) + + # internal links + srcpieces = posfid_pattern.split(part) + for j in range(1, len(srcpieces),2): + tag = srcpieces[j] + if tag.startswith(b'<'): + for m in posfid_index_pattern.finditer(tag): + posfid = m.group(1) + offset = m.group(2) + filename, idtag = self.k8proc.getIDTagByPosFid(posfid, offset) + if idtag == b'': + replacement= b'"' + utf8_str(filename) + b'"' + else: + replacement = b'"' + utf8_str(filename) + b'#' + idtag + b'"' + tag = posfid_index_pattern.sub(replacement, tag, 1) + srcpieces[j] = tag + part = b"".join(srcpieces) + parts.append(part) + + # we are free to cut and paste as we see fit + # we can safely remove all of the Kindlegen generated aid tags + # change aid ids that are in k8proc.linked_aids to xhtml ids + find_tag_with_aid_pattern = re.compile(br'''(<[^>]*\said\s*=[^>]*>)''', re.IGNORECASE) + within_tag_aid_position_pattern = re.compile(br'''\said\s*=['"]([^'"]*)['"]''') + for i in range(len(parts)): + part = parts[i] + srcpieces = find_tag_with_aid_pattern.split(part) + for j in range(len(srcpieces)): + tag = srcpieces[j] + if tag.startswith(b'<'): + for m in within_tag_aid_position_pattern.finditer(tag): + try: + aid = m.group(1) + except IndexError: + aid = None + replacement = b'' + if aid in self.k8proc.linked_aids: + replacement = b' id="aid-' + aid + b'"' + tag = within_tag_aid_position_pattern.sub(replacement, tag, 1) + srcpieces[j] = tag + part = b"".join(srcpieces) + parts[i] = part + + # we can safely replace all of the Kindlegen generated data-AmznPageBreak tags + # with page-break-after style patterns + find_tag_with_AmznPageBreak_pattern = re.compile(br'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE) + within_tag_AmznPageBreak_position_pattern = re.compile(br'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''') + for i in range(len(parts)): + part = parts[i] + srcpieces = find_tag_with_AmznPageBreak_pattern.split(part) + for j in range(len(srcpieces)): + tag = srcpieces[j] + if tag.startswith(b'<'): + srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub( + lambda m:b' style="page-break-after:' + m.group(1) + b'"', tag) + part = b"".join(srcpieces) + parts[i] = part + + # we have to handle substitutions for the flows pieces first as they may + # be inlined into the xhtml text + # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images) + # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc) + # kindle:embed:XXXX (used for fonts) + + flows = [] + flows.append(None) + flowinfo = [] + flowinfo.append([None, None, None, None]) + + # regular expression search patterns + img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE) + img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE) + + tag_pattern = re.compile(br'''(<[^>]*>)''') + flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE) + + url_pattern = re.compile(br'''(url\(.*?\))''', re.IGNORECASE) + url_img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*["')]''', re.IGNORECASE) + font_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)["')]''', re.IGNORECASE) + url_css_index_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE) + url_svg_image_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=image/svg\+xml[^\)]*''', re.IGNORECASE) + + for i in range(1, self.k8proc.getNumberOfFlows()): + [ftype, format, dir, filename] = self.k8proc.getFlowInfo(i) + flowpart = self.k8proc.getFlow(i) + + # links to raster image files from image tags + # image_pattern + srcpieces = img_pattern.split(flowpart) + for j in range(1, len(srcpieces),2): + tag = srcpieces[j] + if tag.startswith(b'<im'): + for m in img_index_pattern.finditer(tag): + imageNumber = fromBase32(m.group(1)) + imageName = self.rscnames[imageNumber-1] + if imageName is not None: + replacement = b'"../Images/' + utf8_str(imageName) + b'"' + self.used[imageName] = 'used' + tag = img_index_pattern.sub(replacement, tag, 1) + else: + print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag)) + srcpieces[j] = tag + flowpart = b"".join(srcpieces) + + # replacements inside css url(): + srcpieces = url_pattern.split(flowpart) + for j in range(1, len(srcpieces),2): + tag = srcpieces[j] + + # process links to raster image files + for m in url_img_index_pattern.finditer(tag): + imageNumber = fromBase32(m.group(1)) + imageName = self.rscnames[imageNumber-1] + osep = m.group()[0:1] + csep = m.group()[-1:] + if imageName is not None: + replacement = osep + b'../Images/' + utf8_str(imageName) + csep + self.used[imageName] = 'used' + tag = url_img_index_pattern.sub(replacement, tag, 1) + else: + print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag)) + + # process links to fonts + for m in font_index_pattern.finditer(tag): + fontNumber = fromBase32(m.group(1)) + fontName = self.rscnames[fontNumber-1] + osep = m.group()[0:1] + csep = m.group()[-1:] + if fontName is None: + print("Error: Referenced font %s was not recognized as a valid font in %s" % (fontNumber, tag)) + else: + replacement = osep + b'../Fonts/' + utf8_str(fontName) + csep + tag = font_index_pattern.sub(replacement, tag, 1) + self.used[fontName] = 'used' + + # process links to other css pieces + for m in url_css_index_pattern.finditer(tag): + num = fromBase32(m.group(1)) + [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) + replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"' + tag = url_css_index_pattern.sub(replacement, tag, 1) + self.used[fnm] = 'used' + + # process links to svg images + for m in url_svg_image_pattern.finditer(tag): + num = fromBase32(m.group(1)) + [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) + replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"' + tag = url_svg_image_pattern.sub(replacement, tag, 1) + self.used[fnm] = 'used' + + srcpieces[j] = tag + flowpart = b"".join(srcpieces) + + # store away in our own copy + flows.append(flowpart) + + # I do not think this case exists and even if it does exist, it needs to be done in a separate + # pass to prevent inlining a flow piece into another flow piece before the inserted one or the + # target one has been fully processed + + # but keep it around if it ends up we do need it + + # flow pattern not inside url() + # srcpieces = tag_pattern.split(flowpart) + # for j in range(1, len(srcpieces),2): + # tag = srcpieces[j] + # if tag.startswith(b'<'): + # for m in flow_pattern.finditer(tag): + # num = fromBase32(m.group(1)) + # [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) + # flowtext = self.k8proc.getFlow(num) + # if fmt == b'inline': + # tag = flowtext + # else: + # replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"' + # tag = flow_pattern.sub(replacement, tag, 1) + # self.used[fnm] = 'used' + # srcpieces[j] = tag + # flowpart = b"".join(srcpieces) + + # now handle the main text xhtml parts + + # Handle the flow items in the XHTML text pieces + # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc) + tag_pattern = re.compile(br'''(<[^>]*>)''') + flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE) + for i in range(len(parts)): + part = parts[i] + [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] + # flow pattern + srcpieces = tag_pattern.split(part) + for j in range(1, len(srcpieces),2): + tag = srcpieces[j] + if tag.startswith(b'<'): + for m in flow_pattern.finditer(tag): + num = fromBase32(m.group(1)) + if num > 0 and num < len(self.k8proc.flowinfo): + [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) + flowpart = flows[num] + if fmt == b'inline': + tag = flowpart + else: + replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"' + tag = flow_pattern.sub(replacement, tag, 1) + self.used[fnm] = 'used' + else: + print("warning: ignoring non-existent flow link", tag, " value 0x%x" % num) + srcpieces[j] = tag + part = b''.join(srcpieces) + + # store away modified version + parts[i] = part + + # Handle any embedded raster images links in style= attributes urls + style_pattern = re.compile(br'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''', re.IGNORECASE) + img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE) + + for i in range(len(parts)): + part = parts[i] + [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] + + # replace urls in style attributes + srcpieces = style_pattern.split(part) + for j in range(1, len(srcpieces),2): + tag = srcpieces[j] + if b'kindle:embed' in tag: + for m in img_index_pattern.finditer(tag): + imageNumber = fromBase32(m.group(1)) + imageName = self.rscnames[imageNumber-1] + osep = m.group()[0:1] + csep = m.group()[-1:] + if imageName is not None: + replacement = osep + b'../Images/'+ utf8_str(imageName) + csep + self.used[imageName] = 'used' + tag = img_index_pattern.sub(replacement, tag, 1) + else: + print("Error: Referenced image %s in style url was not recognized in %s" % (imageNumber, tag)) + srcpieces[j] = tag + part = b"".join(srcpieces) + + # store away modified version + parts[i] = part + + # Handle any embedded raster images links in the xhtml text + # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images) + img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE) + img_index_pattern = re.compile(br'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''') + + for i in range(len(parts)): + part = parts[i] + [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] + + # links to raster image files + # image_pattern + srcpieces = img_pattern.split(part) + for j in range(1, len(srcpieces),2): + tag = srcpieces[j] + if tag.startswith(b'<im'): + for m in img_index_pattern.finditer(tag): + imageNumber = fromBase32(m.group(1)) + imageName = self.rscnames[imageNumber-1] + if imageName is not None: + replacement = b'"../Images/' + utf8_str(imageName) + b'"' + self.used[imageName] = 'used' + tag = img_index_pattern.sub(replacement, tag, 1) + else: + print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag)) + srcpieces[j] = tag + part = b"".join(srcpieces) + # store away modified version + parts[i] = part + + # finally perform any general cleanups needed to make valid XHTML + # these include: + # in svg tags replace "perserveaspectratio" attributes with "perserveAspectRatio" + # in svg tags replace "viewbox" attributes with "viewBox" + # in <li> remove value="XX" attributes since these are illegal + tag_pattern = re.compile(br'''(<[^>]*>)''') + li_value_pattern = re.compile(br'''\svalue\s*=\s*['"][^'"]*['"]''', re.IGNORECASE) + + for i in range(len(parts)): + part = parts[i] + [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] + + # tag pattern + srcpieces = tag_pattern.split(part) + for j in range(1, len(srcpieces),2): + tag = srcpieces[j] + if tag.startswith(b'<svg') or tag.startswith(b'<SVG'): + tag = tag.replace(b'preserveaspectratio',b'preserveAspectRatio') + tag = tag.replace(b'viewbox',b'viewBox') + elif tag.startswith(b'<li ') or tag.startswith(b'<LI '): + tagpieces = li_value_pattern.split(tag) + tag = b"".join(tagpieces) + srcpieces[j] = tag + part = b"".join(srcpieces) + # store away modified version + parts[i] = part + + self.k8proc.setFlows(flows) + self.k8proc.setParts(parts) + + return self.used diff --git a/src/epy_reader/tools/KindleUnpack/mobi_index.py b/src/epy_reader/tools/KindleUnpack/mobi_index.py new file mode 100644 index 0000000..397aaf8 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_index.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, bchr, bstr, bord +if PY2: + range = xrange + +import struct +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring + +from .mobi_utils import toHex + +class MobiIndex: + + def __init__(self, sect, DEBUG=False): + self.sect = sect + self.DEBUG = DEBUG + + def getIndexData(self, idx, label="Unknown"): + sect = self.sect + outtbl = [] + ctoc_text = {} + if idx != 0xffffffff: + sect.setsectiondescription(idx,"{0} Main INDX section".format(label)) + data = sect.loadSection(idx) + idxhdr, hordt1, hordt2 = self.parseINDXHeader(data) + IndexCount = idxhdr['count'] + # handle the case of multiple sections used for CTOC + rec_off = 0 + off = idx + IndexCount + 1 + for j in range(idxhdr['nctoc']): + cdata = sect.loadSection(off + j) + sect.setsectiondescription(off+j, label + ' CTOC Data ' + str(j)) + ctocdict = self.readCTOC(cdata) + for k in ctocdict: + ctoc_text[k + rec_off] = ctocdict[k] + rec_off += 0x10000 + tagSectionStart = idxhdr['len'] + controlByteCount, tagTable = readTagSection(tagSectionStart, data) + if self.DEBUG: + print("ControlByteCount is", controlByteCount) + print("IndexCount is", IndexCount) + print("TagTable: %s" % tagTable) + for i in range(idx + 1, idx + 1 + IndexCount): + sect.setsectiondescription(i,"{0} Extra {1:d} INDX section".format(label,i-idx)) + data = sect.loadSection(i) + hdrinfo, ordt1, ordt2 = self.parseINDXHeader(data) + idxtPos = hdrinfo['start'] + entryCount = hdrinfo['count'] + if self.DEBUG: + print(idxtPos, entryCount) + # loop through to build up the IDXT position starts + idxPositions = [] + for j in range(entryCount): + pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j)) + idxPositions.append(pos) + # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) + idxPositions.append(idxtPos) + # for each entry in the IDXT build up the tagMap and any associated text + for j in range(entryCount): + startPos = idxPositions[j] + endPos = idxPositions[j+1] + textLength = ord(data[startPos:startPos+1]) + text = data[startPos+1:startPos+1+textLength] + if hordt2 is not None: + text = b''.join(bchr(hordt2[bord(x)]) for x in text) + tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos) + outtbl.append([text, tagMap]) + if self.DEBUG: + print(tagMap) + print(text) + return outtbl, ctoc_text + + def parseINDXHeader(self, data): + "read INDX header" + if not data[:4] == b'INDX': + print("Warning: index section is not INDX") + return False + words = ( + 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code', + 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc' + ) + num = len(words) + values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)]) + header = {} + for n in range(num): + header[words[n]] = values[n] + + ordt1 = None + ordt2 = None + + ocnt, oentries, op1, op2, otagx = struct.unpack_from(b'>LLLLL',data, 0xa4) + if header['code'] == 0xfdea or ocnt != 0 or oentries > 0: + # horribly hacked up ESP (sample) mobi books use two ORDT sections but never specify + # them in the proper place in the header. They seem to be codepage 65002 which seems + # to be some sort of strange EBCDIC utf-8 or 16 encoded strings + + # so we need to look for them and store them away to process leading text + # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries + # we only ever seem to use the seocnd but ... + assert(ocnt == 1) + assert(data[op1:op1+4] == b'ORDT') + assert(data[op2:op2+4] == b'ORDT') + ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4) + ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4) + + if self.DEBUG: + print("parsed INDX header:") + for n in words: + print(n, "%X" % header[n],) + print("") + return header, ordt1, ordt2 + + def readCTOC(self, txtdata): + # read all blocks from CTOC + ctoc_data = {} + offset = 0 + while offset<len(txtdata): + if PY2: + if txtdata[offset] == b'\0': + break + else: + if txtdata[offset] == 0: + break + idx_offs = offset + # first n bytes: name len as vwi + pos, ilen = getVariableWidthValue(txtdata, offset) + offset += pos + # <len> next bytes: name + name = txtdata[offset:offset+ilen] + offset += ilen + if self.DEBUG: + print("name length is ", ilen) + print(idx_offs, name) + ctoc_data[idx_offs] = name + return ctoc_data + + +def getVariableWidthValue(data, offset): + ''' + Decode variable width value from given bytes. + + @param data: The bytes to decode. + @param offset: The start offset into data. + @return: Tuple of consumed bytes count and decoded value. + ''' + value = 0 + consumed = 0 + finished = False + while not finished: + v = data[offset + consumed: offset + consumed + 1] + consumed += 1 + if ord(v) & 0x80: + finished = True + value = (value << 7) | (ord(v) & 0x7f) + return consumed, value + + +def readTagSection(start, data): + ''' + Read tag section from given data. + + @param start: The start position in the data. + @param data: The data to process. + @return: Tuple of control byte count and list of tag tuples. + ''' + controlByteCount = 0 + tags = [] + if data[start:start+4] == b"TAGX": + firstEntryOffset, = struct.unpack_from(b'>L', data, start + 0x04) + controlByteCount, = struct.unpack_from(b'>L', data, start + 0x08) + + # Skip the first 12 bytes already read above. + for i in range(12, firstEntryOffset, 4): + pos = start + i + tags.append((ord(data[pos:pos+1]), ord(data[pos+1:pos+2]), ord(data[pos+2:pos+3]), ord(data[pos+3:pos+4]))) + return controlByteCount, tags + + +def countSetBits(value, bits=8): + ''' + Count the set bits in the given value. + + @param value: Integer value. + @param bits: The number of bits of the input value (defaults to 8). + @return: Number of set bits. + ''' + count = 0 + for _ in range(bits): + if value & 0x01 == 0x01: + count += 1 + value = value >> 1 + return count + + +def getTagMap(controlByteCount, tagTable, entryData, startPos, endPos): + ''' + Create a map of tags and values from the given byte section. + + @param controlByteCount: The number of control bytes. + @param tagTable: The tag table. + @param entryData: The data to process. + @param startPos: The starting position in entryData. + @param endPos: The end position in entryData or None if it is unknown. + @return: Hashmap of tag and list of values. + ''' + tags = [] + tagHashMap = {} + controlByteIndex = 0 + dataStart = startPos + controlByteCount + + for tag, valuesPerEntry, mask, endFlag in tagTable: + if endFlag == 0x01: + controlByteIndex += 1 + continue + cbyte = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1]) + if 0: + print("Control Byte Index %0x , Control Byte Value %0x" % (controlByteIndex, cbyte)) + + value = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1]) & mask + if value != 0: + if value == mask: + if countSetBits(mask) > 1: + # If all bits of masked value are set and the mask has more than one bit, a variable width value + # will follow after the control bytes which defines the length of bytes (NOT the value count!) + # which will contain the corresponding variable width values. + consumed, value = getVariableWidthValue(entryData, dataStart) + dataStart += consumed + tags.append((tag, None, value, valuesPerEntry)) + else: + tags.append((tag, 1, None, valuesPerEntry)) + else: + # Shift bits to get the masked value. + while mask & 0x01 == 0: + mask = mask >> 1 + value = value >> 1 + tags.append((tag, value, None, valuesPerEntry)) + for tag, valueCount, valueBytes, valuesPerEntry in tags: + values = [] + if valueCount is not None: + # Read valueCount * valuesPerEntry variable width values. + for _ in range(valueCount): + for _ in range(valuesPerEntry): + consumed, data = getVariableWidthValue(entryData, dataStart) + dataStart += consumed + values.append(data) + else: + # Convert valueBytes to variable width values. + totalConsumed = 0 + while totalConsumed < valueBytes: + # Does this work for valuesPerEntry != 1? + consumed, data = getVariableWidthValue(entryData, dataStart) + dataStart += consumed + totalConsumed += consumed + values.append(data) + if totalConsumed != valueBytes: + print("Error: Should consume %s bytes, but consumed %s" % (valueBytes, totalConsumed)) + tagHashMap[tag] = values + # Test that all bytes have been processed if endPos is given. + if endPos is not None and dataStart != endPos: + # The last entry might have some zero padding bytes, so complain only if non zero bytes are left. + for char in entryData[dataStart:endPos]: + if bord(char) != 0: + print("Warning: There are unprocessed index bytes left: %s" % toHex(entryData[dataStart:endPos])) + if 0: + print("controlByteCount: %s" % controlByteCount) + print("tagTable: %s" % tagTable) + print("data: %s" % toHex(entryData[startPos:endPos])) + print("tagHashMap: %s" % tagHashMap) + break + + return tagHashMap diff --git a/src/epy_reader/tools/KindleUnpack/mobi_k8proc.py b/src/epy_reader/tools/KindleUnpack/mobi_k8proc.py new file mode 100644 index 0000000..5b8274e --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_k8proc.py @@ -0,0 +1,496 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, bstr, utf8_str + +if PY2: + range = xrange + +import os + +import struct +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring + +import re +# note: re requites the pattern to be the exact same type as the data to be searched in python3 +# but u"" is not allowed for the pattern itself only b"" + +from .mobi_index import MobiIndex +from .mobi_utils import fromBase32 +from .unipath import pathof + +_guide_types = [b'cover',b'title-page',b'toc',b'index',b'glossary',b'acknowledgements', + b'bibliography',b'colophon',b'copyright-page',b'dedication', + b'epigraph',b'foreward',b'loi',b'lot',b'notes',b'preface',b'text'] + +# locate beginning and ending positions of tag with specific aid attribute +def locate_beg_end_of_tag(ml, aid): + pattern = utf8_str(r'''<[^>]*\said\s*=\s*['"]%s['"][^>]*>''' % aid) + aid_pattern = re.compile(pattern,re.IGNORECASE) + for m in re.finditer(aid_pattern, ml): + plt = m.start() + pgt = ml.find(b'>',plt+1) + return plt, pgt + return 0, 0 + + +# iterate over all tags in block in reverse order, i.e. last ta to first tag +def reverse_tag_iter(block): + end = len(block) + while True: + pgt = block.rfind(b'>', 0, end) + if pgt == -1: + break + plt = block.rfind(b'<', 0, pgt) + if plt == -1: + break + yield block[plt:pgt+1] + end = plt + + +class K8Processor: + + def __init__(self, mh, sect, files, debug=False): + self.sect = sect + self.files = files + self.mi = MobiIndex(sect) + self.mh = mh + self.skelidx = mh.skelidx + self.fragidx = mh.fragidx + self.guideidx = mh.guideidx + self.fdst = mh.fdst + self.flowmap = {} + self.flows = None + self.flowinfo = [] + self.parts = None + self.partinfo = [] + self.linked_aids = set() + self.fdsttbl= [0,0xffffffff] + self.DEBUG = debug + + # read in and parse the FDST info which is very similar in format to the Palm DB section + # parsing except it provides offsets into rawML file and not the Palm DB file + # this is needed to split up the final css, svg, etc flow section + # that can exist at the end of the rawML file + if self.fdst != 0xffffffff: + header = self.sect.loadSection(self.fdst) + if header[0:4] == b"FDST": + num_sections, = struct.unpack_from(b'>L', header, 0x08) + self.fdsttbl = struct.unpack_from(bstr('>%dL' % (num_sections*2)), header, 12)[::2] + (mh.rawSize, ) + sect.setsectiondescription(self.fdst,"KF8 FDST INDX") + if self.DEBUG: + print("\nFDST Section Map: %d sections" % num_sections) + for j in range(num_sections): + print("Section %d: 0x%08X - 0x%08X" % (j, self.fdsttbl[j],self.fdsttbl[j+1])) + else: + print("\nError: K8 Mobi with Missing FDST info") + + # read/process skeleton index info to create the skeleton table + skeltbl = [] + if self.skelidx != 0xffffffff: + # for i in range(2): + # fname = 'skel%04d.dat' % i + # data = self.sect.loadSection(self.skelidx + i) + # with open(pathof(fname), 'wb') as f: + # f.write(data) + outtbl, ctoc_text = self.mi.getIndexData(self.skelidx, "KF8 Skeleton") + fileptr = 0 + for [text, tagMap] in outtbl: + # file number, skeleton name, fragtbl record count, start position, length + skeltbl.append([fileptr, text, tagMap[1][0], tagMap[6][0], tagMap[6][1]]) + fileptr += 1 + self.skeltbl = skeltbl + if self.DEBUG: + print("\nSkel Table: %d entries" % len(self.skeltbl)) + print("table: filenum, skeleton name, frag tbl record count, start position, length") + for j in range(len(self.skeltbl)): + print(self.skeltbl[j]) + + # read/process the fragment index to create the fragment table + fragtbl = [] + if self.fragidx != 0xffffffff: + # for i in range(3): + # fname = 'frag%04d.dat' % i + # data = self.sect.loadSection(self.fragidx + i) + # with open(pathof(fname), 'wb') as f: + # f.write(data) + outtbl, ctoc_text = self.mi.getIndexData(self.fragidx, "KF8 Fragment") + for [text, tagMap] in outtbl: + # insert position, ctoc offset (aidtext), file number, sequence number, start position, length + ctocoffset = tagMap[2][0] + ctocdata = ctoc_text[ctocoffset] + fragtbl.append([int(text), ctocdata, tagMap[3][0], tagMap[4][0], tagMap[6][0], tagMap[6][1]]) + self.fragtbl = fragtbl + if self.DEBUG: + print("\nFragment Table: %d entries" % len(self.fragtbl)) + print("table: file position, link id text, file num, sequence number, start position, length") + for j in range(len(self.fragtbl)): + print(self.fragtbl[j]) + + # read / process guide index for guide elements of opf + guidetbl = [] + if self.guideidx != 0xffffffff: + # for i in range(3): + # fname = 'guide%04d.dat' % i + # data = self.sect.loadSection(self.guideidx + i) + # with open(pathof(fname), 'wb') as f: + # f.write(data) + outtbl, ctoc_text = self.mi.getIndexData(self.guideidx, "KF8 Guide elements)") + for [text, tagMap] in outtbl: + # ref_type, ref_title, frag number + ctocoffset = tagMap[1][0] + ref_title = ctoc_text[ctocoffset] + ref_type = text + fileno = None + if 3 in tagMap: + fileno = tagMap[3][0] + if 6 in tagMap: + fileno = tagMap[6][0] + guidetbl.append([ref_type, ref_title, fileno]) + self.guidetbl = guidetbl + if self.DEBUG: + print("\nGuide Table: %d entries" % len(self.guidetbl)) + print("table: ref_type, ref_title, fragtbl entry number") + for j in range(len(self.guidetbl)): + print(self.guidetbl[j]) + + def buildParts(self, rawML): + # now split the rawML into its flow pieces + self.flows = [] + for j in range(0, len(self.fdsttbl)-1): + start = self.fdsttbl[j] + end = self.fdsttbl[j+1] + self.flows.append(rawML[start:end]) + + # the first piece represents the xhtml text + text = self.flows[0] + self.flows[0] = b'' + + # walk the <skeleton> and fragment tables to build original source xhtml files + # *without* destroying any file position information needed for later href processing + # and create final list of file separation start: stop points and etc in partinfo + if self.DEBUG: + print("\nRebuilding flow piece 0: the main body of the ebook") + self.parts = [] + self.partinfo = [] + fragptr = 0 + baseptr = 0 + cnt = 0 + filename = 'part%04d.xhtml' % cnt + for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl: + baseptr = skelpos + skellen + skeleton = text[skelpos: baseptr] + aidtext = "0" + for i in range(fragcnt): + [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[fragptr] + aidtext = idtext[12:-2] + if i == 0: + filename = 'part%04d.xhtml' % filenum + slice = text[baseptr: baseptr + length] + insertpos = insertpos - skelpos + head = skeleton[:insertpos] + tail = skeleton[insertpos:] + actual_inspos = insertpos + if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') < head.rfind(b'<')): + # There is an incomplete tag in either the head or tail. + # This can happen for some badly formed KF8 files + print('The fragment table for %s has incorrect insert position. Calculating manually.' % skelname) + bp, ep = locate_beg_end_of_tag(skeleton, aidtext) + if bp != ep: + actual_inspos = ep + 1 + startpos + if insertpos != actual_inspos: + print("fixed corrupt fragment table insert position", insertpos+skelpos, actual_inspos+skelpos) + insertpos = actual_inspos + self.fragtbl[fragptr][0] = actual_inspos + skelpos + skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:] + baseptr = baseptr + length + fragptr += 1 + cnt += 1 + self.parts.append(skeleton) + self.partinfo.append([skelnum, 'Text', filename, skelpos, baseptr, aidtext]) + + assembled_text = b''.join(self.parts) + if self.DEBUG: + outassembled = os.path.join(self.files.k8dir, 'assembled_text.dat') + with open(pathof(outassembled),'wb') as f: + f.write(assembled_text) + + # The primary css style sheet is typically stored next followed by any + # snippets of code that were previously inlined in the + # original xhtml but have been stripped out and placed here. + # This can include local CDATA snippets and and svg sections. + + # The problem is that for most browsers and ereaders, you can not + # use <img src="imageXXXX.svg" /> to import any svg image that itself + # properly uses an <image/> tag to import some raster image - it + # should work according to the spec but does not for almost all browsers + # and ereaders and causes epub validation issues because those raster + # images are in manifest but not in xhtml text - since they only + # referenced from an svg image + + # So we need to check the remaining flow pieces to see if they are css + # or svg images. if svg images, we must check if they have an <image /> + # and if so inline them into the xhtml text pieces. + + # there may be other sorts of pieces stored here but until we see one + # in the wild to reverse engineer we won't be able to tell + self.flowinfo.append([None, None, None, None]) + svg_tag_pattern = re.compile(br'''(<svg[^>]*>)''', re.IGNORECASE) + image_tag_pattern = re.compile(br'''(<image[^>]*>)''', re.IGNORECASE) + for j in range(1,len(self.flows)): + flowpart = self.flows[j] + nstr = '%04d' % j + m = re.search(svg_tag_pattern, flowpart) + if m is not None: + # svg + ptype = b'svg' + start = m.start() + m2 = re.search(image_tag_pattern, flowpart) + if m2 is not None: + pformat = b'inline' + pdir = None + fname = None + # strip off anything before <svg if inlining + flowpart = flowpart[start:] + else: + pformat = b'file' + pdir = "Images" + fname = 'svgimg' + nstr + '.svg' + else: + # search for CDATA and if exists inline it + if flowpart.find(b'[CDATA[') >= 0: + ptype = b'css' + flowpart = b'<style type="text/css">\n' + flowpart + b'\n</style>\n' + pformat = b'inline' + pdir = None + fname = None + else: + # css - assume as standalone css file + ptype = b'css' + pformat = b'file' + pdir = "Styles" + fname = 'style' + nstr + '.css' + + self.flows[j] = flowpart + self.flowinfo.append([ptype, pformat, pdir, fname]) + + if self.DEBUG: + print("\nFlow Map: %d entries" % len(self.flowinfo)) + for fi in self.flowinfo: + print(fi) + print("\n") + + print("\nXHTML File Part Position Information: %d entries" % len(self.partinfo)) + for pi in self.partinfo: + print(pi) + + if False: # self.Debug: + # dump all of the locations of the aid tags used in TEXT + # find id links only inside of tags + # inside any < > pair find all "aid=' and return whatever is inside the quotes + # [^>]* means match any amount of chars except for '>' char + # [^'"] match any amount of chars except for the quote character + # \s* means match any amount of whitespace + print("\npositions of all aid= pieces") + id_pattern = re.compile(br'''<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>''',re.IGNORECASE) + for m in re.finditer(id_pattern, rawML): + [filename, partnum, start, end] = self.getFileInfo(m.start()) + [seqnum, idtext] = self.getFragTblInfo(m.start()) + value = fromBase32(m.group(1)) + print(" aid: %s value: %d at: %d -> part: %d, start: %d, end: %d" % (m.group(1), value, m.start(), partnum, start, end)) + print(" %s fragtbl entry %d" % (idtext, seqnum)) + + return + + # get information fragment table entry by pos + def getFragTblInfo(self, pos): + for j in range(len(self.fragtbl)): + [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[j] + if pos >= insertpos and pos < (insertpos + length): + # why are these "in: and before: added here + return seqnum, b'in: ' + idtext + if pos < insertpos: + return seqnum, b'before: ' + idtext + return None, None + + # get information about the part (file) that exists at pos in original rawML + def getFileInfo(self, pos): + for [partnum, pdir, filename, start, end, aidtext] in self.partinfo: + if pos >= start and pos < end: + return filename, partnum, start, end + return None, None, None, None + + # accessor functions to properly protect the internal structure + def getNumberOfParts(self): + return len(self.parts) + + def getPart(self,i): + if i >= 0 and i < len(self.parts): + return self.parts[i] + return None + + def getPartInfo(self, i): + if i >= 0 and i < len(self.partinfo): + return self.partinfo[i] + return None + + def getNumberOfFlows(self): + return len(self.flows) + + def getFlow(self,i): + # note flows[0] is empty - it was all of the original text + if i > 0 and i < len(self.flows): + return self.flows[i] + return None + + def getFlowInfo(self,i): + # note flowinfo[0] is empty - it was all of the original text + if i > 0 and i < len(self.flowinfo): + return self.flowinfo[i] + return None + + def getIDTagByPosFid(self, posfid, offset): + # first convert kindle:pos:fid and offset info to position in file + # (fromBase32 can handle both string types on input) + row = fromBase32(posfid) + off = fromBase32(offset) + [insertpos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[row] + pos = insertpos + off + fname, pn, skelpos, skelend = self.getFileInfo(pos) + if fname is None: + # pos does not exist + # default to skeleton pos instead + print("Link To Position", pos, "does not exist, retargeting to top of target") + pos = self.skeltbl[filenum][3] + fname, pn, skelpos, skelend = self.getFileInfo(pos) + # an existing "id=" or "name=" attribute must exist in original xhtml otherwise it would not have worked for linking. + # Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent + # some position information encoded into Base32 name. + # so find the closest "id=" before position the file by actually searching in that file + idtext = self.getIDTag(pos) + return fname, idtext + + def getIDTag(self, pos): + # find the first tag with a named anchor (name or id attribute) before pos + fname, pn, skelpos, skelend = self.getFileInfo(pos) + if pn is None and skelpos is None: + print("Error: getIDTag - no file contains ", pos) + textblock = self.parts[pn] + npos = pos - skelpos + # if npos inside a tag then search all text before the its end of tag marker + pgt = textblock.find(b'>',npos) + plt = textblock.find(b'<',npos) + if plt == npos or pgt < plt: + npos = pgt + 1 + # find id and name attributes only inside of tags + # use a reverse tag search since that is faster + # inside any < > pair find "id=" and "name=" attributes return it + # [^>]* means match any amount of chars except for '>' char + # [^'"] match any amount of chars except for the quote character + # \s* means match any amount of whitespace + textblock = textblock[0:npos] + id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE) + name_pattern = re.compile(br'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE) + aid_pattern = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''') + for tag in reverse_tag_iter(textblock): + # any ids in the body should default to top of file + if tag[0:6] == b'<body ': + return b'' + if tag[0:6] != b'<meta ': + m = id_pattern.match(tag) or name_pattern.match(tag) + if m is not None: + return m.group(1) + m = aid_pattern.match(tag) + if m is not None: + self.linked_aids.add(m.group(1)) + return b'aid-' + m.group(1) + return b'' + + # do we need to do deep copying + def setParts(self, parts): + assert(len(parts) == len(self.parts)) + for i in range(len(parts)): + self.parts[i] = parts[i] + + # do we need to do deep copying + def setFlows(self, flows): + assert(len(flows) == len(self.flows)) + for i in range(len(flows)): + self.flows[i] = flows[i] + + # get information about the part (file) that exists at pos in original rawML + def getSkelInfo(self, pos): + for [partnum, pdir, filename, start, end, aidtext] in self.partinfo: + if pos >= start and pos < end: + return [partnum, pdir, filename, start, end, aidtext] + return [None, None, None, None, None, None] + + # fileno is actually a reference into fragtbl (a fragment) + def getGuideText(self): + guidetext = b'' + for [ref_type, ref_title, fileno] in self.guidetbl: + if ref_type == b'thumbimagestandard': + continue + if ref_type not in _guide_types and not ref_type.startswith(b'other.'): + if ref_type == b'start': + ref_type = b'text' + else: + ref_type = b'other.' + ref_type + [pos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[fileno] + [pn, pdir, filename, skelpos, skelend, aidtext] = self.getSkelInfo(pos) + idtext = self.getIDTag(pos) + linktgt = filename.encode('utf-8') + if idtext != b'': + linktgt += b'#' + idtext + guidetext += b'<reference type="'+ref_type+b'" title="'+ref_title+b'" href="'+utf8_str(pdir)+b'/'+linktgt+b'" />\n' + # opf is encoded utf-8 so must convert any titles properly + guidetext = (guidetext.decode(self.mh.codec)).encode("utf-8") + return guidetext + + def getPageIDTag(self, pos): + # find the first tag with a named anchor (name or id attribute) before pos + # but page map offsets need to little more leeway so if the offset points + # into a tag look for the next ending tag "/>" or "</" and start your search from there. + fname, pn, skelpos, skelend = self.getFileInfo(pos) + if pn is None and skelpos is None: + print("Error: getIDTag - no file contains ", pos) + textblock = self.parts[pn] + npos = pos - skelpos + # if npos inside a tag then search all text before next ending tag + pgt = textblock.find(b'>',npos) + plt = textblock.find(b'<',npos) + if plt == npos or pgt < plt: + # we are in a tag + # so find first ending tag + pend1 = textblock.find(b'/>', npos) + pend2 = textblock.find(b'</', npos) + if pend1 != -1 and pend2 != -1: + pend = min(pend1, pend2) + else: + pend = max(pend1, pend2) + if pend != -1: + npos = pend + else: + npos = pgt + 1 + # find id and name attributes only inside of tags + # use a reverse tag search since that is faster + # inside any < > pair find "id=" and "name=" attributes return it + # [^>]* means match any amount of chars except for '>' char + # [^'"] match any amount of chars except for the quote character + # \s* means match any amount of whitespace + textblock = textblock[0:npos] + id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE) + name_pattern = re.compile(br'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE) + for tag in reverse_tag_iter(textblock): + # any ids in the body should default to top of file + if tag[0:6] == b'<body ': + return b'' + if tag[0:6] != b'<meta ': + m = id_pattern.match(tag) or name_pattern.match(tag) + if m is not None: + return m.group(1) + return b'' diff --git a/src/epy_reader/tools/KindleUnpack/mobi_k8resc.py b/src/epy_reader/tools/KindleUnpack/mobi_k8resc.py new file mode 100644 index 0000000..1e58e84 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_k8resc.py @@ -0,0 +1,271 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +DEBUG_USE_ORDERED_DICTIONARY = False # OrderedDict is supoorted >= python 2.7. +""" set to True to use OrderedDict for K8RESCProcessor.parsetag.tattr.""" + +if DEBUG_USE_ORDERED_DICTIONARY: + from collections import OrderedDict as dict_ +else: + dict_ = dict + +from .compatibility_utils import unicode_str + +from .mobi_utils import fromBase32 + +_OPF_PARENT_TAGS = ['xml', 'package', 'metadata', 'dc-metadata', + 'x-metadata', 'manifest', 'spine', 'tours', 'guide'] + +class K8RESCProcessor(object): + + def __init__(self, data, debug=False): + self._debug = debug + self.resc = None + self.opos = 0 + self.extrameta = [] + self.cover_name = None + self.spine_idrefs = {} + self.spine_order = [] + self.spine_pageattributes = {} + self.spine_ppd = None + # need3 indicate the book has fields which require epub3. + # but the estimation of the source epub version from the fields is difficult. + self.need3 = False + self.package_ver = None + self.extra_metadata = [] + self.refines_metadata = [] + self.extra_attributes = [] + # get header + start_pos = data.find(b'<') + self.resc_header = data[:start_pos] + # get resc data length + start = self.resc_header.find(b'=') + 1 + end = self.resc_header.find(b'&', start) + resc_size = 0 + if end > 0: + resc_size = fromBase32(self.resc_header[start:end]) + resc_rawbytes = len(data) - start_pos + if resc_rawbytes == resc_size: + self.resc_length = resc_size + else: + # Most RESC has a nul string at its tail but some do not. + end_pos = data.find(b'\x00', start_pos) + if end_pos < 0: + self.resc_length = resc_rawbytes + else: + self.resc_length = end_pos - start_pos + if self.resc_length != resc_size: + print("Warning: RESC section length({:d}bytes) does not match its size({:d}bytes).".format(self.resc_length, resc_size)) + # now parse RESC after converting it to unicode from utf-8 + try: + self.resc = unicode_str(data[start_pos:start_pos+self.resc_length]) + except UnicodeDecodeError: + self.resc = unicode_str(data[start_pos:start_pos+self.resc_length], enc='latin-1') + self.parseData() + + def prepend_to_spine(self, key, idref, linear, properties): + self.spine_order = [key] + self.spine_order + self.spine_idrefs[key] = idref + attributes = {} + if linear is not None: + attributes['linear'] = linear + if properties is not None: + attributes['properties'] = properties + self.spine_pageattributes[key] = attributes + + # RESC tag iterator + def resc_tag_iter(self): + tcontent = last_tattr = None + prefix = [''] + while True: + text, tag = self.parseresc() + if text is None and tag is None: + break + if text is not None: + tcontent = text.rstrip(' \r\n') + else: # we have a tag + ttype, tname, tattr = self.parsetag(tag) + if ttype == 'begin': + tcontent = None + prefix.append(tname + '.') + if tname in _OPF_PARENT_TAGS: + yield ''.join(prefix), tname, tattr, tcontent + else: + last_tattr = tattr + else: # single or end + if ttype == 'end': + prefix.pop() + tattr = last_tattr + last_tattr = None + if tname in _OPF_PARENT_TAGS: + tname += '-end' + yield ''.join(prefix), tname, tattr, tcontent + tcontent = None + + # now parse the RESC to extract spine and extra metadata info + def parseData(self): + for prefix, tname, tattr, tcontent in self.resc_tag_iter(): + if self._debug: + print(" Parsing RESC: ", prefix, tname, tattr, tcontent) + if tname == 'package': + self.package_ver = tattr.get('version', '2.0') + package_prefix = tattr.get('prefix','') + if self.package_ver.startswith('3') or package_prefix.startswith('rendition'): + self.need3 = True + if tname == 'spine': + self.spine_ppd = tattr.get('page-progession-direction', None) + if self.spine_ppd is not None and self.spine_ppd == 'rtl': + self.need3 = True + if tname == 'itemref': + skelid = tattr.pop('skelid', None) + if skelid is None and len(self.spine_order) == 0: + # assume it was removed initial coverpage + skelid = 'coverpage' + tattr['linear'] = 'no' + self.spine_order.append(skelid) + idref = tattr.pop('idref', None) + if idref is not None: + idref = 'x_' + idref + self.spine_idrefs[skelid] = idref + if 'id' in tattr: + del tattr['id'] + # tattr["id"] = 'x_' + tattr["id"] + if 'properties' in tattr: + self.need3 = True + self.spine_pageattributes[skelid] = tattr + if tname == 'meta' or tname.startswith('dc:'): + if 'refines' in tattr or 'property' in tattr: + self.need3 = True + if tattr.get('name','') == 'cover': + cover_name = tattr.get('content',None) + if cover_name is not None: + cover_name = 'x_' + cover_name + self.cover_name = cover_name + else: + self.extrameta.append([tname, tattr, tcontent]) + + # parse and return either leading text or the next tag + def parseresc(self): + p = self.opos + if p >= len(self.resc): + return None, None + if self.resc[p] != '<': + res = self.resc.find('<',p) + if res == -1 : + res = len(self.resc) + self.opos = res + return self.resc[p:res], None + # handle comment as a special case + if self.resc[p:p+4] == '<!--': + te = self.resc.find('-->',p+1) + if te != -1: + te = te+2 + else: + te = self.resc.find('>',p+1) + ntb = self.resc.find('<',p+1) + if ntb != -1 and ntb < te: + self.opos = ntb + return self.resc[p:ntb], None + self.opos = te + 1 + return None, self.resc[p:te+1] + + # parses tag to identify: [tname, ttype, tattr] + # tname: tag name + # ttype: tag type ('begin', 'end' or 'single'); + # tattr: dictionary of tag atributes + def parsetag(self, s): + p = 1 + tname = None + ttype = None + tattr = dict_() + while s[p:p+1] == ' ' : + p += 1 + if s[p:p+1] == '/': + ttype = 'end' + p += 1 + while s[p:p+1] == ' ' : + p += 1 + b = p + while s[p:p+1] not in ('>', '/', ' ', '"', "'",'\r','\n') : + p += 1 + tname=s[b:p].lower() + # some special cases + if tname == '?xml': + tname = 'xml' + if tname == '!--': + ttype = 'single' + comment = s[p:-3].strip() + tattr['comment'] = comment + if ttype is None: + # parse any attributes of begin or single tags + while s.find('=',p) != -1 : + while s[p:p+1] == ' ' : + p += 1 + b = p + while s[p:p+1] != '=' : + p += 1 + aname = s[b:p].lower() + aname = aname.rstrip(' ') + p += 1 + while s[p:p+1] == ' ' : + p += 1 + if s[p:p+1] in ('"', "'") : + p = p + 1 + b = p + while s[p:p+1] not in ('"', "'"): + p += 1 + val = s[b:p] + p += 1 + else : + b = p + while s[p:p+1] not in ('>', '/', ' ') : + p += 1 + val = s[b:p] + tattr[aname] = val + if ttype is None: + ttype = 'begin' + if s.find('/',p) >= 0: + ttype = 'single' + return ttype, tname, tattr + + def taginfo_toxml(self, taginfo): + res = [] + tname, tattr, tcontent = taginfo + res.append('<' + tname) + if tattr is not None: + for key in tattr: + res.append(' ' + key + '="'+tattr[key]+'"') + if tcontent is not None: + res.append('>' + tcontent + '</' + tname + '>\n') + else: + res.append('/>\n') + return "".join(res) + + def hasSpine(self): + return len(self.spine_order) > 0 + + def needEPUB3(self): + return self.need3 + + def hasRefines(self): + for [tname, tattr, tcontent] in self.extrameta: + if 'refines' in tattr: + return True + return False + + def createMetadata(self, epubver): + for taginfo in self.extrameta: + tname, tattr, tcontent = taginfo + if 'refines' in tattr: + if epubver == 'F' and 'property' in tattr: + attr = ' id="%s" opf:%s="%s"\n' % (tattr['refines'], tattr['property'], tcontent) + self.extra_attributes.append(attr) + else: + tag = self.taginfo_toxml(taginfo) + self.refines_metadata.append(tag) + else: + tag = self.taginfo_toxml(taginfo) + self.extra_metadata.append(tag) diff --git a/src/epy_reader/tools/KindleUnpack/mobi_nav.py b/src/epy_reader/tools/KindleUnpack/mobi_nav.py new file mode 100644 index 0000000..16fb0be --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_nav.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import unicode_str +import os +from .unipath import pathof + +import re +# note: re requites the pattern to be the exact same type as the data to be searched in python3 +# but u"" is not allowed for the pattern itself only b"" + +DEBUG_NAV = False + +FORCE_DEFAULT_TITLE = False +""" Set to True to force to use the default title. """ + +NAVIGATION_FINENAME = 'nav.xhtml' +""" The name for the navigation document. """ + +DEFAULT_TITLE = 'Navigation' +""" The default title for the navigation document. """ + +class NAVProcessor(object): + + def __init__(self, files): + self.files = files + self.navname = NAVIGATION_FINENAME + + def buildLandmarks(self, guidetext): + header = '' + header += ' <nav epub:type="landmarks" id="landmarks" hidden="">\n' + header += ' <h2>Guide</h2>\n' + header += ' <ol>\n' + element = ' <li><a epub:type="{:s}" href="{:s}">{:s}</a></li>\n' + footer = '' + footer += ' </ol>\n' + footer += ' </nav>\n' + + type_map = { + 'cover' : 'cover', + 'title-page' : 'title-page', + # ?: 'frontmatter', + 'text' : 'bodymatter', + # ?: 'backmatter', + 'toc' : 'toc', + 'loi' : 'loi', + 'lot' : 'lot', + 'preface' : 'preface', + 'bibliography' : 'bibliography', + 'index' : 'index', + 'glossary' : 'glossary', + 'acknowledgements' : 'acknowledgements', + 'colophon' : None, + 'copyright-page' : None, + 'dedication' : None, + 'epigraph' : None, + 'foreword' : None, + 'notes' : None + } + + re_type = re.compile(r'\s+type\s*=\s*"(.*?)"', re.I) + re_title = re.compile(r'\s+title\s*=\s*"(.*?)"', re.I) + re_link = re.compile(r'\s+href\s*=\s*"(.*?)"', re.I) + dir_ = os.path.relpath(self.files.k8text, self.files.k8oebps).replace('\\', '/') + + data = '' + references = re.findall(r'<reference\s+.*?>', unicode_str(guidetext), re.I) + for reference in references: + mo_type = re_type.search(reference) + mo_title = re_title.search(reference) + mo_link = re_link.search(reference) + if mo_type is not None: + type_ = type_map.get(mo_type.group(1), None) + else: + type_ = None + if mo_title is not None: + title = mo_title.group(1) + else: + title = None + if mo_link is not None: + link = mo_link.group(1) + else: + link = None + + if type_ is not None and title is not None and link is not None: + link = os.path.relpath(link, dir_).replace('\\', '/') + data += element.format(type_, link, title) + if len(data) > 0: + return header + data + footer + else: + return '' + + def buildTOC(self, indx_data): + header = '' + header += ' <nav epub:type="toc" id="toc">\n' + header += ' <h1>Table of contents</h1>\n' + footer = ' </nav>\n' + + # recursive part + def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1): + if start>len(indx_data) or end>len(indx_data): + print("Warning (in buildTOC): missing INDX child entries", start, end, len(indx_data)) + return '' + if DEBUG_NAV: + print("recursINDX (in buildTOC) lvl %d from %d to %d" % (lvl, start, end)) + xhtml = '' + if start <= 0: + start = 0 + if end <= 0: + end = len(indx_data) + if lvl > max_lvl: + max_lvl = lvl + + indent1 = ' ' * (2 + lvl * 2) + indent2 = ' ' * (3 + lvl * 2) + xhtml += indent1 + '<ol>\n' + for i in range(start, end): + e = indx_data[i] + htmlfile = e['filename'] + desttag = e['idtag'] + text = e['text'] + if not e['hlvl'] == lvl: + continue + num += 1 + if desttag == '': + link = htmlfile + else: + link = '{:s}#{:s}'.format(htmlfile, desttag) + xhtml += indent2 + '<li>' + entry = '<a href="{:}">{:s}</a>'.format(link, text) + xhtml += entry + # recurs + if e['child1'] >= 0: + xhtml += '\n' + xhtmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1, + e['child1'], e['childn'] + 1) + xhtml += xhtmlrec + xhtml += indent2 + # close entry + xhtml += '</li>\n' + xhtml += indent1 + '</ol>\n' + return xhtml, max_lvl, num + + data, max_lvl, num = recursINDX() + if not len(indx_data) == num: + print("Warning (in buildTOC): different number of entries in NCX", len(indx_data), num) + return header + data + footer + + def buildNAV(self, ncx_data, guidetext, title, lang): + print("Building Navigation Document.") + if FORCE_DEFAULT_TITLE: + title = DEFAULT_TITLE + nav_header = '' + nav_header += '<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html>' + nav_header += '<html xmlns="http://www.w3.org/1999/xhtml"' + nav_header += ' xmlns:epub="http://www.idpf.org/2007/ops"' + nav_header += ' lang="{0:s}" xml:lang="{0:s}">\n'.format(lang) + nav_header += '<head>\n<title>{:s}</title>\n'.format(title) + nav_header += '<meta charset="UTF-8" />\n' + nav_header += '<style type="text/css">\n' + nav_header += 'nav#landmarks { display:none; }\n' + nav_header += 'ol { list-style-type: none; }' + nav_header += '</style>\n</head>\n<body>\n' + nav_footer = '</body>\n</html>\n' + + landmarks = self.buildLandmarks(guidetext) + toc = self.buildTOC(ncx_data) + + data = nav_header + data += landmarks + data += toc + data += nav_footer + return data + + def getNAVName(self): + return self.navname + + def writeNAV(self, ncx_data, guidetext, metadata): + # build the xhtml + # print("Write Navigation Document.") + xhtml = self.buildNAV(ncx_data, guidetext, metadata.get('Title')[0], metadata.get('Language')[0]) + fname = os.path.join(self.files.k8text, self.navname) + with open(pathof(fname), 'wb') as f: + f.write(xhtml.encode('utf-8')) diff --git a/src/epy_reader/tools/KindleUnpack/mobi_ncx.py b/src/epy_reader/tools/KindleUnpack/mobi_ncx.py new file mode 100644 index 0000000..60ef9a0 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_ncx.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +import os +from .unipath import pathof +from .compatibility_utils import unescapeit + + +import re +# note: re requites the pattern to be the exact same type as the data to be searched in python3 +# but u"" is not allowed for the pattern itself only b"" + +from xml.sax.saxutils import escape as xmlescape + +from .mobi_utils import toBase32 +from .mobi_index import MobiIndex + +DEBUG_NCX = False + +class ncxExtract: + + def __init__(self, mh, files): + self.mh = mh + self.sect = self.mh.sect + self.files = files + self.isNCX = False + self.mi = MobiIndex(self.sect) + self.ncxidx = self.mh.ncxidx + self.indx_data = None + + def parseNCX(self): + indx_data = [] + tag_fieldname_map = { + 1: ['pos',0], + 2: ['len',0], + 3: ['noffs',0], + 4: ['hlvl',0], + 5: ['koffs',0], + 6: ['pos_fid',0], + 21: ['parent',0], + 22: ['child1',0], + 23: ['childn',0] + } + if self.ncxidx != 0xffffffff: + outtbl, ctoc_text = self.mi.getIndexData(self.ncxidx, "NCX") + if DEBUG_NCX: + print(ctoc_text) + print(outtbl) + num = 0 + for [text, tagMap] in outtbl: + tmp = { + 'name': text.decode('utf-8'), + 'pos': -1, + 'len': 0, + 'noffs': -1, + 'text' : "Unknown Text", + 'hlvl' : -1, + 'kind' : "Unknown Kind", + 'pos_fid' : None, + 'parent' : -1, + 'child1' : -1, + 'childn' : -1, + 'num' : num + } + for tag in tag_fieldname_map: + [fieldname, i] = tag_fieldname_map[tag] + if tag in tagMap: + fieldvalue = tagMap[tag][i] + if tag == 6: + pos_fid = toBase32(fieldvalue,4).decode('utf-8') + fieldvalue2 = tagMap[tag][i+1] + pos_off = toBase32(fieldvalue2,10).decode('utf-8') + fieldvalue = 'kindle:pos:fid:%s:off:%s' % (pos_fid, pos_off) + tmp[fieldname] = fieldvalue + if tag == 3: + toctext = ctoc_text.get(fieldvalue, 'Unknown Text') + toctext = toctext.decode(self.mh.codec) + tmp['text'] = toctext + if tag == 5: + kindtext = ctoc_text.get(fieldvalue, 'Unknown Kind') + kindtext = kindtext.decode(self.mh.codec) + tmp['kind'] = kindtext + indx_data.append(tmp) + if DEBUG_NCX: + print("record number: ", num) + print("name: ", tmp['name'],) + print("position", tmp['pos']," length: ", tmp['len']) + print("text: ", tmp['text']) + print("kind: ", tmp['kind']) + print("heading level: ", tmp['hlvl']) + print("parent:", tmp['parent']) + print("first child: ",tmp['child1']," last child: ", tmp['childn']) + print("pos_fid is ", tmp['pos_fid']) + print("\n\n") + num += 1 + self.indx_data = indx_data + return indx_data + + def buildNCX(self, htmlfile, title, ident, lang): + indx_data = self.indx_data + + ncx_header = \ +'''<?xml version='1.0' encoding='utf-8'?> +<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="%s"> +<head> +<meta content="%s" name="dtb:uid"/> +<meta content="%d" name="dtb:depth"/> +<meta content="mobiunpack.py" name="dtb:generator"/> +<meta content="0" name="dtb:totalPageCount"/> +<meta content="0" name="dtb:maxPageNumber"/> +</head> +<docTitle> +<text>%s</text> +</docTitle> +<navMap> +''' + + ncx_footer = \ +''' </navMap> +</ncx> +''' + + ncx_entry = \ +'''<navPoint id="%s" playOrder="%d"> +<navLabel> +<text>%s</text> +</navLabel> +<content src="%s"/>''' + + # recursive part + def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1): + if start>len(indx_data) or end>len(indx_data): + print("Warning: missing INDX child entries", start, end, len(indx_data)) + return '' + if DEBUG_NCX: + print("recursINDX lvl %d from %d to %d" % (lvl, start, end)) + xml = '' + if start <= 0: + start = 0 + if end <= 0: + end = len(indx_data) + if lvl > max_lvl: + max_lvl = lvl + indent = ' ' * (2 + lvl) + + for i in range(start, end): + e = indx_data[i] + if not e['hlvl'] == lvl: + continue + # open entry + num += 1 + link = '%s#filepos%d' % (htmlfile, e['pos']) + tagid = 'np_%d' % num + entry = ncx_entry % (tagid, num, xmlescape(unescapeit(e['text'])), link) + entry = re.sub(re.compile('^', re.M), indent, entry, 0) + xml += entry + '\n' + # recurs + if e['child1']>=0: + xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1, + e['child1'], e['childn'] + 1) + xml += xmlrec + # close entry + xml += indent + '</navPoint>\n' + return xml, max_lvl, num + + body, max_lvl, num = recursINDX() + header = ncx_header % (lang, ident, max_lvl + 1, title) + ncx = header + body + ncx_footer + if not len(indx_data) == num: + print("Warning: different number of entries in NCX", len(indx_data), num) + return ncx + + def writeNCX(self, metadata): + # build the xml + self.isNCX = True + print("Write ncx") + # htmlname = os.path.basename(self.files.outbase) + # htmlname += '.html' + htmlname = 'book.html' + xml = self.buildNCX(htmlname, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0]) + # write the ncx file + # ncxname = os.path.join(self.files.mobi7dir, self.files.getInputFileBasename() + '.ncx') + ncxname = os.path.join(self.files.mobi7dir, 'toc.ncx') + with open(pathof(ncxname), 'wb') as f: + f.write(xml.encode('utf-8')) + + def buildK8NCX(self, indx_data, title, ident, lang): + ncx_header = \ +'''<?xml version='1.0' encoding='utf-8'?> +<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="%s"> +<head> +<meta content="%s" name="dtb:uid"/> +<meta content="%d" name="dtb:depth"/> +<meta content="mobiunpack.py" name="dtb:generator"/> +<meta content="0" name="dtb:totalPageCount"/> +<meta content="0" name="dtb:maxPageNumber"/> +</head> +<docTitle> +<text>%s</text> +</docTitle> +<navMap> +''' + + ncx_footer = \ +''' </navMap> +</ncx> +''' + + ncx_entry = \ +'''<navPoint id="%s" playOrder="%d"> +<navLabel> +<text>%s</text> +</navLabel> +<content src="%s"/>''' + + # recursive part + def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1): + if start>len(indx_data) or end>len(indx_data): + print("Warning: missing INDX child entries", start, end, len(indx_data)) + return '' + if DEBUG_NCX: + print("recursINDX lvl %d from %d to %d" % (lvl, start, end)) + xml = '' + if start <= 0: + start = 0 + if end <= 0: + end = len(indx_data) + if lvl > max_lvl: + max_lvl = lvl + indent = ' ' * (2 + lvl) + + for i in range(start, end): + e = indx_data[i] + htmlfile = e['filename'] + desttag = e['idtag'] + if not e['hlvl'] == lvl: + continue + # open entry + num += 1 + if desttag == '': + link = 'Text/%s' % htmlfile + else: + link = 'Text/%s#%s' % (htmlfile, desttag) + tagid = 'np_%d' % num + entry = ncx_entry % (tagid, num, xmlescape(unescapeit(e['text'])), link) + entry = re.sub(re.compile('^', re.M), indent, entry, 0) + xml += entry + '\n' + # recurs + if e['child1']>=0: + xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1, + e['child1'], e['childn'] + 1) + xml += xmlrec + # close entry + xml += indent + '</navPoint>\n' + return xml, max_lvl, num + + body, max_lvl, num = recursINDX() + header = ncx_header % (lang, ident, max_lvl + 1, title) + ncx = header + body + ncx_footer + if not len(indx_data) == num: + print("Warning: different number of entries in NCX", len(indx_data), num) + return ncx + + def writeK8NCX(self, ncx_data, metadata): + # build the xml + self.isNCX = True + print("Write K8 ncx") + xml = self.buildK8NCX(ncx_data, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0]) + bname = 'toc.ncx' + ncxname = os.path.join(self.files.k8oebps,bname) + with open(pathof(ncxname), 'wb') as f: + f.write(xml.encode('utf-8')) diff --git a/src/epy_reader/tools/KindleUnpack/mobi_opf.py b/src/epy_reader/tools/KindleUnpack/mobi_opf.py new file mode 100644 index 0000000..742d776 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_opf.py @@ -0,0 +1,686 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import unicode_str, unescapeit +from .compatibility_utils import lzip + +from .unipath import pathof + +from xml.sax.saxutils import escape as xmlescape + +import os +import uuid +from datetime import datetime + +# In EPUB3, NCX and <guide> MAY exist in OPF, although the NCX is superseded +# by the Navigation Document and the <guide> is deprecated. Currently, EPUB3_WITH_NCX +# and EPUB3_WITH_GUIDE are set to True due to compatibility with epub2 reading systems. +# They might be change to set to False in the future. + +EPUB3_WITH_NCX = True # Do not set to False except for debug. +""" Set to True to create a toc.ncx when converting to epub3. """ + +EPUB3_WITH_GUIDE = True # Do not set to False except for debug. +""" Set to True to create a guide element in an opf when converting to epub3. """ + +EPUB_OPF = 'content.opf' +""" The name for the OPF of EPUB. """ + +TOC_NCX = 'toc.ncx' +""" The name for the TOC of EPUB2. """ + +NAVIGATION_DOCUMENT = 'nav.xhtml' +""" The name for the navigation document of EPUB3. """ + +BEGIN_INFO_ONLY = '<!-- BEGIN INFORMATION ONLY ' +""" The comment to indicate the beginning of metadata which will be ignored by kindlegen. """ + +END_INFO_ONLY = 'END INFORMATION ONLY -->' +""" The comment to indicate the end of metadata which will be ignored by kindlegen. """ + +EXTH_TITLE_FURIGANA = 'Title-Pronunciation' +""" The name for Title Furigana(similar to file-as) set by KDP. """ + +EXTH_CREATOR_FURIGANA = 'Author-Pronunciation' +""" The name for Creator Furigana(similar to file-as) set by KDP. """ + +EXTH_PUBLISHER_FURIGANA = 'Publisher-Pronunciation' +""" The name for Publisher Furigana(similar to file-as) set by KDP. """ + +EXTRA_ENTITIES = {'"': '"', "'": "'"} + +class OPFProcessor(object): + + def __init__(self, files, metadata, fileinfo, rscnames, hasNCX, mh, usedmap, pagemapxml='', guidetext='', k8resc=None, epubver='2'): + self.files = files + self.metadata = metadata + self.fileinfo = fileinfo + self.rscnames = rscnames + self.has_ncx = hasNCX + self.codec = mh.codec + self.isK8 = mh.isK8() + self.printReplica = mh.isPrintReplica() + self.guidetext = unicode_str(guidetext) + self.used = usedmap + self.k8resc = k8resc + self.covername = None + self.cover_id = 'cover_img' + if self.k8resc is not None and self.k8resc.cover_name is not None: + # update cover id info from RESC if available + self.cover_id = self.k8resc.cover_name + # Create a unique urn uuid + self.BookId = unicode_str(str(uuid.uuid4())) + self.pagemap = pagemapxml + + self.ncxname = None + self.navname = None + + # page-progression-direction is only set in spine + self.page_progression_direction = metadata.pop('page-progression-direction', [None])[0] + if 'rl' in metadata.get('primary-writing-mode', [''])[0]: + self.page_progression_direction = 'rtl' + self.epubver = epubver # the epub version set by user + self.target_epubver = epubver # the epub vertion set by user or detected automatically + if self.epubver == 'A': + self.target_epubver = self.autodetectEPUBVersion() + elif self.epubver == 'F': + self.target_epubver = '2' + elif self.epubver != '2' and self.epubver != '3': + self.target_epubver = '2' + + # id for rifine attributes + self.title_id = {} + self.creator_id = {} + self.publisher_id = {} + # extra attributes + self.title_attrib = {} + self.creator_attrib = {} + self.publisher_attrib = {} + self.extra_attributes = [] # for force epub2 option + # Create epub3 metadata from EXTH. + self.exth_solved_refines_metadata = [] + self.exth_refines_metadata = [] + self.exth_fixedlayout_metadata = [] + + self.defineRefinesID() + self.processRefinesMetadata() + if self.k8resc is not None: + # Create metadata in RESC section. + self.k8resc.createMetadata(epubver) + if self.target_epubver == "3": + self.createMetadataForFixedlayout() + + def escapeit(self, sval, EXTRAS=None): + # note, xmlescape and unescape do not work with utf-8 bytestrings + sval = unicode_str(sval) + if EXTRAS: + res = xmlescape(unescapeit(sval), EXTRAS) + else: + res = xmlescape(unescapeit(sval)) + return res + + def createMetaTag(self, data, property, content, refid=''): + refines = '' + if refid: + refines = ' refines="#%s"' % refid + data.append('<meta property="%s"%s>%s</meta>\n' % (property, refines, content)) + + def buildOPFMetadata(self, start_tag, has_obfuscated_fonts=False): + # convert from EXTH metadata format to target epub version metadata + # epub 3 will ignore <meta name="xxxx" content="yyyy" /> style metatags + # but allows them to be present for backwards compatibility + # instead the new format is + # <meta property="xxxx" id="iiii" ... > property_value</meta> + # and DCMES elements such as: + # <dc:blah id="iiii">value</dc:blah> + + metadata = self.metadata + k8resc = self.k8resc + + META_TAGS = ['Drm Server Id', 'Drm Commerce Id', 'Drm Ebookbase Book Id', 'ASIN', 'ThumbOffset', 'Fake Cover', + 'Creator Software', 'Creator Major Version', 'Creator Minor Version', 'Creator Build Number', + 'Watermark', 'Clipping Limit', 'Publisher Limit', 'Text to Speech Disabled', 'CDE Type', + 'Updated Title', 'Font Signature (hex)', 'Tamper Proof Keys (hex)',] + + # def handleTag(data, metadata, key, tag, ids={}): + def handleTag(data, metadata, key, tag, attrib={}): + '''Format metadata values. + + @param data: List of formatted metadata entries. + @param metadata: The metadata dictionary. + @param key: The key of the metadata value to handle. + @param tag: The opf tag corresponds to the metadata value. + ###@param ids: The ids in tags for refines property of epub3. + @param attrib: The extra attibute for refines or opf prefixs. + ''' + if key in metadata: + for i, value in enumerate(metadata[key]): + closingTag = tag.split(" ")[0] + res = '<%s%s>%s</%s>\n' % (tag, attrib.get(i, ''), self.escapeit(value), closingTag) + data.append(res) + del metadata[key] + + # these are allowed but ignored by epub3 + def handleMetaPairs(data, metadata, key, name): + if key in metadata: + for value in metadata[key]: + res = '<meta name="%s" content="%s" />\n' % (name, self.escapeit(value, EXTRA_ENTITIES)) + data.append(res) + del metadata[key] + + data = [] + data.append(start_tag + '\n') + # Handle standard metadata + if 'Title' in metadata: + handleTag(data, metadata, 'Title', 'dc:title', self.title_attrib) + else: + data.append('<dc:title>Untitled</dc:title>\n') + handleTag(data, metadata, 'Language', 'dc:language') + if 'UniqueID' in metadata: + handleTag(data, metadata, 'UniqueID', 'dc:identifier id="uid"') + else: + # No unique ID in original, give it a generic one. + data.append('<dc:identifier id="uid">0</dc:identifier>\n') + + if self.target_epubver == '3': + # epub version 3 minimal metadata requires a dcterms:modifed date tag + self.createMetaTag(data, 'dcterms:modified', datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")) + + if self.isK8 and has_obfuscated_fonts: + # Use the random generated urn:uuid so obuscated fonts work. + # It doesn't need to be _THE_ unique identifier to work as a key + # for obfuscated fonts in Sigil, ADE and calibre. Its just has + # to use the opf:scheme="UUID" and have the urn:uuid: prefix. + if self.target_epubver == '3': + data.append('<dc:identifier>urn:uuid:'+self.BookId+'</dc:identifier>\n') + else: + data.append('<dc:identifier opf:scheme="UUID">urn:uuid:'+self.BookId+'</dc:identifier>\n') + + handleTag(data, metadata, 'Creator', 'dc:creator', self.creator_attrib) + handleTag(data, metadata, 'Contributor', 'dc:contributor') + handleTag(data, metadata, 'Publisher', 'dc:publisher', self.publisher_attrib) + handleTag(data, metadata, 'Source', 'dc:source') + handleTag(data, metadata, 'Type', 'dc:type') + if self.target_epubver == '3': + if 'ISBN' in metadata: + for i, value in enumerate(metadata['ISBN']): + res = '<dc:identifier>urn:isbn:%s</dc:identifier>\n' % self.escapeit(value) + data.append(res) + else: + handleTag(data, metadata, 'ISBN', 'dc:identifier opf:scheme="ISBN"') + if 'Subject' in metadata: + if 'SubjectCode' in metadata: + codeList = metadata['SubjectCode'] + del metadata['SubjectCode'] + else: + codeList = None + for i in range(len(metadata['Subject'])): + if codeList and i < len(codeList): + data.append('<dc:subject BASICCode="'+codeList[i]+'">') + else: + data.append('<dc:subject>') + data.append(self.escapeit(metadata['Subject'][i])+'</dc:subject>\n') + del metadata['Subject'] + handleTag(data, metadata, 'Description', 'dc:description') + if self.target_epubver == '3': + if 'Published' in metadata: + for i, value in enumerate(metadata['Published']): + res = '<dc:date>%s</dc:date>\n' % self.escapeit(value) + data.append(res) + else: + handleTag(data, metadata, 'Published', 'dc:date opf:event="publication"') + handleTag(data, metadata, 'Rights', 'dc:rights') + + if self.epubver == 'F': + if self.extra_attributes or k8resc is not None and k8resc.extra_attributes: + data.append('<!-- THE FOLLOWINGS ARE REQUIRED TO INSERT INTO <dc:xxx> MANUALLY\n') + if self.extra_attributes: + data += self.extra_attributes + if k8resc is not None and k8resc.extra_attributes: + data += k8resc.extra_attributes + data.append('-->\n') + else: + # Append refines metadata. + if self.exth_solved_refines_metadata: + data.append('<!-- Refines MetaData from EXTH -->\n') + data += self.exth_solved_refines_metadata + if self.exth_refines_metadata or k8resc is not None and k8resc.refines_metadata: + data.append('<!-- THE FOLLOWINGS ARE REQUIRED TO EDIT IDS MANUALLY\n') + if self.exth_refines_metadata: + data += self.exth_refines_metadata + if k8resc is not None and k8resc.refines_metadata: + data += k8resc.refines_metadata + data.append('-->\n') + + # Append metadata in RESC section. + if k8resc is not None and k8resc.extra_metadata: + data.append('<!-- Extra MetaData from RESC\n') + data += k8resc.extra_metadata + data.append('-->\n') + + if 'CoverOffset' in metadata: + imageNumber = int(metadata['CoverOffset'][0]) + self.covername = self.rscnames[imageNumber] + if self.covername is None: + print("Error: Cover image %s was not recognized as a valid image" % imageNumber) + else: + # <meta name="cover"> is obsoleted in EPUB3, but kindlegen v2.9 requires it. + data.append('<meta name="cover" content="' + self.cover_id + '" />\n') + self.used[self.covername] = 'used' + del metadata['CoverOffset'] + + handleMetaPairs(data, metadata, 'Codec', 'output encoding') + # handle kindlegen specifc tags + handleTag(data, metadata, 'DictInLanguage', 'DictionaryInLanguage') + handleTag(data, metadata, 'DictOutLanguage', 'DictionaryOutLanguage') + handleMetaPairs(data, metadata, 'RegionMagnification', 'RegionMagnification') + handleMetaPairs(data, metadata, 'book-type', 'book-type') + handleMetaPairs(data, metadata, 'zero-gutter', 'zero-gutter') + handleMetaPairs(data, metadata, 'zero-margin', 'zero-margin') + handleMetaPairs(data, metadata, 'primary-writing-mode', 'primary-writing-mode') + handleMetaPairs(data, metadata, 'fixed-layout', 'fixed-layout') + handleMetaPairs(data, metadata, 'orientation-lock', 'orientation-lock') + handleMetaPairs(data, metadata, 'original-resolution', 'original-resolution') + + # these are not allowed in epub2 or 3 so convert them to meta name content pairs + # perhaps these could better be mapped into the dcterms namespace instead + handleMetaPairs(data, metadata, 'Review', 'review') + handleMetaPairs(data, metadata, 'Imprint', 'imprint') + handleMetaPairs(data, metadata, 'Adult', 'adult') + handleMetaPairs(data, metadata, 'DictShortName', 'DictionaryVeryShortName') + + # these are needed by kobo books upon submission but not sure if legal metadata in epub2 or epub3 + if 'Price' in metadata and 'Currency' in metadata: + priceList = metadata['Price'] + currencyList = metadata['Currency'] + if len(priceList) != len(currencyList): + print("Error: found %s price entries, but %s currency entries.") + else: + for i in range(len(priceList)): + data.append('<SRP Currency="'+currencyList[i]+'">'+priceList[i]+'</SRP>\n') + del metadata['Price'] + del metadata['Currency'] + + if self.target_epubver == '3': + # Append metadata for EPUB3. + if self.exth_fixedlayout_metadata: + data.append('<!-- EPUB3 MedaData converted from EXTH -->\n') + data += self.exth_fixedlayout_metadata + + # all that remains is extra EXTH info we will store inside a comment inside meta name/content pairs + # so it can not impact anything and will be automatically stripped out if found again in a RESC section + data.append(BEGIN_INFO_ONLY + '\n') + if 'ThumbOffset' in metadata: + imageNumber = int(metadata['ThumbOffset'][0]) + # Some bad books give image indexes that are 'out of range' + try: + imageName = self.rscnames[imageNumber] + except: + print('Number given for Cover Thumbnail is out of range: %s' % imageNumber) + imageName = None + if imageName is None: + print("Error: Cover Thumbnail image %s was not recognized as a valid image" % imageNumber) + else: + data.append('<meta name="Cover ThumbNail Image" content="'+ 'Images/'+imageName+'" />\n') + # self.used[imageName] = 'used' # thumbnail image is always generated by Kindlegen, so don't include in manifest + self.used[imageName] = 'not used' + del metadata['ThumbOffset'] + for metaName in META_TAGS: + if metaName in metadata: + for value in metadata[metaName]: + data.append('<meta name="'+metaName+'" content="'+self.escapeit(value, EXTRA_ENTITIES)+'" />\n') + del metadata[metaName] + for key in list(metadata.keys()): + for value in metadata[key]: + data.append('<meta name="'+key+'" content="'+self.escapeit(value, EXTRA_ENTITIES)+'" />\n') + del metadata[key] + data.append(END_INFO_ONLY + '\n') + data.append('</metadata>\n') + return data + + def buildOPFManifest(self, ncxname, navname=None): + # buildManifest for mobi7, azw4, epub2 and epub3. + k8resc = self.k8resc + cover_id = self.cover_id + hasK8RescSpine = k8resc is not None and k8resc.hasSpine() + self.ncxname = ncxname + self.navname = navname + + data = [] + data.append('<manifest>\n') + media_map = { + '.jpg' : 'image/jpeg', + '.jpeg' : 'image/jpeg', + '.png' : 'image/png', + '.gif' : 'image/gif', + '.svg' : 'image/svg+xml', + '.xhtml': 'application/xhtml+xml', + '.html' : 'text/html', # for mobi7 + '.pdf' : 'application/pdf', # for azw4(print replica textbook) + '.ttf' : 'application/x-font-ttf', + '.otf' : 'application/x-font-opentype', # replaced? + '.css' : 'text/css', + # '.html' : 'text/x-oeb1-document', # for mobi7 + # '.otf' : 'application/vnd.ms-opentype', # [OpenType] OpenType fonts + # '.woff' : 'application/font-woff', # [WOFF] WOFF fonts + # '.smil' : 'application/smil+xml', # [MediaOverlays301] EPUB Media Overlay documents + # '.pls' : 'application/pls+xml', # [PLS] Text-to-Speech (TTS) Pronunciation lexicons + # '.mp3' : 'audio/mpeg', + # '.mp4' : 'video/mp4', + # '.js' : 'text/javascript', # not supported in K8 + } + spinerefs = [] + + idcnt = 0 + for [key,dir,fname] in self.fileinfo: + name, ext = os.path.splitext(fname) + ext = ext.lower() + media = media_map.get(ext) + ref = "item%d" % idcnt + if hasK8RescSpine: + if key is not None and key in k8resc.spine_idrefs: + ref = k8resc.spine_idrefs[key] + properties = '' + if dir != '': + fpath = dir + '/' + fname + else: + fpath = fname + data.append('<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(ref, media, fpath, properties)) + + if ext in ['.xhtml', '.html']: + spinerefs.append(ref) + idcnt += 1 + + for fname in self.rscnames: + if fname is not None: + if self.used.get(fname,'not used') == 'not used': + continue + name, ext = os.path.splitext(fname) + ext = ext.lower() + media = media_map.get(ext,ext[1:]) + properties = '' + if fname == self.covername: + ref = cover_id + if self.target_epubver == '3': + properties = 'properties="cover-image"' + else: + ref = "item%d" % idcnt + if ext == '.ttf' or ext == '.otf': + if self.isK8: # fonts are only used in Mobi 8 + fpath = 'Fonts/' + fname + data.append('<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(ref, media, fpath, properties)) + else: + fpath = 'Images/' + fname + data.append('<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(ref, media, fpath, properties)) + idcnt += 1 + + if self.target_epubver == '3' and navname is not None: + data.append('<item id="nav" media-type="application/xhtml+xml" href="Text/' + navname + '" properties="nav"/>\n') + if self.has_ncx and ncxname is not None: + data.append('<item id="ncx" media-type="application/x-dtbncx+xml" href="' + ncxname +'" />\n') + if self.pagemap != '': + data.append('<item id="map" media-type="application/oebs-page-map+xml" href="page-map.xml" />\n') + data.append('</manifest>\n') + return [data, spinerefs] + + def buildOPFSpine(self, spinerefs, isNCX): + # build spine + k8resc = self.k8resc + hasK8RescSpine = k8resc is not None and k8resc.hasSpine() + data = [] + ppd = '' + if self.isK8 and self.page_progression_direction is not None: + ppd = ' page-progression-direction="{:s}"'.format(self.page_progression_direction) + ncx = '' + if isNCX: + ncx = ' toc="ncx"' + map='' + if self.pagemap != '': + map = ' page-map="map"' + if self.epubver == 'F': + if ppd: + ppd = '<!--' + ppd + ' -->' + spine_start_tag = '<spine{1:s}{2:s}>{0:s}\n'.format(ppd, map, ncx) + else: + spine_start_tag = '<spine{0:s}{1:s}{2:s}>\n'.format(ppd, map, ncx) + data.append(spine_start_tag) + + if hasK8RescSpine: + for key in k8resc.spine_order: + idref = k8resc.spine_idrefs[key] + attribs = k8resc.spine_pageattributes[key] + tag = '<itemref idref="%s"' % idref + for aname, val in list(attribs.items()): + if self.epubver == 'F' and aname == 'properties': + continue + if val is not None: + tag += ' %s="%s"' % (aname, val) + tag += '/>' + if self.epubver == 'F' and 'properties' in attribs: + val = attribs['properties'] + if val is not None: + tag += '<!-- properties="%s" -->' % val + tag += '\n' + data.append(tag) + else: + start = 0 + # special case the created coverpage if need be + [key, dir, fname] = self.fileinfo[0] + if key is not None and key == "coverpage": + entry = spinerefs[start] + data.append('<itemref idref="%s" linear="no"/>\n' % entry) + start += 1 + for entry in spinerefs[start:]: + data.append('<itemref idref="' + entry + '"/>\n') + data.append('</spine>\n') + return data + + def buildMobi7OPF(self): + # Build an OPF for mobi7 and azw4. + print("Building an opf for mobi7/azw4.") + data = [] + data.append('<?xml version="1.0" encoding="utf-8"?>\n') + data.append('<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="uid">\n') + metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">' + opf_metadata = self.buildOPFMetadata(metadata_tag) + data += opf_metadata + if self.has_ncx: + # ncxname = self.files.getInputFileBasename() + '.ncx' + ncxname = 'toc.ncx' + else: + ncxname = None + [opf_manifest, spinerefs] = self.buildOPFManifest(ncxname) + data += opf_manifest + opf_spine = self.buildOPFSpine(spinerefs, self.has_ncx) + data += opf_spine + data.append('<tours>\n</tours>\n') + if not self.printReplica: + guide ='<guide>\n' + self.guidetext + '</guide>\n' + data.append(guide) + data.append('</package>\n') + return ''.join(data) + + def buildEPUBOPF(self, has_obfuscated_fonts=False): + print("Building an opf for mobi8 using epub version: ", self.target_epubver) + if self.target_epubver == '2': + has_ncx = self.has_ncx + has_guide = True + ncxname = None + ncxname = TOC_NCX + navname = None + package = '<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="uid">\n' + tours = '<tours>\n</tours>\n' + metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">' + else: + has_ncx = EPUB3_WITH_NCX + has_guide = EPUB3_WITH_GUIDE + ncxname = None + if has_ncx: + ncxname = TOC_NCX + navname = NAVIGATION_DOCUMENT + package = '<package version="3.0" xmlns="http://www.idpf.org/2007/opf" prefix="rendition: http://www.idpf.org/vocab/rendition/#" unique-identifier="uid">\n' + tours = '' + metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">' + + data = [] + data.append('<?xml version="1.0" encoding="utf-8"?>\n') + data.append(package) + opf_metadata = self.buildOPFMetadata(metadata_tag, has_obfuscated_fonts) + data += opf_metadata + [opf_manifest, spinerefs] = self.buildOPFManifest(ncxname, navname) + data += opf_manifest + opf_spine = self.buildOPFSpine(spinerefs, has_ncx) + data += opf_spine + data.append(tours) + if has_guide: + guide ='<guide>\n' + self.guidetext + '</guide>\n' + data.append(guide) + data.append('</package>\n') + return ''.join(data) + + def writeOPF(self, has_obfuscated_fonts=False): + if self.isK8: + data = self.buildEPUBOPF(has_obfuscated_fonts) + outopf = os.path.join(self.files.k8oebps, EPUB_OPF) + with open(pathof(outopf), 'wb') as f: + f.write(data.encode('utf-8')) + return self.BookId + else: + data = self.buildMobi7OPF() + outopf = os.path.join(self.files.mobi7dir, 'content.opf') + with open(pathof(outopf), 'wb') as f: + f.write(data.encode('utf-8')) + return 0 + + def getBookId(self): + return self.BookId + + def getNCXName(self): + return self.ncxname + + def getNAVName(self): + return self.navname + + def getEPUBVersion(self): + return self.target_epubver + + def hasNCX(self): + return self.ncxname is not None and self.has_ncx + + def hasNAV(self): + return self.navname is not None + + def autodetectEPUBVersion(self): + # Determine EPUB version from metadata and RESC. + metadata = self.metadata + k8resc = self.k8resc + epubver = '2' + if 'true' == metadata.get('fixed-layout', [''])[0].lower(): + epubver = '3' + elif metadata.get('orientation-lock', [''])[0].lower() in ['portrait', 'landscape']: + epubver = '3' + elif self.page_progression_direction == 'rtl': + epubver = '3' + elif EXTH_TITLE_FURIGANA in metadata: + epubver = '3' + elif EXTH_CREATOR_FURIGANA in metadata: + epubver = '3' + elif EXTH_PUBLISHER_FURIGANA in metadata: + epubver = '3' + elif k8resc is not None and k8resc.needEPUB3(): + epubver = '3' + return epubver + + def defineRefinesID(self): + # the following EXTH are set by KDP. + # 'Title_Furigana_(508)' + # 'Creator_Furigana_(517)', + # 'Publisher_Furigana_(522)' + # It is difficult to find correspondence between Title, Creator, Publisher + # and EXTH 508,512, 522 if they have more than two values since KDP seems not preserve the oders of EXTH 508,512 and 522. + # It is also difficult to find correspondence between them and tags which have refine attributes in RESC. + # So editing manually is required. + metadata = self.metadata + + needRefinesId = False + if self.k8resc is not None: + needRefinesId = self.k8resc.hasRefines() + # Create id for rifine attributes + if (needRefinesId or EXTH_TITLE_FURIGANA in metadata) and 'Title' in metadata: + for i in range(len(metadata.get('Title'))): + self.title_id[i] = 'title%02d' % (i+1) + + if (needRefinesId or EXTH_CREATOR_FURIGANA in metadata) and 'Creator' in metadata: + for i in range(len(metadata.get('Creator'))): + self.creator_id[i] = 'creator%02d' % (i+1) + + if (needRefinesId or EXTH_PUBLISHER_FURIGANA in metadata) and 'Publisher' in metadata: + for i in range(len(metadata.get('Publisher'))): + self.publisher_id[i] = 'publisher%02d' % (i+1) + + def processRefinesMetadata(self): + # create refines metadata defined in epub3 or convert refines property to opf: attribues for epub2. + metadata = self.metadata + + refines_list = [ + [EXTH_TITLE_FURIGANA, self.title_id, self.title_attrib, 'title00'], + [EXTH_CREATOR_FURIGANA, self.creator_id, self.creator_attrib, 'creator00'], + [EXTH_PUBLISHER_FURIGANA, self.publisher_id, self.publisher_attrib, 'publisher00'] + ] + + create_refines_metadata = False + for EXTH in lzip(*refines_list)[0]: + if EXTH in metadata: + create_refines_metadata = True + break + if create_refines_metadata: + for [EXTH, id, attrib, defaultid] in refines_list: + if self.target_epubver == '3': + for i, value in list(id.items()): + attrib[i] = ' id="%s"' % value + + if EXTH in metadata: + if len(metadata[EXTH]) == 1 and len(id) == 1: + self.createMetaTag(self.exth_solved_refines_metadata, 'file-as', metadata[EXTH][0], id[0]) + else: + for i, value in enumerate(metadata[EXTH]): + self.createMetaTag(self.exth_refines_metadata, 'file-as', value, id.get(i, defaultid)) + else: + if EXTH in metadata: + if len(metadata[EXTH]) == 1 and len(id) == 1: + attr = ' opf:file-as="%s"' % metadata[EXTH][0] + attrib[0] = attr + else: + for i, value in enumerate(metadata[EXTH]): + attr = ' id="#%s" opf:file-as="%s"\n' % (id.get(i, defaultid), value) + self.extra_attributes.append(attr) + + def createMetadataForFixedlayout(self): + # convert fixed layout to epub3 format if needed. + metadata = self.metadata + + if 'fixed-layout' in metadata: + fixedlayout = metadata['fixed-layout'][0] + content = {'true' : 'pre-paginated'}.get(fixedlayout.lower(), 'reflowable') + self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:layout', content) + + if 'orientation-lock' in metadata: + content = metadata['orientation-lock'][0].lower() + if content == 'portrait' or content == 'landscape': + self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:orientation', content) + + # according to epub3 spec about correspondence with Amazon + # if 'original-resolution' is provided it needs to be converted to + # meta viewport property tag stored in the <head></head> of **each** + # xhtml page - so this tag would need to be handled by editing each part + # before reaching this routine + # we need to add support for this to the k8html routine + # if 'original-resolution' in metadata.keys(): + # resolution = metadata['original-resolution'][0].lower() + # width, height = resolution.split('x') + # if width.isdigit() and int(width) > 0 and height.isdigit() and int(height) > 0: + # viewport = 'width=%s, height=%s' % (width, height) + # self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:viewport', viewport) diff --git a/src/epy_reader/tools/KindleUnpack/mobi_pagemap.py b/src/epy_reader/tools/KindleUnpack/mobi_pagemap.py new file mode 100644 index 0000000..5228d4e --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_pagemap.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, unicode_str + +if PY2: + range = xrange + +import struct +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring + +import re +# note: re requites the pattern to be the exact same type as the data to be searched in python3 +# but u"" is not allowed for the pattern itself only b"" + + +_TABLE = [('m', 1000), ('cm', 900), ('d', 500), ('cd', 400), ('c', 100), ('xc', 90), ('l', 50), ('xl', 40), ('x', 10), ('ix', 9), ('v', 5), ('iv', 4), ('i', 1)] + +def int_to_roman(i): + parts = [] + num = i + for letter, value in _TABLE: + while value <= num: + num -= value + parts.append(letter) + return ''.join(parts) + +def roman_to_int(s): + result = 0 + rnstr = s + for letter, value in _TABLE: + while rnstr.startswith(letter): + result += value + rnstr = rnstr[len(letter):] + return result + +_pattern = r'''\(([^\)]*)\)''' +_tup_pattern = re.compile(_pattern,re.IGNORECASE) + + +def _parseNames(numpages, data): + data = unicode_str(data) + pagenames = [] + pageMap = '' + for i in range(numpages): + pagenames.append(None) + for m in re.finditer(_tup_pattern, data): + tup = m.group(1) + if pageMap != '': + pageMap += ',' + pageMap += '(' + tup + ')' + spos, nametype, svalue = tup.split(",") + # print(spos, nametype, svalue) + if nametype == 'a' or nametype == 'r': + svalue = int(svalue) + spos = int(spos) + for i in range(spos - 1, numpages): + if nametype == 'r': + pname = int_to_roman(svalue) + svalue += 1 + elif nametype == 'a': + pname = "%s" % svalue + svalue += 1 + elif nametype == 'c': + sp = svalue.find('|') + if sp == -1: + pname = svalue + else: + pname = svalue[0:sp] + svalue = svalue[sp+1:] + else: + print("Error: unknown page numbering type", nametype) + pagenames[i] = pname + return pagenames, pageMap + + +class PageMapProcessor: + + def __init__(self, mh, data): + self.mh = mh + self.data = data + self.pagenames = [] + self.pageoffsets = [] + self.pageMap = '' + self.pm_len = 0 + self.pm_nn = 0 + self.pn_bits = 0 + self.pmoff = None + self.pmstr = '' + print("Extracting Page Map Information") + rev_len, = struct.unpack_from(b'>L', self.data, 0x10) + # skip over header, revision string length data, and revision string + ptr = 0x14 + rev_len + pm_1, self.pm_len, self.pm_nn, self.pm_bits = struct.unpack_from(b'>4H', self.data, ptr) + # print(pm_1, self.pm_len, self.pm_nn, self.pm_bits) + self.pmstr = self.data[ptr+8:ptr+8+self.pm_len] + self.pmoff = self.data[ptr+8+self.pm_len:] + offsize = b">L" + offwidth = 4 + if self.pm_bits == 16: + offsize = b">H" + offwidth = 2 + ptr = 0 + for i in range(self.pm_nn): + od, = struct.unpack_from(offsize, self.pmoff, ptr) + ptr += offwidth + self.pageoffsets.append(od) + self.pagenames, self.pageMap = _parseNames(self.pm_nn, self.pmstr) + + def getPageMap(self): + return self.pageMap + + def getNames(self): + return self.pagenames + + def getOffsets(self): + return self.pageoffsets + + # page-map.xml will be unicode but encoded to utf-8 immediately before being written to a file + def generateKF8PageMapXML(self, k8proc): + pagemapxml = '<page-map xmlns="http://www.idpf.org/2007/opf">\n' + for i in range(len(self.pagenames)): + pos = self.pageoffsets[i] + name = self.pagenames[i] + if name is not None and name != "": + [pn, dir, filename, skelpos, skelend, aidtext] = k8proc.getSkelInfo(pos) + idtext = unicode_str(k8proc.getPageIDTag(pos)) + linktgt = unicode_str(filename) + if idtext != '': + linktgt += '#' + idtext + pagemapxml += '<page name="%s" href="%s/%s" />\n' % (name, dir, linktgt) + pagemapxml += "</page-map>\n" + return pagemapxml + + def generateAPNX(self, apnx_meta): + if apnx_meta['format'] == 'MOBI_8': + content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","format":"%(format)s","fileRevisionId":"1","acr":"%(acr)s"}' %apnx_meta + else: + content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","fileRevisionId":"1"}' % apnx_meta + content_header = content_header.encode('utf-8') + page_header = '{"asin":"%(asin)s","pageMap":"%(pageMap)s"}' % apnx_meta + page_header = page_header.encode('utf-8') + apnx = struct.pack(b'>H',1) + struct.pack(b'>H',1) + apnx += struct.pack(b'>I', 12 + len(content_header)) + apnx += struct.pack(b'>I', len(content_header)) + apnx += content_header + apnx += struct.pack(b'>H', 1) + apnx += struct.pack(b'>H', len(page_header)) + apnx += struct.pack(b'>H', self.pm_nn) + apnx += struct.pack(b'>H', 32) + apnx += page_header + for page in self.pageoffsets: + apnx += struct.pack(b'>L', page) + return apnx diff --git a/src/epy_reader/tools/KindleUnpack/mobi_sectioner.py b/src/epy_reader/tools/KindleUnpack/mobi_sectioner.py new file mode 100644 index 0000000..81f62bb --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_sectioner.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, hexlify, bstr, bord, bchar + +import datetime + +if PY2: + range = xrange + +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring +import struct + +from .unipath import pathof + +DUMP = False +""" Set to True to dump all possible information. """ + +class unpackException(Exception): + pass + + +def describe(data): + txtans = '' + hexans = hexlify(data) + for i in data: + if bord(i) < 32 or bord(i) > 127: + txtans += '?' + else: + txtans += bchar(i).decode('latin-1') + return '"' + txtans + '"' + ' 0x'+ hexans + +def datetimefrompalmtime(palmtime): + if palmtime > 0x7FFFFFFF: + pythondatetime = datetime.datetime(year=1904,month=1,day=1)+datetime.timedelta(seconds=palmtime) + else: + pythondatetime = datetime.datetime(year=1970,month=1,day=1)+datetime.timedelta(seconds=palmtime) + return pythondatetime + + +class Sectionizer: + + def __init__(self, filename): + self.data = b'' + with open(pathof(filename), 'rb') as f: + self.data = f.read() + self.palmheader = self.data[:78] + self.palmname = self.data[:32] + self.ident = self.palmheader[0x3C:0x3C+8] + self.num_sections, = struct.unpack_from(b'>H', self.palmheader, 76) + self.filelength = len(self.data) + sectionsdata = struct.unpack_from(bstr('>%dL' % (self.num_sections*2)), self.data, 78) + (self.filelength, 0) + self.sectionoffsets = sectionsdata[::2] + self.sectionattributes = sectionsdata[1::2] + self.sectiondescriptions = ["" for x in range(self.num_sections+1)] + self.sectiondescriptions[-1] = "File Length Only" + return + + def dumpsectionsinfo(self): + print("Section Offset Length UID Attribs Description") + for i in range(self.num_sections): + print("%3d %3X 0x%07X 0x%05X % 8d % 7d %s" % (i,i, self.sectionoffsets[i], self.sectionoffsets[ + i+1] - self.sectionoffsets[i], self.sectionattributes[i]&0xFFFFFF, (self.sectionattributes[i]>>24)&0xFF, self.sectiondescriptions[i])) + print("%3d %3X 0x%07X %s" % + (self.num_sections,self.num_sections, self.sectionoffsets[self.num_sections], self.sectiondescriptions[self.num_sections])) + + def setsectiondescription(self, section, description): + if section < len(self.sectiondescriptions): + self.sectiondescriptions[section] = description + else: + print("Section out of range: %d, description %s" % (section,description)) + + def dumppalmheader(self): + print("Palm Database Header") + print("Database name: " + repr(self.palmheader[:32])) + dbattributes, = struct.unpack_from(b'>H', self.palmheader, 32) + print("Bitfield attributes: 0x%0X" % dbattributes,) + if dbattributes != 0: + print(" (",) + if (dbattributes & 2): + print("Read-only; ",) + if (dbattributes & 4): + print("Dirty AppInfoArea; ",) + if (dbattributes & 8): + print("Needs to be backed up; ",) + if (dbattributes & 16): + print("OK to install over newer; ",) + if (dbattributes & 32): + print("Reset after installation; ",) + if (dbattributes & 64): + print("No copying by PalmPilot beaming; ",) + print(")") + else: + print("") + print("File version: %d" % struct.unpack_from(b'>H', self.palmheader, 34)[0]) + dbcreation, = struct.unpack_from(b'>L', self.palmheader, 36) + print("Creation Date: " + str(datetimefrompalmtime(dbcreation))+ (" (0x%0X)" % dbcreation)) + dbmodification, = struct.unpack_from(b'>L', self.palmheader, 40) + print("Modification Date: " + str(datetimefrompalmtime(dbmodification))+ (" (0x%0X)" % dbmodification)) + dbbackup, = struct.unpack_from(b'>L', self.palmheader, 44) + if dbbackup != 0: + print("Backup Date: " + str(datetimefrompalmtime(dbbackup))+ (" (0x%0X)" % dbbackup)) + print("Modification No.: %d" % struct.unpack_from(b'>L', self.palmheader, 48)[0]) + print("App Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 52)[0]) + print("Sort Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 56)[0]) + print("Type/Creator: %s/%s" % (repr(self.palmheader[60:64]), repr(self.palmheader[64:68]))) + print("Unique seed: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 68)[0]) + expectedzero, = struct.unpack_from(b'>L', self.palmheader, 72) + if expectedzero != 0: + print("Should be zero but isn't: %d" % struct.unpack_from(b'>L', self.palmheader, 72)[0]) + print("Number of sections: %d" % struct.unpack_from(b'>H', self.palmheader, 76)[0]) + return + + def loadSection(self, section): + before, after = self.sectionoffsets[section:section+2] + return self.data[before:after] diff --git a/src/epy_reader/tools/KindleUnpack/mobi_split.py b/src/epy_reader/tools/KindleUnpack/mobi_split.py new file mode 100755 index 0000000..3535029 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_split.py @@ -0,0 +1,438 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +import struct +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring + +from .unipath import pathof + + +# important pdb header offsets +unique_id_seed = 68 +number_of_pdb_records = 76 + +# important palmdoc header offsets +book_length = 4 +book_record_count = 8 +first_pdb_record = 78 + +# important rec0 offsets +length_of_book = 4 +mobi_header_base = 16 +mobi_header_length = 20 +mobi_type = 24 +mobi_version = 36 +first_non_text = 80 +title_offset = 84 +first_resc_record = 108 +first_content_index = 192 +last_content_index = 194 +kf8_fdst_index = 192 # for KF8 mobi headers +fcis_index = 200 +flis_index = 208 +srcs_index = 224 +srcs_count = 228 +primary_index = 244 +datp_index = 256 +huffoff = 112 +hufftbloff = 120 + +def getint(datain,ofs,sz=b'L'): + i, = struct.unpack_from(b'>'+sz,datain,ofs) + return i + +def writeint(datain,ofs,n,len=b'L'): + if len==b'L': + return datain[:ofs]+struct.pack(b'>L',n)+datain[ofs+4:] + else: + return datain[:ofs]+struct.pack(b'>H',n)+datain[ofs+2:] + +def getsecaddr(datain,secno): + nsec = getint(datain,number_of_pdb_records,b'H') + assert secno>=0 & secno<nsec,'secno %d out of range (nsec=%d)'%(secno,nsec) + secstart = getint(datain,first_pdb_record+secno*8) + if secno == nsec-1: + secend = len(datain) + else: + secend = getint(datain,first_pdb_record+(secno+1)*8) + return secstart,secend + +def readsection(datain,secno): + secstart, secend = getsecaddr(datain,secno) + return datain[secstart:secend] + +def writesection(datain,secno,secdata): # overwrite, accounting for different length + # dataout = deletesectionrange(datain,secno, secno) + # return insertsection(dataout, secno, secdata) + datalst = [] + nsec = getint(datain,number_of_pdb_records,b'H') + zerosecstart,zerosecend = getsecaddr(datain,0) + secstart,secend = getsecaddr(datain,secno) + dif = len(secdata) - (secend - secstart) + datalst.append(datain[:unique_id_seed]) + datalst.append(struct.pack(b'>L',2*nsec+1)) + datalst.append(datain[unique_id_seed+4:number_of_pdb_records]) + datalst.append(struct.pack(b'>H',nsec)) + newstart = zerosecstart + for i in range(0,secno): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) + datalst.append(struct.pack(b'>L', secstart) + struct.pack(b'>L', (2*secno))) + for i in range(secno+1,nsec): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + ofs = ofs + dif + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) + lpad = newstart - (first_pdb_record + 8*nsec) + if lpad > 0: + datalst.append(b'\0' * lpad) + datalst.append(datain[zerosecstart:secstart]) + datalst.append(secdata) + datalst.append(datain[secend:]) + dataout = b''.join(datalst) + return dataout + +def nullsection(datain,secno): # make it zero-length without deleting it + datalst = [] + nsec = getint(datain,number_of_pdb_records,b'H') + secstart, secend = getsecaddr(datain,secno) + zerosecstart, zerosecend = getsecaddr(datain, 0) + dif = secend-secstart + datalst.append(datain[:first_pdb_record]) + for i in range(0,secno+1): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) + for i in range(secno+1, nsec): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + ofs = ofs - dif + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) + lpad = zerosecstart - (first_pdb_record + 8*nsec) + if lpad > 0: + datalst.append(b'\0' * lpad) + datalst.append(datain[zerosecstart: secstart]) + datalst.append(datain[secend:]) + dataout = b''.join(datalst) + return dataout + +def deletesectionrange(datain,firstsec,lastsec): # delete a range of sections + datalst = [] + firstsecstart,firstsecend = getsecaddr(datain,firstsec) + lastsecstart,lastsecend = getsecaddr(datain,lastsec) + zerosecstart, zerosecend = getsecaddr(datain, 0) + dif = lastsecend - firstsecstart + 8*(lastsec-firstsec+1) + nsec = getint(datain,number_of_pdb_records,b'H') + datalst.append(datain[:unique_id_seed]) + datalst.append(struct.pack(b'>L',2*(nsec-(lastsec-firstsec+1))+1)) + datalst.append(datain[unique_id_seed+4:number_of_pdb_records]) + datalst.append(struct.pack(b'>H',nsec-(lastsec-firstsec+1))) + newstart = zerosecstart - 8*(lastsec-firstsec+1) + for i in range(0,firstsec): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + ofs = ofs-8*(lastsec-firstsec+1) + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) + for i in range(lastsec+1,nsec): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + ofs = ofs - dif + flgval = 2*(i-(lastsec-firstsec+1)) + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) + lpad = newstart - (first_pdb_record + 8*(nsec - (lastsec - firstsec + 1))) + if lpad > 0: + datalst.append(b'\0' * lpad) + datalst.append(datain[zerosecstart:firstsecstart]) + datalst.append(datain[lastsecend:]) + dataout = b''.join(datalst) + return dataout + +def insertsection(datain,secno,secdata): # insert a new section + datalst = [] + nsec = getint(datain,number_of_pdb_records,b'H') + # print("inserting secno" , secno, "into" ,nsec, "sections") + secstart,secend = getsecaddr(datain,secno) + zerosecstart,zerosecend = getsecaddr(datain,0) + dif = len(secdata) + datalst.append(datain[:unique_id_seed]) + datalst.append(struct.pack(b'>L',2*(nsec+1)+1)) + datalst.append(datain[unique_id_seed+4:number_of_pdb_records]) + datalst.append(struct.pack(b'>H',nsec+1)) + newstart = zerosecstart + 8 + for i in range(0,secno): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + ofs += 8 + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) + datalst.append(struct.pack(b'>L', secstart + 8) + struct.pack(b'>L', (2*secno))) + for i in range(secno,nsec): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + ofs = ofs + dif + 8 + flgval = 2*(i+1) + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) + lpad = newstart - (first_pdb_record + 8*(nsec + 1)) + if lpad > 0: + datalst.append(b'\0' * lpad) + datalst.append(datain[zerosecstart:secstart]) + datalst.append(secdata) + datalst.append(datain[secstart:]) + dataout = b''.join(datalst) + return dataout + + +def insertsectionrange(sectionsource,firstsec,lastsec,sectiontarget,targetsec): # insert a range of sections + # print("inserting secno" , firstsec, "to", lastsec, "into" ,targetsec, "sections") + # dataout = sectiontarget + # for idx in range(lastsec,firstsec-1,-1): + # dataout = insertsection(dataout,targetsec,readsection(sectionsource,idx)) + # return dataout + datalst = [] + nsec = getint(sectiontarget,number_of_pdb_records,b'H') + zerosecstart, zerosecend = getsecaddr(sectiontarget,0) + insstart, nul = getsecaddr(sectiontarget,targetsec) + nins = lastsec - firstsec + 1 + srcstart, nul = getsecaddr(sectionsource,firstsec) + nul, srcend = getsecaddr(sectionsource,lastsec) + newstart = zerosecstart + 8*nins + + datalst.append(sectiontarget[:unique_id_seed]) + datalst.append(struct.pack(b'>L',2*(nsec+nins)+1)) + datalst.append(sectiontarget[unique_id_seed+4:number_of_pdb_records]) + datalst.append(struct.pack(b'>H',nsec+nins)) + for i in range(0,targetsec): + ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8) + ofsnew = ofs + 8*nins + flgvalnew = flgval + datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew)) + # print(ofsnew, flgvalnew, ofs, flgval) + srcstart0, nul = getsecaddr(sectionsource,firstsec) + for i in range(nins): + isrcstart, nul = getsecaddr(sectionsource,firstsec+i) + ofsnew = insstart + (isrcstart-srcstart0) + 8*nins + flgvalnew = 2*(targetsec+i) + datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew)) + # print(ofsnew, flgvalnew) + dif = srcend - srcstart + for i in range(targetsec,nsec): + ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8) + ofsnew = ofs + dif + 8*nins + flgvalnew = 2*(i+nins) + datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L',flgvalnew)) + # print(ofsnew, flgvalnew, ofs, flgval) + lpad = newstart - (first_pdb_record + 8*(nsec + nins)) + if lpad > 0: + datalst.append(b'\0' * lpad) + datalst.append(sectiontarget[zerosecstart:insstart]) + datalst.append(sectionsource[srcstart:srcend]) + datalst.append(sectiontarget[insstart:]) + dataout = b''.join(datalst) + return dataout + +def get_exth_params(rec0): + ebase = mobi_header_base + getint(rec0,mobi_header_length) + elen = getint(rec0,ebase+4) + enum = getint(rec0,ebase+8) + return ebase,elen,enum + +def add_exth(rec0,exth_num,exth_bytes): + ebase,elen,enum = get_exth_params(rec0) + newrecsize = 8+len(exth_bytes) + newrec0 = rec0[0:ebase+4]+struct.pack(b'>L',elen+newrecsize)+struct.pack(b'>L',enum+1)+\ + struct.pack(b'>L',exth_num)+struct.pack(b'>L',newrecsize)+exth_bytes+rec0[ebase+12:] + newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+newrecsize) + return newrec0 + +def read_exth(rec0,exth_num): + exth_values = [] + ebase,elen,enum = get_exth_params(rec0) + ebase = ebase+12 + while enum>0: + exth_id = getint(rec0,ebase) + if exth_id == exth_num: + # We might have multiple exths, so build a list. + exth_values.append(rec0[ebase+8:ebase+getint(rec0,ebase+4)]) + enum = enum-1 + ebase = ebase+getint(rec0,ebase+4) + return exth_values + +def write_exth(rec0,exth_num,exth_bytes): + ebase,elen,enum = get_exth_params(rec0) + ebase_idx = ebase+12 + enum_idx = enum + while enum_idx>0: + exth_id = getint(rec0,ebase_idx) + if exth_id == exth_num: + dif = len(exth_bytes)+8-getint(rec0,ebase_idx+4) + newrec0 = rec0 + if dif != 0: + newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+dif) + return newrec0[:ebase+4]+struct.pack(b'>L',elen+len(exth_bytes)+8-getint(rec0,ebase_idx+4))+\ + struct.pack(b'>L',enum)+rec0[ebase+12:ebase_idx+4]+\ + struct.pack(b'>L',len(exth_bytes)+8)+exth_bytes+\ + rec0[ebase_idx+getint(rec0,ebase_idx+4):] + enum_idx = enum_idx-1 + ebase_idx = ebase_idx+getint(rec0,ebase_idx+4) + return rec0 + +def del_exth(rec0,exth_num): + ebase,elen,enum = get_exth_params(rec0) + ebase_idx = ebase+12 + enum_idx = 0 + while enum_idx < enum: + exth_id = getint(rec0,ebase_idx) + exth_size = getint(rec0,ebase_idx+4) + if exth_id == exth_num: + newrec0 = rec0 + newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)-exth_size) + newrec0 = newrec0[:ebase_idx]+newrec0[ebase_idx+exth_size:] + newrec0 = newrec0[0:ebase+4]+struct.pack(b'>L',elen-exth_size)+struct.pack(b'>L',enum-1)+newrec0[ebase+12:] + return newrec0 + enum_idx += 1 + ebase_idx = ebase_idx+exth_size + return rec0 + + +class mobi_split: + + def __init__(self, infile): + datain = b'' + with open(pathof(infile), 'rb') as f: + datain = f.read() + datain_rec0 = readsection(datain,0) + ver = getint(datain_rec0,mobi_version) + self.combo = (ver!=8) + if not self.combo: + return + exth121 = read_exth(datain_rec0,121) + if len(exth121) == 0: + self.combo = False + return + else: + # only pay attention to first exth121 + # (there should only be one) + datain_kf8, = struct.unpack_from(b'>L',exth121[0],0) + if datain_kf8 == 0xffffffff: + self.combo = False + return + datain_kfrec0 =readsection(datain,datain_kf8) + + # create the standalone mobi7 + num_sec = getint(datain,number_of_pdb_records,b'H') + # remove BOUNDARY up to but not including ELF record + self.result_file7 = deletesectionrange(datain,datain_kf8-1,num_sec-2) + # check if there are SRCS records and delete them + srcs = getint(datain_rec0,srcs_index) + num_srcs = getint(datain_rec0,srcs_count) + if srcs != 0xffffffff and num_srcs > 0: + self.result_file7 = deletesectionrange(self.result_file7,srcs,srcs+num_srcs-1) + datain_rec0 = writeint(datain_rec0,srcs_index,0xffffffff) + datain_rec0 = writeint(datain_rec0,srcs_count,0) + # reset the EXTH 121 KF8 Boundary meta data to 0xffffffff + datain_rec0 = write_exth(datain_rec0,121, struct.pack(b'>L', 0xffffffff)) + # datain_rec0 = del_exth(datain_rec0,121) + # datain_rec0 = del_exth(datain_rec0,534) + # don't remove the EXTH 125 KF8 Count of Resources, seems to be present in mobi6 files as well + # set the EXTH 129 KF8 Masthead / Cover Image string to the null string + datain_rec0 = write_exth(datain_rec0,129, b'') + # don't remove the EXTH 131 KF8 Unidentified Count, seems to be present in mobi6 files as well + + # need to reset flags stored in 0x80-0x83 + # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050 + # Bit Flags + # 0x1000 = Bit 12 indicates if embedded fonts are used or not + # 0x0800 = means this Header points to *shared* images/resource/fonts ?? + # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8? + # 0x0040 = exth exists + # 0x0010 = Not sure but this is always set so far + fval, = struct.unpack_from(b'>L',datain_rec0, 0x80) + # need to remove flag 0x0800 for KindlePreviewer 2.8 and unset Bit 12 for embedded fonts + fval = fval & 0x07FF + datain_rec0 = datain_rec0[:0x80] + struct.pack(b'>L',fval) + datain_rec0[0x84:] + + self.result_file7 = writesection(self.result_file7,0,datain_rec0) + + # no need to replace kf8 style fcis with mobi 7 one + # fcis_secnum, = struct.unpack_from(b'>L',datain_rec0, 0xc8) + # if fcis_secnum != 0xffffffff: + # fcis_info = readsection(datain, fcis_secnum) + # text_len, = struct.unpack_from(b'>L', fcis_info, 0x14) + # new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00' + # new_fcis += struct.pack(b'>L',text_len) + # new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00' + # self.result_file7 = writesection(self.result_file7, fcis_secnum, new_fcis) + + firstimage = getint(datain_rec0,first_resc_record) + lastimage = getint(datain_rec0,last_content_index,b'H') + # print("Old First Image, last Image", firstimage,lastimage) + if lastimage == 0xffff: + # find the lowest of the next sections and copy up to that. + ofs_list = [(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')] + for ofs,sz in ofs_list: + n = getint(datain_rec0,ofs,sz) + # print("n",n) + if n > 0 and n < lastimage: + lastimage = n-1 + print("First Image, last Image", firstimage,lastimage) + + # Try to null out FONT and RES, but leave the (empty) PDB record so image refs remain valid + for i in range(firstimage,lastimage): + imgsec = readsection(self.result_file7,i) + if imgsec[0:4] in [b'RESC',b'FONT']: + self.result_file7 = nullsection(self.result_file7,i) + + # mobi7 finished + + # create standalone mobi8 + self.result_file8 = deletesectionrange(datain,0,datain_kf8-1) + target = getint(datain_kfrec0,first_resc_record) + self.result_file8 = insertsectionrange(datain,firstimage,lastimage,self.result_file8,target) + datain_kfrec0 =readsection(self.result_file8,0) + + # Only keep the correct EXTH 116 StartOffset, KG 2.5 carries over the one from the mobi7 part, which then points at garbage in the mobi8 part, and confuses FW 3.4 + kf8starts = read_exth(datain_kfrec0,116) + # If we have multiple StartOffset, keep only the last one + kf8start_count = len(kf8starts) + while kf8start_count > 1: + kf8start_count -= 1 + datain_kfrec0 = del_exth(datain_kfrec0,116) + + # update the EXTH 125 KF8 Count of Images/Fonts/Resources + datain_kfrec0 = write_exth(datain_kfrec0,125,struct.pack(b'>L',lastimage-firstimage+1)) + + # need to reset flags stored in 0x80-0x83 + # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050 + # standalone mobi8 with exth: 0x0050 + # Bit Flags + # 0x1000 = Bit 12 indicates if embedded fonts are used or not + # 0x0800 = means this Header points to *shared* images/resource/fonts ?? + # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8? + # 0x0040 = exth exists + # 0x0010 = Not sure but this is always set so far + fval, = struct.unpack_from('>L',datain_kfrec0, 0x80) + fval = fval & 0x1FFF + fval |= 0x0800 + datain_kfrec0 = datain_kfrec0[:0x80] + struct.pack(b'>L',fval) + datain_kfrec0[0x84:] + + # properly update other index pointers that have been shifted by the insertion of images + ofs_list = [(kf8_fdst_index,b'L'),(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')] + for ofs,sz in ofs_list: + n = getint(datain_kfrec0,ofs,sz) + if n != 0xffffffff: + datain_kfrec0 = writeint(datain_kfrec0,ofs,n+lastimage-firstimage+1,sz) + self.result_file8 = writesection(self.result_file8,0,datain_kfrec0) + + # no need to replace kf8 style fcis with mobi 7 one + # fcis_secnum, = struct.unpack_from(b'>L',datain_kfrec0, 0xc8) + # if fcis_secnum != 0xffffffff: + # fcis_info = readsection(self.result_file8, fcis_secnum) + # text_len, = struct.unpack_from(b'>L', fcis_info, 0x14) + # new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00' + # new_fcis += struct.pack(b'>L',text_len) + # new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00' + # self.result_file8 = writesection(self.result_file8, fcis_secnum, new_fcis) + + # mobi8 finished + + def getResult8(self): + return self.result_file8 + + def getResult7(self): + return self.result_file7 diff --git a/src/epy_reader/tools/KindleUnpack/mobi_uncompress.py b/src/epy_reader/tools/KindleUnpack/mobi_uncompress.py new file mode 100644 index 0000000..c5fad85 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_uncompress.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, bchr, lmap, bstr + +if PY2: + range = xrange + +import struct +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring + + +class unpackException(Exception): + pass + +class UncompressedReader: + + def unpack(self, data): + return data + +class PalmdocReader: + + def unpack(self, i): + o, p = b'', 0 + while p < len(i): + # for python 3 must use slice since i[p] returns int while slice returns character + c = ord(i[p:p+1]) + p += 1 + if (c >= 1 and c <= 8): + o += i[p:p+c] + p += c + elif (c < 128): + o += bchr(c) + elif (c >= 192): + o += b' ' + bchr(c ^ 128) + else: + if p < len(i): + c = (c << 8) | ord(i[p:p+1]) + p += 1 + m = (c >> 3) & 0x07ff + n = (c & 7) + 3 + if (m > n): + o += o[-m:n-m] + else: + for _ in range(n): + # because of completely ass-backwards decision by python mainters for python 3 + # we must use slice for bytes as i[p] returns int while slice returns character + if m == 1: + o += o[-m:] + else: + o += o[-m:-m+1] + return o + +class HuffcdicReader: + q = struct.Struct(b'>Q').unpack_from + + def loadHuff(self, huff): + if huff[0:8] != b'HUFF\x00\x00\x00\x18': + raise unpackException('invalid huff header') + off1, off2 = struct.unpack_from(b'>LL', huff, 8) + + def dict1_unpack(v): + codelen, term, maxcode = v&0x1f, v&0x80, v>>8 + assert codelen != 0 + if codelen <= 8: + assert term + maxcode = ((maxcode + 1) << (32 - codelen)) - 1 + return (codelen, term, maxcode) + self.dict1 = lmap(dict1_unpack, struct.unpack_from(b'>256L', huff, off1)) + + dict2 = struct.unpack_from(b'>64L', huff, off2) + self.mincode, self.maxcode = (), () + for codelen, mincode in enumerate((0,) + dict2[0::2]): + self.mincode += (mincode << (32 - codelen), ) + for codelen, maxcode in enumerate((0,) + dict2[1::2]): + self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, ) + + self.dictionary = [] + + def loadCdic(self, cdic): + if cdic[0:8] != b'CDIC\x00\x00\x00\x10': + raise unpackException('invalid cdic header') + phrases, bits = struct.unpack_from(b'>LL', cdic, 8) + n = min(1<<bits, phrases-len(self.dictionary)) + h = struct.Struct(b'>H').unpack_from + def getslice(off): + blen, = h(cdic, 16+off) + slice = cdic[18+off:18+off+(blen&0x7fff)] + return (slice, blen&0x8000) + self.dictionary += lmap(getslice, struct.unpack_from(bstr('>%dH' % n), cdic, 16)) + + def unpack(self, data): + q = HuffcdicReader.q + + bitsleft = len(data) * 8 + data += b"\x00\x00\x00\x00\x00\x00\x00\x00" + pos = 0 + x, = q(data, pos) + n = 32 + + s = b'' + while True: + if n <= 0: + pos += 4 + x, = q(data, pos) + n += 32 + code = (x >> n) & ((1 << 32) - 1) + + codelen, term, maxcode = self.dict1[code >> 24] + if not term: + while code < self.mincode[codelen]: + codelen += 1 + maxcode = self.maxcode[codelen] + + n -= codelen + bitsleft -= codelen + if bitsleft < 0: + break + + r = (maxcode - code) >> (32 - codelen) + slice, flag = self.dictionary[r] + if not flag: + self.dictionary[r] = None + slice = self.unpack(slice) + self.dictionary[r] = (slice, 1) + s += slice + return s diff --git a/src/epy_reader/tools/KindleUnpack/mobi_utils.py b/src/epy_reader/tools/KindleUnpack/mobi_utils.py new file mode 100644 index 0000000..6791e0d --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_utils.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab +# flake8: noqa + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, text_type, bchr, bord + +import binascii + +if PY2: + range = xrange + +from itertools import cycle + +def getLanguage(langID, sublangID): + mobilangdict = { + 54 : {0 : 'af'}, # Afrikaans + 28 : {0 : 'sq'}, # Albanian + 1 : {0 : 'ar' , 5 : 'ar-dz' , 15 : 'ar-bh' , 3 : 'ar-eg' , 2 : 'ar-iq', 11 : 'ar-jo' , 13 : 'ar-kw' , 12 : 'ar-lb' , 4: 'ar-ly', + 6 : 'ar-ma' , 8 : 'ar-om' , 16 : 'ar-qa' , 1 : 'ar-sa' , 10 : 'ar-sy' , 7 : 'ar-tn' , 14 : 'ar-ae' , 9 : 'ar-ye'}, + # Arabic, Arabic (Algeria), Arabic (Bahrain), Arabic (Egypt), Arabic + # (Iraq), Arabic (Jordan), Arabic (Kuwait), Arabic (Lebanon), Arabic + # (Libya), Arabic (Morocco), Arabic (Oman), Arabic (Qatar), Arabic + # (Saudi Arabia), Arabic (Syria), Arabic (Tunisia), Arabic (United Arab + # Emirates), Arabic (Yemen) + 43 : {0 : 'hy'}, # Armenian + 77 : {0 : 'as'}, # Assamese + 44 : {0 : 'az'}, # "Azeri (IANA: Azerbaijani) + 45 : {0 : 'eu'}, # Basque + 35 : {0 : 'be'}, # Belarusian + 69 : {0 : 'bn'}, # Bengali + 2 : {0 : 'bg'}, # Bulgarian + 3 : {0 : 'ca'}, # Catalan + 4 : {0 : 'zh' , 3 : 'zh-hk' , 2 : 'zh-cn' , 4 : 'zh-sg' , 1 : 'zh-tw'}, + # Chinese, Chinese (Hong Kong), Chinese (PRC), Chinese (Singapore), Chinese (Taiwan) + 26 : {0 : 'hr', 3 : 'sr'}, # Croatian, Serbian + 5 : {0 : 'cs'}, # Czech + 6 : {0 : 'da'}, # Danish + 19 : {0: 'nl', 1 : 'nl' , 2 : 'nl-be'}, # Dutch / Flemish, Dutch (Belgium) + 9 : {0: 'en', 1 : 'en' , 3 : 'en-au' , 40 : 'en-bz' , 4 : 'en-ca' , 6 : 'en-ie' , 8 : 'en-jm' , 5 : 'en-nz' , 13 : 'en-ph' , + 7 : 'en-za' , 11 : 'en-tt' , 2 : 'en-gb', 1 : 'en-us' , 12 : 'en-zw'}, + # English, English (Australia), English (Belize), English (Canada), + # English (Ireland), English (Jamaica), English (New Zealand), English + # (Philippines), English (South Africa), English (Trinidad), English + # (United Kingdom), English (United States), English (Zimbabwe) + 37 : {0 : 'et'}, # Estonian + 56 : {0 : 'fo'}, # Faroese + 41 : {0 : 'fa'}, # Farsi / Persian + 11 : {0 : 'fi'}, # Finnish + 12 : {0 : 'fr', 1 : 'fr' , 2 : 'fr-be' , 3 : 'fr-ca' , 5 : 'fr-lu' , 6 : 'fr-mc' , 4 : 'fr-ch'}, + # French, French (Belgium), French (Canada), French (Luxembourg), French (Monaco), French (Switzerland) + 55 : {0 : 'ka'}, # Georgian + 7 : {0 : 'de', 1 : 'de' , 3 : 'de-at' , 5 : 'de-li' , 4 : 'de-lu' , 2 : 'de-ch'}, + # German, German (Austria), German (Liechtenstein), German (Luxembourg), German (Switzerland) + 8 : {0 : 'el'}, # Greek, Modern (1453-) + 71 : {0 : 'gu'}, # Gujarati + 13 : {0 : 'he'}, # Hebrew (also code 'iw'?) + 57 : {0 : 'hi'}, # Hindi + 14 : {0 : 'hu'}, # Hungarian + 15 : {0 : 'is'}, # Icelandic + 33 : {0 : 'id'}, # Indonesian + 16 : {0 : 'it', 1 : 'it' , 2 : 'it-ch'}, # Italian, Italian (Switzerland) + 17 : {0 : 'ja'}, # Japanese + 75 : {0 : 'kn'}, # Kannada + 63 : {0 : 'kk'}, # Kazakh + 87 : {0 : 'x-kok'}, # Konkani (real language code is 'kok'?) + 18 : {0 : 'ko'}, # Korean + 38 : {0 : 'lv'}, # Latvian + 39 : {0 : 'lt'}, # Lithuanian + 47 : {0 : 'mk'}, # Macedonian + 62 : {0 : 'ms'}, # Malay + 76 : {0 : 'ml'}, # Malayalam + 58 : {0 : 'mt'}, # Maltese + 78 : {0 : 'mr'}, # Marathi + 97 : {0 : 'ne'}, # Nepali + 20 : {0 : 'no'}, # Norwegian + 72 : {0 : 'or'}, # Oriya + 21 : {0 : 'pl'}, # Polish + 22 : {0 : 'pt', 2 : 'pt' , 1 : 'pt-br'}, # Portuguese, Portuguese (Brazil) + 70 : {0 : 'pa'}, # Punjabi + 23 : {0 : 'rm'}, # "Rhaeto-Romanic" (IANA: Romansh) + 24 : {0 : 'ro'}, # Romanian + 25 : {0 : 'ru'}, # Russian + 59 : {0 : 'sz'}, # "Sami (Lappish)" (not an IANA language code) + # IANA code for "Northern Sami" is 'se' + # 'SZ' is the IANA region code for Swaziland + 79 : {0 : 'sa'}, # Sanskrit + 27 : {0 : 'sk'}, # Slovak + 36 : {0 : 'sl'}, # Slovenian + 46 : {0 : 'sb'}, # "Sorbian" (not an IANA language code) + # 'SB' is IANA region code for 'Solomon Islands' + # Lower Sorbian = 'dsb' + # Upper Sorbian = 'hsb' + # Sorbian Languages = 'wen' + 10 : {0 : 'es' , 4 : 'es' , 44 : 'es-ar' , 64 : 'es-bo' , 52 : 'es-cl' , 36 : 'es-co' , 20 : 'es-cr' , 28 : 'es-do' , + 48 : 'es-ec' , 68 : 'es-sv' , 16 : 'es-gt' , 72 : 'es-hn' , 8 : 'es-mx' , 76 : 'es-ni' , 24 : 'es-pa' , + 60 : 'es-py' , 40 : 'es-pe' , 80 : 'es-pr' , 56 : 'es-uy' , 32 : 'es-ve'}, + # Spanish, Spanish (Mobipocket bug?), Spanish (Argentina), Spanish + # (Bolivia), Spanish (Chile), Spanish (Colombia), Spanish (Costa Rica), + # Spanish (Dominican Republic), Spanish (Ecuador), Spanish (El + # Salvador), Spanish (Guatemala), Spanish (Honduras), Spanish (Mexico), + # Spanish (Nicaragua), Spanish (Panama), Spanish (Paraguay), Spanish + # (Peru), Spanish (Puerto Rico), Spanish (Uruguay), Spanish (Venezuela) + 48 : {0 : 'sx'}, # "Sutu" (not an IANA language code) + # "Sutu" is another name for "Southern Sotho"? + # IANA code for "Southern Sotho" is 'st' + 65 : {0 : 'sw'}, # Swahili + 29 : {0 : 'sv' , 1 : 'sv' , 8 : 'sv-fi'}, # Swedish, Swedish (Finland) + 73 : {0 : 'ta'}, # Tamil + 68 : {0 : 'tt'}, # Tatar + 74 : {0 : 'te'}, # Telugu + 30 : {0 : 'th'}, # Thai + 49 : {0 : 'ts'}, # Tsonga + 50 : {0 : 'tn'}, # Tswana + 31 : {0 : 'tr'}, # Turkish + 34 : {0 : 'uk'}, # Ukrainian + 32 : {0 : 'ur'}, # Urdu + 67 : {0 : 'uz', 2 : 'uz'}, # Uzbek + 42 : {0 : 'vi'}, # Vietnamese + 52 : {0 : 'xh'}, # Xhosa + 53 : {0 : 'zu'}, # Zulu + } + lang = "en" + if langID in mobilangdict: + subdict = mobilangdict[langID] + lang = subdict[0] + if sublangID in subdict: + lang = subdict[sublangID] + return lang + + +def toHex(byteList): + return binascii.hexlify(byteList) + +# returns base32 bytestring +def toBase32(value, npad=4): + digits = b'0123456789ABCDEFGHIJKLMNOPQRSTUV' + num_string=b'' + current = value + while current != 0: + next, remainder = divmod(current, 32) + rem_string = digits[remainder:remainder+1] + num_string = rem_string + num_string + current=next + if num_string == b'': + num_string = b'0' + pad = npad - len(num_string) + if pad > 0: + num_string = b'0' * pad + num_string + return num_string + + +# converts base32 string to value +def fromBase32(str_num): + if isinstance(str_num, text_type): + str_num = str_num.encode('latin-1') + scalelst = [1,32,1024,32768,1048576,33554432,1073741824,34359738368] + value = 0 + j = 0 + n = len(str_num) + scale = 0 + for i in range(n): + c = str_num[n-i-1:n-i] + if c in b'0123456789': + v = ord(c) - ord(b'0') + else: + v = ord(c) - ord(b'A') + 10 + if j < len(scalelst): + scale = scalelst[j] + else: + scale = scale * 32 + j += 1 + if v != 0: + value = value + (v * scale) + return value + + +# note: if decode a bytestring using 'latin-1' (or any other 0-255 encoding) +# in place of ascii you will get a byte to half-word or integer +# one to one mapping of values from 0 - 255 + +def mangle_fonts(encryption_key, data): + if isinstance(encryption_key, text_type): + encryption_key = encryption_key.encode('latin-1') + crypt = data[:1024] + key = cycle(iter(map(bord, encryption_key))) + # encrypt = ''.join([chr(ord(x)^key.next()) for x in crypt]) + encrypt = b''.join([bchr(bord(x)^next(key)) for x in crypt]) + return encrypt + data[1024:] diff --git a/src/epy_reader/tools/KindleUnpack/mobiml2xhtml.py b/src/epy_reader/tools/KindleUnpack/mobiml2xhtml.py new file mode 100755 index 0000000..94fc671 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobiml2xhtml.py @@ -0,0 +1,527 @@ +#! /usr/bin/python +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + + +# this program works in concert with the output from KindleUnpack + +''' +Convert from Mobi ML to XHTML +''' + +from __future__ import division, absolute_import, print_function + +import os +import sys +import re + +SPECIAL_HANDLING_TAGS = { + '?xml' : ('xmlheader', -1), + '!--' : ('comment', -3), + '!DOCTYPE' : ('doctype', -1), +} + +SPECIAL_HANDLING_TYPES = ['xmlheader', 'doctype', 'comment'] + +SELF_CLOSING_TAGS = ['br' , 'hr', 'input', 'img', 'image', 'meta', 'spacer', 'link', 'frame', 'base', 'col', 'reference'] + +class MobiMLConverter(object): + + PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE) + IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex') + + def __init__(self, filename): + self.base_css_rules = 'blockquote { margin: 0em 0em 0em 1.25em }\n' + self.base_css_rules += 'p { margin: 0em }\n' + self.base_css_rules += '.bold { font-weight: bold }\n' + self.base_css_rules += '.italic { font-style: italic }\n' + self.base_css_rules += '.mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n' + self.tag_css_rules = {} + self.tag_css_rule_cnt = 0 + self.path = [] + self.filename = filename + self.wipml = open(self.filename, 'r').read() + self.pos = 0 + self.opfname = self.filename.rsplit('.',1)[0] + '.opf' + self.opos = 0 + self.meta = '' + self.cssname = os.path.join(os.path.dirname(self.filename),'styles.css') + self.current_font_size = 3 + self.font_history = [] + + def cleanup_html(self): + self.wipml = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.wipml) + self.wipml = self.wipml.replace('\r\n', '\n') + self.wipml = self.wipml.replace('> <', '>\n<') + self.wipml = self.wipml.replace('<mbp: ', '<mbp:') + # self.wipml = re.sub(r'<?xml[^>]*>', '', self.wipml) + self.wipml = self.wipml.replace('<br></br>','<br/>') + + def replace_page_breaks(self): + self.wipml = self.PAGE_BREAK_PAT.sub( + '<div class="mbp_pagebreak" />', + self.wipml) + + # parse leading text of ml and tag + def parseml(self): + p = self.pos + if p >= len(self.wipml): + return None + if self.wipml[p] != '<': + res = self.wipml.find('<',p) + if res == -1 : + res = len(self.wipml) + self.pos = res + return self.wipml[p:res], None + # handle comment as a special case to deal with multi-line comments + if self.wipml[p:p+4] == '<!--': + te = self.wipml.find('-->',p+1) + if te != -1: + te = te+2 + else : + te = self.wipml.find('>',p+1) + ntb = self.wipml.find('<',p+1) + if ntb != -1 and ntb < te: + self.pos = ntb + return self.wipml[p:ntb], None + self.pos = te + 1 + return None, self.wipml[p:te+1] + + # parses string version of tag to identify its name, + # its type 'begin', 'end' or 'single', + # plus build a hashtable of its attributes + # code is written to handle the possiblity of very poor formating + def parsetag(self, s): + p = 1 + # get the tag name + tname = None + ttype = None + tattr = {} + while s[p:p+1] == ' ' : + p += 1 + if s[p:p+1] == '/': + ttype = 'end' + p += 1 + while s[p:p+1] == ' ' : + p += 1 + b = p + while s[p:p+1] not in ('>', '/', ' ', '"', "'", "\r", "\n") : + p += 1 + tname=s[b:p].lower() + if tname == '!doctype': + tname = '!DOCTYPE' + # special cases + if tname in SPECIAL_HANDLING_TAGS: + ttype, backstep = SPECIAL_HANDLING_TAGS[tname] + tattr['special'] = s[p:backstep] + if ttype is None: + # parse any attributes + while s.find('=',p) != -1 : + while s[p:p+1] == ' ' : + p += 1 + b = p + while s[p:p+1] != '=' : + p += 1 + aname = s[b:p].lower() + aname = aname.rstrip(' ') + p += 1 + while s[p:p+1] == ' ' : + p += 1 + if s[p:p+1] in ('"', "'") : + p = p + 1 + b = p + while s[p:p+1] not in ('"', "'") : + p += 1 + val = s[b:p] + p += 1 + else : + b = p + while s[p:p+1] not in ('>', '/', ' ') : + p += 1 + val = s[b:p] + tattr[aname] = val + # label beginning and single tags + if ttype is None: + ttype = 'begin' + if s.find(' /',p) >= 0: + ttype = 'single_ext' + elif s.find('/',p) >= 0: + ttype = 'single' + return ttype, tname, tattr + + # main routine to convert from mobi markup language to html + def processml(self): + + # are these really needed + html_done = False + head_done = False + body_done = False + + skip = False + + htmlstr = '' + self.replace_page_breaks() + self.cleanup_html() + + # now parse the cleaned up ml into standard xhtml + while True: + + r = self.parseml() + if not r: + break + + text, tag = r + + if text: + if not skip: + htmlstr += text + + if tag: + ttype, tname, tattr = self.parsetag(tag) + + # If we run into a DTD or xml declarations inside the body ... bail. + if tname in SPECIAL_HANDLING_TAGS and tname != 'comment' and body_done: + htmlstr += '\n</body></html>' + break + + # make sure self-closing tags actually self-close + if ttype == 'begin' and tname in SELF_CLOSING_TAGS: + ttype = 'single' + + # make sure any end tags of self-closing tags are discarded + if ttype == 'end' and tname in SELF_CLOSING_TAGS: + continue + + # remove embedded guide and refernces from old mobis + if tname in ('guide', 'ncx', 'reference') and ttype in ('begin', 'single', 'single_ext'): + tname = 'removeme:{0}'.format(tname) + tattr = None + if tname in ('guide', 'ncx', 'reference', 'font', 'span') and ttype == 'end': + if self.path[-1] == 'removeme:{0}'.format(tname): + tname = 'removeme:{0}'.format(tname) + tattr = None + + # Get rid of font tags that only have a color attribute. + if tname == 'font' and ttype in ('begin', 'single', 'single_ext'): + if 'color' in tattr and len(tattr) == 1: + tname = 'removeme:{0}'.format(tname) + tattr = None + + # Get rid of empty spans in the markup. + if tname == 'span' and ttype in ('begin', 'single', 'single_ext') and not len(tattr): + tname = 'removeme:{0}'.format(tname) + + # need to handle fonts outside of the normal methods + # so fonts tags won't be added to the self.path since we keep track + # of font tags separately with self.font_history + if tname == 'font' and ttype == 'begin': + # check for nested font start tags + if len(self.font_history) > 0 : + # inject a font end tag + taginfo = ('end', 'font', None) + htmlstr += self.processtag(taginfo) + self.font_history.append((ttype, tname, tattr)) + # handle the current font start tag + taginfo = (ttype, tname, tattr) + htmlstr += self.processtag(taginfo) + continue + + # check for nested font tags and unnest them + if tname == 'font' and ttype == 'end': + self.font_history.pop() + # handle this font end tag + taginfo = ('end', 'font', None) + htmlstr += self.processtag(taginfo) + # check if we were nested + if len(self.font_history) > 0: + # inject a copy of the most recent font start tag from history + taginfo = self.font_history[-1] + htmlstr += self.processtag(taginfo) + continue + + # keep track of nesting path + if ttype == 'begin': + self.path.append(tname) + elif ttype == 'end': + if tname != self.path[-1]: + print('improper nesting: ', self.path, tname, ttype) + if tname not in self.path: + # handle case of end tag with no beginning by injecting empty begin tag + taginfo = ('begin', tname, None) + htmlstr += self.processtag(taginfo) + print(" - fixed by injecting empty start tag ", tname) + self.path.append(tname) + elif len(self.path) > 1 and tname == self.path[-2]: + # handle case of dangling missing end + taginfo = ('end', self.path[-1], None) + htmlstr += self.processtag(taginfo) + print(" - fixed by injecting end tag ", self.path[-1]) + self.path.pop() + self.path.pop() + + if tname == 'removeme:{0}'.format(tname): + if ttype in ('begin', 'single', 'single_ext'): + skip = True + else: + skip = False + else: + taginfo = (ttype, tname, tattr) + htmlstr += self.processtag(taginfo) + + # handle potential issue of multiple html, head, and body sections + if tname == 'html' and ttype == 'begin' and not html_done: + htmlstr += '\n' + html_done = True + + if tname == 'head' and ttype == 'begin' and not head_done: + htmlstr += '\n' + # also add in metadata and style link tags + htmlstr += self.meta + htmlstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n' + head_done = True + + if tname == 'body' and ttype == 'begin' and not body_done: + htmlstr += '\n' + body_done = True + + # handle issue of possibly missing html, head, and body tags + # I have not seen this but the original did something like this so ... + if not body_done: + htmlstr = '<body>\n' + htmlstr + '</body>\n' + if not head_done: + headstr = '<head>\n' + headstr += self.meta + headstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n' + headstr += '</head>\n' + htmlstr = headstr + htmlstr + if not html_done: + htmlstr = '<html>\n' + htmlstr + '</html>\n' + + # finally add DOCTYPE info + htmlstr = '<?xml version="1.0"?>\n<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n' + htmlstr + + css = self.base_css_rules + for cls, rule in self.tag_css_rules.items(): + css += '.%s { %s }\n' % (cls, rule) + + return (htmlstr, css, self.cssname) + + def ensure_unit(self, raw, unit='px'): + if re.search(r'\d+$', raw) is not None: + raw += unit + return raw + + # flatten possibly modified tag back to string + def taginfo_tostring(self, taginfo): + (ttype, tname, tattr) = taginfo + if ttype is None or tname is None: + return '' + if ttype == 'end': + return '</%s>' % tname + if ttype in SPECIAL_HANDLING_TYPES and tattr is not None and 'special' in tattr: + info = tattr['special'] + if ttype == 'comment': + return '<%s %s-->' % (tname, info) + else: + return '<%s %s>' % (tname, info) + res = [] + res.append('<%s' % tname) + if tattr is not None: + for key in tattr: + res.append(' %s="%s"' % (key, tattr[key])) + if ttype == 'single': + res.append('/>') + elif ttype == 'single_ext': + res.append(' />') + else : + res.append('>') + return "".join(res) + + # routines to convert from mobi ml tags atributes to xhtml attributes and styles + def processtag(self, taginfo): + # Converting mobi font sizes to numerics + size_map = { + 'xx-small': '1', + 'x-small': '2', + 'small': '3', + 'medium': '4', + 'large': '5', + 'x-large': '6', + 'xx-large': '7', + } + + size_to_em_map = { + '1': '.65em', + '2': '.75em', + '3': '1em', + '4': '1.125em', + '5': '1.25em', + '6': '1.5em', + '7': '2em', + } + + # current tag to work on + (ttype, tname, tattr) = taginfo + if not tattr: + tattr = {} + + styles = [] + + if tname is None or tname.startswith('removeme'): + return '' + + # have not seen an example of this yet so keep it here to be safe + # until this is better understood + if tname in ('country-region', 'place', 'placetype', 'placename', + 'state', 'city', 'street', 'address', 'content'): + tname = 'div' if tname == 'content' else 'span' + for key in tattr: + tattr.pop(key) + + # handle general case of style, height, width, bgcolor in any tag + if 'style' in tattr: + style = tattr.pop('style').strip() + if style: + styles.append(style) + + if 'align' in tattr: + align = tattr.pop('align').strip() + if align: + if tname in ('table', 'td', 'tr'): + pass + else: + styles.append('text-align: %s' % align) + + if 'height' in tattr: + height = tattr.pop('height').strip() + if height and '<' not in height and '>' not in height and re.search(r'\d+', height): + if tname in ('table', 'td', 'tr'): + pass + elif tname == 'img': + tattr['height'] = height + else: + styles.append('margin-top: %s' % self.ensure_unit(height)) + + if 'width' in tattr: + width = tattr.pop('width').strip() + if width and re.search(r'\d+', width): + if tname in ('table', 'td', 'tr'): + pass + elif tname == 'img': + tattr['width'] = width + else: + styles.append('text-indent: %s' % self.ensure_unit(width)) + if width.startswith('-'): + styles.append('margin-left: %s' % self.ensure_unit(width[1:])) + + if 'bgcolor' in tattr: + # no proprietary html allowed + if tname == 'div': + del tattr['bgcolor'] + + elif tname == 'font': + # Change font tags to span tags + tname = 'span' + if ttype in ('begin', 'single', 'single_ext'): + # move the face attribute to css font-family + if 'face' in tattr: + face = tattr.pop('face').strip() + styles.append('font-family: "%s"' % face) + + # Monitor the constantly changing font sizes, change them to ems and move + # them to css. The following will work for 'flat' font tags, but nested font tags + # will cause things to go wonky. Need to revert to the parent font tag's size + # when a closing tag is encountered. + if 'size' in tattr: + sz = tattr.pop('size').strip().lower() + try: + float(sz) + except ValueError: + if sz in size_map: + sz = size_map[sz] + else: + if sz.startswith('-') or sz.startswith('+'): + sz = self.current_font_size + float(sz) + if sz > 7: + sz = 7 + elif sz < 1: + sz = 1 + sz = str(int(sz)) + styles.append('font-size: %s' % size_to_em_map[sz]) + self.current_font_size = int(sz) + + elif tname == 'img': + for attr in ('width', 'height'): + if attr in tattr: + val = tattr[attr] + if val.lower().endswith('em'): + try: + nval = float(val[:-2]) + nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile + tattr[attr] = "%dpx"%int(nval) + except: + del tattr[attr] + elif val.lower().endswith('%'): + del tattr[attr] + + # convert the anchor tags + if 'filepos-id' in tattr: + tattr['id'] = tattr.pop('filepos-id') + if 'name' in tattr and tattr['name'] != tattr['id']: + tattr['name'] = tattr['id'] + + if 'filepos' in tattr: + filepos = tattr.pop('filepos') + try: + tattr['href'] = "#filepos%d" % int(filepos) + except ValueError: + pass + + if styles: + ncls = None + rule = '; '.join(styles) + for sel, srule in self.tag_css_rules.items(): + if srule == rule: + ncls = sel + break + if ncls is None: + self.tag_css_rule_cnt += 1 + ncls = 'rule_%d' % self.tag_css_rule_cnt + self.tag_css_rules[ncls] = rule + cls = tattr.get('class', '') + cls = cls + (' ' if cls else '') + ncls + tattr['class'] = cls + + # convert updated tag back to string representation + if len(tattr) == 0: + tattr = None + taginfo = (ttype, tname, tattr) + return self.taginfo_tostring(taginfo) + +''' main only left in for testing outside of plugin ''' + +def main(argv=sys.argv): + if len(argv) != 2: + return 1 + else: + infile = argv[1] + + try: + print('Converting Mobi Markup Language to XHTML') + mlc = MobiMLConverter(infile) + print('Processing ...') + htmlstr, css, cssname = mlc.processml() + outname = infile.rsplit('.',1)[0] + '_converted.html' + open(outname, 'w').write(htmlstr) + open(cssname, 'w').write(css) + print('Completed') + print('XHTML version of book can be found at: ' + outname) + + except ValueError as e: + print("Error: %s" % e) + return 1 + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/epy_reader/tools/KindleUnpack/unipath.py b/src/epy_reader/tools/KindleUnpack/unipath.py new file mode 100755 index 0000000..2416279 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/unipath.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this list of +# conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, this list +# of conditions and the following disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY +# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import unicode_literals, division, absolute_import, print_function +from .compatibility_utils import PY2, text_type, binary_type + +import sys +import os + +# utility routines to convert all paths to be full unicode + +# Under Python 2, if a bytestring, try to convert it to unicode using sys.getfilesystemencoding +# Under Python 3, if bytes, try to convert it to unicode using os.fsencode() to decode it + +# Mac OS X and Windows will happily support full unicode paths +# Linux can support full unicode paths but allows arbitrary byte paths which may be inconsistent with unicode + +fsencoding = sys.getfilesystemencoding() + +def pathof(s, enc=fsencoding): + if s is None: + return None + if isinstance(s, text_type): + return s + if isinstance(s, binary_type): + try: + return s.decode(enc) + except: + pass + return s + +def exists(s): + return os.path.exists(pathof(s)) + +def isfile(s): + return os.path.isfile(pathof(s)) + +def isdir(s): + return os.path.isdir(pathof(s)) + +def mkdir(s): + return os.mkdir(pathof(s)) + +def listdir(s): + rv = [] + for file in os.listdir(pathof(s)): + rv.append(pathof(file)) + return rv + +def getcwd(): + if PY2: + return os.getcwdu() + return os.getcwd() + +def walk(top): + top = pathof(top) + rv = [] + for base, dnames, names in os.walk(top): + base = pathof(base) + for name in names: + name = pathof(name) + rv.append(relpath(os.path.join(base, name), top)) + return rv + +def relpath(path, start=None): + return os.path.relpath(pathof(path) , pathof(start)) + +def abspath(path): + return os.path.abspath(pathof(path)) diff --git a/src/epy_reader/tools/KindleUnpack/unpack_structure.py b/src/epy_reader/tools/KindleUnpack/unpack_structure.py new file mode 100644 index 0000000..2e66eb8 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/unpack_structure.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import text_type + +from . import unipath +from .unipath import pathof + +DUMP = False +""" Set to True to dump all possible information. """ + +import os + +import re +# note: re requites the pattern to be the exact same type as the data to be searched in python3 +# but u"" is not allowed for the pattern itself only b"" + +import zipfile +import binascii +from .mobi_utils import mangle_fonts + +class unpackException(Exception): + pass + +class ZipInfo(zipfile.ZipInfo): + + def __init__(self, *args, **kwargs): + if 'compress_type' in kwargs: + compress_type = kwargs.pop('compress_type') + super(ZipInfo, self).__init__(*args, **kwargs) + self.compress_type = compress_type + +class fileNames: + + def __init__(self, infile, outdir): + self.infile = infile + self.outdir = outdir + if not unipath.exists(self.outdir): + unipath.mkdir(self.outdir) + self.mobi7dir = os.path.join(self.outdir,'mobi7') + if not unipath.exists(self.mobi7dir): + unipath.mkdir(self.mobi7dir) + self.imgdir = os.path.join(self.mobi7dir, 'Images') + if not unipath.exists(self.imgdir): + unipath.mkdir(self.imgdir) + self.hdimgdir = os.path.join(self.outdir,'HDImages') + if not unipath.exists(self.hdimgdir): + unipath.mkdir(self.hdimgdir) + self.outbase = os.path.join(self.outdir, os.path.splitext(os.path.split(infile)[1])[0]) + + def getInputFileBasename(self): + return os.path.splitext(os.path.basename(self.infile))[0] + + def makeK8Struct(self): + self.k8dir = os.path.join(self.outdir,'mobi8') + if not unipath.exists(self.k8dir): + unipath.mkdir(self.k8dir) + self.k8metainf = os.path.join(self.k8dir,'META-INF') + if not unipath.exists(self.k8metainf): + unipath.mkdir(self.k8metainf) + self.k8oebps = os.path.join(self.k8dir,'OEBPS') + if not unipath.exists(self.k8oebps): + unipath.mkdir(self.k8oebps) + self.k8images = os.path.join(self.k8oebps,'Images') + if not unipath.exists(self.k8images): + unipath.mkdir(self.k8images) + self.k8fonts = os.path.join(self.k8oebps,'Fonts') + if not unipath.exists(self.k8fonts): + unipath.mkdir(self.k8fonts) + self.k8styles = os.path.join(self.k8oebps,'Styles') + if not unipath.exists(self.k8styles): + unipath.mkdir(self.k8styles) + self.k8text = os.path.join(self.k8oebps,'Text') + if not unipath.exists(self.k8text): + unipath.mkdir(self.k8text) + + # recursive zip creation support routine + def zipUpDir(self, myzip, tdir, localname): + currentdir = tdir + if localname != "": + currentdir = os.path.join(currentdir,localname) + list = unipath.listdir(currentdir) + for file in list: + afilename = file + localfilePath = os.path.join(localname, afilename) + realfilePath = os.path.join(currentdir,file) + if unipath.isfile(realfilePath): + myzip.write(pathof(realfilePath), pathof(localfilePath), zipfile.ZIP_DEFLATED) + elif unipath.isdir(realfilePath): + self.zipUpDir(myzip, tdir, localfilePath) + + def makeEPUB(self, usedmap, obfuscate_data, uid): + bname = os.path.join(self.k8dir, self.getInputFileBasename() + '.epub') + # Create an encryption key for Adobe font obfuscation + # based on the epub's uid + if isinstance(uid,text_type): + uid = uid.encode('ascii') + if obfuscate_data: + key = re.sub(br'[^a-fA-F0-9]', b'', uid) + key = binascii.unhexlify((key + key)[:32]) + + # copy over all images and fonts that are actually used in the ebook + # and remove all font files from mobi7 since not supported + imgnames = unipath.listdir(self.imgdir) + for name in imgnames: + if usedmap.get(name,'not used') == 'used': + filein = os.path.join(self.imgdir,name) + if name.endswith(".ttf"): + fileout = os.path.join(self.k8fonts,name) + elif name.endswith(".otf"): + fileout = os.path.join(self.k8fonts,name) + elif name.endswith(".failed"): + fileout = os.path.join(self.k8fonts,name) + else: + fileout = os.path.join(self.k8images,name) + data = b'' + with open(pathof(filein),'rb') as f: + data = f.read() + if obfuscate_data: + if name in obfuscate_data: + data = mangle_fonts(key, data) + open(pathof(fileout),'wb').write(data) + if name.endswith(".ttf") or name.endswith(".otf"): + os.remove(pathof(filein)) + + # opf file name hard coded to "content.opf" + container = '<?xml version="1.0" encoding="UTF-8"?>\n' + container += '<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">\n' + container += ' <rootfiles>\n' + container += '<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>' + container += ' </rootfiles>\n</container>\n' + fileout = os.path.join(self.k8metainf,'container.xml') + with open(pathof(fileout),'wb') as f: + f.write(container.encode('utf-8')) + + if obfuscate_data: + encryption = '<encryption xmlns="urn:oasis:names:tc:opendocument:xmlns:container" \ +xmlns:enc="http://www.w3.org/2001/04/xmlenc#" xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">\n' + for font in obfuscate_data: + encryption += ' <enc:EncryptedData>\n' + encryption += ' <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>\n' + encryption += ' <enc:CipherData>\n' + encryption += ' <enc:CipherReference URI="OEBPS/Fonts/' + font + '"/>\n' + encryption += ' </enc:CipherData>\n' + encryption += ' </enc:EncryptedData>\n' + encryption += '</encryption>\n' + fileout = os.path.join(self.k8metainf,'encryption.xml') + with open(pathof(fileout),'wb') as f: + f.write(encryption.encode('utf-8')) + + # ready to build epub + self.outzip = zipfile.ZipFile(pathof(bname), 'w') + + # add the mimetype file uncompressed + mimetype = b'application/epub+zip' + fileout = os.path.join(self.k8dir,'mimetype') + with open(pathof(fileout),'wb') as f: + f.write(mimetype) + nzinfo = ZipInfo('mimetype', compress_type=zipfile.ZIP_STORED) + nzinfo.external_attr = 0o600 << 16 # make this a normal file + self.outzip.writestr(nzinfo, mimetype) + self.zipUpDir(self.outzip,self.k8dir,'META-INF') + self.zipUpDir(self.outzip,self.k8dir,'OEBPS') + self.outzip.close() |