From 258c30d2e088cd4ab091a53794da3f93af79915d Mon Sep 17 00:00:00 2001 From: Benawi Adha Date: Sun, 2 Oct 2022 21:22:38 +0700 Subject: Major refactor: breakdown epy.py script into package project structure for easier development Squashed commit of the following: commit 01309b961a4ab32394bff0d90949b57435dfda47 Author: Benawi Adha Date: Sun Oct 2 21:15:04 2022 +0700 Fix missing objects commit aab2e773c30b255c81b1250b3b20967d5da40338 Author: Benawi Adha Date: Sun Oct 2 21:09:31 2022 +0700 Update README.md commit d4e98926bcd9b00ce0410ad71249d24e6315abc5 Author: Benawi Adha Date: Sun Oct 2 21:07:28 2022 +0700 Add keywords in pyproject.toml commit 432055af8245560a3ff2e046aef0b4e87da44930 Author: Benawi Adha Date: Sun Oct 2 21:04:34 2022 +0700 Bump version and deprecete setup.py commit 51dd15aab8f8ff5996f822f8378e813f0b9fb80d Author: Benawi Adha Date: Sun Oct 2 20:56:38 2022 +0700 Formatting commit 81fb35e3b6fa0e27d79ef1da77202ed81eb99500 Author: Benawi Adha Date: Sun Oct 2 20:55:08 2022 +0700 Fix speakers module commit 3b852e7c59b38d5a28520038e35f50a95270d2f1 Author: Benawi Adha Date: Sat Oct 1 20:52:46 2022 +0700 Fix circular import commit 061e8a2649dabacd28a9e2f972559475316c654c Author: Benawi Adha Date: Sat Oct 1 20:39:27 2022 +0700 Run formatting commit abc2d0ab156992c63dc04745d14a69679a60accb Author: Benawi Adha Date: Sat Oct 1 20:39:00 2022 +0700 Update isort and black config in pyproject commit 5dc2e41bab5b997bd719bdc1561eb51ba0c17a83 Author: Benawi Adha Date: Sat Oct 1 20:31:00 2022 +0700 Add app Config commit ed485a2ea8281585bf86dc5772f0c6dd9c803cc4 Author: Benawi Adha Date: Sat Oct 1 20:23:02 2022 +0700 Update debugpy script commit 68b0553dd4d63eb4b847132c68ea4018587fa8ec Author: Benawi Adha Date: Sat Oct 1 20:14:11 2022 +0700 Connect reader to main script commit 63c3dd176f18a784a4ed2e88aa72b13d1c2b0990 Author: Benawi Adha Date: Sat Oct 1 20:11:17 2022 +0700 Implement reader commit ce5eec8fb4e1db3870a16a07541365cd777d6c4c Author: Benawi Adha Date: Sat Oct 1 19:29:49 2022 +0700 Fix script in pyproject.toml commit 941e8e49f1593731fb582d92084206772b3f0442 Author: Benawi Adha Date: Sat Oct 1 19:28:39 2022 +0700 Rename modules commit 5a3e7f766aee774c09b3b5336f3a2968e9cb1d0c Author: Benawi Adha Date: Sat Oct 1 19:28:20 2022 +0700 Rename tool method commit 3c0503ff475cb7eff8b12d3be0bda7a38efe1072 Author: Benawi Adha Date: Sat Oct 1 19:27:03 2022 +0700 Add ebooks lib commit b5f71c3296a7d6f36454f6e1cbe84e15a45092ee Author: Benawi Adha Date: Sat Oct 1 17:25:11 2022 +0700 Initial reorganization --- Makefile | 13 +- README.md | 2 + epy_extras/KindleUnpack/__init__.py | 2 - epy_extras/KindleUnpack/compatibility_utils.py | 278 ---- epy_extras/KindleUnpack/kindleunpack.py | 1029 ------------- epy_extras/KindleUnpack/mobi_cover.py | 238 --- epy_extras/KindleUnpack/mobi_dict.py | 377 ----- epy_extras/KindleUnpack/mobi_header.py | 936 ------------ epy_extras/KindleUnpack/mobi_html.py | 439 ------ epy_extras/KindleUnpack/mobi_index.py | 276 ---- epy_extras/KindleUnpack/mobi_k8proc.py | 496 ------ epy_extras/KindleUnpack/mobi_k8resc.py | 271 ---- epy_extras/KindleUnpack/mobi_nav.py | 187 --- epy_extras/KindleUnpack/mobi_ncx.py | 275 ---- epy_extras/KindleUnpack/mobi_opf.py | 686 --------- epy_extras/KindleUnpack/mobi_pagemap.py | 158 -- epy_extras/KindleUnpack/mobi_sectioner.py | 120 -- epy_extras/KindleUnpack/mobi_split.py | 438 ------ epy_extras/KindleUnpack/mobi_uncompress.py | 131 -- epy_extras/KindleUnpack/mobi_utils.py | 191 --- epy_extras/KindleUnpack/mobiml2xhtml.py | 527 ------- epy_extras/KindleUnpack/unipath.py | 93 -- epy_extras/KindleUnpack/unpack_structure.py | 167 -- epy_extras/__init__.py | 3 - poetry.lock | 502 +++++- pyproject.toml | 43 +- setup.py | 28 - src/epy_reader/__init__.py | 5 + src/epy_reader/__main__.py | 23 + src/epy_reader/board.py | 148 ++ src/epy_reader/cli.py | 171 +++ src/epy_reader/config.py | 80 + src/epy_reader/ebooks/__init__.py | 15 + src/epy_reader/ebooks/azw.py | 26 + src/epy_reader/ebooks/base.py | 48 + src/epy_reader/ebooks/epub.py | 202 +++ src/epy_reader/ebooks/fictionbook.py | 76 + src/epy_reader/ebooks/mobi.py | 69 + src/epy_reader/ebooks/url.py | 49 + src/epy_reader/lib.py | 63 + src/epy_reader/models.py | 232 +++ src/epy_reader/parser.py | 421 +++++ src/epy_reader/reader.py | 1610 ++++++++++++++++++++ src/epy_reader/settings.py | 133 ++ src/epy_reader/speakers/__init__.py | 9 + src/epy_reader/speakers/base.py | 21 + src/epy_reader/speakers/mimic.py | 31 + src/epy_reader/speakers/pico.py | 43 + src/epy_reader/state.py | 195 +++ src/epy_reader/tools/KindleUnpack/__init__.py | 2 + .../tools/KindleUnpack/compatibility_utils.py | 278 ++++ src/epy_reader/tools/KindleUnpack/kindleunpack.py | 1029 +++++++++++++ src/epy_reader/tools/KindleUnpack/mobi_cover.py | 238 +++ src/epy_reader/tools/KindleUnpack/mobi_dict.py | 377 +++++ src/epy_reader/tools/KindleUnpack/mobi_header.py | 936 ++++++++++++ src/epy_reader/tools/KindleUnpack/mobi_html.py | 439 ++++++ src/epy_reader/tools/KindleUnpack/mobi_index.py | 276 ++++ src/epy_reader/tools/KindleUnpack/mobi_k8proc.py | 496 ++++++ src/epy_reader/tools/KindleUnpack/mobi_k8resc.py | 271 ++++ src/epy_reader/tools/KindleUnpack/mobi_nav.py | 187 +++ src/epy_reader/tools/KindleUnpack/mobi_ncx.py | 275 ++++ src/epy_reader/tools/KindleUnpack/mobi_opf.py | 686 +++++++++ src/epy_reader/tools/KindleUnpack/mobi_pagemap.py | 158 ++ .../tools/KindleUnpack/mobi_sectioner.py | 120 ++ src/epy_reader/tools/KindleUnpack/mobi_split.py | 438 ++++++ .../tools/KindleUnpack/mobi_uncompress.py | 131 ++ src/epy_reader/tools/KindleUnpack/mobi_utils.py | 191 +++ src/epy_reader/tools/KindleUnpack/mobiml2xhtml.py | 527 +++++++ src/epy_reader/tools/KindleUnpack/unipath.py | 93 ++ .../tools/KindleUnpack/unpack_structure.py | 167 ++ src/epy_reader/tools/__init__.py | 3 + src/epy_reader/utils.py | 377 +++++ 72 files changed, 11877 insertions(+), 7394 deletions(-) delete mode 100644 epy_extras/KindleUnpack/__init__.py delete mode 100755 epy_extras/KindleUnpack/compatibility_utils.py delete mode 100644 epy_extras/KindleUnpack/kindleunpack.py delete mode 100644 epy_extras/KindleUnpack/mobi_cover.py delete mode 100644 epy_extras/KindleUnpack/mobi_dict.py delete mode 100644 epy_extras/KindleUnpack/mobi_header.py delete mode 100644 epy_extras/KindleUnpack/mobi_html.py delete mode 100644 epy_extras/KindleUnpack/mobi_index.py delete mode 100644 epy_extras/KindleUnpack/mobi_k8proc.py delete mode 100644 epy_extras/KindleUnpack/mobi_k8resc.py delete mode 100644 epy_extras/KindleUnpack/mobi_nav.py delete mode 100644 epy_extras/KindleUnpack/mobi_ncx.py delete mode 100644 epy_extras/KindleUnpack/mobi_opf.py delete mode 100644 epy_extras/KindleUnpack/mobi_pagemap.py delete mode 100644 epy_extras/KindleUnpack/mobi_sectioner.py delete mode 100755 epy_extras/KindleUnpack/mobi_split.py delete mode 100644 epy_extras/KindleUnpack/mobi_uncompress.py delete mode 100644 epy_extras/KindleUnpack/mobi_utils.py delete mode 100755 epy_extras/KindleUnpack/mobiml2xhtml.py delete mode 100755 epy_extras/KindleUnpack/unipath.py delete mode 100644 epy_extras/KindleUnpack/unpack_structure.py delete mode 100644 epy_extras/__init__.py delete mode 100644 setup.py create mode 100644 src/epy_reader/__init__.py create mode 100644 src/epy_reader/__main__.py create mode 100644 src/epy_reader/board.py create mode 100644 src/epy_reader/cli.py create mode 100644 src/epy_reader/config.py create mode 100644 src/epy_reader/ebooks/__init__.py create mode 100644 src/epy_reader/ebooks/azw.py create mode 100644 src/epy_reader/ebooks/base.py create mode 100644 src/epy_reader/ebooks/epub.py create mode 100644 src/epy_reader/ebooks/fictionbook.py create mode 100644 src/epy_reader/ebooks/mobi.py create mode 100644 src/epy_reader/ebooks/url.py create mode 100644 src/epy_reader/lib.py create mode 100644 src/epy_reader/models.py create mode 100644 src/epy_reader/parser.py create mode 100644 src/epy_reader/reader.py create mode 100644 src/epy_reader/settings.py create mode 100644 src/epy_reader/speakers/__init__.py create mode 100644 src/epy_reader/speakers/base.py create mode 100644 src/epy_reader/speakers/mimic.py create mode 100644 src/epy_reader/speakers/pico.py create mode 100644 src/epy_reader/state.py create mode 100644 src/epy_reader/tools/KindleUnpack/__init__.py create mode 100755 src/epy_reader/tools/KindleUnpack/compatibility_utils.py create mode 100644 src/epy_reader/tools/KindleUnpack/kindleunpack.py create mode 100644 src/epy_reader/tools/KindleUnpack/mobi_cover.py create mode 100644 src/epy_reader/tools/KindleUnpack/mobi_dict.py create mode 100644 src/epy_reader/tools/KindleUnpack/mobi_header.py create mode 100644 src/epy_reader/tools/KindleUnpack/mobi_html.py create mode 100644 src/epy_reader/tools/KindleUnpack/mobi_index.py create mode 100644 src/epy_reader/tools/KindleUnpack/mobi_k8proc.py create mode 100644 src/epy_reader/tools/KindleUnpack/mobi_k8resc.py create mode 100644 src/epy_reader/tools/KindleUnpack/mobi_nav.py create mode 100644 src/epy_reader/tools/KindleUnpack/mobi_ncx.py create mode 100644 src/epy_reader/tools/KindleUnpack/mobi_opf.py create mode 100644 src/epy_reader/tools/KindleUnpack/mobi_pagemap.py create mode 100644 src/epy_reader/tools/KindleUnpack/mobi_sectioner.py create mode 100755 src/epy_reader/tools/KindleUnpack/mobi_split.py create mode 100644 src/epy_reader/tools/KindleUnpack/mobi_uncompress.py create mode 100644 src/epy_reader/tools/KindleUnpack/mobi_utils.py create mode 100755 src/epy_reader/tools/KindleUnpack/mobiml2xhtml.py create mode 100755 src/epy_reader/tools/KindleUnpack/unipath.py create mode 100644 src/epy_reader/tools/KindleUnpack/unpack_structure.py create mode 100644 src/epy_reader/tools/__init__.py create mode 100644 src/epy_reader/utils.py diff --git a/Makefile b/Makefile index 3c5fcca..3d03560 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,15 @@ .PHONY: tests -.DEFAULT_GOAL := tests +.DEFAULT_GOAL := check + +check: + mypy --follow-imports=silent src + +format: + isort src + black src debug: - python -m debugpy --listen 5678 --wait-for-client -m epy + python -m debugpy --listen 5678 --wait-for-client -m epy_reader dev: poetry install @@ -16,5 +23,5 @@ coverage: python -m http.server -d htmlcov release: - python setup.py sdist bdist_wheel + python -m build twine upload --skip-existing dist/* diff --git a/README.md b/README.md index 0dcfdd7..98a671a 100644 --- a/README.md +++ b/README.md @@ -134,3 +134,5 @@ so line scrolling navigation will act as scrolling page and textwidth is not adj inside epy (default key: `R`). - `v2022.2.5`: Fix process.join() issue for unstarted process. + +- `v2022.10.2`: Major breakdown `epy.py` module into package structure for easier development. diff --git a/epy_extras/KindleUnpack/__init__.py b/epy_extras/KindleUnpack/__init__.py deleted file mode 100644 index 0077258..0000000 --- a/epy_extras/KindleUnpack/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -#!/usr/bin/env python -# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai diff --git a/epy_extras/KindleUnpack/compatibility_utils.py b/epy_extras/KindleUnpack/compatibility_utils.py deleted file mode 100755 index c46c0bb..0000000 --- a/epy_extras/KindleUnpack/compatibility_utils.py +++ /dev/null @@ -1,278 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab - -# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without modification, -# are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this list of -# conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, this list -# of conditions and the following disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT -# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED -# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY -# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from __future__ import unicode_literals, division, absolute_import, print_function - -import sys -import codecs - -PY2 = sys.version_info[0] == 2 -PY3 = sys.version_info[0] == 3 - -iswindows = sys.platform.startswith('win') - -try: - from urllib.parse import unquote -except ImportError: - from urllib import unquote - -if PY2: - from HTMLParser import HTMLParser - _h = HTMLParser() -elif sys.version_info[1] < 4: - import html.parser - _h = html.parser.HTMLParser() -else: - import html as _h - -if PY3: - text_type = str - binary_type = bytes - # if will be printing arbitraty binary data to stdout on python 3 - # sys.stdin = sys.stdin.detach() - # sys.stdout = sys.stdout.detach() - # sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach()) -else: - range = xrange - text_type = unicode - binary_type = str - # if will be printing unicode under python 2 need to protect - # against sys.stdout.encoding being None stupidly forcing forcing ascii encoding of unicode - # sys.stdout = codecs.getwriter("utf-8")(sys.stdout) - # alternatively set environment variable as follows **before** launching python: export PYTHONIOENCODING=UTF-8 - -# NOTE: Python 3 is completely broken when accessing single bytes in bytes strings -# (and they amazingly claim by design and no bug!) - -# To illustrate: this works for unicode in Python 3 and for all Python 2.X for both bytestrings and unicode -# >>> o = '123456789' -# >>> o[-3] -# '7' -# >>> type(o[-3]) -# -# >>> type(o) -# - -# Unfortunately, this is what Python 3 does for no sane reason and only for bytestrings -# >>> o = b'123456789' -# >>> o[-3] -# 55 -# >>> type(o[-3]) -# -# >>> type(o) -# - -# This mind boggling behaviour also happens when indexing a bytestring and/or -# iteratoring over a bytestring. In other words it will return an int but not -# the byte itself!!!!!!! - -# The only way to access a single byte as a byte in bytestring and get the byte in both -# Python 2 and Python 3 is to use a slice - -# This problem is so common there are horrible hacks floating around the net to **try** -# to work around it, so that code that works on both Python 2 and Python 3 is possible. - -# So in order to write code that works on both Python 2 and Python 3 -# if you index or access a single byte and want its ord() then use the bord() function. -# If instead you want it as a single character byte use the bchar() function -# both of which are defined below. - -if PY3: - # Also Note: if decode a bytestring using 'latin-1' (or any other full range 0-255 encoding) - # in place of ascii you will get a byte value to half-word or integer value - # one-to-one mapping (in the 0 - 255 range) - - def bchr(s): - return bytes([s]) - - def bstr(s): - if isinstance(s, str): - return bytes(s, 'latin-1') - else: - return bytes(s) - - def bord(s): - return s - - def bchar(s): - return bytes([s]) - -else: - def bchr(s): - return chr(s) - - def bstr(s): - return str(s) - - def bord(s): - return ord(s) - - def bchar(s): - return s - -if PY3: - # list-producing versions of the major Python iterating functions - def lrange(*args, **kwargs): - return list(range(*args, **kwargs)) - - def lzip(*args, **kwargs): - return list(zip(*args, **kwargs)) - - def lmap(*args, **kwargs): - return list(map(*args, **kwargs)) - - def lfilter(*args, **kwargs): - return list(filter(*args, **kwargs)) -else: - import __builtin__ - # Python 2-builtin ranges produce lists - lrange = __builtin__.range - lzip = __builtin__.zip - lmap = __builtin__.map - lfilter = __builtin__.filter - -# In Python 3 you can no longer use .encode('hex') on a bytestring -# instead use the following on both platforms -import binascii -def hexlify(bdata): - return (binascii.hexlify(bdata)).decode('ascii') - -# If you: import struct -# Note: struct pack, unpack, unpack_from all *require* bytestring format -# data all the way up to at least Python 2.7.5, Python 3 is okay with either - -# If you: import re -# note: Python 3 "re" requires the pattern to be the exact same type as the data to be -# searched ... but u"" is not allowed for the pattern itself only b"" -# Python 2.X allows the pattern to be any type and converts it to match the data -# and returns the same type as the data - -# convert string to be utf-8 encoded -def utf8_str(p, enc='utf-8'): - if p is None: - return None - if isinstance(p, text_type): - return p.encode('utf-8') - if enc != 'utf-8': - return p.decode(enc).encode('utf-8') - return p - -# convert string to be unicode encoded -def unicode_str(p, enc='utf-8'): - if p is None: - return None - if isinstance(p, text_type): - return p - return p.decode(enc) - -ASCII_CHARS = set(chr(x) for x in range(128)) -URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ' - 'abcdefghijklmnopqrstuvwxyz' - '0123456789' '#' '_.-/~') -IRI_UNSAFE = ASCII_CHARS - URL_SAFE - -# returns a quoted IRI (not a URI) -def quoteurl(href): - if isinstance(href,binary_type): - href = href.decode('utf-8') - result = [] - for char in href: - if char in IRI_UNSAFE: - char = "%%%02x" % ord(char) - result.append(char) - return ''.join(result) - -# unquotes url/iri -def unquoteurl(href): - if isinstance(href,binary_type): - href = href.decode('utf-8') - href = unquote(href) - return href - -# unescape html -def unescapeit(sval): - return _h.unescape(sval) - -# Python 2.X commandline parsing under Windows has been horribly broken for years! -# Use the following code to emulate full unicode commandline parsing on Python 2 -# ie. To get sys.argv arguments and properly encode them as unicode - -def unicode_argv(): - global iswindows - global PY3 - if PY3: - return sys.argv - if iswindows: - # Versions 2.x of Python don't support Unicode in sys.argv on - # Windows, with the underlying Windows API instead replacing multi-byte - # characters with '?'. So use shell32.GetCommandLineArgvW to get sys.argv - # as a list of Unicode strings - from ctypes import POINTER, byref, cdll, c_int, windll - from ctypes.wintypes import LPCWSTR, LPWSTR - - GetCommandLineW = cdll.kernel32.GetCommandLineW - GetCommandLineW.argtypes = [] - GetCommandLineW.restype = LPCWSTR - - CommandLineToArgvW = windll.shell32.CommandLineToArgvW - CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)] - CommandLineToArgvW.restype = POINTER(LPWSTR) - - cmd = GetCommandLineW() - argc = c_int(0) - argv = CommandLineToArgvW(cmd, byref(argc)) - if argc.value > 0: - # Remove Python executable and commands if present - start = argc.value - len(sys.argv) - return [argv[i] for i in - range(start, argc.value)] - # this should never happen - return None - else: - argv = [] - argvencoding = sys.stdin.encoding - if argvencoding is None: - argvencoding = sys.getfilesystemencoding() - if argvencoding is None: - argvencoding = 'utf-8' - for arg in sys.argv: - if isinstance(arg, text_type): - argv.append(arg) - else: - argv.append(arg.decode(argvencoding)) - return argv - - -# Python 2.X is broken in that it does not recognize CP65001 as UTF-8 -def add_cp65001_codec(): - if PY2: - try: - codecs.lookup('cp65001') - except LookupError: - codecs.register( - lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None) - return diff --git a/epy_extras/KindleUnpack/kindleunpack.py b/epy_extras/KindleUnpack/kindleunpack.py deleted file mode 100644 index 317941a..0000000 --- a/epy_extras/KindleUnpack/kindleunpack.py +++ /dev/null @@ -1,1029 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab - -from __future__ import unicode_literals, division, absolute_import, print_function - -import os - -__path__ = ["lib", os.path.dirname(os.path.realpath(__file__)), "kindleunpack"] - -import sys -import codecs -import traceback - -from .compatibility_utils import PY2, binary_type, utf8_str, unicode_str -from .compatibility_utils import unicode_argv, add_cp65001_codec -from .compatibility_utils import hexlify - -add_cp65001_codec() - -from .unipath import pathof - -if PY2: - range = xrange - # since will be printing unicode under python 2 need to protect - # against sys.stdout.encoding being None stupidly forcing forcing ascii encoding - if sys.stdout.encoding is None: - sys.stdout = codecs.getwriter("utf-8")(sys.stdout) - else: - encoding = sys.stdout.encoding - sys.stdout = codecs.getwriter(encoding)(sys.stdout) - -# Changelog -# 0.11 - Version by adamselene -# 0.11pd - Tweaked version by pdurrant -# 0.12 - extracts pictures too, and all into a folder. -# 0.13 - added back in optional output dir for those who don't want it based on infile -# 0.14 - auto flush stdout and wrapped in main, added proper return codes -# 0.15 - added support for metadata -# 0.16 - metadata now starting to be output as an opf file (PD) -# 0.17 - Also created tweaked text as source for Mobipocket Creator -# 0.18 - removed raw mobi file completely but kept _meta.html file for ease of conversion -# 0.19 - added in metadata for ASIN, Updated Title and Rights to the opf -# 0.20 - remove _meta.html since no longer needed -# 0.21 - Fixed some typos in the opf output, and also updated handling -# of test for trailing data/multibyte characters -# 0.22 - Fixed problem with > 9 images -# 0.23 - Now output Start guide item -# 0.24 - Set firstaddl value for 'TEXtREAd' -# 0.25 - Now added character set metadata to html file for utf-8 files. -# 0.26 - Dictionary support added. Image handling speed improved. -# For huge files create temp files to speed up decoding. -# Language decoding fixed. Metadata is now converted to utf-8 when written to opf file. -# 0.27 - Add idx:entry attribute "scriptable" if dictionary contains entry length tags. -# Don't save non-image sections as images. Extract and save source zip file -# included by kindlegen as kindlegensrc.zip. -# 0.28 - Added back correct image file name extensions, created FastConcat class to simplify and clean up -# 0.29 - Metadata handling reworked, multiple entries of the same type are now supported. -# Several missing types added. -# FastConcat class has been removed as in-memory handling with lists is faster, even for huge files. -# 0.30 - Add support for outputting **all** metadata values - encode content with hex if of unknown type -# 0.31 - Now supports Print Replica ebooks, outputting PDF and mysterious data sections -# 0.32 - Now supports NCX file extraction/building. -# Overhauled the structure of mobiunpack to be more class oriented. -# 0.33 - Split Classes ito separate files and added prelim support for KF8 format eBooks -# 0.34 - Improved KF8 support, guide support, bug fixes -# 0.35 - Added splitting combo mobi7/mobi8 into standalone mobi7 and mobi8 files -# Also handle mobi8-only file properly -# 0.36 - very minor changes to support KF8 mobis with no flow items, no ncx, etc -# 0.37 - separate output, add command line switches to control, interface to Mobi_Unpack.pyw -# 0.38 - improve split function by resetting flags properly, fix bug in Thumbnail Images -# 0.39 - improve split function so that ToC info is not lost for standalone mobi8s -# 0.40 - make mobi7 split match official versions, add support for graphic novel metadata, -# improve debug for KF8 -# 0.41 - fix when StartOffset set to 0xffffffff, fix to work with older mobi versions, -# fix other minor metadata issues -# 0.42 - add new class interface to allow it to integrate more easily with internal calibre routines -# 0.43 - bug fixes for new class interface -# 0.44 - more bug fixes and fix for potnetial bug caused by not properly closing created zip archive -# 0.45 - sync to version in the new Mobi_Unpack plugin -# 0.46 - fixes for: obfuscated fonts, improper toc links and ncx, add support for opentype fonts -# 0.47 - minor opf improvements -# 0.48 - ncx link fixes -# 0.49 - use azw3 when splitting mobis -# 0.50 - unknown change -# 0.51 - fix for converting filepos links to hrefs, Added GPL3 notice, made KF8 extension just '.azw3' -# 0.52 - fix for cover metadata (no support for Mobipocket Creator) -# 0.53 - fix for proper identification of embedded fonts, added new metadata items -# 0.54 - Added error-handling so wonky embedded fonts don't bomb the whole unpack process, -# entity escape KF8 metadata to ensure valid OPF. -# 0.55 Strip extra StartOffset EXTH from the mobi8 header when splitting, keeping only the relevant one -# For mobi8 files, don't generate duplicate guide entries from the metadata if we could extract one -# from the OTH table. -# 0.56 - Added further entity escaping of OPF text. -# Allow unicode string file paths to be passed as arguments to the unpackBook method without blowing up later -# when the attempt to "re"-unicode a portion of that filename occurs in the process_all_mobi_headers method. -# 0.57 - Fixed eror when splitting Preview files downloaded from KDP website -# 0.58 - Output original kindlegen build log ('CMET' record) if included in the package. -# 0.58 - Include and extend functionality of DumpMobiHeader, replacing DEBUG with DUMP -# 0.59 - Much added DUMP functionality, including full dumping and descriptions of sections -# 0.60 - Bug fixes in opf, div tables, bad links, page breaks, section descriptions -# - plus a number of other bug fixed that were found by Sergey Dubinets -# - fixs for file/paths that require full unicode to work properly -# - replace subprocess with multiprocessing to remove need for unbuffered stdout -# 0.61 - renamed to be KindleUnpack and more unicode/utf-8 path bug fixes and other minor fixes -# 0.62 - fix for multiprocessing on Windows, split fixes, opf improvements -# 0.63 - Modified to process right to left page progression books properly. -# - Added some id_map_strings and RESC section processing; metadata and -# - spine in the RESC are integrated partly to content.opf. -# 0.63a- Separated K8 RESC processor to an individual file. Bug fixes. Added cover page creation. -# 0.64 - minor bug fixes to more properly handle unicode command lines, and support for more jpeg types -# 0.64a- Modifed to handle something irregular mobi and azw3 files. -# 0.64b- Modifed to create k8resc.spine for no RECS files. -# 0.65 - Bug fixes to shorten title and remove epub3 "properties" to make the output epub2 compliant -# 0.65a- Bug fixes to extract RESC section correctly, to prevent item id confliction -# - and to process multiline comments in RESC. -# 0.66 - Bug fix to deal with missing first resource information sometimes generated by calibre -# 0.66a- Fixed minor bugs, which probably do not affect the output anything -# 0.67 - Fixed Mobi Split functionality bug with azw3 images not being properly copied -# 0.68 - preliminary support for handling PAGE sections to create page-map.xml -# 0.69 - preliminary support for CONT and CRES for HD Images -# 0.70 - preliminary support for decoding apnx files when used with azw3 ebooks -# 0.71 - extensive refactoring of kindleunpack.py to make it more manageable -# 0.72 - many bug fixes from tkeo: fix pageProcessing, fix print replica, fix resc usage, fix font mangling, etc. -# 0.72a- fix for still broken PrintReplica support -# 0.72b- preview for primary epub3 support. A parameter epubver(default='2') is added to process_all_mobi_headers(), unpackBook(). -# 0.72c- preview for apnx page support -# 0.72d- more bugs fixed in preview features, much improved GUI with ability to dynaically grow the Log Window with preference support -# 0.72e- more bug fixes, Tk GUI adds support for epub version and HDImage use -# 0.72f- more bug fixes, implement use hd images if present -# 0.72g- minor bug fixes and cleanups from tkeo -# 0.72h- updated mobi_header and mobi_k8proc to use the correct fragment and guide terms in place of div and other -# to better match the terms that both Calibre and Amazon use internally to their own software -# 0.72x- very experimental conversion to use new mobi_k8resc.py and some of its associated changes -# 0.72y- more changes to simplify and integrate in epub3 support in a simpler manner -# 0.72z- remove redundancy in mobi_opf.py and bug fixes for mobi_k8resc.py -# 0.73 faster mobi split, numerous bug fixes in mobi_k8proc, mobi_header, mobi_opf, mobi_k8resc, etc -# 0.74 added refines metadata, fixed language code in ncx and title in nav, added support for opf: from refines -# 0.75 much improved dictioanry support including support for multiple inflection sections, minor mobi_opf fixes -# 0.76 pre-release version only fix name related issues in opf by not using original file name in mobi7 -# 0.77 bug fix for unpacking HDImages with included Fonts -# 0.80 converted to work with both python 2.7 and Python 3.3 and later -# 0.81 various fixes -# 0.82 Handle calibre-generated mobis that can have skeletons with no fragments -# 0.83 Fix header item 114 being mistakenly treated as a string instead of a value - -DUMP = False -""" Set to True to dump all possible information. """ - -WRITE_RAW_DATA = False -""" Set to True to create additional files with raw data for debugging/reverse engineering. """ - -SPLIT_COMBO_MOBIS = False -""" Set to True to split combination mobis into mobi7 and mobi8 pieces. """ - -CREATE_COVER_PAGE = True # XXX experimental -""" Create and insert a cover xhtml page. """ - -EOF_RECORD = b'\xe9\x8e' + b'\r\n' -""" The EOF record content. """ - -TERMINATION_INDICATOR1 = b'\x00' -TERMINATION_INDICATOR2 = b'\x00\x00' -TERMINATION_INDICATOR3 = b'\x00\x00\x00' - -KINDLEGENSRC_FILENAME = "kindlegensrc.zip" -""" The name for the kindlegen source archive. """ - -KINDLEGENLOG_FILENAME = "kindlegenbuild.log" -""" The name for the kindlegen build log. """ - -K8_BOUNDARY = b'BOUNDARY' -""" The section data that divides K8 mobi ebooks. """ - -import os -import struct -import re -import zlib -import getopt - -class unpackException(Exception): - pass - - -# import the kindleunpack support libraries -from .unpack_structure import fileNames -from .mobi_sectioner import Sectionizer, describe -from .mobi_header import MobiHeader, dump_contexth -from .mobi_utils import toBase32 -from .mobi_opf import OPFProcessor -from .mobi_html import HTMLProcessor, XHTMLK8Processor -from .mobi_ncx import ncxExtract -from .mobi_k8proc import K8Processor -from .mobi_split import mobi_split -from .mobi_k8resc import K8RESCProcessor -from .mobi_nav import NAVProcessor -from .mobi_cover import CoverProcessor, get_image_type -from .mobi_pagemap import PageMapProcessor -from .mobi_dict import dictSupport - - -def processSRCS(i, files, rscnames, sect, data): - # extract the source zip archive and save it. - print("File contains kindlegen source archive, extracting as %s" % KINDLEGENSRC_FILENAME) - srcname = os.path.join(files.outdir, KINDLEGENSRC_FILENAME) - with open(pathof(srcname), 'wb') as f: - f.write(data[16:]) - rscnames.append(None) - sect.setsectiondescription(i,"Zipped Source Files") - return rscnames - - -def processPAGE(i, files, rscnames, sect, data, mh, pagemapproc): - # process any page map information and create an apnx file - pagemapproc = PageMapProcessor(mh, data) - rscnames.append(None) - sect.setsectiondescription(i,"PageMap") - apnx_meta = {} - acr = sect.palmname.decode('latin-1').rstrip('\x00') - apnx_meta['acr'] = acr - apnx_meta['cdeType'] = mh.metadata['cdeType'][0] - apnx_meta['contentGuid'] = hex(int(mh.metadata['UniqueID'][0]))[2:] - apnx_meta['asin'] = mh.metadata['ASIN'][0] - apnx_meta['pageMap'] = pagemapproc.getPageMap() - if mh.version == 8: - apnx_meta['format'] = 'MOBI_8' - else: - apnx_meta['format'] = 'MOBI_7' - apnx_data = pagemapproc.generateAPNX(apnx_meta) - if mh.isK8(): - outname = os.path.join(files.outdir, 'mobi8-'+files.getInputFileBasename() + '.apnx') - else: - outname = os.path.join(files.outdir, 'mobi7-'+files.getInputFileBasename() + '.apnx') - with open(pathof(outname), 'wb') as f: - f.write(apnx_data) - return rscnames, pagemapproc - - -def processCMET(i, files, rscnames, sect, data): - # extract the build log - print("File contains kindlegen build log, extracting as %s" % KINDLEGENLOG_FILENAME) - srcname = os.path.join(files.outdir, KINDLEGENLOG_FILENAME) - with open(pathof(srcname), 'wb') as f: - f.write(data[10:]) - rscnames.append(None) - sect.setsectiondescription(i,"Kindlegen log") - return rscnames - - -# fonts only exist in KF8 ebooks -# Format: bytes 0 - 3: 'FONT' -# bytes 4 - 7: uncompressed size -# bytes 8 - 11: flags -# flag bit 0x0001 - zlib compression -# flag bit 0x0002 - obfuscated with xor string -# bytes 12 - 15: offset to start of compressed font data -# bytes 16 - 19: length of xor string stored before the start of the comnpress font data -# bytes 20 - 23: start of xor string -def processFONT(i, files, rscnames, sect, data, obfuscate_data, beg, rsc_ptr): - fontname = "font%05d" % i - ext = '.dat' - font_error = False - font_data = data - try: - usize, fflags, dstart, xor_len, xor_start = struct.unpack_from(b'>LLLLL',data,4) - except: - print("Failed to extract font: {0:s} from section {1:d}".format(fontname,i)) - font_error = True - ext = '.failed' - pass - if not font_error: - print("Extracting font:", fontname) - font_data = data[dstart:] - extent = len(font_data) - extent = min(extent, 1040) - if fflags & 0x0002: - # obfuscated so need to de-obfuscate the first 1040 bytes - key = bytearray(data[xor_start: xor_start+ xor_len]) - buf = bytearray(font_data) - for n in range(extent): - buf[n] ^= key[n%xor_len] - font_data = bytes(buf) - if fflags & 0x0001: - # ZLIB compressed data - font_data = zlib.decompress(font_data) - hdr = font_data[0:4] - if hdr == b'\0\1\0\0' or hdr == b'true' or hdr == b'ttcf': - ext = '.ttf' - elif hdr == b'OTTO': - ext = '.otf' - else: - print("Warning: unknown font header %s" % hexlify(hdr)) - if (ext == '.ttf' or ext == '.otf') and (fflags & 0x0002): - obfuscate_data.append(fontname + ext) - fontname += ext - outfnt = os.path.join(files.imgdir, fontname) - with open(pathof(outfnt), 'wb') as f: - f.write(font_data) - rscnames.append(fontname) - sect.setsectiondescription(i,"Font {0:s}".format(fontname)) - if rsc_ptr == -1: - rsc_ptr = i - beg - return rscnames, obfuscate_data, rsc_ptr - - -def processCRES(i, files, rscnames, sect, data, beg, rsc_ptr, use_hd): - # extract an HDImage - global DUMP - data = data[12:] - imgtype = get_image_type(None, data) - - if imgtype is None: - print("Warning: CRES Section %s does not contain a recognised resource" % i) - rscnames.append(None) - sect.setsectiondescription(i,"Mysterious CRES data, first four bytes %s" % describe(data[0:4])) - if DUMP: - fname = "unknown%05d.dat" % i - outname= os.path.join(files.outdir, fname) - with open(pathof(outname), 'wb') as f: - f.write(data) - sect.setsectiondescription(i,"Mysterious CRES data, first four bytes %s extracting as %s" % (describe(data[0:4]), fname)) - rsc_ptr += 1 - return rscnames, rsc_ptr - - if use_hd: - # overwrite corresponding lower res image with hd version - imgname = rscnames[rsc_ptr] - imgdest = files.imgdir - else: - imgname = "HDimage%05d.%s" % (i, imgtype) - imgdest = files.hdimgdir - print("Extracting HD image: {0:s} from section {1:d}".format(imgname,i)) - outimg = os.path.join(imgdest, imgname) - with open(pathof(outimg), 'wb') as f: - f.write(data) - rscnames.append(None) - sect.setsectiondescription(i,"Optional HD Image {0:s}".format(imgname)) - rsc_ptr += 1 - return rscnames, rsc_ptr - - -def processCONT(i, files, rscnames, sect, data): - global DUMP - # process a container header, most of this is unknown - # right now only extract its EXTH - dt = data[0:12] - if dt == b"CONTBOUNDARY": - rscnames.append(None) - sect.setsectiondescription(i,"CONTAINER BOUNDARY") - else: - sect.setsectiondescription(i,"CONT Header") - rscnames.append(None) - if DUMP: - cpage, = struct.unpack_from(b'>L', data, 12) - contexth = data[48:] - print("\n\nContainer EXTH Dump") - dump_contexth(cpage, contexth) - fname = "CONT_Header%05d.dat" % i - outname= os.path.join(files.outdir, fname) - with open(pathof(outname), 'wb') as f: - f.write(data) - return rscnames - - -def processkind(i, files, rscnames, sect, data): - global DUMP - dt = data[0:12] - if dt == b"kindle:embed": - if DUMP: - print("\n\nHD Image Container Description String") - print(data) - sect.setsectiondescription(i,"HD Image Container Description String") - rscnames.append(None) - return rscnames - - -# spine information from the original content.opf -def processRESC(i, files, rscnames, sect, data, k8resc): - global DUMP - if DUMP: - rescname = "RESC%05d.dat" % i - print("Extracting Resource: ", rescname) - outrsc = os.path.join(files.outdir, rescname) - with open(pathof(outrsc), 'wb') as f: - f.write(data) - if True: # try: - # parse the spine and metadata from RESC - k8resc = K8RESCProcessor(data[16:], DUMP) - else: # except: - print("Warning: cannot extract information from RESC.") - k8resc = None - rscnames.append(None) - sect.setsectiondescription(i,"K8 RESC section") - return rscnames, k8resc - - -def processImage(i, files, rscnames, sect, data, beg, rsc_ptr, cover_offset, thumb_offset): - global DUMP - # Extract an Image - imgtype = get_image_type(None, data) - if imgtype is None: - print("Warning: Section %s does not contain a recognised resource" % i) - rscnames.append(None) - sect.setsectiondescription(i,"Mysterious Section, first four bytes %s" % describe(data[0:4])) - if DUMP: - fname = "unknown%05d.dat" % i - outname= os.path.join(files.outdir, fname) - with open(pathof(outname), 'wb') as f: - f.write(data) - sect.setsectiondescription(i,"Mysterious Section, first four bytes %s extracting as %s" % (describe(data[0:4]), fname)) - return rscnames, rsc_ptr - - imgname = "image%05d.%s" % (i, imgtype) - if cover_offset is not None and i == beg + cover_offset: - imgname = "cover%05d.%s" % (i, imgtype) - if thumb_offset is not None and i == beg + thumb_offset: - imgname = "thumb%05d.%s" % (i, imgtype) - print("Extracting image: {0:s} from section {1:d}".format(imgname,i)) - outimg = os.path.join(files.imgdir, imgname) - with open(pathof(outimg), 'wb') as f: - f.write(data) - rscnames.append(imgname) - sect.setsectiondescription(i,"Image {0:s}".format(imgname)) - if rsc_ptr == -1: - rsc_ptr = i - beg - return rscnames, rsc_ptr - - -def processPrintReplica(metadata, files, rscnames, mh): - global DUMP - global WRITE_RAW_DATA - rawML = mh.getRawML() - if DUMP or WRITE_RAW_DATA: - outraw = os.path.join(files.outdir,files.getInputFileBasename() + '.rawpr') - with open(pathof(outraw),'wb') as f: - f.write(rawML) - - fileinfo = [] - print("Print Replica ebook detected") - try: - numTables, = struct.unpack_from(b'>L', rawML, 0x04) - tableIndexOffset = 8 + 4*numTables - # for each table, read in count of sections, assume first section is a PDF - # and output other sections as binary files - for i in range(numTables): - sectionCount, = struct.unpack_from(b'>L', rawML, 0x08 + 4*i) - for j in range(sectionCount): - sectionOffset, sectionLength, = struct.unpack_from(b'>LL', rawML, tableIndexOffset) - tableIndexOffset += 8 - if j == 0: - entryName = os.path.join(files.outdir, files.getInputFileBasename() + ('.%03d.pdf' % (i+1))) - else: - entryName = os.path.join(files.outdir, files.getInputFileBasename() + ('.%03d.%03d.data' % ((i+1),j))) - with open(pathof(entryName), 'wb') as f: - f.write(rawML[sectionOffset:(sectionOffset+sectionLength)]) - except Exception as e: - print('Error processing Print Replica: ' + str(e)) - - fileinfo.append([None,'', files.getInputFileBasename() + '.pdf']) - usedmap = {} - for name in rscnames: - if name is not None: - usedmap[name] = 'used' - opf = OPFProcessor(files, metadata, fileinfo, rscnames, False, mh, usedmap) - opf.writeOPF() - - -def processMobi8(mh, metadata, sect, files, rscnames, pagemapproc, k8resc, obfuscate_data, apnxfile=None, epubver='2'): - global DUMP - global WRITE_RAW_DATA - - # extract raw markup langauge - rawML = mh.getRawML() - if DUMP or WRITE_RAW_DATA: - outraw = os.path.join(files.k8dir,files.getInputFileBasename() + '.rawml') - with open(pathof(outraw),'wb') as f: - f.write(rawML) - - # KF8 require other indexes which contain parsing information and the FDST info - # to process the rawml back into the xhtml files, css files, svg image files, etc - k8proc = K8Processor(mh, sect, files, DUMP) - k8proc.buildParts(rawML) - - # collect information for the guide first - guidetext = unicode_str(k8proc.getGuideText()) - - # if the guide was empty, add in any guide info from metadata, such as StartOffset - if not guidetext and 'StartOffset' in metadata: - # Apparently, KG 2.5 carries over the StartOffset from the mobi7 part... - # Taking that into account, we only care about the *last* StartOffset, which - # should always be the correct one in these cases (the one actually pointing - # to the right place in the mobi8 part). - starts = metadata['StartOffset'] - last_start = starts[-1] - last_start = int(last_start) - if last_start == 0xffffffff: - last_start = 0 - seq, idtext = k8proc.getFragTblInfo(last_start) - filename, idtext = k8proc.getIDTagByPosFid(toBase32(seq), b'0000000000') - linktgt = filename - idtext = unicode_str(idtext, mh.codec) - if idtext != '': - linktgt += '#' + idtext - guidetext += '\n' % linktgt - - # if apnxfile is passed in use it for page map information - if apnxfile is not None and pagemapproc is None: - with open(apnxfile, 'rb') as f: - apnxdata = b"00000000" + f.read() - pagemapproc = PageMapProcessor(mh, apnxdata) - - # generate the page map - pagemapxml = '' - if pagemapproc is not None: - pagemapxml = pagemapproc.generateKF8PageMapXML(k8proc) - outpm = os.path.join(files.k8oebps,'page-map.xml') - with open(pathof(outpm),'wb') as f: - f.write(pagemapxml.encode('utf-8')) - if DUMP: - print(pagemapproc.getNames()) - print(pagemapproc.getOffsets()) - print("\n\nPage Map") - print(pagemapxml) - - # process the toc ncx - # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num - print("Processing ncx / toc") - ncx = ncxExtract(mh, files) - ncx_data = ncx.parseNCX() - # extend the ncx data with filenames and proper internal idtags - for i in range(len(ncx_data)): - ncxmap = ncx_data[i] - [junk1, junk2, junk3, fid, junk4, off] = ncxmap['pos_fid'].split(':') - filename, idtag = k8proc.getIDTagByPosFid(fid, off) - ncxmap['filename'] = filename - ncxmap['idtag'] = unicode_str(idtag) - ncx_data[i] = ncxmap - - # convert the rawML to a set of xhtml files - print("Building an epub-like structure") - htmlproc = XHTMLK8Processor(rscnames, k8proc) - usedmap = htmlproc.buildXHTML() - - # write out the xhtml svg, and css files - # fileinfo = [skelid|coverpage, dir, name] - fileinfo = [] - # first create a cover page if none exists - if CREATE_COVER_PAGE: - cover = CoverProcessor(files, metadata, rscnames) - cover_img = utf8_str(cover.getImageName()) - need_to_create_cover_page = False - if cover_img is not None: - if k8resc is None or not k8resc.hasSpine(): - part = k8proc.getPart(0) - if part.find(cover_img) == -1: - need_to_create_cover_page = True - else: - if "coverpage" not in k8resc.spine_idrefs: - part = k8proc.getPart(int(k8resc.spine_order[0])) - if part.find(cover_img) == -1: - k8resc.prepend_to_spine("coverpage", "inserted", "no", None) - if k8resc.spine_order[0] == "coverpage": - need_to_create_cover_page = True - if need_to_create_cover_page: - filename = cover.getXHTMLName() - fileinfo.append(["coverpage", 'Text', filename]) - guidetext += cover.guide_toxml() - cover.writeXHTML() - - n = k8proc.getNumberOfParts() - for i in range(n): - part = k8proc.getPart(i) - [skelnum, dir, filename, beg, end, aidtext] = k8proc.getPartInfo(i) - fileinfo.append([str(skelnum), dir, filename]) - fname = os.path.join(files.k8oebps,dir,filename) - with open(pathof(fname),'wb') as f: - f.write(part) - n = k8proc.getNumberOfFlows() - for i in range(1, n): - [ptype, pformat, pdir, filename] = k8proc.getFlowInfo(i) - flowpart = k8proc.getFlow(i) - if pformat == b'file': - fileinfo.append([None, pdir, filename]) - fname = os.path.join(files.k8oebps,pdir,filename) - with open(pathof(fname),'wb') as f: - f.write(flowpart) - - # create the opf - opf = OPFProcessor(files, metadata.copy(), fileinfo, rscnames, True, mh, usedmap, - pagemapxml=pagemapxml, guidetext=guidetext, k8resc=k8resc, epubver=epubver) - uuid = opf.writeOPF(bool(obfuscate_data)) - - if opf.hasNCX(): - # Create a toc.ncx. - ncx.writeK8NCX(ncx_data, metadata) - if opf.hasNAV(): - # Create a navigation document. - nav = NAVProcessor(files) - nav.writeNAV(ncx_data, guidetext, metadata) - - # make an epub-like structure of it all - print("Creating an epub-like file") - files.makeEPUB(usedmap, obfuscate_data, uuid) - - -def processMobi7(mh, metadata, sect, files, rscnames): - global DUMP - global WRITE_RAW_DATA - # An original Mobi - rawML = mh.getRawML() - if DUMP or WRITE_RAW_DATA: - outraw = os.path.join(files.mobi7dir,files.getInputFileBasename() + '.rawml') - with open(pathof(outraw),'wb') as f: - f.write(rawML) - - # process the toc ncx - # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num - ncx = ncxExtract(mh, files) - ncx_data = ncx.parseNCX() - ncx.writeNCX(metadata) - - positionMap = {} - - # if Dictionary build up the positionMap - if mh.isDictionary(): - if mh.DictInLanguage(): - metadata['DictInLanguage'] = [mh.DictInLanguage()] - if mh.DictOutLanguage(): - metadata['DictOutLanguage'] = [mh.DictOutLanguage()] - positionMap = dictSupport(mh, sect).getPositionMap() - - # convert the rawml back to Mobi ml - proc = HTMLProcessor(files, metadata, rscnames) - srctext = proc.findAnchors(rawML, ncx_data, positionMap) - srctext, usedmap = proc.insertHREFS() - - # write the proper mobi html - fileinfo=[] - # fname = files.getInputFileBasename() + '.html' - fname = 'book.html' - fileinfo.append([None,'', fname]) - outhtml = os.path.join(files.mobi7dir, fname) - with open(pathof(outhtml), 'wb') as f: - f.write(srctext) - - # extract guidetext from srctext - guidetext =b'' - # no pagemap support for older mobis - # pagemapxml = None - guidematch = re.search(br'''(.*)''',srctext,re.IGNORECASE+re.DOTALL) - if guidematch: - guidetext = guidematch.group(1) - # sometimes old mobi guide from srctext horribly written so need to clean up - guidetext = guidetext.replace(b"\r", b"") - guidetext = guidetext.replace(b']*>)''', re.IGNORECASE) - guidepieces = ref_tag_pattern.split(guidetext) - for i in range(1,len(guidepieces), 2): - reftag = guidepieces[i] - # remove any href there now to replace with filepos - reftag = re.sub(br'''href\s*=[^'"]*['"][^'"]*['"]''',b'', reftag) - # make sure the reference tag ends properly - if not reftag.endswith(b"/>"): - reftag = reftag[0:-1] + b"/>" - guidepieces[i] = reftag - guidetext = b''.join(guidepieces) - replacetext = br'''href="'''+utf8_str(fileinfo[0][2])+ br'''#filepos\1"''' - guidetext = re.sub(br'''filepos=['"]{0,1}0*(\d+)['"]{0,1}''', replacetext, guidetext) - guidetext += b'\n' - - if 'StartOffset' in metadata: - for value in metadata['StartOffset']: - if int(value) == 0xffffffff: - value = '0' - starting_offset = value - # get guide items from metadata - metaguidetext = b'\n' - guidetext += metaguidetext - - if isinstance(guidetext, binary_type): - guidetext = guidetext.decode(mh.codec) - - # create an OPF - opf = OPFProcessor(files, metadata, fileinfo, rscnames, ncx.isNCX, mh, usedmap, guidetext=guidetext) - opf.writeOPF() - - -def processUnknownSections(mh, sect, files, K8Boundary): - global DUMP - global TERMINATION_INDICATOR1 - global TERMINATION_INDICATOR2 - global TERMINATION_INDICATOR3 - if DUMP: - print("Unpacking any remaining unknown records") - beg = mh.start - end = sect.num_sections - if beg < K8Boundary: - # then we're processing the first part of a combination file - end = K8Boundary - for i in range(beg, end): - if sect.sectiondescriptions[i] == "": - data = sect.loadSection(i) - type = data[0:4] - if type == TERMINATION_INDICATOR3: - description = "Termination Marker 3 Nulls" - elif type == TERMINATION_INDICATOR2: - description = "Termination Marker 2 Nulls" - elif type == TERMINATION_INDICATOR1: - description = "Termination Marker 1 Null" - elif type == "INDX": - fname = "Unknown%05d_INDX.dat" % i - description = "Unknown INDX section" - if DUMP: - outname= os.path.join(files.outdir, fname) - with open(pathof(outname), 'wb') as f: - f.write(data) - print("Extracting %s: %s from section %d" % (description, fname, i)) - description = description + ", extracting as %s" % fname - else: - fname = "unknown%05d.dat" % i - description = "Mysterious Section, first four bytes %s" % describe(data[0:4]) - if DUMP: - outname= os.path.join(files.outdir, fname) - with open(pathof(outname), 'wb') as f: - f.write(data) - print("Extracting %s: %s from section %d" % (description, fname, i)) - description = description + ", extracting as %s" % fname - sect.setsectiondescription(i, description) - - -def process_all_mobi_headers(files, apnxfile, sect, mhlst, K8Boundary, k8only=False, epubver='2', use_hd=False): - global DUMP - global WRITE_RAW_DATA - rscnames = [] - rsc_ptr = -1 - k8resc = None - obfuscate_data = [] - for mh in mhlst: - pagemapproc = None - if mh.isK8(): - sect.setsectiondescription(mh.start,"KF8 Header") - mhname = os.path.join(files.outdir,"header_K8.dat") - print("Processing K8 section of book...") - elif mh.isPrintReplica(): - sect.setsectiondescription(mh.start,"Print Replica Header") - mhname = os.path.join(files.outdir,"header_PR.dat") - print("Processing PrintReplica section of book...") - else: - if mh.version == 0: - sect.setsectiondescription(mh.start, "PalmDoc Header".format(mh.version)) - else: - sect.setsectiondescription(mh.start,"Mobipocket {0:d} Header".format(mh.version)) - mhname = os.path.join(files.outdir,"header.dat") - print("Processing Mobipocket {0:d} section of book...".format(mh.version)) - - if DUMP: - # write out raw mobi header data - with open(pathof(mhname), 'wb') as f: - f.write(mh.header) - - # process each mobi header - metadata = mh.getMetaData() - mh.describeHeader(DUMP) - if mh.isEncrypted(): - raise unpackException('Book is encrypted') - - pagemapproc = None - - # first handle all of the different resource sections: images, resources, fonts, and etc - # build up a list of image names to use to postprocess the ebook - - print("Unpacking images, resources, fonts, etc") - beg = mh.firstresource - end = sect.num_sections - if beg < K8Boundary: - # processing first part of a combination file - end = K8Boundary - - # Not sure the try/except is necessary, but just in case - try: - thumb_offset = int(metadata.get('ThumbOffset', ['-1'])[0]) - except: - thumb_offset = None - - cover_offset = int(metadata.get('CoverOffset', ['-1'])[0]) - if not CREATE_COVER_PAGE: - cover_offset = None - - for i in range(beg, end): - data = sect.loadSection(i) - type = data[0:4] - - # handle the basics first - if type in [b"FLIS", b"FCIS", b"FDST", b"DATP"]: - if DUMP: - fname = unicode_str(type) + "%05d" % i - if mh.isK8(): - fname += "_K8" - fname += '.dat' - outname= os.path.join(files.outdir, fname) - with open(pathof(outname), 'wb') as f: - f.write(data) - print("Dumping section {0:d} type {1:s} to file {2:s} ".format(i,unicode_str(type),outname)) - sect.setsectiondescription(i,"Type {0:s}".format(unicode_str(type))) - rscnames.append(None) - elif type == b"SRCS": - rscnames = processSRCS(i, files, rscnames, sect, data) - elif type == b"PAGE": - rscnames, pagemapproc = processPAGE(i, files, rscnames, sect, data, mh, pagemapproc) - elif type == b"CMET": - rscnames = processCMET(i, files, rscnames, sect, data) - elif type == b"FONT": - rscnames, obfuscate_data, rsc_ptr = processFONT(i, files, rscnames, sect, data, obfuscate_data, beg, rsc_ptr) - elif type == b"CRES": - rscnames, rsc_ptr = processCRES(i, files, rscnames, sect, data, beg, rsc_ptr, use_hd) - elif type == b"CONT": - rscnames = processCONT(i, files, rscnames, sect, data) - elif type == b"kind": - rscnames = processkind(i, files, rscnames, sect, data) - elif type == b'\xa0\xa0\xa0\xa0': - sect.setsectiondescription(i,"Empty_HD_Image/Resource_Placeholder") - rscnames.append(None) - rsc_ptr += 1 - elif type == b"RESC": - rscnames, k8resc = processRESC(i, files, rscnames, sect, data, k8resc) - elif data == EOF_RECORD: - sect.setsectiondescription(i,"End Of File") - rscnames.append(None) - elif data[0:8] == b"BOUNDARY": - sect.setsectiondescription(i,"BOUNDARY Marker") - rscnames.append(None) - else: - # if reached here should be an image ow treat as unknown - rscnames, rsc_ptr = processImage(i, files, rscnames, sect, data, beg, rsc_ptr, cover_offset, thumb_offset) - # done unpacking resources - - # Print Replica - if mh.isPrintReplica() and not k8only: - processPrintReplica(metadata, files, rscnames, mh) - continue - - # KF8 (Mobi 8) - if mh.isK8(): - processMobi8(mh, metadata, sect, files, rscnames, pagemapproc, k8resc, obfuscate_data, apnxfile, epubver) - - # Old Mobi (Mobi 7) - elif not k8only: - processMobi7(mh, metadata, sect, files, rscnames) - - # process any remaining unknown sections of the palm file - processUnknownSections(mh, sect, files, K8Boundary) - - return - - -def unpackBook(infile, outdir, apnxfile=None, epubver='2', use_hd=False, dodump=False, dowriteraw=False, dosplitcombos=False): - global DUMP - global WRITE_RAW_DATA - global SPLIT_COMBO_MOBIS - if DUMP or dodump: - DUMP = True - if WRITE_RAW_DATA or dowriteraw: - WRITE_RAW_DATA = True - if SPLIT_COMBO_MOBIS or dosplitcombos: - SPLIT_COMBO_MOBIS = True - - infile = unicode_str(infile) - outdir = unicode_str(outdir) - if apnxfile is not None: - apnxfile = unicode_str(apnxfile) - - files = fileNames(infile, outdir) - - # process the PalmDoc database header and verify it is a mobi - sect = Sectionizer(infile) - if sect.ident != b'BOOKMOBI' and sect.ident != b'TEXtREAd': - raise unpackException('Invalid file format') - if DUMP: - sect.dumppalmheader() - else: - print("Palm DB type: %s, %d sections." % (sect.ident.decode('utf-8'),sect.num_sections)) - - # scan sections to see if this is a compound mobi file (K8 format) - # and build a list of all mobi headers to process. - mhlst = [] - mh = MobiHeader(sect,0) - # if this is a mobi8-only file hasK8 here will be true - mhlst.append(mh) - K8Boundary = -1 - - if mh.isK8(): - print("Unpacking a KF8 book...") - hasK8 = True - else: - # This is either a Mobipocket 7 or earlier, or a combi M7/KF8 - # Find out which - hasK8 = False - for i in range(len(sect.sectionoffsets)-1): - before, after = sect.sectionoffsets[i:i+2] - if (after - before) == 8: - data = sect.loadSection(i) - if data == K8_BOUNDARY: - sect.setsectiondescription(i,"Mobi/KF8 Boundary Section") - mh = MobiHeader(sect,i+1) - hasK8 = True - mhlst.append(mh) - K8Boundary = i - break - if hasK8: - print("Unpacking a Combination M{0:d}/KF8 book...".format(mh.version)) - if SPLIT_COMBO_MOBIS: - # if this is a combination mobi7-mobi8 file split them up - mobisplit = mobi_split(infile) - if mobisplit.combo: - outmobi7 = os.path.join(files.outdir, 'mobi7-'+files.getInputFileBasename() + '.mobi') - outmobi8 = os.path.join(files.outdir, 'mobi8-'+files.getInputFileBasename() + '.azw3') - with open(pathof(outmobi7), 'wb') as f: - f.write(mobisplit.getResult7()) - with open(pathof(outmobi8), 'wb') as f: - f.write(mobisplit.getResult8()) - else: - print("Unpacking a Mobipocket {0:d} book...".format(mh.version)) - - if hasK8: - files.makeK8Struct() - - process_all_mobi_headers(files, apnxfile, sect, mhlst, K8Boundary, False, epubver, use_hd) - - if DUMP: - sect.dumpsectionsinfo() - return - - -def usage(progname): - print("") - print("Description:") - print(" Unpacks an unencrypted Kindle/MobiPocket ebook to html and images") - print(" or an unencrypted Kindle/Print Replica ebook to PDF and images") - print(" into the specified output folder.") - print("Usage:") - print(" %s -r -s -p apnxfile -d -h --epub_version= infile [outdir]" % progname) - print("Options:") - print(" -h print this help message") - print(" -i use HD Images, if present, to overwrite reduced resolution images") - print(" -s split combination mobis into mobi7 and mobi8 ebooks") - print(" -p APNXFILE path to an .apnx file associated with the azw3 input (optional)") - print(" --epub_version= specify epub version to unpack to: 2, 3, A (for automatic) or ") - print(" F (force to fit to epub2 definitions), default is 2") - print(" -d dump headers and other info to output and extra files") - print(" -r write raw data to the output folder") - - -def main(argv=unicode_argv()): - global DUMP - global WRITE_RAW_DATA - global SPLIT_COMBO_MOBIS - - print("KindleUnpack v0.83") - print(" Based on initial mobipocket version Copyright © 2009 Charles M. Hannum ") - print(" Extensive Extensions and Improvements Copyright © 2009-2020 ") - print(" by: P. Durrant, K. Hendricks, S. Siebert, fandrieu, DiapDealer, nickredding, tkeo.") - print(" This program is free software: you can redistribute it and/or modify") - print(" it under the terms of the GNU General Public License as published by") - print(" the Free Software Foundation, version 3.") - - progname = os.path.basename(argv[0]) - try: - opts, args = getopt.getopt(argv[1:], "dhirsp:", ['epub_version=']) - except getopt.GetoptError as err: - print(str(err)) - usage(progname) - sys.exit(2) - - if len(args)<1: - usage(progname) - sys.exit(2) - - apnxfile = None - epubver = '2' - use_hd = False - - for o, a in opts: - if o == "-h": - usage(progname) - sys.exit(0) - if o == "-i": - use_hd = True - if o == "-d": - DUMP = True - if o == "-r": - WRITE_RAW_DATA = True - if o == "-s": - SPLIT_COMBO_MOBIS = True - if o == "-p": - apnxfile = a - if o == "--epub_version": - epubver = a - - if len(args) > 1: - infile, outdir = args - else: - infile = args[0] - outdir = os.path.splitext(infile)[0] - - infileext = os.path.splitext(infile)[1].upper() - if infileext not in ['.MOBI', '.PRC', '.AZW', '.AZW3', '.AZW4']: - print("Error: first parameter must be a Kindle/Mobipocket ebook or a Kindle/Print Replica ebook.") - return 1 - - try: - print('Unpacking Book...') - unpackBook(infile, outdir, apnxfile, epubver, use_hd) - print('Completed') - - except ValueError as e: - print("Error: %s" % e) - print(traceback.format_exc()) - return 1 - - return 0 - - -if __name__ == '__main__': - sys.exit(main()) diff --git a/epy_extras/KindleUnpack/mobi_cover.py b/epy_extras/KindleUnpack/mobi_cover.py deleted file mode 100644 index 3078ac4..0000000 --- a/epy_extras/KindleUnpack/mobi_cover.py +++ /dev/null @@ -1,238 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab - -from __future__ import unicode_literals, division, absolute_import, print_function - -from .compatibility_utils import unicode_str - -from .unipath import pathof -import os -import imghdr - -import struct -# note: struct pack, unpack, unpack_from all require bytestring format -# data all the way up to at least python 2.7.5, python 3 okay with bytestring - -USE_SVG_WRAPPER = True -""" Set to True to use svg wrapper for default. """ - -FORCE_DEFAULT_TITLE = False -""" Set to True to force to use the default title. """ - -COVER_PAGE_FINENAME = 'cover_page.xhtml' -""" The name for the cover page. """ - -DEFAULT_TITLE = 'Cover' -""" The default title for the cover page. """ - -MAX_WIDTH = 4096 -""" The max width for the svg cover page. """ - -MAX_HEIGHT = 4096 -""" The max height for the svg cover page. """ - - -def get_image_type(imgname, imgdata=None): - imgtype = unicode_str(imghdr.what(pathof(imgname), imgdata)) - - # imghdr only checks for JFIF or Exif JPEG files. Apparently, there are some - # with only the magic JPEG bytes out there... - # ImageMagick handles those, so, do it too. - if imgtype is None: - if imgdata is None: - with open(pathof(imgname), 'rb') as f: - imgdata = f.read() - if imgdata[0:2] == b'\xFF\xD8': - # Get last non-null bytes - last = len(imgdata) - while (imgdata[last-1:last] == b'\x00'): - last-=1 - # Be extra safe, check the trailing bytes, too. - if imgdata[last-2:last] == b'\xFF\xD9': - imgtype = "jpeg" - return imgtype - - -def get_image_size(imgname, imgdata=None): - '''Determine the image type of imgname (or imgdata) and return its size. - - Originally, - Determine the image type of fhandle and return its size. - from draco''' - if imgdata is None: - fhandle = open(pathof(imgname), 'rb') - head = fhandle.read(24) - else: - head = imgdata[0:24] - if len(head) != 24: - return - - imgtype = get_image_type(imgname, imgdata) - if imgtype == 'png': - check = struct.unpack(b'>i', head[4:8])[0] - if check != 0x0d0a1a0a: - return - width, height = struct.unpack(b'>ii', head[16:24]) - elif imgtype == 'gif': - width, height = struct.unpack(b'H', fhandle.read(2))[0] - 2 - # We are at a SOFn block - fhandle.seek(1, 1) # Skip `precision' byte. - height, width = struct.unpack(b'>HH', fhandle.read(4)) - except Exception: # IGNORE:W0703 - return - elif imgtype == 'jpeg' and imgdata is not None: - try: - pos = 0 - size = 2 - ftype = 0 - while not 0xc0 <= ftype <= 0xcf: - pos += size - byte = imgdata[pos:pos+1] - pos += 1 - while ord(byte) == 0xff: - byte = imgdata[pos:pos+1] - pos += 1 - ftype = ord(byte) - size = struct.unpack(b'>H', imgdata[pos:pos+2])[0] - 2 - pos += 2 - # We are at a SOFn block - pos += 1 # Skip `precision' byte. - height, width = struct.unpack(b'>HH', imgdata[pos:pos+4]) - pos += 4 - except Exception: # IGNORE:W0703 - return - else: - return - return width, height - -# XXX experimental -class CoverProcessor(object): - - """Create a cover page. - - """ - def __init__(self, files, metadata, rscnames, imgname=None, imgdata=None): - self.files = files - self.metadata = metadata - self.rscnames = rscnames - self.cover_page = COVER_PAGE_FINENAME - self.use_svg = USE_SVG_WRAPPER # Use svg wrapper. - self.lang = metadata.get('Language', ['en'])[0] - # This should ensure that if the methods to find the cover image's - # dimensions should fail for any reason, the SVG routine will not be used. - [self.width, self.height] = (-1,-1) - if FORCE_DEFAULT_TITLE: - self.title = DEFAULT_TITLE - else: - self.title = metadata.get('Title', [DEFAULT_TITLE])[0] - - self.cover_image = None - if imgname is not None: - self.cover_image = imgname - elif 'CoverOffset' in metadata: - imageNumber = int(metadata['CoverOffset'][0]) - cover_image = self.rscnames[imageNumber] - if cover_image is not None: - self.cover_image = cover_image - else: - print('Warning: Cannot identify the cover image.') - if self.use_svg: - try: - if imgdata is None: - fname = os.path.join(files.imgdir, self.cover_image) - [self.width, self.height] = get_image_size(fname) - else: - [self.width, self.height] = get_image_size(None, imgdata) - except: - self.use_svg = False - width = self.width - height = self.height - if width < 0 or height < 0 or width > MAX_WIDTH or height > MAX_HEIGHT: - self.use_svg = False - return - - def getImageName(self): - return self.cover_image - - def getXHTMLName(self): - return self.cover_page - - def buildXHTML(self): - print('Building a cover page.') - files = self.files - cover_image = self.cover_image - title = self.title - lang = self.lang - - image_dir = os.path.normpath(os.path.relpath(files.k8images, files.k8text)) - image_path = os.path.join(image_dir, cover_image).replace('\\', '/') - - if not self.use_svg: - data = '' - data += '' - data += 'L', idata, 0x14) - count, = struct.unpack_from(b'>L', idata, 0x18) - self.starts.append(start) - self.counts.append(count) - - def lookup(self, lookupvalue): - i = 0 - rvalue = lookupvalue - while rvalue >= self.counts[i]: - rvalue = rvalue - self.counts[i] - i += 1 - if i == len(self.counts): - print("Error: Problem with multiple inflections data sections") - return lookupvalue, self.starts[0], self.counts[0], self.infldatas[0] - return rvalue, self.starts[i], self.counts[i], self.infldatas[i] - - def offsets(self, value): - rvalue, start, count, data = self.lookup(value) - offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue)) - if rvalue + 1 < count: - nextOffset, = struct.unpack_from(b'>H',data, start + 4 + (2 * (rvalue + 1))) - else: - nextOffset = None - return offset, nextOffset, data - - -class dictSupport(object): - - def __init__(self, mh, sect): - self.mh = mh - self.header = mh.header - self.sect = sect - self.metaOrthIndex = mh.metaOrthIndex - self.metaInflIndex = mh.metaInflIndex - - def parseHeader(self, data): - "read INDX header" - if not data[:4] == b'INDX': - print("Warning: index section is not INDX") - return False - words = ( - 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code', - 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc' - ) - num = len(words) - values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)]) - header = {} - for n in range(num): - header[words[n]] = values[n] - - ordt1 = None - ordt2 = None - - otype, oentries, op1, op2, otagx = struct.unpack_from(b'>LLLLL',data, 0xa4) - header['otype'] = otype - header['oentries'] = oentries - - if DEBUG_DICT: - print("otype %d, oentries %d, op1 %d, op2 %d, otagx %d" % (otype, oentries, op1, op2, otagx)) - - if header['code'] == 0xfdea or oentries > 0: - # some dictionaries seem to be codepage 65002 (0xFDEA) which seems - # to be some sort of strange EBCDIC utf-8 or 16 encoded strings - # So we need to look for them and store them away to process leading text - # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries - # we only ever seem to use the second but ... - # - # if otype = 0, ORDT table uses 16 bit values as offsets into the table - # if otype = 1, ORDT table uses 8 bit values as offsets inot the table - - assert(data[op1:op1+4] == b'ORDT') - assert(data[op2:op2+4] == b'ORDT') - ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4) - ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4) - - if DEBUG_DICT: - print("parsed INDX header:") - for key in header: - print(key, "%x" % header[key],) - print("\n") - return header, ordt1, ordt2 - - def getPositionMap(self): - sect = self.sect - - positionMap = {} - - metaOrthIndex = self.metaOrthIndex - metaInflIndex = self.metaInflIndex - - decodeInflection = True - if metaOrthIndex != 0xFFFFFFFF: - print("Info: Document contains orthographic index, handle as dictionary") - if metaInflIndex == 0xFFFFFFFF: - decodeInflection = False - else: - metaInflIndexData = sect.loadSection(metaInflIndex) - - print("\nParsing metaInflIndexData") - midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData) - - metaIndexCount = midxhdr['count'] - idatas = [] - for j in range(metaIndexCount): - idatas.append(sect.loadSection(metaInflIndex + 1 + j)) - dinfl = InflectionData(idatas) - - inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount) - tagSectionStart = midxhdr['len'] - inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData) - if DEBUG_DICT: - print("inflectionTagTable: %s" % inflectionTagTable) - if self.hasTag(inflectionTagTable, 0x07): - print("Error: Dictionary uses obsolete inflection rule scheme which is not yet supported") - decodeInflection = False - - data = sect.loadSection(metaOrthIndex) - - print("\nParsing metaOrthIndex") - idxhdr, hordt1, hordt2 = self.parseHeader(data) - - tagSectionStart = idxhdr['len'] - controlByteCount, tagTable = readTagSection(tagSectionStart, data) - orthIndexCount = idxhdr['count'] - print("orthIndexCount is", orthIndexCount) - if DEBUG_DICT: - print("orthTagTable: %s" % tagTable) - if hordt2 is not None: - print("orth entry uses ordt2 lookup table of type ", idxhdr['otype']) - hasEntryLength = self.hasTag(tagTable, 0x02) - if not hasEntryLength: - print("Info: Index doesn't contain entry length tags") - - print("Read dictionary index data") - for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount): - data = sect.loadSection(i) - hdrinfo, ordt1, ordt2 = self.parseHeader(data) - idxtPos = hdrinfo['start'] - entryCount = hdrinfo['count'] - idxPositions = [] - for j in range(entryCount): - pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j)) - idxPositions.append(pos) - # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) - idxPositions.append(idxtPos) - for j in range(entryCount): - startPos = idxPositions[j] - endPos = idxPositions[j+1] - textLength = ord(data[startPos:startPos+1]) - text = data[startPos+1:startPos+1+textLength] - if hordt2 is not None: - utext = u"" - if idxhdr['otype'] == 0: - pattern = b'>H' - inc = 2 - else: - pattern = b'>B' - inc = 1 - pos = 0 - while pos < textLength: - off, = struct.unpack_from(pattern, text, pos) - if off < len(hordt2): - utext += unichr(hordt2[off]) - else: - utext += unichr(off) - pos += inc - text = utext.encode('utf-8') - - tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos) - if 0x01 in tagMap: - if decodeInflection and 0x2a in tagMap: - inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable, - dinfl, inflNameData, tagMap[0x2a]) - else: - inflectionGroups = b'' - assert len(tagMap[0x01]) == 1 - entryStartPosition = tagMap[0x01][0] - if hasEntryLength: - # The idx:entry attribute "scriptable" must be present to create entry length tags. - ml = b'' + inflectionGroups + b'' - if entryStartPosition in positionMap: - positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml - else: - positionMap[entryStartPosition] = ml - assert len(tagMap[0x02]) == 1 - entryEndPosition = entryStartPosition + tagMap[0x02][0] - if entryEndPosition in positionMap: - positionMap[entryEndPosition] = b"" + positionMap[entryEndPosition] - else: - positionMap[entryEndPosition] = b"" - - else: - indexTags = b'\n\n' + inflectionGroups + b'\n' - if entryStartPosition in positionMap: - positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags - else: - positionMap[entryStartPosition] = indexTags - return positionMap - - def hasTag(self, tagTable, tag): - ''' - Test if tag table contains given tag. - - @param tagTable: The tag table. - @param tag: The tag to search. - @return: True if tag table contains given tag; False otherwise. - ''' - for currentTag, _, _, _ in tagTable: - if currentTag == tag: - return True - return False - - def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList): - ''' - Create string which contains the inflection groups with inflection rules as mobipocket tags. - - @param mainEntry: The word to inflect. - @param controlByteCount: The number of control bytes. - @param tagTable: The tag table. - @param data: The Inflection data object to properly select the right inflection data section to use - @param inflectionNames: The inflection rule name data. - @param groupList: The list of inflection groups to process. - @return: String with inflection groups and rules or empty string if required tags are not available. - ''' - result = b"" - for value in groupList: - offset, nextOffset, data = dinfl.offsets(value) - - # First byte seems to be always 0x00 and must be skipped. - assert ord(data[offset:offset+1]) == 0x00 - tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset) - - # Make sure that the required tags are available. - if 0x05 not in tagMap: - print("Error: Required tag 0x05 not found in tagMap") - return "" - if 0x1a not in tagMap: - print("Error: Required tag 0x1a not found in tagMap") - return b'' - - result += b'' - - for i in range(len(tagMap[0x05])): - - # Get name of inflection rule. - value = tagMap[0x05][i] - consumed, textLength = getVariableWidthValue(inflectionNames, value) - inflectionName = inflectionNames[value+consumed:value+consumed+textLength] - - # Get and apply inflection rule across possibly multiple inflection data sections - value = tagMap[0x1a][i] - rvalue, start, count, data = dinfl.lookup(value) - offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue)) - textLength = ord(data[offset:offset+1]) - inflection = self.applyInflectionRule(mainEntry, data, offset+1, offset+1+textLength) - if inflection is not None: - result += b' ' - - result += b'' - return result - - def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end): - ''' - Apply inflection rule. - - @param mainEntry: The word to inflect. - @param inflectionRuleData: The inflection rules. - @param start: The start position of the inflection rule to use. - @param end: The end position of the inflection rule to use. - @return: The string with the inflected word or None if an error occurs. - ''' - mode = -1 - byteArray = array.array(array_format, mainEntry) - position = len(byteArray) - for charOffset in range(start, end): - char = inflectionRuleData[charOffset:charOffset+1] - abyte = ord(char) - if abyte >= 0x0a and abyte <= 0x13: - # Move cursor backwards - offset = abyte - 0x0a - if mode not in [0x02, 0x03]: - mode = 0x02 - position = len(byteArray) - position -= offset - elif abyte > 0x13: - if mode == -1: - print("Error: Unexpected first byte %i of inflection rule" % abyte) - return None - elif position == -1: - print("Error: Unexpected first byte %i of inflection rule" % abyte) - return None - else: - if mode == 0x01: - # Insert at word start - byteArray.insert(position, abyte) - position += 1 - elif mode == 0x02: - # Insert at word end - byteArray.insert(position, abyte) - elif mode == 0x03: - # Delete at word end - position -= 1 - deleted = byteArray.pop(position) - if bchr(deleted) != char: - if DEBUG_DICT: - print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted))) - print("Error: Delete operation of inflection rule failed") - return None - elif mode == 0x04: - # Delete at word start - deleted = byteArray.pop(position) - if bchr(deleted) != char: - if DEBUG_DICT: - print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted))) - print("Error: Delete operation of inflection rule failed") - return None - else: - print("Error: Inflection rule mode %x is not implemented" % mode) - return None - elif abyte == 0x01: - # Insert at word start - if mode not in [0x01, 0x04]: - position = 0 - mode = abyte - elif abyte == 0x02: - # Insert at word end - if mode not in [0x02, 0x03]: - position = len(byteArray) - mode = abyte - elif abyte == 0x03: - # Delete at word end - if mode not in [0x02, 0x03]: - position = len(byteArray) - mode = abyte - elif abyte == 0x04: - # Delete at word start - if mode not in [0x01, 0x04]: - position = 0 - # Delete at word start - mode = abyte - else: - print("Error: Inflection rule mode %x is not implemented" % abyte) - return None - return utf8_str(byteArray.tostring()) diff --git a/epy_extras/KindleUnpack/mobi_header.py b/epy_extras/KindleUnpack/mobi_header.py deleted file mode 100644 index a15f636..0000000 --- a/epy_extras/KindleUnpack/mobi_header.py +++ /dev/null @@ -1,936 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab - -from __future__ import unicode_literals, division, absolute_import, print_function - -DEBUG_USE_ORDERED_DICTIONARY = False # OrderedDict is supoorted >= python 2.7. -""" set to True to use OrderedDict for MobiHeader.metadata.""" - -if DEBUG_USE_ORDERED_DICTIONARY: - from collections import OrderedDict as dict_ -else: - dict_ = dict - -from .compatibility_utils import PY2, unicode_str, hexlify, bord - -if PY2: - range = xrange - -import struct -import uuid - -# import the mobiunpack support libraries -from .mobi_utils import getLanguage -from .mobi_uncompress import HuffcdicReader, PalmdocReader, UncompressedReader - -class unpackException(Exception): - pass - - -def sortedHeaderKeys(mheader): - hdrkeys = sorted(list(mheader.keys()), key=lambda akey: mheader[akey][0]) - return hdrkeys - - -# HD Containers have their own headers and their own EXTH -# this is just guesswork so far, making big assumption that -# metavalue key numbers remain the same in the CONT EXTH - -# Note: The layout of the CONT Header is still unknown -# so just deal with their EXTH sections for now - -def dump_contexth(cpage, extheader): - # determine text encoding - codec = 'windows-1252' - codec_map = { - 1252 : 'windows-1252', - 65001: 'utf-8', - } - if cpage in codec_map: - codec = codec_map[cpage] - if extheader == b'': - return - id_map_strings = { - 1 : 'Drm Server Id', - 2 : 'Drm Commerce Id', - 3 : 'Drm Ebookbase Book Id', - 4 : 'Drm Ebookbase Dep Id', - 100 : 'Creator', - 101 : 'Publisher', - 102 : 'Imprint', - 103 : 'Description', - 104 : 'ISBN', - 105 : 'Subject', - 106 : 'Published', - 107 : 'Review', - 108 : 'Contributor', - 109 : 'Rights', - 110 : 'SubjectCode', - 111 : 'Type', - 112 : 'Source', - 113 : 'ASIN', - # 114 : 'versionNumber', - 117 : 'Adult', - 118 : 'Retail-Price', - 119 : 'Retail-Currency', - 120 : 'TSC', - 122 : 'fixed-layout', - 123 : 'book-type', - 124 : 'orientation-lock', - 126 : 'original-resolution', - 127 : 'zero-gutter', - 128 : 'zero-margin', - 129 : 'MetadataResourceURI', - 132 : 'RegionMagnification', - 150 : 'LendingEnabled', - 200 : 'DictShortName', - 501 : 'cdeType', - 502 : 'last_update_time', - 503 : 'Updated_Title', - 504 : 'CDEContentKey', - 505 : 'AmazonContentReference', - 506 : 'Title-Language', - 507 : 'Title-Display-Direction', - 508 : 'Title-Pronunciation', - 509 : 'Title-Collation', - 510 : 'Secondary-Title', - 511 : 'Secondary-Title-Language', - 512 : 'Secondary-Title-Direction', - 513 : 'Secondary-Title-Pronunciation', - 514 : 'Secondary-Title-Collation', - 515 : 'Author-Language', - 516 : 'Author-Display-Direction', - 517 : 'Author-Pronunciation', - 518 : 'Author-Collation', - 519 : 'Author-Type', - 520 : 'Publisher-Language', - 521 : 'Publisher-Display-Direction', - 522 : 'Publisher-Pronunciation', - 523 : 'Publisher-Collation', - 524 : 'Content-Language-Tag', - 525 : 'primary-writing-mode', - 526 : 'NCX-Ingested-By-Software', - 527 : 'page-progression-direction', - 528 : 'override-kindle-fonts', - 529 : 'Compression-Upgraded', - 530 : 'Soft-Hyphens-In-Content', - 531 : 'Dictionary_In_Langague', - 532 : 'Dictionary_Out_Language', - 533 : 'Font_Converted', - 534 : 'Amazon_Creator_Info', - 535 : 'Creator-Build-Tag', - 536 : 'HD-Media-Containers-Info', # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?) - 538 : 'Resource-Container-Fidelity', - 539 : 'HD-Container-Mimetype', - 540 : 'Sample-For_Special-Purpose', - 541 : 'Kindletool-Operation-Information', - 542 : 'Container_Id', - 543 : 'Asset-Type', # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER - 544 : 'Unknown_544', - } - id_map_values = { - 114 : 'versionNumber', - 115 : 'sample', - 116 : 'StartOffset', - 121 : 'Mobi8-Boundary-Section', - 125 : 'Embedded-Record-Count', - 130 : 'Offline-Sample', - 131 : 'Metadata-Record-Offset', - 201 : 'CoverOffset', - 202 : 'ThumbOffset', - 203 : 'HasFakeCover', - 204 : 'Creator-Software', - 205 : 'Creator-Major-Version', - 206 : 'Creator-Minor-Version', - 207 : 'Creator-Build-Number', - 401 : 'Clipping-Limit', - 402 : 'Publisher-Limit', - 404 : 'Text-to-Speech-Disabled', - 406 : 'Rental-Expiration-Time', - } - id_map_hexstrings = { - 208 : 'Watermark_(hex)', - 209 : 'Tamper-Proof-Keys_(hex)', - 300 : 'Font-Signature_(hex)', - 403 : 'Unknown_(403)_(hex)', - 405 : 'Ownership-Type_(hex)', - 407 : 'Unknown_(407)_(hex)', - 420 : 'Multimedia-Content-Reference_(hex)', - 450 : 'Locations_Match_(hex)', - 451 : 'Full-Story-Length_(hex)', - 452 : 'Sample-Start_Location_(hex)', - 453 : 'Sample-End-Location_(hex)', - } - _length, num_items = struct.unpack(b'>LL', extheader[4:12]) - extheader = extheader[12:] - pos = 0 - for _ in range(num_items): - id, size = struct.unpack(b'>LL', extheader[pos:pos+8]) - content = extheader[pos + 8: pos + size] - if id in id_map_strings: - name = id_map_strings[id] - print('\n Key: "%s"\n Value: "%s"' % (name, content.decode(codec, errors='replace'))) - elif id in id_map_values: - name = id_map_values[id] - if size == 9: - value, = struct.unpack(b'B',content) - print('\n Key: "%s"\n Value: 0x%01x' % (name, value)) - elif size == 10: - value, = struct.unpack(b'>H',content) - print('\n Key: "%s"\n Value: 0x%02x' % (name, value)) - elif size == 12: - value, = struct.unpack(b'>L',content) - print('\n Key: "%s"\n Value: 0x%04x' % (name, value)) - else: - print("\nError: Value for %s has unexpected size of %s" % (name, size)) - elif id in id_map_hexstrings: - name = id_map_hexstrings[id] - print('\n Key: "%s"\n Value: 0x%s' % (name, hexlify(content))) - else: - print("\nWarning: Unknown metadata with id %s found" % id) - name = str(id) + ' (hex)' - print(' Key: "%s"\n Value: 0x%s' % (name, hexlify(content))) - pos += size - return - - -class MobiHeader: - # all values are packed in big endian format - palmdoc_header = { - 'compression_type' : (0x00, b'>H', 2), - 'fill0' : (0x02, b'>H', 2), - 'text_length' : (0x04, b'>L', 4), - 'text_records' : (0x08, b'>H', 2), - 'max_section_size' : (0x0a, b'>H', 2), - 'read_pos ' : (0x0c, b'>L', 4), - } - - mobi6_header = { - 'compression_type' : (0x00, b'>H', 2), - 'fill0' : (0x02, b'>H', 2), - 'text_length' : (0x04, b'>L', 4), - 'text_records' : (0x08, b'>H', 2), - 'max_section_size' : (0x0a, b'>H', 2), - 'crypto_type' : (0x0c, b'>H', 2), - 'fill1' : (0x0e, b'>H', 2), - 'magic' : (0x10, b'4s', 4), - 'header_length (from MOBI)' : (0x14, b'>L', 4), - 'type' : (0x18, b'>L', 4), - 'codepage' : (0x1c, b'>L', 4), - 'unique_id' : (0x20, b'>L', 4), - 'version' : (0x24, b'>L', 4), - 'metaorthindex' : (0x28, b'>L', 4), - 'metainflindex' : (0x2c, b'>L', 4), - 'index_names' : (0x30, b'>L', 4), - 'index_keys' : (0x34, b'>L', 4), - 'extra_index0' : (0x38, b'>L', 4), - 'extra_index1' : (0x3c, b'>L', 4), - 'extra_index2' : (0x40, b'>L', 4), - 'extra_index3' : (0x44, b'>L', 4), - 'extra_index4' : (0x48, b'>L', 4), - 'extra_index5' : (0x4c, b'>L', 4), - 'first_nontext' : (0x50, b'>L', 4), - 'title_offset' : (0x54, b'>L', 4), - 'title_length' : (0x58, b'>L', 4), - 'language_code' : (0x5c, b'>L', 4), - 'dict_in_lang' : (0x60, b'>L', 4), - 'dict_out_lang' : (0x64, b'>L', 4), - 'min_version' : (0x68, b'>L', 4), - 'first_resc_offset' : (0x6c, b'>L', 4), - 'huff_offset' : (0x70, b'>L', 4), - 'huff_num' : (0x74, b'>L', 4), - 'huff_tbl_offset' : (0x78, b'>L', 4), - 'huff_tbl_len' : (0x7c, b'>L', 4), - 'exth_flags' : (0x80, b'>L', 4), - 'fill3_a' : (0x84, b'>L', 4), - 'fill3_b' : (0x88, b'>L', 4), - 'fill3_c' : (0x8c, b'>L', 4), - 'fill3_d' : (0x90, b'>L', 4), - 'fill3_e' : (0x94, b'>L', 4), - 'fill3_f' : (0x98, b'>L', 4), - 'fill3_g' : (0x9c, b'>L', 4), - 'fill3_h' : (0xa0, b'>L', 4), - 'unknown0' : (0xa4, b'>L', 4), - 'drm_offset' : (0xa8, b'>L', 4), - 'drm_count' : (0xac, b'>L', 4), - 'drm_size' : (0xb0, b'>L', 4), - 'drm_flags' : (0xb4, b'>L', 4), - 'fill4_a' : (0xb8, b'>L', 4), - 'fill4_b' : (0xbc, b'>L', 4), - 'first_content' : (0xc0, b'>H', 2), - 'last_content' : (0xc2, b'>H', 2), - 'unknown0' : (0xc4, b'>L', 4), - 'fcis_offset' : (0xc8, b'>L', 4), - 'fcis_count' : (0xcc, b'>L', 4), - 'flis_offset' : (0xd0, b'>L', 4), - 'flis_count' : (0xd4, b'>L', 4), - 'unknown1' : (0xd8, b'>L', 4), - 'unknown2' : (0xdc, b'>L', 4), - 'srcs_offset' : (0xe0, b'>L', 4), - 'srcs_count' : (0xe4, b'>L', 4), - 'unknown3' : (0xe8, b'>L', 4), - 'unknown4' : (0xec, b'>L', 4), - 'fill5' : (0xf0, b'>H', 2), - 'traildata_flags' : (0xf2, b'>H', 2), - 'ncx_index' : (0xf4, b'>L', 4), - 'unknown5' : (0xf8, b'>L', 4), - 'unknown6' : (0xfc, b'>L', 4), - 'datp_offset' : (0x100, b'>L', 4), - 'unknown7' : (0x104, b'>L', 4), - 'Unknown ' : (0x108, b'>L', 4), - 'Unknown ' : (0x10C, b'>L', 4), - 'Unknown ' : (0x110, b'>L', 4), - 'Unknown ' : (0x114, b'>L', 4), - 'Unknown ' : (0x118, b'>L', 4), - 'Unknown ' : (0x11C, b'>L', 4), - 'Unknown ' : (0x120, b'>L', 4), - 'Unknown ' : (0x124, b'>L', 4), - 'Unknown ' : (0x128, b'>L', 4), - 'Unknown ' : (0x12C, b'>L', 4), - 'Unknown ' : (0x130, b'>L', 4), - 'Unknown ' : (0x134, b'>L', 4), - 'Unknown ' : (0x138, b'>L', 4), - 'Unknown ' : (0x11C, b'>L', 4), - } - - mobi8_header = { - 'compression_type' : (0x00, b'>H', 2), - 'fill0' : (0x02, b'>H', 2), - 'text_length' : (0x04, b'>L', 4), - 'text_records' : (0x08, b'>H', 2), - 'max_section_size' : (0x0a, b'>H', 2), - 'crypto_type' : (0x0c, b'>H', 2), - 'fill1' : (0x0e, b'>H', 2), - 'magic' : (0x10, b'4s', 4), - 'header_length (from MOBI)' : (0x14, b'>L', 4), - 'type' : (0x18, b'>L', 4), - 'codepage' : (0x1c, b'>L', 4), - 'unique_id' : (0x20, b'>L', 4), - 'version' : (0x24, b'>L', 4), - 'metaorthindex' : (0x28, b'>L', 4), - 'metainflindex' : (0x2c, b'>L', 4), - 'index_names' : (0x30, b'>L', 4), - 'index_keys' : (0x34, b'>L', 4), - 'extra_index0' : (0x38, b'>L', 4), - 'extra_index1' : (0x3c, b'>L', 4), - 'extra_index2' : (0x40, b'>L', 4), - 'extra_index3' : (0x44, b'>L', 4), - 'extra_index4' : (0x48, b'>L', 4), - 'extra_index5' : (0x4c, b'>L', 4), - 'first_nontext' : (0x50, b'>L', 4), - 'title_offset' : (0x54, b'>L', 4), - 'title_length' : (0x58, b'>L', 4), - 'language_code' : (0x5c, b'>L', 4), - 'dict_in_lang' : (0x60, b'>L', 4), - 'dict_out_lang' : (0x64, b'>L', 4), - 'min_version' : (0x68, b'>L', 4), - 'first_resc_offset' : (0x6c, b'>L', 4), - 'huff_offset' : (0x70, b'>L', 4), - 'huff_num' : (0x74, b'>L', 4), - 'huff_tbl_offset' : (0x78, b'>L', 4), - 'huff_tbl_len' : (0x7c, b'>L', 4), - 'exth_flags' : (0x80, b'>L', 4), - 'fill3_a' : (0x84, b'>L', 4), - 'fill3_b' : (0x88, b'>L', 4), - 'fill3_c' : (0x8c, b'>L', 4), - 'fill3_d' : (0x90, b'>L', 4), - 'fill3_e' : (0x94, b'>L', 4), - 'fill3_f' : (0x98, b'>L', 4), - 'fill3_g' : (0x9c, b'>L', 4), - 'fill3_h' : (0xa0, b'>L', 4), - 'unknown0' : (0xa4, b'>L', 4), - 'drm_offset' : (0xa8, b'>L', 4), - 'drm_count' : (0xac, b'>L', 4), - 'drm_size' : (0xb0, b'>L', 4), - 'drm_flags' : (0xb4, b'>L', 4), - 'fill4_a' : (0xb8, b'>L', 4), - 'fill4_b' : (0xbc, b'>L', 4), - 'fdst_offset' : (0xc0, b'>L', 4), - 'fdst_flow_count' : (0xc4, b'>L', 4), - 'fcis_offset' : (0xc8, b'>L', 4), - 'fcis_count' : (0xcc, b'>L', 4), - 'flis_offset' : (0xd0, b'>L', 4), - 'flis_count' : (0xd4, b'>L', 4), - 'unknown1' : (0xd8, b'>L', 4), - 'unknown2' : (0xdc, b'>L', 4), - 'srcs_offset' : (0xe0, b'>L', 4), - 'srcs_count' : (0xe4, b'>L', 4), - 'unknown3' : (0xe8, b'>L', 4), - 'unknown4' : (0xec, b'>L', 4), - 'fill5' : (0xf0, b'>H', 2), - 'traildata_flags' : (0xf2, b'>H', 2), - 'ncx_index' : (0xf4, b'>L', 4), - 'fragment_index' : (0xf8, b'>L', 4), - 'skeleton_index' : (0xfc, b'>L', 4), - 'datp_offset' : (0x100, b'>L', 4), - 'guide_index' : (0x104, b'>L', 4), - 'Unknown ' : (0x108, b'>L', 4), - 'Unknown ' : (0x10C, b'>L', 4), - 'Unknown ' : (0x110, b'>L', 4), - 'Unknown ' : (0x114, b'>L', 4), - 'Unknown ' : (0x118, b'>L', 4), - 'Unknown ' : (0x11C, b'>L', 4), - 'Unknown ' : (0x120, b'>L', 4), - 'Unknown ' : (0x124, b'>L', 4), - 'Unknown ' : (0x128, b'>L', 4), - 'Unknown ' : (0x12C, b'>L', 4), - 'Unknown ' : (0x130, b'>L', 4), - 'Unknown ' : (0x134, b'>L', 4), - 'Unknown ' : (0x138, b'>L', 4), - 'Unknown ' : (0x11C, b'>L', 4), - } - - palmdoc_header_sorted_keys = sortedHeaderKeys(palmdoc_header) - mobi6_header_sorted_keys = sortedHeaderKeys(mobi6_header) - mobi8_header_sorted_keys = sortedHeaderKeys(mobi8_header) - - id_map_strings = { - 1 : 'Drm Server Id', - 2 : 'Drm Commerce Id', - 3 : 'Drm Ebookbase Book Id', - 4 : 'Drm Ebookbase Dep Id', - 100 : 'Creator', - 101 : 'Publisher', - 102 : 'Imprint', - 103 : 'Description', - 104 : 'ISBN', - 105 : 'Subject', - 106 : 'Published', - 107 : 'Review', - 108 : 'Contributor', - 109 : 'Rights', - 110 : 'SubjectCode', - 111 : 'Type', - 112 : 'Source', - 113 : 'ASIN', - # 114 : 'versionNumber', - 117 : 'Adult', - 118 : 'Retail-Price', - 119 : 'Retail-Currency', - 120 : 'TSC', - 122 : 'fixed-layout', - 123 : 'book-type', - 124 : 'orientation-lock', - 126 : 'original-resolution', - 127 : 'zero-gutter', - 128 : 'zero-margin', - 129 : 'MetadataResourceURI', - 132 : 'RegionMagnification', - 150 : 'LendingEnabled', - 200 : 'DictShortName', - 501 : 'cdeType', - 502 : 'last_update_time', - 503 : 'Updated_Title', - 504 : 'CDEContentKey', - 505 : 'AmazonContentReference', - 506 : 'Title-Language', - 507 : 'Title-Display-Direction', - 508 : 'Title-Pronunciation', - 509 : 'Title-Collation', - 510 : 'Secondary-Title', - 511 : 'Secondary-Title-Language', - 512 : 'Secondary-Title-Direction', - 513 : 'Secondary-Title-Pronunciation', - 514 : 'Secondary-Title-Collation', - 515 : 'Author-Language', - 516 : 'Author-Display-Direction', - 517 : 'Author-Pronunciation', - 518 : 'Author-Collation', - 519 : 'Author-Type', - 520 : 'Publisher-Language', - 521 : 'Publisher-Display-Direction', - 522 : 'Publisher-Pronunciation', - 523 : 'Publisher-Collation', - 524 : 'Content-Language-Tag', - 525 : 'primary-writing-mode', - 526 : 'NCX-Ingested-By-Software', - 527 : 'page-progression-direction', - 528 : 'override-kindle-fonts', - 529 : 'Compression-Upgraded', - 530 : 'Soft-Hyphens-In-Content', - 531 : 'Dictionary_In_Langague', - 532 : 'Dictionary_Out_Language', - 533 : 'Font_Converted', - 534 : 'Amazon_Creator_Info', - 535 : 'Creator-Build-Tag', - 536 : 'HD-Media-Containers-Info', # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?) - 538 : 'Resource-Container-Fidelity', - 539 : 'HD-Container-Mimetype', - 540 : 'Sample-For_Special-Purpose', - 541 : 'Kindletool-Operation-Information', - 542 : 'Container_Id', - 543 : 'Asset-Type', # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER - 544 : 'Unknown_544', - } - id_map_values = { - 114 : 'versionNumber', - 115 : 'sample', - 116 : 'StartOffset', - 121 : 'Mobi8-Boundary-Section', - 125 : 'Embedded-Record-Count', - 130 : 'Offline-Sample', - 131 : 'Metadata-Record-Offset', - 201 : 'CoverOffset', - 202 : 'ThumbOffset', - 203 : 'HasFakeCover', - 204 : 'Creator-Software', - 205 : 'Creator-Major-Version', - 206 : 'Creator-Minor-Version', - 207 : 'Creator-Build-Number', - 401 : 'Clipping-Limit', - 402 : 'Publisher-Limit', - 404 : 'Text-to-Speech-Disabled', - 406 : 'Rental-Expiration-Time', - } - id_map_hexstrings = { - 208 : 'Watermark_(hex)', - 209 : 'Tamper-Proof-Keys_(hex)', - 300 : 'Font-Signature_(hex)', - 403 : 'Unknown_(403)_(hex)', - 405 : 'Ownership-Type_(hex)', - 407 : 'Unknown_(407)_(hex)', - 420 : 'Multimedia-Content-Reference_(hex)', - 450 : 'Locations_Match_(hex)', - 451 : 'Full-Story-Length_(hex)', - 452 : 'Sample-Start_Location_(hex)', - 453 : 'Sample-End-Location_(hex)', - } - - def __init__(self, sect, sectNumber): - self.sect = sect - self.start = sectNumber - self.header = self.sect.loadSection(self.start) - if len(self.header)>20 and self.header[16:20] == b'MOBI': - self.sect.setsectiondescription(0,"Mobipocket Header") - self.palm = False - elif self.sect.ident == b'TEXtREAd': - self.sect.setsectiondescription(0, "PalmDOC Header") - self.palm = True - else: - raise unpackException('Unknown File Format') - - self.records, = struct.unpack_from(b'>H', self.header, 0x8) - - # set defaults in case this is a PalmDOC - self.title = self.sect.palmname.decode('latin-1', errors='replace') - self.length = len(self.header)-16 - self.type = 3 - self.codepage = 1252 - self.codec = 'windows-1252' - self.unique_id = 0 - self.version = 0 - self.hasExth = False - self.exth = b'' - self.exth_offset = self.length + 16 - self.exth_length = 0 - self.crypto_type = 0 - self.firstnontext = self.start+self.records + 1 - self.firstresource = self.start+self.records + 1 - self.ncxidx = 0xffffffff - self.metaOrthIndex = 0xffffffff - self.metaInflIndex = 0xffffffff - self.skelidx = 0xffffffff - self.fragidx = 0xffffffff - self.guideidx = 0xffffffff - self.fdst = 0xffffffff - self.mlstart = self.sect.loadSection(self.start+1)[:4] - self.rawSize = 0 - self.metadata = dict_() - - # set up for decompression/unpacking - self.compression, = struct.unpack_from(b'>H', self.header, 0x0) - if self.compression == 0x4448: - reader = HuffcdicReader() - huffoff, huffnum = struct.unpack_from(b'>LL', self.header, 0x70) - huffoff = huffoff + self.start - self.sect.setsectiondescription(huffoff,"Huffman Compression Seed") - reader.loadHuff(self.sect.loadSection(huffoff)) - for i in range(1, huffnum): - self.sect.setsectiondescription(huffoff+i,"Huffman CDIC Compression Seed %d" % i) - reader.loadCdic(self.sect.loadSection(huffoff+i)) - self.unpack = reader.unpack - elif self.compression == 2: - self.unpack = PalmdocReader().unpack - elif self.compression == 1: - self.unpack = UncompressedReader().unpack - else: - raise unpackException('invalid compression type: 0x%4x' % self.compression) - - if self.palm: - return - - self.length, self.type, self.codepage, self.unique_id, self.version = struct.unpack(b'>LLLLL', self.header[20:40]) - codec_map = { - 1252 : 'windows-1252', - 65001: 'utf-8', - } - if self.codepage in codec_map: - self.codec = codec_map[self.codepage] - - # title - toff, tlen = struct.unpack(b'>II', self.header[0x54:0x5c]) - tend = toff + tlen - self.title=self.header[toff:tend].decode(self.codec, errors='replace') - - exth_flag, = struct.unpack(b'>L', self.header[0x80:0x84]) - self.hasExth = exth_flag & 0x40 - self.exth_offset = self.length + 16 - self.exth_length = 0 - if self.hasExth: - self.exth_length, = struct.unpack_from(b'>L', self.header, self.exth_offset+4) - self.exth_length = ((self.exth_length + 3)>>2)<<2 # round to next 4 byte boundary - self.exth = self.header[self.exth_offset:self.exth_offset+self.exth_length] - - # parse the exth / metadata - self.parseMetaData() - - # self.mlstart = self.sect.loadSection(self.start+1) - # self.mlstart = self.mlstart[0:4] - self.crypto_type, = struct.unpack_from(b'>H', self.header, 0xC) - - # Start sector for additional files such as images, fonts, resources, etc - # Can be missing so fall back to default set previously - ofst, = struct.unpack_from(b'>L', self.header, 0x6C) - if ofst != 0xffffffff: - self.firstresource = ofst + self.start - ofst, = struct.unpack_from(b'>L', self.header, 0x50) - if ofst != 0xffffffff: - self.firstnontext = ofst + self.start - - if self.isPrintReplica(): - return - - if self.version < 8: - # Dictionary metaOrthIndex - self.metaOrthIndex, = struct.unpack_from(b'>L', self.header, 0x28) - if self.metaOrthIndex != 0xffffffff: - self.metaOrthIndex += self.start - - # Dictionary metaInflIndex - self.metaInflIndex, = struct.unpack_from(b'>L', self.header, 0x2C) - if self.metaInflIndex != 0xffffffff: - self.metaInflIndex += self.start - - # handle older headers without any ncxindex info and later - # specifically 0xe4 headers - if self.length + 16 < 0xf8: - return - - # NCX Index - self.ncxidx, = struct.unpack(b'>L', self.header[0xf4:0xf8]) - if self.ncxidx != 0xffffffff: - self.ncxidx += self.start - - # K8 specific Indexes - if self.start != 0 or self.version == 8: - # Index into file skeletons in RawML - self.skelidx, = struct.unpack_from(b'>L', self.header, 0xfc) - if self.skelidx != 0xffffffff: - self.skelidx += self.start - - # Index into
sections in RawML - self.fragidx, = struct.unpack_from(b'>L', self.header, 0xf8) - if self.fragidx != 0xffffffff: - self.fragidx += self.start - - # Index into Other files - self.guideidx, = struct.unpack_from(b'>L', self.header, 0x104) - if self.guideidx != 0xffffffff: - self.guideidx += self.start - - # dictionaries do not seem to use the same approach in K8's - # so disable them - self.metaOrthIndex = 0xffffffff - self.metaInflIndex = 0xffffffff - - # need to use the FDST record to find out how to properly unpack - # the rawML into pieces - # it is simply a table of start and end locations for each flow piece - self.fdst, = struct.unpack_from(b'>L', self.header, 0xc0) - self.fdstcnt, = struct.unpack_from(b'>L', self.header, 0xc4) - # if cnt is 1 or less, fdst section mumber can be garbage - if self.fdstcnt <= 1: - self.fdst = 0xffffffff - if self.fdst != 0xffffffff: - self.fdst += self.start - # setting of fdst section description properly handled in mobi_kf8proc - - def dump_exth(self): - # determine text encoding - codec=self.codec - if (not self.hasExth) or (self.exth_length) == 0 or (self.exth == b''): - return - num_items, = struct.unpack(b'>L', self.exth[8:12]) - pos = 12 - print("Key Size Description Value") - for _ in range(num_items): - id, size = struct.unpack(b'>LL', self.exth[pos:pos+8]) - contentsize = size-8 - content = self.exth[pos + 8: pos + size] - if id in MobiHeader.id_map_strings: - exth_name = MobiHeader.id_map_strings[id] - print('{0: >3d} {1: >4d} {2: <30s} {3:s}'.format(id, contentsize, exth_name, content.decode(codec, errors='replace'))) - elif id in MobiHeader.id_map_values: - exth_name = MobiHeader.id_map_values[id] - if size == 9: - value, = struct.unpack(b'B',content) - print('{0:3d} byte {1:<30s} {2:d}'.format(id, exth_name, value)) - elif size == 10: - value, = struct.unpack(b'>H',content) - print('{0:3d} word {1:<30s} 0x{2:0>4X} ({2:d})'.format(id, exth_name, value)) - elif size == 12: - value, = struct.unpack(b'>L',content) - print('{0:3d} long {1:<30s} 0x{2:0>8X} ({2:d})'.format(id, exth_name, value)) - else: - print('{0: >3d} {1: >4d} {2: <30s} (0x{3:s})'.format(id, contentsize, "Bad size for "+exth_name, hexlify(content))) - elif id in MobiHeader.id_map_hexstrings: - exth_name = MobiHeader.id_map_hexstrings[id] - print('{0:3d} {1:4d} {2:<30s} 0x{3:s}'.format(id, contentsize, exth_name, hexlify(content))) - else: - exth_name = "Unknown EXTH ID {0:d}".format(id) - print("{0: >3d} {1: >4d} {2: <30s} 0x{3:s}".format(id, contentsize, exth_name, hexlify(content))) - pos += size - return - - def dumpheader(self): - # first 16 bytes are not part of the official mobiheader - # but we will treat it as such - # so section 0 is 16 (decimal) + self.length in total == at least 0x108 bytes for Mobi 8 headers - print("Dumping section %d, Mobipocket Header version: %d, total length %d" % (self.start,self.version, self.length+16)) - self.hdr = {} - # set it up for the proper header version - if self.version == 0: - self.mobi_header = MobiHeader.palmdoc_header - self.mobi_header_sorted_keys = MobiHeader.palmdoc_header_sorted_keys - elif self.version < 8: - self.mobi_header = MobiHeader.mobi6_header - self.mobi_header_sorted_keys = MobiHeader.mobi6_header_sorted_keys - else: - self.mobi_header = MobiHeader.mobi8_header - self.mobi_header_sorted_keys = MobiHeader.mobi8_header_sorted_keys - - # parse the header information - for key in self.mobi_header_sorted_keys: - (pos, format, tot_len) = self.mobi_header[key] - if pos < (self.length + 16): - val, = struct.unpack_from(format, self.header, pos) - self.hdr[key] = val - - if 'title_offset' in self.hdr: - title_offset = self.hdr['title_offset'] - title_length = self.hdr['title_length'] - else: - title_offset = 0 - title_length = 0 - if title_offset == 0: - title_offset = len(self.header) - title_length = 0 - self.title = self.sect.palmname.decode('latin-1', errors='replace') - else: - self.title = self.header[title_offset:title_offset+title_length].decode(self.codec, errors='replace') - # title record always padded with two nul bytes and then padded with nuls to next 4 byte boundary - title_length = ((title_length+2+3)>>2)<<2 - - self.extra1 = self.header[self.exth_offset+self.exth_length:title_offset] - self.extra2 = self.header[title_offset+title_length:] - - print("Mobipocket header from section %d" % self.start) - print(" Offset Value Hex Dec Description") - for key in self.mobi_header_sorted_keys: - (pos, format, tot_len) = self.mobi_header[key] - if pos < (self.length + 16): - if key != 'magic': - fmt_string = "0x{0:0>3X} ({0:3d}){1: >" + str(9-2*tot_len) +"s}0x{2:0>" + str(2*tot_len) + "X} {2:10d} {3:s}" - else: - self.hdr[key] = unicode_str(self.hdr[key]) - fmt_string = "0x{0:0>3X} ({0:3d}){2:>11s} {3:s}" - print(fmt_string.format(pos, " ",self.hdr[key], key)) - print("") - - if self.exth_length > 0: - print("EXTH metadata, offset %d, padded length %d" % (self.exth_offset,self.exth_length)) - self.dump_exth() - print("") - - if len(self.extra1) > 0: - print("Extra data between EXTH and Title, length %d" % len(self.extra1)) - print(hexlify(self.extra1)) - print("") - - if title_length > 0: - print("Title in header at offset %d, padded length %d: '%s'" %(title_offset,title_length,self.title)) - print("") - - if len(self.extra2) > 0: - print("Extra data between Title and end of header, length %d" % len(self.extra2)) - print(hexlify(self.extra2)) - print("") - - def isPrintReplica(self): - return self.mlstart[0:4] == b"%MOP" - - def isK8(self): - return self.start != 0 or self.version == 8 - - def isEncrypted(self): - return self.crypto_type != 0 - - def hasNCX(self): - return self.ncxidx != 0xffffffff - - def isDictionary(self): - return self.metaOrthIndex != 0xffffffff - - def getncxIndex(self): - return self.ncxidx - - def decompress(self, data): - return self.unpack(data) - - def Language(self): - langcode = struct.unpack(b'!L', self.header[0x5c:0x60])[0] - langid = langcode & 0xFF - sublangid = (langcode >> 8) & 0xFF - return getLanguage(langid, sublangid) - - def DictInLanguage(self): - if self.isDictionary(): - langcode = struct.unpack(b'!L', self.header[0x60:0x64])[0] - langid = langcode & 0xFF - sublangid = (langcode >> 10) & 0xFF - if langid != 0: - return getLanguage(langid, sublangid) - return False - - def DictOutLanguage(self): - if self.isDictionary(): - langcode = struct.unpack(b'!L', self.header[0x64:0x68])[0] - langid = langcode & 0xFF - sublangid = (langcode >> 10) & 0xFF - if langid != 0: - return getLanguage(langid, sublangid) - return False - - def getRawML(self): - def getSizeOfTrailingDataEntry(data): - num = 0 - for v in data[-4:]: - if bord(v) & 0x80: - num = 0 - num = (num << 7) | (bord(v) & 0x7f) - return num - def trimTrailingDataEntries(data): - for _ in range(trailers): - num = getSizeOfTrailingDataEntry(data) - data = data[:-num] - if multibyte: - num = (ord(data[-1:]) & 3) + 1 - data = data[:-num] - return data - multibyte = 0 - trailers = 0 - if self.sect.ident == b'BOOKMOBI': - mobi_length, = struct.unpack_from(b'>L', self.header, 0x14) - mobi_version, = struct.unpack_from(b'>L', self.header, 0x68) - if (mobi_length >= 0xE4) and (mobi_version >= 5): - flags, = struct.unpack_from(b'>H', self.header, 0xF2) - multibyte = flags & 1 - while flags > 1: - if flags & 2: - trailers += 1 - flags = flags >> 1 - # get raw mobi markup languge - print("Unpacking raw markup language") - dataList = [] - # offset = 0 - for i in range(1, self.records+1): - data = trimTrailingDataEntries(self.sect.loadSection(self.start + i)) - dataList.append(self.unpack(data)) - if self.isK8(): - self.sect.setsectiondescription(self.start + i,"KF8 Text Section {0:d}".format(i)) - elif self.version == 0: - self.sect.setsectiondescription(self.start + i,"PalmDOC Text Section {0:d}".format(i)) - else: - self.sect.setsectiondescription(self.start + i,"Mobipocket Text Section {0:d}".format(i)) - rawML = b''.join(dataList) - self.rawSize = len(rawML) - return rawML - - # all metadata is stored in a dictionary with key and returns a *list* of values - # a list is used to allow for multiple creators, multiple contributors, etc - def parseMetaData(self): - def addValue(name, value): - if name not in self.metadata: - self.metadata[name] = [value] - else: - self.metadata[name].append(value) - - codec=self.codec - if self.hasExth: - extheader=self.exth - _length, num_items = struct.unpack(b'>LL', extheader[4:12]) - extheader = extheader[12:] - pos = 0 - for _ in range(num_items): - id, size = struct.unpack(b'>LL', extheader[pos:pos+8]) - content = extheader[pos + 8: pos + size] - if id in MobiHeader.id_map_strings: - name = MobiHeader.id_map_strings[id] - addValue(name, content.decode(codec, errors='replace')) - elif id in MobiHeader.id_map_values: - name = MobiHeader.id_map_values[id] - if size == 9: - value, = struct.unpack(b'B',content) - addValue(name, unicode_str(str(value))) - elif size == 10: - value, = struct.unpack(b'>H',content) - addValue(name, unicode_str(str(value))) - elif size == 12: - value, = struct.unpack(b'>L',content) - # handle special case of missing CoverOffset or missing ThumbOffset - if id == 201 or id == 202: - if value != 0xffffffff: - addValue(name, unicode_str(str(value))) - else: - addValue(name, unicode_str(str(value))) - else: - print("Warning: Bad key, size, value combination detected in EXTH ", id, size, hexlify(content)) - addValue(name, hexlify(content)) - elif id in MobiHeader.id_map_hexstrings: - name = MobiHeader.id_map_hexstrings[id] - addValue(name, hexlify(content)) - else: - name = unicode_str(str(id)) + ' (hex)' - addValue(name, hexlify(content)) - pos += size - - # add the basics to the metadata each as a list element - self.metadata['Language'] = [self.Language()] - self.metadata['Title'] = [unicode_str(self.title,self.codec)] - self.metadata['Codec'] = [self.codec] - self.metadata['UniqueID'] = [unicode_str(str(self.unique_id))] - # if no asin create one using a uuid - if 'ASIN' not in self.metadata: - self.metadata['ASIN'] = [unicode_str(str(uuid.uuid4()))] - # if no cdeType set it to "EBOK" - if 'cdeType' not in self.metadata: - self.metadata['cdeType'] = ['EBOK'] - - def getMetaData(self): - return self.metadata - - def describeHeader(self, DUMP): - print("Mobi Version:", self.version) - print("Codec:", self.codec) - print("Title:", self.title) - if 'Updated_Title' in self.metadata: - print("EXTH Title:", self.metadata['Updated_Title'][0]) - if self.compression == 0x4448: - print("Huffdic compression") - elif self.compression == 2: - print("Palmdoc compression") - elif self.compression == 1: - print("No compression") - if DUMP: - self.dumpheader() diff --git a/epy_extras/KindleUnpack/mobi_html.py b/epy_extras/KindleUnpack/mobi_html.py deleted file mode 100644 index eda766c..0000000 --- a/epy_extras/KindleUnpack/mobi_html.py +++ /dev/null @@ -1,439 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab - -from __future__ import unicode_literals, division, absolute_import, print_function - -from .compatibility_utils import PY2, utf8_str - -if PY2: - range = xrange - -import re -# note: re requites the pattern to be the exact same type as the data to be searched in python3 -# but u"" is not allowed for the pattern itself only b"" - -from .mobi_utils import fromBase32 - -class HTMLProcessor: - - def __init__(self, files, metadata, rscnames): - self.files = files - self.metadata = metadata - self.rscnames = rscnames - # for original style mobis, default to including all image files in the opf manifest - self.used = {} - for name in rscnames: - self.used[name] = 'used' - - def findAnchors(self, rawtext, indx_data, positionMap): - # process the raw text - # find anchors... - print("Find link anchors") - link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''', re.IGNORECASE) - # TEST NCX: merge in filepos from indx - pos_links = [int(m.group(1)) for m in link_pattern.finditer(rawtext)] - if indx_data: - pos_indx = [e['pos'] for e in indx_data if e['pos']>0] - pos_links = list(set(pos_links + pos_indx)) - - for position in pos_links: - if position in positionMap: - positionMap[position] = positionMap[position] + utf8_str('' % position) - else: - positionMap[position] = utf8_str('' % position) - - # apply dictionary metadata and anchors - print("Insert data into html") - pos = 0 - lastPos = len(rawtext) - dataList = [] - for end in sorted(positionMap.keys()): - if end == 0 or end > lastPos: - continue # something's up - can't put a tag in outside ... - dataList.append(rawtext[pos:end]) - dataList.append(positionMap[end]) - pos = end - dataList.append(rawtext[pos:]) - srctext = b"".join(dataList) - rawtext = None - dataList = None - self.srctext = srctext - self.indx_data = indx_data - return srctext - - def insertHREFS(self): - srctext = self.srctext - rscnames = self.rscnames - metadata = self.metadata - - # put in the hrefs - print("Insert hrefs into html") - # There doesn't seem to be a standard, so search as best as we can - - link_pattern = re.compile(br''']*?)filepos=['"]{0,1}0*(\d+)['"]{0,1}([^>]*?)>''', re.IGNORECASE) - srctext = link_pattern.sub(br'''''', srctext) - - # remove empty anchors - print("Remove empty anchors from html") - srctext = re.sub(br"",br"", srctext) - srctext = re.sub(br"\s*",br"", srctext) - - # convert image references - print("Insert image references into html") - # split string into image tag pieces and other pieces - image_pattern = re.compile(br'''()''', re.IGNORECASE) - image_index_pattern = re.compile(br'''recindex=['"]{0,1}([0-9]+)['"]{0,1}''', re.IGNORECASE) - srcpieces = image_pattern.split(srctext) - srctext = self.srctext = None - - # all odd pieces are image tags (nulls string on even pieces if no space between them in srctext) - for i in range(1, len(srcpieces), 2): - tag = srcpieces[i] - for m in image_index_pattern.finditer(tag): - imageNumber = int(m.group(1)) - imageName = rscnames[imageNumber-1] - if imageName is None: - print("Error: Referenced image %s was not recognized as a valid image" % imageNumber) - else: - replacement = b'src="Images/' + utf8_str(imageName) + b'"' - tag = image_index_pattern.sub(replacement, tag, 1) - srcpieces[i] = tag - srctext = b"".join(srcpieces) - - # add in character set meta into the html header if needed - if 'Codec' in metadata: - srctext = srctext[0:12]+b''+srctext[12:] - return srctext, self.used - - -class XHTMLK8Processor: - - def __init__(self, rscnames, k8proc): - self.rscnames = rscnames - self.k8proc = k8proc - self.used = {} - - def buildXHTML(self): - - # first need to update all links that are internal which - # are based on positions within the xhtml files **BEFORE** - # cutting and pasting any pieces into the xhtml text files - - # kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml) - # XXXX is the offset in records into divtbl - # YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position - - # pos:fid pattern - posfid_pattern = re.compile(br'''()''', re.IGNORECASE) - posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''') - - parts = [] - print("Building proper xhtml for each file") - for i in range(self.k8proc.getNumberOfParts()): - part = self.k8proc.getPart(i) - [partnum, dir, filename, beg, end, aidtext] = self.k8proc.getPartInfo(i) - - # internal links - srcpieces = posfid_pattern.split(part) - for j in range(1, len(srcpieces),2): - tag = srcpieces[j] - if tag.startswith(b'<'): - for m in posfid_index_pattern.finditer(tag): - posfid = m.group(1) - offset = m.group(2) - filename, idtag = self.k8proc.getIDTagByPosFid(posfid, offset) - if idtag == b'': - replacement= b'"' + utf8_str(filename) + b'"' - else: - replacement = b'"' + utf8_str(filename) + b'#' + idtag + b'"' - tag = posfid_index_pattern.sub(replacement, tag, 1) - srcpieces[j] = tag - part = b"".join(srcpieces) - parts.append(part) - - # we are free to cut and paste as we see fit - # we can safely remove all of the Kindlegen generated aid tags - # change aid ids that are in k8proc.linked_aids to xhtml ids - find_tag_with_aid_pattern = re.compile(br'''(<[^>]*\said\s*=[^>]*>)''', re.IGNORECASE) - within_tag_aid_position_pattern = re.compile(br'''\said\s*=['"]([^'"]*)['"]''') - for i in range(len(parts)): - part = parts[i] - srcpieces = find_tag_with_aid_pattern.split(part) - for j in range(len(srcpieces)): - tag = srcpieces[j] - if tag.startswith(b'<'): - for m in within_tag_aid_position_pattern.finditer(tag): - try: - aid = m.group(1) - except IndexError: - aid = None - replacement = b'' - if aid in self.k8proc.linked_aids: - replacement = b' id="aid-' + aid + b'"' - tag = within_tag_aid_position_pattern.sub(replacement, tag, 1) - srcpieces[j] = tag - part = b"".join(srcpieces) - parts[i] = part - - # we can safely replace all of the Kindlegen generated data-AmznPageBreak tags - # with page-break-after style patterns - find_tag_with_AmznPageBreak_pattern = re.compile(br'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE) - within_tag_AmznPageBreak_position_pattern = re.compile(br'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''') - for i in range(len(parts)): - part = parts[i] - srcpieces = find_tag_with_AmznPageBreak_pattern.split(part) - for j in range(len(srcpieces)): - tag = srcpieces[j] - if tag.startswith(b'<'): - srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub( - lambda m:b' style="page-break-after:' + m.group(1) + b'"', tag) - part = b"".join(srcpieces) - parts[i] = part - - # we have to handle substitutions for the flows pieces first as they may - # be inlined into the xhtml text - # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images) - # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc) - # kindle:embed:XXXX (used for fonts) - - flows = [] - flows.append(None) - flowinfo = [] - flowinfo.append([None, None, None, None]) - - # regular expression search patterns - img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE) - img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE) - - tag_pattern = re.compile(br'''(<[^>]*>)''') - flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE) - - url_pattern = re.compile(br'''(url\(.*?\))''', re.IGNORECASE) - url_img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*["')]''', re.IGNORECASE) - font_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)["')]''', re.IGNORECASE) - url_css_index_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE) - url_svg_image_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=image/svg\+xml[^\)]*''', re.IGNORECASE) - - for i in range(1, self.k8proc.getNumberOfFlows()): - [ftype, format, dir, filename] = self.k8proc.getFlowInfo(i) - flowpart = self.k8proc.getFlow(i) - - # links to raster image files from image tags - # image_pattern - srcpieces = img_pattern.split(flowpart) - for j in range(1, len(srcpieces),2): - tag = srcpieces[j] - if tag.startswith(b']*>)''') - flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE) - for i in range(len(parts)): - part = parts[i] - [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] - # flow pattern - srcpieces = tag_pattern.split(part) - for j in range(1, len(srcpieces),2): - tag = srcpieces[j] - if tag.startswith(b'<'): - for m in flow_pattern.finditer(tag): - num = fromBase32(m.group(1)) - if num > 0 and num < len(self.k8proc.flowinfo): - [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) - flowpart = flows[num] - if fmt == b'inline': - tag = flowpart - else: - replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"' - tag = flow_pattern.sub(replacement, tag, 1) - self.used[fnm] = 'used' - else: - print("warning: ignoring non-existent flow link", tag, " value 0x%x" % num) - srcpieces[j] = tag - part = b''.join(srcpieces) - - # store away modified version - parts[i] = part - - # Handle any embedded raster images links in style= attributes urls - style_pattern = re.compile(br'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''', re.IGNORECASE) - img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE) - - for i in range(len(parts)): - part = parts[i] - [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] - - # replace urls in style attributes - srcpieces = style_pattern.split(part) - for j in range(1, len(srcpieces),2): - tag = srcpieces[j] - if b'kindle:embed' in tag: - for m in img_index_pattern.finditer(tag): - imageNumber = fromBase32(m.group(1)) - imageName = self.rscnames[imageNumber-1] - osep = m.group()[0:1] - csep = m.group()[-1:] - if imageName is not None: - replacement = osep + b'../Images/'+ utf8_str(imageName) + csep - self.used[imageName] = 'used' - tag = img_index_pattern.sub(replacement, tag, 1) - else: - print("Error: Referenced image %s in style url was not recognized in %s" % (imageNumber, tag)) - srcpieces[j] = tag - part = b"".join(srcpieces) - - # store away modified version - parts[i] = part - - # Handle any embedded raster images links in the xhtml text - # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images) - img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE) - img_index_pattern = re.compile(br'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''') - - for i in range(len(parts)): - part = parts[i] - [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] - - # links to raster image files - # image_pattern - srcpieces = img_pattern.split(part) - for j in range(1, len(srcpieces),2): - tag = srcpieces[j] - if tag.startswith(b' remove value="XX" attributes since these are illegal - tag_pattern = re.compile(br'''(<[^>]*>)''') - li_value_pattern = re.compile(br'''\svalue\s*=\s*['"][^'"]*['"]''', re.IGNORECASE) - - for i in range(len(parts)): - part = parts[i] - [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] - - # tag pattern - srcpieces = tag_pattern.split(part) - for j in range(1, len(srcpieces),2): - tag = srcpieces[j] - if tag.startswith(b'H', data, idxtPos + 4 + (2 * j)) - idxPositions.append(pos) - # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) - idxPositions.append(idxtPos) - # for each entry in the IDXT build up the tagMap and any associated text - for j in range(entryCount): - startPos = idxPositions[j] - endPos = idxPositions[j+1] - textLength = ord(data[startPos:startPos+1]) - text = data[startPos+1:startPos+1+textLength] - if hordt2 is not None: - text = b''.join(bchr(hordt2[bord(x)]) for x in text) - tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos) - outtbl.append([text, tagMap]) - if self.DEBUG: - print(tagMap) - print(text) - return outtbl, ctoc_text - - def parseINDXHeader(self, data): - "read INDX header" - if not data[:4] == b'INDX': - print("Warning: index section is not INDX") - return False - words = ( - 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code', - 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc' - ) - num = len(words) - values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)]) - header = {} - for n in range(num): - header[words[n]] = values[n] - - ordt1 = None - ordt2 = None - - ocnt, oentries, op1, op2, otagx = struct.unpack_from(b'>LLLLL',data, 0xa4) - if header['code'] == 0xfdea or ocnt != 0 or oentries > 0: - # horribly hacked up ESP (sample) mobi books use two ORDT sections but never specify - # them in the proper place in the header. They seem to be codepage 65002 which seems - # to be some sort of strange EBCDIC utf-8 or 16 encoded strings - - # so we need to look for them and store them away to process leading text - # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries - # we only ever seem to use the seocnd but ... - assert(ocnt == 1) - assert(data[op1:op1+4] == b'ORDT') - assert(data[op2:op2+4] == b'ORDT') - ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4) - ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4) - - if self.DEBUG: - print("parsed INDX header:") - for n in words: - print(n, "%X" % header[n],) - print("") - return header, ordt1, ordt2 - - def readCTOC(self, txtdata): - # read all blocks from CTOC - ctoc_data = {} - offset = 0 - while offset next bytes: name - name = txtdata[offset:offset+ilen] - offset += ilen - if self.DEBUG: - print("name length is ", ilen) - print(idx_offs, name) - ctoc_data[idx_offs] = name - return ctoc_data - - -def getVariableWidthValue(data, offset): - ''' - Decode variable width value from given bytes. - - @param data: The bytes to decode. - @param offset: The start offset into data. - @return: Tuple of consumed bytes count and decoded value. - ''' - value = 0 - consumed = 0 - finished = False - while not finished: - v = data[offset + consumed: offset + consumed + 1] - consumed += 1 - if ord(v) & 0x80: - finished = True - value = (value << 7) | (ord(v) & 0x7f) - return consumed, value - - -def readTagSection(start, data): - ''' - Read tag section from given data. - - @param start: The start position in the data. - @param data: The data to process. - @return: Tuple of control byte count and list of tag tuples. - ''' - controlByteCount = 0 - tags = [] - if data[start:start+4] == b"TAGX": - firstEntryOffset, = struct.unpack_from(b'>L', data, start + 0x04) - controlByteCount, = struct.unpack_from(b'>L', data, start + 0x08) - - # Skip the first 12 bytes already read above. - for i in range(12, firstEntryOffset, 4): - pos = start + i - tags.append((ord(data[pos:pos+1]), ord(data[pos+1:pos+2]), ord(data[pos+2:pos+3]), ord(data[pos+3:pos+4]))) - return controlByteCount, tags - - -def countSetBits(value, bits=8): - ''' - Count the set bits in the given value. - - @param value: Integer value. - @param bits: The number of bits of the input value (defaults to 8). - @return: Number of set bits. - ''' - count = 0 - for _ in range(bits): - if value & 0x01 == 0x01: - count += 1 - value = value >> 1 - return count - - -def getTagMap(controlByteCount, tagTable, entryData, startPos, endPos): - ''' - Create a map of tags and values from the given byte section. - - @param controlByteCount: The number of control bytes. - @param tagTable: The tag table. - @param entryData: The data to process. - @param startPos: The starting position in entryData. - @param endPos: The end position in entryData or None if it is unknown. - @return: Hashmap of tag and list of values. - ''' - tags = [] - tagHashMap = {} - controlByteIndex = 0 - dataStart = startPos + controlByteCount - - for tag, valuesPerEntry, mask, endFlag in tagTable: - if endFlag == 0x01: - controlByteIndex += 1 - continue - cbyte = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1]) - if 0: - print("Control Byte Index %0x , Control Byte Value %0x" % (controlByteIndex, cbyte)) - - value = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1]) & mask - if value != 0: - if value == mask: - if countSetBits(mask) > 1: - # If all bits of masked value are set and the mask has more than one bit, a variable width value - # will follow after the control bytes which defines the length of bytes (NOT the value count!) - # which will contain the corresponding variable width values. - consumed, value = getVariableWidthValue(entryData, dataStart) - dataStart += consumed - tags.append((tag, None, value, valuesPerEntry)) - else: - tags.append((tag, 1, None, valuesPerEntry)) - else: - # Shift bits to get the masked value. - while mask & 0x01 == 0: - mask = mask >> 1 - value = value >> 1 - tags.append((tag, value, None, valuesPerEntry)) - for tag, valueCount, valueBytes, valuesPerEntry in tags: - values = [] - if valueCount is not None: - # Read valueCount * valuesPerEntry variable width values. - for _ in range(valueCount): - for _ in range(valuesPerEntry): - consumed, data = getVariableWidthValue(entryData, dataStart) - dataStart += consumed - values.append(data) - else: - # Convert valueBytes to variable width values. - totalConsumed = 0 - while totalConsumed < valueBytes: - # Does this work for valuesPerEntry != 1? - consumed, data = getVariableWidthValue(entryData, dataStart) - dataStart += consumed - totalConsumed += consumed - values.append(data) - if totalConsumed != valueBytes: - print("Error: Should consume %s bytes, but consumed %s" % (valueBytes, totalConsumed)) - tagHashMap[tag] = values - # Test that all bytes have been processed if endPos is given. - if endPos is not None and dataStart != endPos: - # The last entry might have some zero padding bytes, so complain only if non zero bytes are left. - for char in entryData[dataStart:endPos]: - if bord(char) != 0: - print("Warning: There are unprocessed index bytes left: %s" % toHex(entryData[dataStart:endPos])) - if 0: - print("controlByteCount: %s" % controlByteCount) - print("tagTable: %s" % tagTable) - print("data: %s" % toHex(entryData[startPos:endPos])) - print("tagHashMap: %s" % tagHashMap) - break - - return tagHashMap diff --git a/epy_extras/KindleUnpack/mobi_k8proc.py b/epy_extras/KindleUnpack/mobi_k8proc.py deleted file mode 100644 index 5b8274e..0000000 --- a/epy_extras/KindleUnpack/mobi_k8proc.py +++ /dev/null @@ -1,496 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab - -from __future__ import unicode_literals, division, absolute_import, print_function - -from .compatibility_utils import PY2, bstr, utf8_str - -if PY2: - range = xrange - -import os - -import struct -# note: struct pack, unpack, unpack_from all require bytestring format -# data all the way up to at least python 2.7.5, python 3 okay with bytestring - -import re -# note: re requites the pattern to be the exact same type as the data to be searched in python3 -# but u"" is not allowed for the pattern itself only b"" - -from .mobi_index import MobiIndex -from .mobi_utils import fromBase32 -from .unipath import pathof - -_guide_types = [b'cover',b'title-page',b'toc',b'index',b'glossary',b'acknowledgements', - b'bibliography',b'colophon',b'copyright-page',b'dedication', - b'epigraph',b'foreward',b'loi',b'lot',b'notes',b'preface',b'text'] - -# locate beginning and ending positions of tag with specific aid attribute -def locate_beg_end_of_tag(ml, aid): - pattern = utf8_str(r'''<[^>]*\said\s*=\s*['"]%s['"][^>]*>''' % aid) - aid_pattern = re.compile(pattern,re.IGNORECASE) - for m in re.finditer(aid_pattern, ml): - plt = m.start() - pgt = ml.find(b'>',plt+1) - return plt, pgt - return 0, 0 - - -# iterate over all tags in block in reverse order, i.e. last ta to first tag -def reverse_tag_iter(block): - end = len(block) - while True: - pgt = block.rfind(b'>', 0, end) - if pgt == -1: - break - plt = block.rfind(b'<', 0, pgt) - if plt == -1: - break - yield block[plt:pgt+1] - end = plt - - -class K8Processor: - - def __init__(self, mh, sect, files, debug=False): - self.sect = sect - self.files = files - self.mi = MobiIndex(sect) - self.mh = mh - self.skelidx = mh.skelidx - self.fragidx = mh.fragidx - self.guideidx = mh.guideidx - self.fdst = mh.fdst - self.flowmap = {} - self.flows = None - self.flowinfo = [] - self.parts = None - self.partinfo = [] - self.linked_aids = set() - self.fdsttbl= [0,0xffffffff] - self.DEBUG = debug - - # read in and parse the FDST info which is very similar in format to the Palm DB section - # parsing except it provides offsets into rawML file and not the Palm DB file - # this is needed to split up the final css, svg, etc flow section - # that can exist at the end of the rawML file - if self.fdst != 0xffffffff: - header = self.sect.loadSection(self.fdst) - if header[0:4] == b"FDST": - num_sections, = struct.unpack_from(b'>L', header, 0x08) - self.fdsttbl = struct.unpack_from(bstr('>%dL' % (num_sections*2)), header, 12)[::2] + (mh.rawSize, ) - sect.setsectiondescription(self.fdst,"KF8 FDST INDX") - if self.DEBUG: - print("\nFDST Section Map: %d sections" % num_sections) - for j in range(num_sections): - print("Section %d: 0x%08X - 0x%08X" % (j, self.fdsttbl[j],self.fdsttbl[j+1])) - else: - print("\nError: K8 Mobi with Missing FDST info") - - # read/process skeleton index info to create the skeleton table - skeltbl = [] - if self.skelidx != 0xffffffff: - # for i in range(2): - # fname = 'skel%04d.dat' % i - # data = self.sect.loadSection(self.skelidx + i) - # with open(pathof(fname), 'wb') as f: - # f.write(data) - outtbl, ctoc_text = self.mi.getIndexData(self.skelidx, "KF8 Skeleton") - fileptr = 0 - for [text, tagMap] in outtbl: - # file number, skeleton name, fragtbl record count, start position, length - skeltbl.append([fileptr, text, tagMap[1][0], tagMap[6][0], tagMap[6][1]]) - fileptr += 1 - self.skeltbl = skeltbl - if self.DEBUG: - print("\nSkel Table: %d entries" % len(self.skeltbl)) - print("table: filenum, skeleton name, frag tbl record count, start position, length") - for j in range(len(self.skeltbl)): - print(self.skeltbl[j]) - - # read/process the fragment index to create the fragment table - fragtbl = [] - if self.fragidx != 0xffffffff: - # for i in range(3): - # fname = 'frag%04d.dat' % i - # data = self.sect.loadSection(self.fragidx + i) - # with open(pathof(fname), 'wb') as f: - # f.write(data) - outtbl, ctoc_text = self.mi.getIndexData(self.fragidx, "KF8 Fragment") - for [text, tagMap] in outtbl: - # insert position, ctoc offset (aidtext), file number, sequence number, start position, length - ctocoffset = tagMap[2][0] - ctocdata = ctoc_text[ctocoffset] - fragtbl.append([int(text), ctocdata, tagMap[3][0], tagMap[4][0], tagMap[6][0], tagMap[6][1]]) - self.fragtbl = fragtbl - if self.DEBUG: - print("\nFragment Table: %d entries" % len(self.fragtbl)) - print("table: file position, link id text, file num, sequence number, start position, length") - for j in range(len(self.fragtbl)): - print(self.fragtbl[j]) - - # read / process guide index for guide elements of opf - guidetbl = [] - if self.guideidx != 0xffffffff: - # for i in range(3): - # fname = 'guide%04d.dat' % i - # data = self.sect.loadSection(self.guideidx + i) - # with open(pathof(fname), 'wb') as f: - # f.write(data) - outtbl, ctoc_text = self.mi.getIndexData(self.guideidx, "KF8 Guide elements)") - for [text, tagMap] in outtbl: - # ref_type, ref_title, frag number - ctocoffset = tagMap[1][0] - ref_title = ctoc_text[ctocoffset] - ref_type = text - fileno = None - if 3 in tagMap: - fileno = tagMap[3][0] - if 6 in tagMap: - fileno = tagMap[6][0] - guidetbl.append([ref_type, ref_title, fileno]) - self.guidetbl = guidetbl - if self.DEBUG: - print("\nGuide Table: %d entries" % len(self.guidetbl)) - print("table: ref_type, ref_title, fragtbl entry number") - for j in range(len(self.guidetbl)): - print(self.guidetbl[j]) - - def buildParts(self, rawML): - # now split the rawML into its flow pieces - self.flows = [] - for j in range(0, len(self.fdsttbl)-1): - start = self.fdsttbl[j] - end = self.fdsttbl[j+1] - self.flows.append(rawML[start:end]) - - # the first piece represents the xhtml text - text = self.flows[0] - self.flows[0] = b'' - - # walk the and fragment tables to build original source xhtml files - # *without* destroying any file position information needed for later href processing - # and create final list of file separation start: stop points and etc in partinfo - if self.DEBUG: - print("\nRebuilding flow piece 0: the main body of the ebook") - self.parts = [] - self.partinfo = [] - fragptr = 0 - baseptr = 0 - cnt = 0 - filename = 'part%04d.xhtml' % cnt - for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl: - baseptr = skelpos + skellen - skeleton = text[skelpos: baseptr] - aidtext = "0" - for i in range(fragcnt): - [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[fragptr] - aidtext = idtext[12:-2] - if i == 0: - filename = 'part%04d.xhtml' % filenum - slice = text[baseptr: baseptr + length] - insertpos = insertpos - skelpos - head = skeleton[:insertpos] - tail = skeleton[insertpos:] - actual_inspos = insertpos - if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') < head.rfind(b'<')): - # There is an incomplete tag in either the head or tail. - # This can happen for some badly formed KF8 files - print('The fragment table for %s has incorrect insert position. Calculating manually.' % skelname) - bp, ep = locate_beg_end_of_tag(skeleton, aidtext) - if bp != ep: - actual_inspos = ep + 1 + startpos - if insertpos != actual_inspos: - print("fixed corrupt fragment table insert position", insertpos+skelpos, actual_inspos+skelpos) - insertpos = actual_inspos - self.fragtbl[fragptr][0] = actual_inspos + skelpos - skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:] - baseptr = baseptr + length - fragptr += 1 - cnt += 1 - self.parts.append(skeleton) - self.partinfo.append([skelnum, 'Text', filename, skelpos, baseptr, aidtext]) - - assembled_text = b''.join(self.parts) - if self.DEBUG: - outassembled = os.path.join(self.files.k8dir, 'assembled_text.dat') - with open(pathof(outassembled),'wb') as f: - f.write(assembled_text) - - # The primary css style sheet is typically stored next followed by any - # snippets of code that were previously inlined in the - # original xhtml but have been stripped out and placed here. - # This can include local CDATA snippets and and svg sections. - - # The problem is that for most browsers and ereaders, you can not - # use to import any svg image that itself - # properly uses an tag to import some raster image - it - # should work according to the spec but does not for almost all browsers - # and ereaders and causes epub validation issues because those raster - # images are in manifest but not in xhtml text - since they only - # referenced from an svg image - - # So we need to check the remaining flow pieces to see if they are css - # or svg images. if svg images, we must check if they have an - # and if so inline them into the xhtml text pieces. - - # there may be other sorts of pieces stored here but until we see one - # in the wild to reverse engineer we won't be able to tell - self.flowinfo.append([None, None, None, None]) - svg_tag_pattern = re.compile(br'''(]*>)''', re.IGNORECASE) - image_tag_pattern = re.compile(br'''(]*>)''', re.IGNORECASE) - for j in range(1,len(self.flows)): - flowpart = self.flows[j] - nstr = '%04d' % j - m = re.search(svg_tag_pattern, flowpart) - if m is not None: - # svg - ptype = b'svg' - start = m.start() - m2 = re.search(image_tag_pattern, flowpart) - if m2 is not None: - pformat = b'inline' - pdir = None - fname = None - # strip off anything before = 0: - ptype = b'css' - flowpart = b'\n' - pformat = b'inline' - pdir = None - fname = None - else: - # css - assume as standalone css file - ptype = b'css' - pformat = b'file' - pdir = "Styles" - fname = 'style' + nstr + '.css' - - self.flows[j] = flowpart - self.flowinfo.append([ptype, pformat, pdir, fname]) - - if self.DEBUG: - print("\nFlow Map: %d entries" % len(self.flowinfo)) - for fi in self.flowinfo: - print(fi) - print("\n") - - print("\nXHTML File Part Position Information: %d entries" % len(self.partinfo)) - for pi in self.partinfo: - print(pi) - - if False: # self.Debug: - # dump all of the locations of the aid tags used in TEXT - # find id links only inside of tags - # inside any < > pair find all "aid=' and return whatever is inside the quotes - # [^>]* means match any amount of chars except for '>' char - # [^'"] match any amount of chars except for the quote character - # \s* means match any amount of whitespace - print("\npositions of all aid= pieces") - id_pattern = re.compile(br'''<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>''',re.IGNORECASE) - for m in re.finditer(id_pattern, rawML): - [filename, partnum, start, end] = self.getFileInfo(m.start()) - [seqnum, idtext] = self.getFragTblInfo(m.start()) - value = fromBase32(m.group(1)) - print(" aid: %s value: %d at: %d -> part: %d, start: %d, end: %d" % (m.group(1), value, m.start(), partnum, start, end)) - print(" %s fragtbl entry %d" % (idtext, seqnum)) - - return - - # get information fragment table entry by pos - def getFragTblInfo(self, pos): - for j in range(len(self.fragtbl)): - [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[j] - if pos >= insertpos and pos < (insertpos + length): - # why are these "in: and before: added here - return seqnum, b'in: ' + idtext - if pos < insertpos: - return seqnum, b'before: ' + idtext - return None, None - - # get information about the part (file) that exists at pos in original rawML - def getFileInfo(self, pos): - for [partnum, pdir, filename, start, end, aidtext] in self.partinfo: - if pos >= start and pos < end: - return filename, partnum, start, end - return None, None, None, None - - # accessor functions to properly protect the internal structure - def getNumberOfParts(self): - return len(self.parts) - - def getPart(self,i): - if i >= 0 and i < len(self.parts): - return self.parts[i] - return None - - def getPartInfo(self, i): - if i >= 0 and i < len(self.partinfo): - return self.partinfo[i] - return None - - def getNumberOfFlows(self): - return len(self.flows) - - def getFlow(self,i): - # note flows[0] is empty - it was all of the original text - if i > 0 and i < len(self.flows): - return self.flows[i] - return None - - def getFlowInfo(self,i): - # note flowinfo[0] is empty - it was all of the original text - if i > 0 and i < len(self.flowinfo): - return self.flowinfo[i] - return None - - def getIDTagByPosFid(self, posfid, offset): - # first convert kindle:pos:fid and offset info to position in file - # (fromBase32 can handle both string types on input) - row = fromBase32(posfid) - off = fromBase32(offset) - [insertpos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[row] - pos = insertpos + off - fname, pn, skelpos, skelend = self.getFileInfo(pos) - if fname is None: - # pos does not exist - # default to skeleton pos instead - print("Link To Position", pos, "does not exist, retargeting to top of target") - pos = self.skeltbl[filenum][3] - fname, pn, skelpos, skelend = self.getFileInfo(pos) - # an existing "id=" or "name=" attribute must exist in original xhtml otherwise it would not have worked for linking. - # Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent - # some position information encoded into Base32 name. - # so find the closest "id=" before position the file by actually searching in that file - idtext = self.getIDTag(pos) - return fname, idtext - - def getIDTag(self, pos): - # find the first tag with a named anchor (name or id attribute) before pos - fname, pn, skelpos, skelend = self.getFileInfo(pos) - if pn is None and skelpos is None: - print("Error: getIDTag - no file contains ", pos) - textblock = self.parts[pn] - npos = pos - skelpos - # if npos inside a tag then search all text before the its end of tag marker - pgt = textblock.find(b'>',npos) - plt = textblock.find(b'<',npos) - if plt == npos or pgt < plt: - npos = pgt + 1 - # find id and name attributes only inside of tags - # use a reverse tag search since that is faster - # inside any < > pair find "id=" and "name=" attributes return it - # [^>]* means match any amount of chars except for '>' char - # [^'"] match any amount of chars except for the quote character - # \s* means match any amount of whitespace - textblock = textblock[0:npos] - id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE) - name_pattern = re.compile(br'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE) - aid_pattern = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''') - for tag in reverse_tag_iter(textblock): - # any ids in the body should default to top of file - if tag[0:6] == b'= start and pos < end: - return [partnum, pdir, filename, start, end, aidtext] - return [None, None, None, None, None, None] - - # fileno is actually a reference into fragtbl (a fragment) - def getGuideText(self): - guidetext = b'' - for [ref_type, ref_title, fileno] in self.guidetbl: - if ref_type == b'thumbimagestandard': - continue - if ref_type not in _guide_types and not ref_type.startswith(b'other.'): - if ref_type == b'start': - ref_type = b'text' - else: - ref_type = b'other.' + ref_type - [pos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[fileno] - [pn, pdir, filename, skelpos, skelend, aidtext] = self.getSkelInfo(pos) - idtext = self.getIDTag(pos) - linktgt = filename.encode('utf-8') - if idtext != b'': - linktgt += b'#' + idtext - guidetext += b'\n' - # opf is encoded utf-8 so must convert any titles properly - guidetext = (guidetext.decode(self.mh.codec)).encode("utf-8") - return guidetext - - def getPageIDTag(self, pos): - # find the first tag with a named anchor (name or id attribute) before pos - # but page map offsets need to little more leeway so if the offset points - # into a tag look for the next ending tag "/>" or "',npos) - plt = textblock.find(b'<',npos) - if plt == npos or pgt < plt: - # we are in a tag - # so find first ending tag - pend1 = textblock.find(b'/>', npos) - pend2 = textblock.find(b' pair find "id=" and "name=" attributes return it - # [^>]* means match any amount of chars except for '>' char - # [^'"] match any amount of chars except for the quote character - # \s* means match any amount of whitespace - textblock = textblock[0:npos] - id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE) - name_pattern = re.compile(br'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE) - for tag in reverse_tag_iter(textblock): - # any ids in the body should default to top of file - if tag[0:6] == b'= python 2.7. -""" set to True to use OrderedDict for K8RESCProcessor.parsetag.tattr.""" - -if DEBUG_USE_ORDERED_DICTIONARY: - from collections import OrderedDict as dict_ -else: - dict_ = dict - -from .compatibility_utils import unicode_str - -from .mobi_utils import fromBase32 - -_OPF_PARENT_TAGS = ['xml', 'package', 'metadata', 'dc-metadata', - 'x-metadata', 'manifest', 'spine', 'tours', 'guide'] - -class K8RESCProcessor(object): - - def __init__(self, data, debug=False): - self._debug = debug - self.resc = None - self.opos = 0 - self.extrameta = [] - self.cover_name = None - self.spine_idrefs = {} - self.spine_order = [] - self.spine_pageattributes = {} - self.spine_ppd = None - # need3 indicate the book has fields which require epub3. - # but the estimation of the source epub version from the fields is difficult. - self.need3 = False - self.package_ver = None - self.extra_metadata = [] - self.refines_metadata = [] - self.extra_attributes = [] - # get header - start_pos = data.find(b'<') - self.resc_header = data[:start_pos] - # get resc data length - start = self.resc_header.find(b'=') + 1 - end = self.resc_header.find(b'&', start) - resc_size = 0 - if end > 0: - resc_size = fromBase32(self.resc_header[start:end]) - resc_rawbytes = len(data) - start_pos - if resc_rawbytes == resc_size: - self.resc_length = resc_size - else: - # Most RESC has a nul string at its tail but some do not. - end_pos = data.find(b'\x00', start_pos) - if end_pos < 0: - self.resc_length = resc_rawbytes - else: - self.resc_length = end_pos - start_pos - if self.resc_length != resc_size: - print("Warning: RESC section length({:d}bytes) does not match its size({:d}bytes).".format(self.resc_length, resc_size)) - # now parse RESC after converting it to unicode from utf-8 - try: - self.resc = unicode_str(data[start_pos:start_pos+self.resc_length]) - except UnicodeDecodeError: - self.resc = unicode_str(data[start_pos:start_pos+self.resc_length], enc='latin-1') - self.parseData() - - def prepend_to_spine(self, key, idref, linear, properties): - self.spine_order = [key] + self.spine_order - self.spine_idrefs[key] = idref - attributes = {} - if linear is not None: - attributes['linear'] = linear - if properties is not None: - attributes['properties'] = properties - self.spine_pageattributes[key] = attributes - - # RESC tag iterator - def resc_tag_iter(self): - tcontent = last_tattr = None - prefix = [''] - while True: - text, tag = self.parseresc() - if text is None and tag is None: - break - if text is not None: - tcontent = text.rstrip(' \r\n') - else: # we have a tag - ttype, tname, tattr = self.parsetag(tag) - if ttype == 'begin': - tcontent = None - prefix.append(tname + '.') - if tname in _OPF_PARENT_TAGS: - yield ''.join(prefix), tname, tattr, tcontent - else: - last_tattr = tattr - else: # single or end - if ttype == 'end': - prefix.pop() - tattr = last_tattr - last_tattr = None - if tname in _OPF_PARENT_TAGS: - tname += '-end' - yield ''.join(prefix), tname, tattr, tcontent - tcontent = None - - # now parse the RESC to extract spine and extra metadata info - def parseData(self): - for prefix, tname, tattr, tcontent in self.resc_tag_iter(): - if self._debug: - print(" Parsing RESC: ", prefix, tname, tattr, tcontent) - if tname == 'package': - self.package_ver = tattr.get('version', '2.0') - package_prefix = tattr.get('prefix','') - if self.package_ver.startswith('3') or package_prefix.startswith('rendition'): - self.need3 = True - if tname == 'spine': - self.spine_ppd = tattr.get('page-progession-direction', None) - if self.spine_ppd is not None and self.spine_ppd == 'rtl': - self.need3 = True - if tname == 'itemref': - skelid = tattr.pop('skelid', None) - if skelid is None and len(self.spine_order) == 0: - # assume it was removed initial coverpage - skelid = 'coverpage' - tattr['linear'] = 'no' - self.spine_order.append(skelid) - idref = tattr.pop('idref', None) - if idref is not None: - idref = 'x_' + idref - self.spine_idrefs[skelid] = idref - if 'id' in tattr: - del tattr['id'] - # tattr["id"] = 'x_' + tattr["id"] - if 'properties' in tattr: - self.need3 = True - self.spine_pageattributes[skelid] = tattr - if tname == 'meta' or tname.startswith('dc:'): - if 'refines' in tattr or 'property' in tattr: - self.need3 = True - if tattr.get('name','') == 'cover': - cover_name = tattr.get('content',None) - if cover_name is not None: - cover_name = 'x_' + cover_name - self.cover_name = cover_name - else: - self.extrameta.append([tname, tattr, tcontent]) - - # parse and return either leading text or the next tag - def parseresc(self): - p = self.opos - if p >= len(self.resc): - return None, None - if self.resc[p] != '<': - res = self.resc.find('<',p) - if res == -1 : - res = len(self.resc) - self.opos = res - return self.resc[p:res], None - # handle comment as a special case - if self.resc[p:p+4] == '',p+1) - if te != -1: - te = te+2 - else: - te = self.resc.find('>',p+1) - ntb = self.resc.find('<',p+1) - if ntb != -1 and ntb < te: - self.opos = ntb - return self.resc[p:ntb], None - self.opos = te + 1 - return None, self.resc[p:te+1] - - # parses tag to identify: [tname, ttype, tattr] - # tname: tag name - # ttype: tag type ('begin', 'end' or 'single'); - # tattr: dictionary of tag atributes - def parsetag(self, s): - p = 1 - tname = None - ttype = None - tattr = dict_() - while s[p:p+1] == ' ' : - p += 1 - if s[p:p+1] == '/': - ttype = 'end' - p += 1 - while s[p:p+1] == ' ' : - p += 1 - b = p - while s[p:p+1] not in ('>', '/', ' ', '"', "'",'\r','\n') : - p += 1 - tname=s[b:p].lower() - # some special cases - if tname == '?xml': - tname = 'xml' - if tname == '!--': - ttype = 'single' - comment = s[p:-3].strip() - tattr['comment'] = comment - if ttype is None: - # parse any attributes of begin or single tags - while s.find('=',p) != -1 : - while s[p:p+1] == ' ' : - p += 1 - b = p - while s[p:p+1] != '=' : - p += 1 - aname = s[b:p].lower() - aname = aname.rstrip(' ') - p += 1 - while s[p:p+1] == ' ' : - p += 1 - if s[p:p+1] in ('"', "'") : - p = p + 1 - b = p - while s[p:p+1] not in ('"', "'"): - p += 1 - val = s[b:p] - p += 1 - else : - b = p - while s[p:p+1] not in ('>', '/', ' ') : - p += 1 - val = s[b:p] - tattr[aname] = val - if ttype is None: - ttype = 'begin' - if s.find('/',p) >= 0: - ttype = 'single' - return ttype, tname, tattr - - def taginfo_toxml(self, taginfo): - res = [] - tname, tattr, tcontent = taginfo - res.append('<' + tname) - if tattr is not None: - for key in tattr: - res.append(' ' + key + '="'+tattr[key]+'"') - if tcontent is not None: - res.append('>' + tcontent + '\n') - else: - res.append('/>\n') - return "".join(res) - - def hasSpine(self): - return len(self.spine_order) > 0 - - def needEPUB3(self): - return self.need3 - - def hasRefines(self): - for [tname, tattr, tcontent] in self.extrameta: - if 'refines' in tattr: - return True - return False - - def createMetadata(self, epubver): - for taginfo in self.extrameta: - tname, tattr, tcontent = taginfo - if 'refines' in tattr: - if epubver == 'F' and 'property' in tattr: - attr = ' id="%s" opf:%s="%s"\n' % (tattr['refines'], tattr['property'], tcontent) - self.extra_attributes.append(attr) - else: - tag = self.taginfo_toxml(taginfo) - self.refines_metadata.append(tag) - else: - tag = self.taginfo_toxml(taginfo) - self.extra_metadata.append(tag) diff --git a/epy_extras/KindleUnpack/mobi_nav.py b/epy_extras/KindleUnpack/mobi_nav.py deleted file mode 100644 index 16fb0be..0000000 --- a/epy_extras/KindleUnpack/mobi_nav.py +++ /dev/null @@ -1,187 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab - -from __future__ import unicode_literals, division, absolute_import, print_function - -from .compatibility_utils import unicode_str -import os -from .unipath import pathof - -import re -# note: re requites the pattern to be the exact same type as the data to be searched in python3 -# but u"" is not allowed for the pattern itself only b"" - -DEBUG_NAV = False - -FORCE_DEFAULT_TITLE = False -""" Set to True to force to use the default title. """ - -NAVIGATION_FINENAME = 'nav.xhtml' -""" The name for the navigation document. """ - -DEFAULT_TITLE = 'Navigation' -""" The default title for the navigation document. """ - -class NAVProcessor(object): - - def __init__(self, files): - self.files = files - self.navname = NAVIGATION_FINENAME - - def buildLandmarks(self, guidetext): - header = '' - header += ' \n' - - type_map = { - 'cover' : 'cover', - 'title-page' : 'title-page', - # ?: 'frontmatter', - 'text' : 'bodymatter', - # ?: 'backmatter', - 'toc' : 'toc', - 'loi' : 'loi', - 'lot' : 'lot', - 'preface' : 'preface', - 'bibliography' : 'bibliography', - 'index' : 'index', - 'glossary' : 'glossary', - 'acknowledgements' : 'acknowledgements', - 'colophon' : None, - 'copyright-page' : None, - 'dedication' : None, - 'epigraph' : None, - 'foreword' : None, - 'notes' : None - } - - re_type = re.compile(r'\s+type\s*=\s*"(.*?)"', re.I) - re_title = re.compile(r'\s+title\s*=\s*"(.*?)"', re.I) - re_link = re.compile(r'\s+href\s*=\s*"(.*?)"', re.I) - dir_ = os.path.relpath(self.files.k8text, self.files.k8oebps).replace('\\', '/') - - data = '' - references = re.findall(r'', unicode_str(guidetext), re.I) - for reference in references: - mo_type = re_type.search(reference) - mo_title = re_title.search(reference) - mo_link = re_link.search(reference) - if mo_type is not None: - type_ = type_map.get(mo_type.group(1), None) - else: - type_ = None - if mo_title is not None: - title = mo_title.group(1) - else: - title = None - if mo_link is not None: - link = mo_link.group(1) - else: - link = None - - if type_ is not None and title is not None and link is not None: - link = os.path.relpath(link, dir_).replace('\\', '/') - data += element.format(type_, link, title) - if len(data) > 0: - return header + data + footer - else: - return '' - - def buildTOC(self, indx_data): - header = '' - header += ' \n' - - # recursive part - def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1): - if start>len(indx_data) or end>len(indx_data): - print("Warning (in buildTOC): missing INDX child entries", start, end, len(indx_data)) - return '' - if DEBUG_NAV: - print("recursINDX (in buildTOC) lvl %d from %d to %d" % (lvl, start, end)) - xhtml = '' - if start <= 0: - start = 0 - if end <= 0: - end = len(indx_data) - if lvl > max_lvl: - max_lvl = lvl - - indent1 = ' ' * (2 + lvl * 2) - indent2 = ' ' * (3 + lvl * 2) - xhtml += indent1 + '
    \n' - for i in range(start, end): - e = indx_data[i] - htmlfile = e['filename'] - desttag = e['idtag'] - text = e['text'] - if not e['hlvl'] == lvl: - continue - num += 1 - if desttag == '': - link = htmlfile - else: - link = '{:s}#{:s}'.format(htmlfile, desttag) - xhtml += indent2 + '
  1. ' - entry = '{:s}'.format(link, text) - xhtml += entry - # recurs - if e['child1'] >= 0: - xhtml += '\n' - xhtmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1, - e['child1'], e['childn'] + 1) - xhtml += xhtmlrec - xhtml += indent2 - # close entry - xhtml += '
  2. \n' - xhtml += indent1 + '
\n' - return xhtml, max_lvl, num - - data, max_lvl, num = recursINDX() - if not len(indx_data) == num: - print("Warning (in buildTOC): different number of entries in NCX", len(indx_data), num) - return header + data + footer - - def buildNAV(self, ncx_data, guidetext, title, lang): - print("Building Navigation Document.") - if FORCE_DEFAULT_TITLE: - title = DEFAULT_TITLE - nav_header = '' - nav_header += '\n' - nav_header += ' - - - - - - - - - -%s - - -''' - - ncx_footer = \ -''' - -''' - - ncx_entry = \ -''' - -%s - -''' - - # recursive part - def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1): - if start>len(indx_data) or end>len(indx_data): - print("Warning: missing INDX child entries", start, end, len(indx_data)) - return '' - if DEBUG_NCX: - print("recursINDX lvl %d from %d to %d" % (lvl, start, end)) - xml = '' - if start <= 0: - start = 0 - if end <= 0: - end = len(indx_data) - if lvl > max_lvl: - max_lvl = lvl - indent = ' ' * (2 + lvl) - - for i in range(start, end): - e = indx_data[i] - if not e['hlvl'] == lvl: - continue - # open entry - num += 1 - link = '%s#filepos%d' % (htmlfile, e['pos']) - tagid = 'np_%d' % num - entry = ncx_entry % (tagid, num, xmlescape(unescapeit(e['text'])), link) - entry = re.sub(re.compile('^', re.M), indent, entry, 0) - xml += entry + '\n' - # recurs - if e['child1']>=0: - xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1, - e['child1'], e['childn'] + 1) - xml += xmlrec - # close entry - xml += indent + '\n' - return xml, max_lvl, num - - body, max_lvl, num = recursINDX() - header = ncx_header % (lang, ident, max_lvl + 1, title) - ncx = header + body + ncx_footer - if not len(indx_data) == num: - print("Warning: different number of entries in NCX", len(indx_data), num) - return ncx - - def writeNCX(self, metadata): - # build the xml - self.isNCX = True - print("Write ncx") - # htmlname = os.path.basename(self.files.outbase) - # htmlname += '.html' - htmlname = 'book.html' - xml = self.buildNCX(htmlname, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0]) - # write the ncx file - # ncxname = os.path.join(self.files.mobi7dir, self.files.getInputFileBasename() + '.ncx') - ncxname = os.path.join(self.files.mobi7dir, 'toc.ncx') - with open(pathof(ncxname), 'wb') as f: - f.write(xml.encode('utf-8')) - - def buildK8NCX(self, indx_data, title, ident, lang): - ncx_header = \ -''' - - - - - - - - - -%s - - -''' - - ncx_footer = \ -''' - -''' - - ncx_entry = \ -''' - -%s - -''' - - # recursive part - def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1): - if start>len(indx_data) or end>len(indx_data): - print("Warning: missing INDX child entries", start, end, len(indx_data)) - return '' - if DEBUG_NCX: - print("recursINDX lvl %d from %d to %d" % (lvl, start, end)) - xml = '' - if start <= 0: - start = 0 - if end <= 0: - end = len(indx_data) - if lvl > max_lvl: - max_lvl = lvl - indent = ' ' * (2 + lvl) - - for i in range(start, end): - e = indx_data[i] - htmlfile = e['filename'] - desttag = e['idtag'] - if not e['hlvl'] == lvl: - continue - # open entry - num += 1 - if desttag == '': - link = 'Text/%s' % htmlfile - else: - link = 'Text/%s#%s' % (htmlfile, desttag) - tagid = 'np_%d' % num - entry = ncx_entry % (tagid, num, xmlescape(unescapeit(e['text'])), link) - entry = re.sub(re.compile('^', re.M), indent, entry, 0) - xml += entry + '\n' - # recurs - if e['child1']>=0: - xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1, - e['child1'], e['childn'] + 1) - xml += xmlrec - # close entry - xml += indent + '\n' - return xml, max_lvl, num - - body, max_lvl, num = recursINDX() - header = ncx_header % (lang, ident, max_lvl + 1, title) - ncx = header + body + ncx_footer - if not len(indx_data) == num: - print("Warning: different number of entries in NCX", len(indx_data), num) - return ncx - - def writeK8NCX(self, ncx_data, metadata): - # build the xml - self.isNCX = True - print("Write K8 ncx") - xml = self.buildK8NCX(ncx_data, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0]) - bname = 'toc.ncx' - ncxname = os.path.join(self.files.k8oebps,bname) - with open(pathof(ncxname), 'wb') as f: - f.write(xml.encode('utf-8')) diff --git a/epy_extras/KindleUnpack/mobi_opf.py b/epy_extras/KindleUnpack/mobi_opf.py deleted file mode 100644 index 742d776..0000000 --- a/epy_extras/KindleUnpack/mobi_opf.py +++ /dev/null @@ -1,686 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab - -from __future__ import unicode_literals, division, absolute_import, print_function - -from .compatibility_utils import unicode_str, unescapeit -from .compatibility_utils import lzip - -from .unipath import pathof - -from xml.sax.saxutils import escape as xmlescape - -import os -import uuid -from datetime import datetime - -# In EPUB3, NCX and MAY exist in OPF, although the NCX is superseded -# by the Navigation Document and the is deprecated. Currently, EPUB3_WITH_NCX -# and EPUB3_WITH_GUIDE are set to True due to compatibility with epub2 reading systems. -# They might be change to set to False in the future. - -EPUB3_WITH_NCX = True # Do not set to False except for debug. -""" Set to True to create a toc.ncx when converting to epub3. """ - -EPUB3_WITH_GUIDE = True # Do not set to False except for debug. -""" Set to True to create a guide element in an opf when converting to epub3. """ - -EPUB_OPF = 'content.opf' -""" The name for the OPF of EPUB. """ - -TOC_NCX = 'toc.ncx' -""" The name for the TOC of EPUB2. """ - -NAVIGATION_DOCUMENT = 'nav.xhtml' -""" The name for the navigation document of EPUB3. """ - -BEGIN_INFO_ONLY = '' -""" The comment to indicate the end of metadata which will be ignored by kindlegen. """ - -EXTH_TITLE_FURIGANA = 'Title-Pronunciation' -""" The name for Title Furigana(similar to file-as) set by KDP. """ - -EXTH_CREATOR_FURIGANA = 'Author-Pronunciation' -""" The name for Creator Furigana(similar to file-as) set by KDP. """ - -EXTH_PUBLISHER_FURIGANA = 'Publisher-Pronunciation' -""" The name for Publisher Furigana(similar to file-as) set by KDP. """ - -EXTRA_ENTITIES = {'"': '"', "'": "'"} - -class OPFProcessor(object): - - def __init__(self, files, metadata, fileinfo, rscnames, hasNCX, mh, usedmap, pagemapxml='', guidetext='', k8resc=None, epubver='2'): - self.files = files - self.metadata = metadata - self.fileinfo = fileinfo - self.rscnames = rscnames - self.has_ncx = hasNCX - self.codec = mh.codec - self.isK8 = mh.isK8() - self.printReplica = mh.isPrintReplica() - self.guidetext = unicode_str(guidetext) - self.used = usedmap - self.k8resc = k8resc - self.covername = None - self.cover_id = 'cover_img' - if self.k8resc is not None and self.k8resc.cover_name is not None: - # update cover id info from RESC if available - self.cover_id = self.k8resc.cover_name - # Create a unique urn uuid - self.BookId = unicode_str(str(uuid.uuid4())) - self.pagemap = pagemapxml - - self.ncxname = None - self.navname = None - - # page-progression-direction is only set in spine - self.page_progression_direction = metadata.pop('page-progression-direction', [None])[0] - if 'rl' in metadata.get('primary-writing-mode', [''])[0]: - self.page_progression_direction = 'rtl' - self.epubver = epubver # the epub version set by user - self.target_epubver = epubver # the epub vertion set by user or detected automatically - if self.epubver == 'A': - self.target_epubver = self.autodetectEPUBVersion() - elif self.epubver == 'F': - self.target_epubver = '2' - elif self.epubver != '2' and self.epubver != '3': - self.target_epubver = '2' - - # id for rifine attributes - self.title_id = {} - self.creator_id = {} - self.publisher_id = {} - # extra attributes - self.title_attrib = {} - self.creator_attrib = {} - self.publisher_attrib = {} - self.extra_attributes = [] # for force epub2 option - # Create epub3 metadata from EXTH. - self.exth_solved_refines_metadata = [] - self.exth_refines_metadata = [] - self.exth_fixedlayout_metadata = [] - - self.defineRefinesID() - self.processRefinesMetadata() - if self.k8resc is not None: - # Create metadata in RESC section. - self.k8resc.createMetadata(epubver) - if self.target_epubver == "3": - self.createMetadataForFixedlayout() - - def escapeit(self, sval, EXTRAS=None): - # note, xmlescape and unescape do not work with utf-8 bytestrings - sval = unicode_str(sval) - if EXTRAS: - res = xmlescape(unescapeit(sval), EXTRAS) - else: - res = xmlescape(unescapeit(sval)) - return res - - def createMetaTag(self, data, property, content, refid=''): - refines = '' - if refid: - refines = ' refines="#%s"' % refid - data.append('%s\n' % (property, refines, content)) - - def buildOPFMetadata(self, start_tag, has_obfuscated_fonts=False): - # convert from EXTH metadata format to target epub version metadata - # epub 3 will ignore style metatags - # but allows them to be present for backwards compatibility - # instead the new format is - # property_value - # and DCMES elements such as: - # value - - metadata = self.metadata - k8resc = self.k8resc - - META_TAGS = ['Drm Server Id', 'Drm Commerce Id', 'Drm Ebookbase Book Id', 'ASIN', 'ThumbOffset', 'Fake Cover', - 'Creator Software', 'Creator Major Version', 'Creator Minor Version', 'Creator Build Number', - 'Watermark', 'Clipping Limit', 'Publisher Limit', 'Text to Speech Disabled', 'CDE Type', - 'Updated Title', 'Font Signature (hex)', 'Tamper Proof Keys (hex)',] - - # def handleTag(data, metadata, key, tag, ids={}): - def handleTag(data, metadata, key, tag, attrib={}): - '''Format metadata values. - - @param data: List of formatted metadata entries. - @param metadata: The metadata dictionary. - @param key: The key of the metadata value to handle. - @param tag: The opf tag corresponds to the metadata value. - ###@param ids: The ids in tags for refines property of epub3. - @param attrib: The extra attibute for refines or opf prefixs. - ''' - if key in metadata: - for i, value in enumerate(metadata[key]): - closingTag = tag.split(" ")[0] - res = '<%s%s>%s\n' % (tag, attrib.get(i, ''), self.escapeit(value), closingTag) - data.append(res) - del metadata[key] - - # these are allowed but ignored by epub3 - def handleMetaPairs(data, metadata, key, name): - if key in metadata: - for value in metadata[key]: - res = '\n' % (name, self.escapeit(value, EXTRA_ENTITIES)) - data.append(res) - del metadata[key] - - data = [] - data.append(start_tag + '\n') - # Handle standard metadata - if 'Title' in metadata: - handleTag(data, metadata, 'Title', 'dc:title', self.title_attrib) - else: - data.append('Untitled\n') - handleTag(data, metadata, 'Language', 'dc:language') - if 'UniqueID' in metadata: - handleTag(data, metadata, 'UniqueID', 'dc:identifier id="uid"') - else: - # No unique ID in original, give it a generic one. - data.append('0\n') - - if self.target_epubver == '3': - # epub version 3 minimal metadata requires a dcterms:modifed date tag - self.createMetaTag(data, 'dcterms:modified', datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")) - - if self.isK8 and has_obfuscated_fonts: - # Use the random generated urn:uuid so obuscated fonts work. - # It doesn't need to be _THE_ unique identifier to work as a key - # for obfuscated fonts in Sigil, ADE and calibre. Its just has - # to use the opf:scheme="UUID" and have the urn:uuid: prefix. - if self.target_epubver == '3': - data.append('urn:uuid:'+self.BookId+'\n') - else: - data.append('urn:uuid:'+self.BookId+'\n') - - handleTag(data, metadata, 'Creator', 'dc:creator', self.creator_attrib) - handleTag(data, metadata, 'Contributor', 'dc:contributor') - handleTag(data, metadata, 'Publisher', 'dc:publisher', self.publisher_attrib) - handleTag(data, metadata, 'Source', 'dc:source') - handleTag(data, metadata, 'Type', 'dc:type') - if self.target_epubver == '3': - if 'ISBN' in metadata: - for i, value in enumerate(metadata['ISBN']): - res = 'urn:isbn:%s\n' % self.escapeit(value) - data.append(res) - else: - handleTag(data, metadata, 'ISBN', 'dc:identifier opf:scheme="ISBN"') - if 'Subject' in metadata: - if 'SubjectCode' in metadata: - codeList = metadata['SubjectCode'] - del metadata['SubjectCode'] - else: - codeList = None - for i in range(len(metadata['Subject'])): - if codeList and i < len(codeList): - data.append('') - else: - data.append('') - data.append(self.escapeit(metadata['Subject'][i])+'\n') - del metadata['Subject'] - handleTag(data, metadata, 'Description', 'dc:description') - if self.target_epubver == '3': - if 'Published' in metadata: - for i, value in enumerate(metadata['Published']): - res = '%s\n' % self.escapeit(value) - data.append(res) - else: - handleTag(data, metadata, 'Published', 'dc:date opf:event="publication"') - handleTag(data, metadata, 'Rights', 'dc:rights') - - if self.epubver == 'F': - if self.extra_attributes or k8resc is not None and k8resc.extra_attributes: - data.append('\n') - else: - # Append refines metadata. - if self.exth_solved_refines_metadata: - data.append('\n') - data += self.exth_solved_refines_metadata - if self.exth_refines_metadata or k8resc is not None and k8resc.refines_metadata: - data.append('\n') - - # Append metadata in RESC section. - if k8resc is not None and k8resc.extra_metadata: - data.append('\n') - - if 'CoverOffset' in metadata: - imageNumber = int(metadata['CoverOffset'][0]) - self.covername = self.rscnames[imageNumber] - if self.covername is None: - print("Error: Cover image %s was not recognized as a valid image" % imageNumber) - else: - # is obsoleted in EPUB3, but kindlegen v2.9 requires it. - data.append('\n') - self.used[self.covername] = 'used' - del metadata['CoverOffset'] - - handleMetaPairs(data, metadata, 'Codec', 'output encoding') - # handle kindlegen specifc tags - handleTag(data, metadata, 'DictInLanguage', 'DictionaryInLanguage') - handleTag(data, metadata, 'DictOutLanguage', 'DictionaryOutLanguage') - handleMetaPairs(data, metadata, 'RegionMagnification', 'RegionMagnification') - handleMetaPairs(data, metadata, 'book-type', 'book-type') - handleMetaPairs(data, metadata, 'zero-gutter', 'zero-gutter') - handleMetaPairs(data, metadata, 'zero-margin', 'zero-margin') - handleMetaPairs(data, metadata, 'primary-writing-mode', 'primary-writing-mode') - handleMetaPairs(data, metadata, 'fixed-layout', 'fixed-layout') - handleMetaPairs(data, metadata, 'orientation-lock', 'orientation-lock') - handleMetaPairs(data, metadata, 'original-resolution', 'original-resolution') - - # these are not allowed in epub2 or 3 so convert them to meta name content pairs - # perhaps these could better be mapped into the dcterms namespace instead - handleMetaPairs(data, metadata, 'Review', 'review') - handleMetaPairs(data, metadata, 'Imprint', 'imprint') - handleMetaPairs(data, metadata, 'Adult', 'adult') - handleMetaPairs(data, metadata, 'DictShortName', 'DictionaryVeryShortName') - - # these are needed by kobo books upon submission but not sure if legal metadata in epub2 or epub3 - if 'Price' in metadata and 'Currency' in metadata: - priceList = metadata['Price'] - currencyList = metadata['Currency'] - if len(priceList) != len(currencyList): - print("Error: found %s price entries, but %s currency entries.") - else: - for i in range(len(priceList)): - data.append(''+priceList[i]+'\n') - del metadata['Price'] - del metadata['Currency'] - - if self.target_epubver == '3': - # Append metadata for EPUB3. - if self.exth_fixedlayout_metadata: - data.append('\n') - data += self.exth_fixedlayout_metadata - - # all that remains is extra EXTH info we will store inside a comment inside meta name/content pairs - # so it can not impact anything and will be automatically stripped out if found again in a RESC section - data.append(BEGIN_INFO_ONLY + '\n') - if 'ThumbOffset' in metadata: - imageNumber = int(metadata['ThumbOffset'][0]) - # Some bad books give image indexes that are 'out of range' - try: - imageName = self.rscnames[imageNumber] - except: - print('Number given for Cover Thumbnail is out of range: %s' % imageNumber) - imageName = None - if imageName is None: - print("Error: Cover Thumbnail image %s was not recognized as a valid image" % imageNumber) - else: - data.append('\n') - # self.used[imageName] = 'used' # thumbnail image is always generated by Kindlegen, so don't include in manifest - self.used[imageName] = 'not used' - del metadata['ThumbOffset'] - for metaName in META_TAGS: - if metaName in metadata: - for value in metadata[metaName]: - data.append('\n') - del metadata[metaName] - for key in list(metadata.keys()): - for value in metadata[key]: - data.append('\n') - del metadata[key] - data.append(END_INFO_ONLY + '\n') - data.append('\n') - return data - - def buildOPFManifest(self, ncxname, navname=None): - # buildManifest for mobi7, azw4, epub2 and epub3. - k8resc = self.k8resc - cover_id = self.cover_id - hasK8RescSpine = k8resc is not None and k8resc.hasSpine() - self.ncxname = ncxname - self.navname = navname - - data = [] - data.append('\n') - media_map = { - '.jpg' : 'image/jpeg', - '.jpeg' : 'image/jpeg', - '.png' : 'image/png', - '.gif' : 'image/gif', - '.svg' : 'image/svg+xml', - '.xhtml': 'application/xhtml+xml', - '.html' : 'text/html', # for mobi7 - '.pdf' : 'application/pdf', # for azw4(print replica textbook) - '.ttf' : 'application/x-font-ttf', - '.otf' : 'application/x-font-opentype', # replaced? - '.css' : 'text/css', - # '.html' : 'text/x-oeb1-document', # for mobi7 - # '.otf' : 'application/vnd.ms-opentype', # [OpenType] OpenType fonts - # '.woff' : 'application/font-woff', # [WOFF] WOFF fonts - # '.smil' : 'application/smil+xml', # [MediaOverlays301] EPUB Media Overlay documents - # '.pls' : 'application/pls+xml', # [PLS] Text-to-Speech (TTS) Pronunciation lexicons - # '.mp3' : 'audio/mpeg', - # '.mp4' : 'video/mp4', - # '.js' : 'text/javascript', # not supported in K8 - } - spinerefs = [] - - idcnt = 0 - for [key,dir,fname] in self.fileinfo: - name, ext = os.path.splitext(fname) - ext = ext.lower() - media = media_map.get(ext) - ref = "item%d" % idcnt - if hasK8RescSpine: - if key is not None and key in k8resc.spine_idrefs: - ref = k8resc.spine_idrefs[key] - properties = '' - if dir != '': - fpath = dir + '/' + fname - else: - fpath = fname - data.append('\n'.format(ref, media, fpath, properties)) - - if ext in ['.xhtml', '.html']: - spinerefs.append(ref) - idcnt += 1 - - for fname in self.rscnames: - if fname is not None: - if self.used.get(fname,'not used') == 'not used': - continue - name, ext = os.path.splitext(fname) - ext = ext.lower() - media = media_map.get(ext,ext[1:]) - properties = '' - if fname == self.covername: - ref = cover_id - if self.target_epubver == '3': - properties = 'properties="cover-image"' - else: - ref = "item%d" % idcnt - if ext == '.ttf' or ext == '.otf': - if self.isK8: # fonts are only used in Mobi 8 - fpath = 'Fonts/' + fname - data.append('\n'.format(ref, media, fpath, properties)) - else: - fpath = 'Images/' + fname - data.append('\n'.format(ref, media, fpath, properties)) - idcnt += 1 - - if self.target_epubver == '3' and navname is not None: - data.append('\n') - if self.has_ncx and ncxname is not None: - data.append('\n') - if self.pagemap != '': - data.append('\n') - data.append('\n') - return [data, spinerefs] - - def buildOPFSpine(self, spinerefs, isNCX): - # build spine - k8resc = self.k8resc - hasK8RescSpine = k8resc is not None and k8resc.hasSpine() - data = [] - ppd = '' - if self.isK8 and self.page_progression_direction is not None: - ppd = ' page-progression-direction="{:s}"'.format(self.page_progression_direction) - ncx = '' - if isNCX: - ncx = ' toc="ncx"' - map='' - if self.pagemap != '': - map = ' page-map="map"' - if self.epubver == 'F': - if ppd: - ppd = '' - spine_start_tag = '{0:s}\n'.format(ppd, map, ncx) - else: - spine_start_tag = '\n'.format(ppd, map, ncx) - data.append(spine_start_tag) - - if hasK8RescSpine: - for key in k8resc.spine_order: - idref = k8resc.spine_idrefs[key] - attribs = k8resc.spine_pageattributes[key] - tag = '\n' % entry) - start += 1 - for entry in spinerefs[start:]: - data.append('\n') - data.append('\n') - return data - - def buildMobi7OPF(self): - # Build an OPF for mobi7 and azw4. - print("Building an opf for mobi7/azw4.") - data = [] - data.append('\n') - data.append('\n') - metadata_tag = '' - opf_metadata = self.buildOPFMetadata(metadata_tag) - data += opf_metadata - if self.has_ncx: - # ncxname = self.files.getInputFileBasename() + '.ncx' - ncxname = 'toc.ncx' - else: - ncxname = None - [opf_manifest, spinerefs] = self.buildOPFManifest(ncxname) - data += opf_manifest - opf_spine = self.buildOPFSpine(spinerefs, self.has_ncx) - data += opf_spine - data.append('\n\n') - if not self.printReplica: - guide ='\n' + self.guidetext + '\n' - data.append(guide) - data.append('\n') - return ''.join(data) - - def buildEPUBOPF(self, has_obfuscated_fonts=False): - print("Building an opf for mobi8 using epub version: ", self.target_epubver) - if self.target_epubver == '2': - has_ncx = self.has_ncx - has_guide = True - ncxname = None - ncxname = TOC_NCX - navname = None - package = '\n' - tours = '\n\n' - metadata_tag = '' - else: - has_ncx = EPUB3_WITH_NCX - has_guide = EPUB3_WITH_GUIDE - ncxname = None - if has_ncx: - ncxname = TOC_NCX - navname = NAVIGATION_DOCUMENT - package = '\n' - tours = '' - metadata_tag = '' - - data = [] - data.append('\n') - data.append(package) - opf_metadata = self.buildOPFMetadata(metadata_tag, has_obfuscated_fonts) - data += opf_metadata - [opf_manifest, spinerefs] = self.buildOPFManifest(ncxname, navname) - data += opf_manifest - opf_spine = self.buildOPFSpine(spinerefs, has_ncx) - data += opf_spine - data.append(tours) - if has_guide: - guide ='\n' + self.guidetext + '\n' - data.append(guide) - data.append('\n') - return ''.join(data) - - def writeOPF(self, has_obfuscated_fonts=False): - if self.isK8: - data = self.buildEPUBOPF(has_obfuscated_fonts) - outopf = os.path.join(self.files.k8oebps, EPUB_OPF) - with open(pathof(outopf), 'wb') as f: - f.write(data.encode('utf-8')) - return self.BookId - else: - data = self.buildMobi7OPF() - outopf = os.path.join(self.files.mobi7dir, 'content.opf') - with open(pathof(outopf), 'wb') as f: - f.write(data.encode('utf-8')) - return 0 - - def getBookId(self): - return self.BookId - - def getNCXName(self): - return self.ncxname - - def getNAVName(self): - return self.navname - - def getEPUBVersion(self): - return self.target_epubver - - def hasNCX(self): - return self.ncxname is not None and self.has_ncx - - def hasNAV(self): - return self.navname is not None - - def autodetectEPUBVersion(self): - # Determine EPUB version from metadata and RESC. - metadata = self.metadata - k8resc = self.k8resc - epubver = '2' - if 'true' == metadata.get('fixed-layout', [''])[0].lower(): - epubver = '3' - elif metadata.get('orientation-lock', [''])[0].lower() in ['portrait', 'landscape']: - epubver = '3' - elif self.page_progression_direction == 'rtl': - epubver = '3' - elif EXTH_TITLE_FURIGANA in metadata: - epubver = '3' - elif EXTH_CREATOR_FURIGANA in metadata: - epubver = '3' - elif EXTH_PUBLISHER_FURIGANA in metadata: - epubver = '3' - elif k8resc is not None and k8resc.needEPUB3(): - epubver = '3' - return epubver - - def defineRefinesID(self): - # the following EXTH are set by KDP. - # 'Title_Furigana_(508)' - # 'Creator_Furigana_(517)', - # 'Publisher_Furigana_(522)' - # It is difficult to find correspondence between Title, Creator, Publisher - # and EXTH 508,512, 522 if they have more than two values since KDP seems not preserve the oders of EXTH 508,512 and 522. - # It is also difficult to find correspondence between them and tags which have refine attributes in RESC. - # So editing manually is required. - metadata = self.metadata - - needRefinesId = False - if self.k8resc is not None: - needRefinesId = self.k8resc.hasRefines() - # Create id for rifine attributes - if (needRefinesId or EXTH_TITLE_FURIGANA in metadata) and 'Title' in metadata: - for i in range(len(metadata.get('Title'))): - self.title_id[i] = 'title%02d' % (i+1) - - if (needRefinesId or EXTH_CREATOR_FURIGANA in metadata) and 'Creator' in metadata: - for i in range(len(metadata.get('Creator'))): - self.creator_id[i] = 'creator%02d' % (i+1) - - if (needRefinesId or EXTH_PUBLISHER_FURIGANA in metadata) and 'Publisher' in metadata: - for i in range(len(metadata.get('Publisher'))): - self.publisher_id[i] = 'publisher%02d' % (i+1) - - def processRefinesMetadata(self): - # create refines metadata defined in epub3 or convert refines property to opf: attribues for epub2. - metadata = self.metadata - - refines_list = [ - [EXTH_TITLE_FURIGANA, self.title_id, self.title_attrib, 'title00'], - [EXTH_CREATOR_FURIGANA, self.creator_id, self.creator_attrib, 'creator00'], - [EXTH_PUBLISHER_FURIGANA, self.publisher_id, self.publisher_attrib, 'publisher00'] - ] - - create_refines_metadata = False - for EXTH in lzip(*refines_list)[0]: - if EXTH in metadata: - create_refines_metadata = True - break - if create_refines_metadata: - for [EXTH, id, attrib, defaultid] in refines_list: - if self.target_epubver == '3': - for i, value in list(id.items()): - attrib[i] = ' id="%s"' % value - - if EXTH in metadata: - if len(metadata[EXTH]) == 1 and len(id) == 1: - self.createMetaTag(self.exth_solved_refines_metadata, 'file-as', metadata[EXTH][0], id[0]) - else: - for i, value in enumerate(metadata[EXTH]): - self.createMetaTag(self.exth_refines_metadata, 'file-as', value, id.get(i, defaultid)) - else: - if EXTH in metadata: - if len(metadata[EXTH]) == 1 and len(id) == 1: - attr = ' opf:file-as="%s"' % metadata[EXTH][0] - attrib[0] = attr - else: - for i, value in enumerate(metadata[EXTH]): - attr = ' id="#%s" opf:file-as="%s"\n' % (id.get(i, defaultid), value) - self.extra_attributes.append(attr) - - def createMetadataForFixedlayout(self): - # convert fixed layout to epub3 format if needed. - metadata = self.metadata - - if 'fixed-layout' in metadata: - fixedlayout = metadata['fixed-layout'][0] - content = {'true' : 'pre-paginated'}.get(fixedlayout.lower(), 'reflowable') - self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:layout', content) - - if 'orientation-lock' in metadata: - content = metadata['orientation-lock'][0].lower() - if content == 'portrait' or content == 'landscape': - self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:orientation', content) - - # according to epub3 spec about correspondence with Amazon - # if 'original-resolution' is provided it needs to be converted to - # meta viewport property tag stored in the of **each** - # xhtml page - so this tag would need to be handled by editing each part - # before reaching this routine - # we need to add support for this to the k8html routine - # if 'original-resolution' in metadata.keys(): - # resolution = metadata['original-resolution'][0].lower() - # width, height = resolution.split('x') - # if width.isdigit() and int(width) > 0 and height.isdigit() and int(height) > 0: - # viewport = 'width=%s, height=%s' % (width, height) - # self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:viewport', viewport) diff --git a/epy_extras/KindleUnpack/mobi_pagemap.py b/epy_extras/KindleUnpack/mobi_pagemap.py deleted file mode 100644 index 5228d4e..0000000 --- a/epy_extras/KindleUnpack/mobi_pagemap.py +++ /dev/null @@ -1,158 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab - -from __future__ import unicode_literals, division, absolute_import, print_function - -from .compatibility_utils import PY2, unicode_str - -if PY2: - range = xrange - -import struct -# note: struct pack, unpack, unpack_from all require bytestring format -# data all the way up to at least python 2.7.5, python 3 okay with bytestring - -import re -# note: re requites the pattern to be the exact same type as the data to be searched in python3 -# but u"" is not allowed for the pattern itself only b"" - - -_TABLE = [('m', 1000), ('cm', 900), ('d', 500), ('cd', 400), ('c', 100), ('xc', 90), ('l', 50), ('xl', 40), ('x', 10), ('ix', 9), ('v', 5), ('iv', 4), ('i', 1)] - -def int_to_roman(i): - parts = [] - num = i - for letter, value in _TABLE: - while value <= num: - num -= value - parts.append(letter) - return ''.join(parts) - -def roman_to_int(s): - result = 0 - rnstr = s - for letter, value in _TABLE: - while rnstr.startswith(letter): - result += value - rnstr = rnstr[len(letter):] - return result - -_pattern = r'''\(([^\)]*)\)''' -_tup_pattern = re.compile(_pattern,re.IGNORECASE) - - -def _parseNames(numpages, data): - data = unicode_str(data) - pagenames = [] - pageMap = '' - for i in range(numpages): - pagenames.append(None) - for m in re.finditer(_tup_pattern, data): - tup = m.group(1) - if pageMap != '': - pageMap += ',' - pageMap += '(' + tup + ')' - spos, nametype, svalue = tup.split(",") - # print(spos, nametype, svalue) - if nametype == 'a' or nametype == 'r': - svalue = int(svalue) - spos = int(spos) - for i in range(spos - 1, numpages): - if nametype == 'r': - pname = int_to_roman(svalue) - svalue += 1 - elif nametype == 'a': - pname = "%s" % svalue - svalue += 1 - elif nametype == 'c': - sp = svalue.find('|') - if sp == -1: - pname = svalue - else: - pname = svalue[0:sp] - svalue = svalue[sp+1:] - else: - print("Error: unknown page numbering type", nametype) - pagenames[i] = pname - return pagenames, pageMap - - -class PageMapProcessor: - - def __init__(self, mh, data): - self.mh = mh - self.data = data - self.pagenames = [] - self.pageoffsets = [] - self.pageMap = '' - self.pm_len = 0 - self.pm_nn = 0 - self.pn_bits = 0 - self.pmoff = None - self.pmstr = '' - print("Extracting Page Map Information") - rev_len, = struct.unpack_from(b'>L', self.data, 0x10) - # skip over header, revision string length data, and revision string - ptr = 0x14 + rev_len - pm_1, self.pm_len, self.pm_nn, self.pm_bits = struct.unpack_from(b'>4H', self.data, ptr) - # print(pm_1, self.pm_len, self.pm_nn, self.pm_bits) - self.pmstr = self.data[ptr+8:ptr+8+self.pm_len] - self.pmoff = self.data[ptr+8+self.pm_len:] - offsize = b">L" - offwidth = 4 - if self.pm_bits == 16: - offsize = b">H" - offwidth = 2 - ptr = 0 - for i in range(self.pm_nn): - od, = struct.unpack_from(offsize, self.pmoff, ptr) - ptr += offwidth - self.pageoffsets.append(od) - self.pagenames, self.pageMap = _parseNames(self.pm_nn, self.pmstr) - - def getPageMap(self): - return self.pageMap - - def getNames(self): - return self.pagenames - - def getOffsets(self): - return self.pageoffsets - - # page-map.xml will be unicode but encoded to utf-8 immediately before being written to a file - def generateKF8PageMapXML(self, k8proc): - pagemapxml = '\n' - for i in range(len(self.pagenames)): - pos = self.pageoffsets[i] - name = self.pagenames[i] - if name is not None and name != "": - [pn, dir, filename, skelpos, skelend, aidtext] = k8proc.getSkelInfo(pos) - idtext = unicode_str(k8proc.getPageIDTag(pos)) - linktgt = unicode_str(filename) - if idtext != '': - linktgt += '#' + idtext - pagemapxml += '\n' % (name, dir, linktgt) - pagemapxml += "\n" - return pagemapxml - - def generateAPNX(self, apnx_meta): - if apnx_meta['format'] == 'MOBI_8': - content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","format":"%(format)s","fileRevisionId":"1","acr":"%(acr)s"}' %apnx_meta - else: - content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","fileRevisionId":"1"}' % apnx_meta - content_header = content_header.encode('utf-8') - page_header = '{"asin":"%(asin)s","pageMap":"%(pageMap)s"}' % apnx_meta - page_header = page_header.encode('utf-8') - apnx = struct.pack(b'>H',1) + struct.pack(b'>H',1) - apnx += struct.pack(b'>I', 12 + len(content_header)) - apnx += struct.pack(b'>I', len(content_header)) - apnx += content_header - apnx += struct.pack(b'>H', 1) - apnx += struct.pack(b'>H', len(page_header)) - apnx += struct.pack(b'>H', self.pm_nn) - apnx += struct.pack(b'>H', 32) - apnx += page_header - for page in self.pageoffsets: - apnx += struct.pack(b'>L', page) - return apnx diff --git a/epy_extras/KindleUnpack/mobi_sectioner.py b/epy_extras/KindleUnpack/mobi_sectioner.py deleted file mode 100644 index 81f62bb..0000000 --- a/epy_extras/KindleUnpack/mobi_sectioner.py +++ /dev/null @@ -1,120 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab - -from __future__ import unicode_literals, division, absolute_import, print_function - -from .compatibility_utils import PY2, hexlify, bstr, bord, bchar - -import datetime - -if PY2: - range = xrange - -# note: struct pack, unpack, unpack_from all require bytestring format -# data all the way up to at least python 2.7.5, python 3 okay with bytestring -import struct - -from .unipath import pathof - -DUMP = False -""" Set to True to dump all possible information. """ - -class unpackException(Exception): - pass - - -def describe(data): - txtans = '' - hexans = hexlify(data) - for i in data: - if bord(i) < 32 or bord(i) > 127: - txtans += '?' - else: - txtans += bchar(i).decode('latin-1') - return '"' + txtans + '"' + ' 0x'+ hexans - -def datetimefrompalmtime(palmtime): - if palmtime > 0x7FFFFFFF: - pythondatetime = datetime.datetime(year=1904,month=1,day=1)+datetime.timedelta(seconds=palmtime) - else: - pythondatetime = datetime.datetime(year=1970,month=1,day=1)+datetime.timedelta(seconds=palmtime) - return pythondatetime - - -class Sectionizer: - - def __init__(self, filename): - self.data = b'' - with open(pathof(filename), 'rb') as f: - self.data = f.read() - self.palmheader = self.data[:78] - self.palmname = self.data[:32] - self.ident = self.palmheader[0x3C:0x3C+8] - self.num_sections, = struct.unpack_from(b'>H', self.palmheader, 76) - self.filelength = len(self.data) - sectionsdata = struct.unpack_from(bstr('>%dL' % (self.num_sections*2)), self.data, 78) + (self.filelength, 0) - self.sectionoffsets = sectionsdata[::2] - self.sectionattributes = sectionsdata[1::2] - self.sectiondescriptions = ["" for x in range(self.num_sections+1)] - self.sectiondescriptions[-1] = "File Length Only" - return - - def dumpsectionsinfo(self): - print("Section Offset Length UID Attribs Description") - for i in range(self.num_sections): - print("%3d %3X 0x%07X 0x%05X % 8d % 7d %s" % (i,i, self.sectionoffsets[i], self.sectionoffsets[ - i+1] - self.sectionoffsets[i], self.sectionattributes[i]&0xFFFFFF, (self.sectionattributes[i]>>24)&0xFF, self.sectiondescriptions[i])) - print("%3d %3X 0x%07X %s" % - (self.num_sections,self.num_sections, self.sectionoffsets[self.num_sections], self.sectiondescriptions[self.num_sections])) - - def setsectiondescription(self, section, description): - if section < len(self.sectiondescriptions): - self.sectiondescriptions[section] = description - else: - print("Section out of range: %d, description %s" % (section,description)) - - def dumppalmheader(self): - print("Palm Database Header") - print("Database name: " + repr(self.palmheader[:32])) - dbattributes, = struct.unpack_from(b'>H', self.palmheader, 32) - print("Bitfield attributes: 0x%0X" % dbattributes,) - if dbattributes != 0: - print(" (",) - if (dbattributes & 2): - print("Read-only; ",) - if (dbattributes & 4): - print("Dirty AppInfoArea; ",) - if (dbattributes & 8): - print("Needs to be backed up; ",) - if (dbattributes & 16): - print("OK to install over newer; ",) - if (dbattributes & 32): - print("Reset after installation; ",) - if (dbattributes & 64): - print("No copying by PalmPilot beaming; ",) - print(")") - else: - print("") - print("File version: %d" % struct.unpack_from(b'>H', self.palmheader, 34)[0]) - dbcreation, = struct.unpack_from(b'>L', self.palmheader, 36) - print("Creation Date: " + str(datetimefrompalmtime(dbcreation))+ (" (0x%0X)" % dbcreation)) - dbmodification, = struct.unpack_from(b'>L', self.palmheader, 40) - print("Modification Date: " + str(datetimefrompalmtime(dbmodification))+ (" (0x%0X)" % dbmodification)) - dbbackup, = struct.unpack_from(b'>L', self.palmheader, 44) - if dbbackup != 0: - print("Backup Date: " + str(datetimefrompalmtime(dbbackup))+ (" (0x%0X)" % dbbackup)) - print("Modification No.: %d" % struct.unpack_from(b'>L', self.palmheader, 48)[0]) - print("App Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 52)[0]) - print("Sort Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 56)[0]) - print("Type/Creator: %s/%s" % (repr(self.palmheader[60:64]), repr(self.palmheader[64:68]))) - print("Unique seed: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 68)[0]) - expectedzero, = struct.unpack_from(b'>L', self.palmheader, 72) - if expectedzero != 0: - print("Should be zero but isn't: %d" % struct.unpack_from(b'>L', self.palmheader, 72)[0]) - print("Number of sections: %d" % struct.unpack_from(b'>H', self.palmheader, 76)[0]) - return - - def loadSection(self, section): - before, after = self.sectionoffsets[section:section+2] - return self.data[before:after] diff --git a/epy_extras/KindleUnpack/mobi_split.py b/epy_extras/KindleUnpack/mobi_split.py deleted file mode 100755 index 3535029..0000000 --- a/epy_extras/KindleUnpack/mobi_split.py +++ /dev/null @@ -1,438 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab - -from __future__ import unicode_literals, division, absolute_import, print_function - -import struct -# note: struct pack, unpack, unpack_from all require bytestring format -# data all the way up to at least python 2.7.5, python 3 okay with bytestring - -from .unipath import pathof - - -# important pdb header offsets -unique_id_seed = 68 -number_of_pdb_records = 76 - -# important palmdoc header offsets -book_length = 4 -book_record_count = 8 -first_pdb_record = 78 - -# important rec0 offsets -length_of_book = 4 -mobi_header_base = 16 -mobi_header_length = 20 -mobi_type = 24 -mobi_version = 36 -first_non_text = 80 -title_offset = 84 -first_resc_record = 108 -first_content_index = 192 -last_content_index = 194 -kf8_fdst_index = 192 # for KF8 mobi headers -fcis_index = 200 -flis_index = 208 -srcs_index = 224 -srcs_count = 228 -primary_index = 244 -datp_index = 256 -huffoff = 112 -hufftbloff = 120 - -def getint(datain,ofs,sz=b'L'): - i, = struct.unpack_from(b'>'+sz,datain,ofs) - return i - -def writeint(datain,ofs,n,len=b'L'): - if len==b'L': - return datain[:ofs]+struct.pack(b'>L',n)+datain[ofs+4:] - else: - return datain[:ofs]+struct.pack(b'>H',n)+datain[ofs+2:] - -def getsecaddr(datain,secno): - nsec = getint(datain,number_of_pdb_records,b'H') - assert secno>=0 & secnoL',2*nsec+1)) - datalst.append(datain[unique_id_seed+4:number_of_pdb_records]) - datalst.append(struct.pack(b'>H',nsec)) - newstart = zerosecstart - for i in range(0,secno): - ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) - datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) - datalst.append(struct.pack(b'>L', secstart) + struct.pack(b'>L', (2*secno))) - for i in range(secno+1,nsec): - ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) - ofs = ofs + dif - datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) - lpad = newstart - (first_pdb_record + 8*nsec) - if lpad > 0: - datalst.append(b'\0' * lpad) - datalst.append(datain[zerosecstart:secstart]) - datalst.append(secdata) - datalst.append(datain[secend:]) - dataout = b''.join(datalst) - return dataout - -def nullsection(datain,secno): # make it zero-length without deleting it - datalst = [] - nsec = getint(datain,number_of_pdb_records,b'H') - secstart, secend = getsecaddr(datain,secno) - zerosecstart, zerosecend = getsecaddr(datain, 0) - dif = secend-secstart - datalst.append(datain[:first_pdb_record]) - for i in range(0,secno+1): - ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) - datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) - for i in range(secno+1, nsec): - ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) - ofs = ofs - dif - datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) - lpad = zerosecstart - (first_pdb_record + 8*nsec) - if lpad > 0: - datalst.append(b'\0' * lpad) - datalst.append(datain[zerosecstart: secstart]) - datalst.append(datain[secend:]) - dataout = b''.join(datalst) - return dataout - -def deletesectionrange(datain,firstsec,lastsec): # delete a range of sections - datalst = [] - firstsecstart,firstsecend = getsecaddr(datain,firstsec) - lastsecstart,lastsecend = getsecaddr(datain,lastsec) - zerosecstart, zerosecend = getsecaddr(datain, 0) - dif = lastsecend - firstsecstart + 8*(lastsec-firstsec+1) - nsec = getint(datain,number_of_pdb_records,b'H') - datalst.append(datain[:unique_id_seed]) - datalst.append(struct.pack(b'>L',2*(nsec-(lastsec-firstsec+1))+1)) - datalst.append(datain[unique_id_seed+4:number_of_pdb_records]) - datalst.append(struct.pack(b'>H',nsec-(lastsec-firstsec+1))) - newstart = zerosecstart - 8*(lastsec-firstsec+1) - for i in range(0,firstsec): - ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) - ofs = ofs-8*(lastsec-firstsec+1) - datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) - for i in range(lastsec+1,nsec): - ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) - ofs = ofs - dif - flgval = 2*(i-(lastsec-firstsec+1)) - datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) - lpad = newstart - (first_pdb_record + 8*(nsec - (lastsec - firstsec + 1))) - if lpad > 0: - datalst.append(b'\0' * lpad) - datalst.append(datain[zerosecstart:firstsecstart]) - datalst.append(datain[lastsecend:]) - dataout = b''.join(datalst) - return dataout - -def insertsection(datain,secno,secdata): # insert a new section - datalst = [] - nsec = getint(datain,number_of_pdb_records,b'H') - # print("inserting secno" , secno, "into" ,nsec, "sections") - secstart,secend = getsecaddr(datain,secno) - zerosecstart,zerosecend = getsecaddr(datain,0) - dif = len(secdata) - datalst.append(datain[:unique_id_seed]) - datalst.append(struct.pack(b'>L',2*(nsec+1)+1)) - datalst.append(datain[unique_id_seed+4:number_of_pdb_records]) - datalst.append(struct.pack(b'>H',nsec+1)) - newstart = zerosecstart + 8 - for i in range(0,secno): - ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) - ofs += 8 - datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) - datalst.append(struct.pack(b'>L', secstart + 8) + struct.pack(b'>L', (2*secno))) - for i in range(secno,nsec): - ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) - ofs = ofs + dif + 8 - flgval = 2*(i+1) - datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) - lpad = newstart - (first_pdb_record + 8*(nsec + 1)) - if lpad > 0: - datalst.append(b'\0' * lpad) - datalst.append(datain[zerosecstart:secstart]) - datalst.append(secdata) - datalst.append(datain[secstart:]) - dataout = b''.join(datalst) - return dataout - - -def insertsectionrange(sectionsource,firstsec,lastsec,sectiontarget,targetsec): # insert a range of sections - # print("inserting secno" , firstsec, "to", lastsec, "into" ,targetsec, "sections") - # dataout = sectiontarget - # for idx in range(lastsec,firstsec-1,-1): - # dataout = insertsection(dataout,targetsec,readsection(sectionsource,idx)) - # return dataout - datalst = [] - nsec = getint(sectiontarget,number_of_pdb_records,b'H') - zerosecstart, zerosecend = getsecaddr(sectiontarget,0) - insstart, nul = getsecaddr(sectiontarget,targetsec) - nins = lastsec - firstsec + 1 - srcstart, nul = getsecaddr(sectionsource,firstsec) - nul, srcend = getsecaddr(sectionsource,lastsec) - newstart = zerosecstart + 8*nins - - datalst.append(sectiontarget[:unique_id_seed]) - datalst.append(struct.pack(b'>L',2*(nsec+nins)+1)) - datalst.append(sectiontarget[unique_id_seed+4:number_of_pdb_records]) - datalst.append(struct.pack(b'>H',nsec+nins)) - for i in range(0,targetsec): - ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8) - ofsnew = ofs + 8*nins - flgvalnew = flgval - datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew)) - # print(ofsnew, flgvalnew, ofs, flgval) - srcstart0, nul = getsecaddr(sectionsource,firstsec) - for i in range(nins): - isrcstart, nul = getsecaddr(sectionsource,firstsec+i) - ofsnew = insstart + (isrcstart-srcstart0) + 8*nins - flgvalnew = 2*(targetsec+i) - datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew)) - # print(ofsnew, flgvalnew) - dif = srcend - srcstart - for i in range(targetsec,nsec): - ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8) - ofsnew = ofs + dif + 8*nins - flgvalnew = 2*(i+nins) - datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L',flgvalnew)) - # print(ofsnew, flgvalnew, ofs, flgval) - lpad = newstart - (first_pdb_record + 8*(nsec + nins)) - if lpad > 0: - datalst.append(b'\0' * lpad) - datalst.append(sectiontarget[zerosecstart:insstart]) - datalst.append(sectionsource[srcstart:srcend]) - datalst.append(sectiontarget[insstart:]) - dataout = b''.join(datalst) - return dataout - -def get_exth_params(rec0): - ebase = mobi_header_base + getint(rec0,mobi_header_length) - elen = getint(rec0,ebase+4) - enum = getint(rec0,ebase+8) - return ebase,elen,enum - -def add_exth(rec0,exth_num,exth_bytes): - ebase,elen,enum = get_exth_params(rec0) - newrecsize = 8+len(exth_bytes) - newrec0 = rec0[0:ebase+4]+struct.pack(b'>L',elen+newrecsize)+struct.pack(b'>L',enum+1)+\ - struct.pack(b'>L',exth_num)+struct.pack(b'>L',newrecsize)+exth_bytes+rec0[ebase+12:] - newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+newrecsize) - return newrec0 - -def read_exth(rec0,exth_num): - exth_values = [] - ebase,elen,enum = get_exth_params(rec0) - ebase = ebase+12 - while enum>0: - exth_id = getint(rec0,ebase) - if exth_id == exth_num: - # We might have multiple exths, so build a list. - exth_values.append(rec0[ebase+8:ebase+getint(rec0,ebase+4)]) - enum = enum-1 - ebase = ebase+getint(rec0,ebase+4) - return exth_values - -def write_exth(rec0,exth_num,exth_bytes): - ebase,elen,enum = get_exth_params(rec0) - ebase_idx = ebase+12 - enum_idx = enum - while enum_idx>0: - exth_id = getint(rec0,ebase_idx) - if exth_id == exth_num: - dif = len(exth_bytes)+8-getint(rec0,ebase_idx+4) - newrec0 = rec0 - if dif != 0: - newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+dif) - return newrec0[:ebase+4]+struct.pack(b'>L',elen+len(exth_bytes)+8-getint(rec0,ebase_idx+4))+\ - struct.pack(b'>L',enum)+rec0[ebase+12:ebase_idx+4]+\ - struct.pack(b'>L',len(exth_bytes)+8)+exth_bytes+\ - rec0[ebase_idx+getint(rec0,ebase_idx+4):] - enum_idx = enum_idx-1 - ebase_idx = ebase_idx+getint(rec0,ebase_idx+4) - return rec0 - -def del_exth(rec0,exth_num): - ebase,elen,enum = get_exth_params(rec0) - ebase_idx = ebase+12 - enum_idx = 0 - while enum_idx < enum: - exth_id = getint(rec0,ebase_idx) - exth_size = getint(rec0,ebase_idx+4) - if exth_id == exth_num: - newrec0 = rec0 - newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)-exth_size) - newrec0 = newrec0[:ebase_idx]+newrec0[ebase_idx+exth_size:] - newrec0 = newrec0[0:ebase+4]+struct.pack(b'>L',elen-exth_size)+struct.pack(b'>L',enum-1)+newrec0[ebase+12:] - return newrec0 - enum_idx += 1 - ebase_idx = ebase_idx+exth_size - return rec0 - - -class mobi_split: - - def __init__(self, infile): - datain = b'' - with open(pathof(infile), 'rb') as f: - datain = f.read() - datain_rec0 = readsection(datain,0) - ver = getint(datain_rec0,mobi_version) - self.combo = (ver!=8) - if not self.combo: - return - exth121 = read_exth(datain_rec0,121) - if len(exth121) == 0: - self.combo = False - return - else: - # only pay attention to first exth121 - # (there should only be one) - datain_kf8, = struct.unpack_from(b'>L',exth121[0],0) - if datain_kf8 == 0xffffffff: - self.combo = False - return - datain_kfrec0 =readsection(datain,datain_kf8) - - # create the standalone mobi7 - num_sec = getint(datain,number_of_pdb_records,b'H') - # remove BOUNDARY up to but not including ELF record - self.result_file7 = deletesectionrange(datain,datain_kf8-1,num_sec-2) - # check if there are SRCS records and delete them - srcs = getint(datain_rec0,srcs_index) - num_srcs = getint(datain_rec0,srcs_count) - if srcs != 0xffffffff and num_srcs > 0: - self.result_file7 = deletesectionrange(self.result_file7,srcs,srcs+num_srcs-1) - datain_rec0 = writeint(datain_rec0,srcs_index,0xffffffff) - datain_rec0 = writeint(datain_rec0,srcs_count,0) - # reset the EXTH 121 KF8 Boundary meta data to 0xffffffff - datain_rec0 = write_exth(datain_rec0,121, struct.pack(b'>L', 0xffffffff)) - # datain_rec0 = del_exth(datain_rec0,121) - # datain_rec0 = del_exth(datain_rec0,534) - # don't remove the EXTH 125 KF8 Count of Resources, seems to be present in mobi6 files as well - # set the EXTH 129 KF8 Masthead / Cover Image string to the null string - datain_rec0 = write_exth(datain_rec0,129, b'') - # don't remove the EXTH 131 KF8 Unidentified Count, seems to be present in mobi6 files as well - - # need to reset flags stored in 0x80-0x83 - # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050 - # Bit Flags - # 0x1000 = Bit 12 indicates if embedded fonts are used or not - # 0x0800 = means this Header points to *shared* images/resource/fonts ?? - # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8? - # 0x0040 = exth exists - # 0x0010 = Not sure but this is always set so far - fval, = struct.unpack_from(b'>L',datain_rec0, 0x80) - # need to remove flag 0x0800 for KindlePreviewer 2.8 and unset Bit 12 for embedded fonts - fval = fval & 0x07FF - datain_rec0 = datain_rec0[:0x80] + struct.pack(b'>L',fval) + datain_rec0[0x84:] - - self.result_file7 = writesection(self.result_file7,0,datain_rec0) - - # no need to replace kf8 style fcis with mobi 7 one - # fcis_secnum, = struct.unpack_from(b'>L',datain_rec0, 0xc8) - # if fcis_secnum != 0xffffffff: - # fcis_info = readsection(datain, fcis_secnum) - # text_len, = struct.unpack_from(b'>L', fcis_info, 0x14) - # new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00' - # new_fcis += struct.pack(b'>L',text_len) - # new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00' - # self.result_file7 = writesection(self.result_file7, fcis_secnum, new_fcis) - - firstimage = getint(datain_rec0,first_resc_record) - lastimage = getint(datain_rec0,last_content_index,b'H') - # print("Old First Image, last Image", firstimage,lastimage) - if lastimage == 0xffff: - # find the lowest of the next sections and copy up to that. - ofs_list = [(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')] - for ofs,sz in ofs_list: - n = getint(datain_rec0,ofs,sz) - # print("n",n) - if n > 0 and n < lastimage: - lastimage = n-1 - print("First Image, last Image", firstimage,lastimage) - - # Try to null out FONT and RES, but leave the (empty) PDB record so image refs remain valid - for i in range(firstimage,lastimage): - imgsec = readsection(self.result_file7,i) - if imgsec[0:4] in [b'RESC',b'FONT']: - self.result_file7 = nullsection(self.result_file7,i) - - # mobi7 finished - - # create standalone mobi8 - self.result_file8 = deletesectionrange(datain,0,datain_kf8-1) - target = getint(datain_kfrec0,first_resc_record) - self.result_file8 = insertsectionrange(datain,firstimage,lastimage,self.result_file8,target) - datain_kfrec0 =readsection(self.result_file8,0) - - # Only keep the correct EXTH 116 StartOffset, KG 2.5 carries over the one from the mobi7 part, which then points at garbage in the mobi8 part, and confuses FW 3.4 - kf8starts = read_exth(datain_kfrec0,116) - # If we have multiple StartOffset, keep only the last one - kf8start_count = len(kf8starts) - while kf8start_count > 1: - kf8start_count -= 1 - datain_kfrec0 = del_exth(datain_kfrec0,116) - - # update the EXTH 125 KF8 Count of Images/Fonts/Resources - datain_kfrec0 = write_exth(datain_kfrec0,125,struct.pack(b'>L',lastimage-firstimage+1)) - - # need to reset flags stored in 0x80-0x83 - # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050 - # standalone mobi8 with exth: 0x0050 - # Bit Flags - # 0x1000 = Bit 12 indicates if embedded fonts are used or not - # 0x0800 = means this Header points to *shared* images/resource/fonts ?? - # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8? - # 0x0040 = exth exists - # 0x0010 = Not sure but this is always set so far - fval, = struct.unpack_from('>L',datain_kfrec0, 0x80) - fval = fval & 0x1FFF - fval |= 0x0800 - datain_kfrec0 = datain_kfrec0[:0x80] + struct.pack(b'>L',fval) + datain_kfrec0[0x84:] - - # properly update other index pointers that have been shifted by the insertion of images - ofs_list = [(kf8_fdst_index,b'L'),(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')] - for ofs,sz in ofs_list: - n = getint(datain_kfrec0,ofs,sz) - if n != 0xffffffff: - datain_kfrec0 = writeint(datain_kfrec0,ofs,n+lastimage-firstimage+1,sz) - self.result_file8 = writesection(self.result_file8,0,datain_kfrec0) - - # no need to replace kf8 style fcis with mobi 7 one - # fcis_secnum, = struct.unpack_from(b'>L',datain_kfrec0, 0xc8) - # if fcis_secnum != 0xffffffff: - # fcis_info = readsection(self.result_file8, fcis_secnum) - # text_len, = struct.unpack_from(b'>L', fcis_info, 0x14) - # new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00' - # new_fcis += struct.pack(b'>L',text_len) - # new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00' - # self.result_file8 = writesection(self.result_file8, fcis_secnum, new_fcis) - - # mobi8 finished - - def getResult8(self): - return self.result_file8 - - def getResult7(self): - return self.result_file7 diff --git a/epy_extras/KindleUnpack/mobi_uncompress.py b/epy_extras/KindleUnpack/mobi_uncompress.py deleted file mode 100644 index c5fad85..0000000 --- a/epy_extras/KindleUnpack/mobi_uncompress.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab - -from __future__ import unicode_literals, division, absolute_import, print_function - -from .compatibility_utils import PY2, bchr, lmap, bstr - -if PY2: - range = xrange - -import struct -# note: struct pack, unpack, unpack_from all require bytestring format -# data all the way up to at least python 2.7.5, python 3 okay with bytestring - - -class unpackException(Exception): - pass - -class UncompressedReader: - - def unpack(self, data): - return data - -class PalmdocReader: - - def unpack(self, i): - o, p = b'', 0 - while p < len(i): - # for python 3 must use slice since i[p] returns int while slice returns character - c = ord(i[p:p+1]) - p += 1 - if (c >= 1 and c <= 8): - o += i[p:p+c] - p += c - elif (c < 128): - o += bchr(c) - elif (c >= 192): - o += b' ' + bchr(c ^ 128) - else: - if p < len(i): - c = (c << 8) | ord(i[p:p+1]) - p += 1 - m = (c >> 3) & 0x07ff - n = (c & 7) + 3 - if (m > n): - o += o[-m:n-m] - else: - for _ in range(n): - # because of completely ass-backwards decision by python mainters for python 3 - # we must use slice for bytes as i[p] returns int while slice returns character - if m == 1: - o += o[-m:] - else: - o += o[-m:-m+1] - return o - -class HuffcdicReader: - q = struct.Struct(b'>Q').unpack_from - - def loadHuff(self, huff): - if huff[0:8] != b'HUFF\x00\x00\x00\x18': - raise unpackException('invalid huff header') - off1, off2 = struct.unpack_from(b'>LL', huff, 8) - - def dict1_unpack(v): - codelen, term, maxcode = v&0x1f, v&0x80, v>>8 - assert codelen != 0 - if codelen <= 8: - assert term - maxcode = ((maxcode + 1) << (32 - codelen)) - 1 - return (codelen, term, maxcode) - self.dict1 = lmap(dict1_unpack, struct.unpack_from(b'>256L', huff, off1)) - - dict2 = struct.unpack_from(b'>64L', huff, off2) - self.mincode, self.maxcode = (), () - for codelen, mincode in enumerate((0,) + dict2[0::2]): - self.mincode += (mincode << (32 - codelen), ) - for codelen, maxcode in enumerate((0,) + dict2[1::2]): - self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, ) - - self.dictionary = [] - - def loadCdic(self, cdic): - if cdic[0:8] != b'CDIC\x00\x00\x00\x10': - raise unpackException('invalid cdic header') - phrases, bits = struct.unpack_from(b'>LL', cdic, 8) - n = min(1<H').unpack_from - def getslice(off): - blen, = h(cdic, 16+off) - slice = cdic[18+off:18+off+(blen&0x7fff)] - return (slice, blen&0x8000) - self.dictionary += lmap(getslice, struct.unpack_from(bstr('>%dH' % n), cdic, 16)) - - def unpack(self, data): - q = HuffcdicReader.q - - bitsleft = len(data) * 8 - data += b"\x00\x00\x00\x00\x00\x00\x00\x00" - pos = 0 - x, = q(data, pos) - n = 32 - - s = b'' - while True: - if n <= 0: - pos += 4 - x, = q(data, pos) - n += 32 - code = (x >> n) & ((1 << 32) - 1) - - codelen, term, maxcode = self.dict1[code >> 24] - if not term: - while code < self.mincode[codelen]: - codelen += 1 - maxcode = self.maxcode[codelen] - - n -= codelen - bitsleft -= codelen - if bitsleft < 0: - break - - r = (maxcode - code) >> (32 - codelen) - slice, flag = self.dictionary[r] - if not flag: - self.dictionary[r] = None - slice = self.unpack(slice) - self.dictionary[r] = (slice, 1) - s += slice - return s diff --git a/epy_extras/KindleUnpack/mobi_utils.py b/epy_extras/KindleUnpack/mobi_utils.py deleted file mode 100644 index 6791e0d..0000000 --- a/epy_extras/KindleUnpack/mobi_utils.py +++ /dev/null @@ -1,191 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab -# flake8: noqa - -from __future__ import unicode_literals, division, absolute_import, print_function - -from .compatibility_utils import PY2, text_type, bchr, bord - -import binascii - -if PY2: - range = xrange - -from itertools import cycle - -def getLanguage(langID, sublangID): - mobilangdict = { - 54 : {0 : 'af'}, # Afrikaans - 28 : {0 : 'sq'}, # Albanian - 1 : {0 : 'ar' , 5 : 'ar-dz' , 15 : 'ar-bh' , 3 : 'ar-eg' , 2 : 'ar-iq', 11 : 'ar-jo' , 13 : 'ar-kw' , 12 : 'ar-lb' , 4: 'ar-ly', - 6 : 'ar-ma' , 8 : 'ar-om' , 16 : 'ar-qa' , 1 : 'ar-sa' , 10 : 'ar-sy' , 7 : 'ar-tn' , 14 : 'ar-ae' , 9 : 'ar-ye'}, - # Arabic, Arabic (Algeria), Arabic (Bahrain), Arabic (Egypt), Arabic - # (Iraq), Arabic (Jordan), Arabic (Kuwait), Arabic (Lebanon), Arabic - # (Libya), Arabic (Morocco), Arabic (Oman), Arabic (Qatar), Arabic - # (Saudi Arabia), Arabic (Syria), Arabic (Tunisia), Arabic (United Arab - # Emirates), Arabic (Yemen) - 43 : {0 : 'hy'}, # Armenian - 77 : {0 : 'as'}, # Assamese - 44 : {0 : 'az'}, # "Azeri (IANA: Azerbaijani) - 45 : {0 : 'eu'}, # Basque - 35 : {0 : 'be'}, # Belarusian - 69 : {0 : 'bn'}, # Bengali - 2 : {0 : 'bg'}, # Bulgarian - 3 : {0 : 'ca'}, # Catalan - 4 : {0 : 'zh' , 3 : 'zh-hk' , 2 : 'zh-cn' , 4 : 'zh-sg' , 1 : 'zh-tw'}, - # Chinese, Chinese (Hong Kong), Chinese (PRC), Chinese (Singapore), Chinese (Taiwan) - 26 : {0 : 'hr', 3 : 'sr'}, # Croatian, Serbian - 5 : {0 : 'cs'}, # Czech - 6 : {0 : 'da'}, # Danish - 19 : {0: 'nl', 1 : 'nl' , 2 : 'nl-be'}, # Dutch / Flemish, Dutch (Belgium) - 9 : {0: 'en', 1 : 'en' , 3 : 'en-au' , 40 : 'en-bz' , 4 : 'en-ca' , 6 : 'en-ie' , 8 : 'en-jm' , 5 : 'en-nz' , 13 : 'en-ph' , - 7 : 'en-za' , 11 : 'en-tt' , 2 : 'en-gb', 1 : 'en-us' , 12 : 'en-zw'}, - # English, English (Australia), English (Belize), English (Canada), - # English (Ireland), English (Jamaica), English (New Zealand), English - # (Philippines), English (South Africa), English (Trinidad), English - # (United Kingdom), English (United States), English (Zimbabwe) - 37 : {0 : 'et'}, # Estonian - 56 : {0 : 'fo'}, # Faroese - 41 : {0 : 'fa'}, # Farsi / Persian - 11 : {0 : 'fi'}, # Finnish - 12 : {0 : 'fr', 1 : 'fr' , 2 : 'fr-be' , 3 : 'fr-ca' , 5 : 'fr-lu' , 6 : 'fr-mc' , 4 : 'fr-ch'}, - # French, French (Belgium), French (Canada), French (Luxembourg), French (Monaco), French (Switzerland) - 55 : {0 : 'ka'}, # Georgian - 7 : {0 : 'de', 1 : 'de' , 3 : 'de-at' , 5 : 'de-li' , 4 : 'de-lu' , 2 : 'de-ch'}, - # German, German (Austria), German (Liechtenstein), German (Luxembourg), German (Switzerland) - 8 : {0 : 'el'}, # Greek, Modern (1453-) - 71 : {0 : 'gu'}, # Gujarati - 13 : {0 : 'he'}, # Hebrew (also code 'iw'?) - 57 : {0 : 'hi'}, # Hindi - 14 : {0 : 'hu'}, # Hungarian - 15 : {0 : 'is'}, # Icelandic - 33 : {0 : 'id'}, # Indonesian - 16 : {0 : 'it', 1 : 'it' , 2 : 'it-ch'}, # Italian, Italian (Switzerland) - 17 : {0 : 'ja'}, # Japanese - 75 : {0 : 'kn'}, # Kannada - 63 : {0 : 'kk'}, # Kazakh - 87 : {0 : 'x-kok'}, # Konkani (real language code is 'kok'?) - 18 : {0 : 'ko'}, # Korean - 38 : {0 : 'lv'}, # Latvian - 39 : {0 : 'lt'}, # Lithuanian - 47 : {0 : 'mk'}, # Macedonian - 62 : {0 : 'ms'}, # Malay - 76 : {0 : 'ml'}, # Malayalam - 58 : {0 : 'mt'}, # Maltese - 78 : {0 : 'mr'}, # Marathi - 97 : {0 : 'ne'}, # Nepali - 20 : {0 : 'no'}, # Norwegian - 72 : {0 : 'or'}, # Oriya - 21 : {0 : 'pl'}, # Polish - 22 : {0 : 'pt', 2 : 'pt' , 1 : 'pt-br'}, # Portuguese, Portuguese (Brazil) - 70 : {0 : 'pa'}, # Punjabi - 23 : {0 : 'rm'}, # "Rhaeto-Romanic" (IANA: Romansh) - 24 : {0 : 'ro'}, # Romanian - 25 : {0 : 'ru'}, # Russian - 59 : {0 : 'sz'}, # "Sami (Lappish)" (not an IANA language code) - # IANA code for "Northern Sami" is 'se' - # 'SZ' is the IANA region code for Swaziland - 79 : {0 : 'sa'}, # Sanskrit - 27 : {0 : 'sk'}, # Slovak - 36 : {0 : 'sl'}, # Slovenian - 46 : {0 : 'sb'}, # "Sorbian" (not an IANA language code) - # 'SB' is IANA region code for 'Solomon Islands' - # Lower Sorbian = 'dsb' - # Upper Sorbian = 'hsb' - # Sorbian Languages = 'wen' - 10 : {0 : 'es' , 4 : 'es' , 44 : 'es-ar' , 64 : 'es-bo' , 52 : 'es-cl' , 36 : 'es-co' , 20 : 'es-cr' , 28 : 'es-do' , - 48 : 'es-ec' , 68 : 'es-sv' , 16 : 'es-gt' , 72 : 'es-hn' , 8 : 'es-mx' , 76 : 'es-ni' , 24 : 'es-pa' , - 60 : 'es-py' , 40 : 'es-pe' , 80 : 'es-pr' , 56 : 'es-uy' , 32 : 'es-ve'}, - # Spanish, Spanish (Mobipocket bug?), Spanish (Argentina), Spanish - # (Bolivia), Spanish (Chile), Spanish (Colombia), Spanish (Costa Rica), - # Spanish (Dominican Republic), Spanish (Ecuador), Spanish (El - # Salvador), Spanish (Guatemala), Spanish (Honduras), Spanish (Mexico), - # Spanish (Nicaragua), Spanish (Panama), Spanish (Paraguay), Spanish - # (Peru), Spanish (Puerto Rico), Spanish (Uruguay), Spanish (Venezuela) - 48 : {0 : 'sx'}, # "Sutu" (not an IANA language code) - # "Sutu" is another name for "Southern Sotho"? - # IANA code for "Southern Sotho" is 'st' - 65 : {0 : 'sw'}, # Swahili - 29 : {0 : 'sv' , 1 : 'sv' , 8 : 'sv-fi'}, # Swedish, Swedish (Finland) - 73 : {0 : 'ta'}, # Tamil - 68 : {0 : 'tt'}, # Tatar - 74 : {0 : 'te'}, # Telugu - 30 : {0 : 'th'}, # Thai - 49 : {0 : 'ts'}, # Tsonga - 50 : {0 : 'tn'}, # Tswana - 31 : {0 : 'tr'}, # Turkish - 34 : {0 : 'uk'}, # Ukrainian - 32 : {0 : 'ur'}, # Urdu - 67 : {0 : 'uz', 2 : 'uz'}, # Uzbek - 42 : {0 : 'vi'}, # Vietnamese - 52 : {0 : 'xh'}, # Xhosa - 53 : {0 : 'zu'}, # Zulu - } - lang = "en" - if langID in mobilangdict: - subdict = mobilangdict[langID] - lang = subdict[0] - if sublangID in subdict: - lang = subdict[sublangID] - return lang - - -def toHex(byteList): - return binascii.hexlify(byteList) - -# returns base32 bytestring -def toBase32(value, npad=4): - digits = b'0123456789ABCDEFGHIJKLMNOPQRSTUV' - num_string=b'' - current = value - while current != 0: - next, remainder = divmod(current, 32) - rem_string = digits[remainder:remainder+1] - num_string = rem_string + num_string - current=next - if num_string == b'': - num_string = b'0' - pad = npad - len(num_string) - if pad > 0: - num_string = b'0' * pad + num_string - return num_string - - -# converts base32 string to value -def fromBase32(str_num): - if isinstance(str_num, text_type): - str_num = str_num.encode('latin-1') - scalelst = [1,32,1024,32768,1048576,33554432,1073741824,34359738368] - value = 0 - j = 0 - n = len(str_num) - scale = 0 - for i in range(n): - c = str_num[n-i-1:n-i] - if c in b'0123456789': - v = ord(c) - ord(b'0') - else: - v = ord(c) - ord(b'A') + 10 - if j < len(scalelst): - scale = scalelst[j] - else: - scale = scale * 32 - j += 1 - if v != 0: - value = value + (v * scale) - return value - - -# note: if decode a bytestring using 'latin-1' (or any other 0-255 encoding) -# in place of ascii you will get a byte to half-word or integer -# one to one mapping of values from 0 - 255 - -def mangle_fonts(encryption_key, data): - if isinstance(encryption_key, text_type): - encryption_key = encryption_key.encode('latin-1') - crypt = data[:1024] - key = cycle(iter(map(bord, encryption_key))) - # encrypt = ''.join([chr(ord(x)^key.next()) for x in crypt]) - encrypt = b''.join([bchr(bord(x)^next(key)) for x in crypt]) - return encrypt + data[1024:] diff --git a/epy_extras/KindleUnpack/mobiml2xhtml.py b/epy_extras/KindleUnpack/mobiml2xhtml.py deleted file mode 100755 index 94fc671..0000000 --- a/epy_extras/KindleUnpack/mobiml2xhtml.py +++ /dev/null @@ -1,527 +0,0 @@ -#! /usr/bin/python -# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab - - -# this program works in concert with the output from KindleUnpack - -''' -Convert from Mobi ML to XHTML -''' - -from __future__ import division, absolute_import, print_function - -import os -import sys -import re - -SPECIAL_HANDLING_TAGS = { - '?xml' : ('xmlheader', -1), - '!--' : ('comment', -3), - '!DOCTYPE' : ('doctype', -1), -} - -SPECIAL_HANDLING_TYPES = ['xmlheader', 'doctype', 'comment'] - -SELF_CLOSING_TAGS = ['br' , 'hr', 'input', 'img', 'image', 'meta', 'spacer', 'link', 'frame', 'base', 'col', 'reference'] - -class MobiMLConverter(object): - - PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE) - IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex') - - def __init__(self, filename): - self.base_css_rules = 'blockquote { margin: 0em 0em 0em 1.25em }\n' - self.base_css_rules += 'p { margin: 0em }\n' - self.base_css_rules += '.bold { font-weight: bold }\n' - self.base_css_rules += '.italic { font-style: italic }\n' - self.base_css_rules += '.mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n' - self.tag_css_rules = {} - self.tag_css_rule_cnt = 0 - self.path = [] - self.filename = filename - self.wipml = open(self.filename, 'r').read() - self.pos = 0 - self.opfname = self.filename.rsplit('.',1)[0] + '.opf' - self.opos = 0 - self.meta = '' - self.cssname = os.path.join(os.path.dirname(self.filename),'styles.css') - self.current_font_size = 3 - self.font_history = [] - - def cleanup_html(self): - self.wipml = re.sub(r'
', '', self.wipml) - self.wipml = self.wipml.replace('\r\n', '\n') - self.wipml = self.wipml.replace('> <', '>\n<') - self.wipml = self.wipml.replace(']*>', '', self.wipml) - self.wipml = self.wipml.replace('

','
') - - def replace_page_breaks(self): - self.wipml = self.PAGE_BREAK_PAT.sub( - '
', - self.wipml) - - # parse leading text of ml and tag - def parseml(self): - p = self.pos - if p >= len(self.wipml): - return None - if self.wipml[p] != '<': - res = self.wipml.find('<',p) - if res == -1 : - res = len(self.wipml) - self.pos = res - return self.wipml[p:res], None - # handle comment as a special case to deal with multi-line comments - if self.wipml[p:p+4] == '',p+1) - if te != -1: - te = te+2 - else : - te = self.wipml.find('>',p+1) - ntb = self.wipml.find('<',p+1) - if ntb != -1 and ntb < te: - self.pos = ntb - return self.wipml[p:ntb], None - self.pos = te + 1 - return None, self.wipml[p:te+1] - - # parses string version of tag to identify its name, - # its type 'begin', 'end' or 'single', - # plus build a hashtable of its attributes - # code is written to handle the possiblity of very poor formating - def parsetag(self, s): - p = 1 - # get the tag name - tname = None - ttype = None - tattr = {} - while s[p:p+1] == ' ' : - p += 1 - if s[p:p+1] == '/': - ttype = 'end' - p += 1 - while s[p:p+1] == ' ' : - p += 1 - b = p - while s[p:p+1] not in ('>', '/', ' ', '"', "'", "\r", "\n") : - p += 1 - tname=s[b:p].lower() - if tname == '!doctype': - tname = '!DOCTYPE' - # special cases - if tname in SPECIAL_HANDLING_TAGS: - ttype, backstep = SPECIAL_HANDLING_TAGS[tname] - tattr['special'] = s[p:backstep] - if ttype is None: - # parse any attributes - while s.find('=',p) != -1 : - while s[p:p+1] == ' ' : - p += 1 - b = p - while s[p:p+1] != '=' : - p += 1 - aname = s[b:p].lower() - aname = aname.rstrip(' ') - p += 1 - while s[p:p+1] == ' ' : - p += 1 - if s[p:p+1] in ('"', "'") : - p = p + 1 - b = p - while s[p:p+1] not in ('"', "'") : - p += 1 - val = s[b:p] - p += 1 - else : - b = p - while s[p:p+1] not in ('>', '/', ' ') : - p += 1 - val = s[b:p] - tattr[aname] = val - # label beginning and single tags - if ttype is None: - ttype = 'begin' - if s.find(' /',p) >= 0: - ttype = 'single_ext' - elif s.find('/',p) >= 0: - ttype = 'single' - return ttype, tname, tattr - - # main routine to convert from mobi markup language to html - def processml(self): - - # are these really needed - html_done = False - head_done = False - body_done = False - - skip = False - - htmlstr = '' - self.replace_page_breaks() - self.cleanup_html() - - # now parse the cleaned up ml into standard xhtml - while True: - - r = self.parseml() - if not r: - break - - text, tag = r - - if text: - if not skip: - htmlstr += text - - if tag: - ttype, tname, tattr = self.parsetag(tag) - - # If we run into a DTD or xml declarations inside the body ... bail. - if tname in SPECIAL_HANDLING_TAGS and tname != 'comment' and body_done: - htmlstr += '\n' - break - - # make sure self-closing tags actually self-close - if ttype == 'begin' and tname in SELF_CLOSING_TAGS: - ttype = 'single' - - # make sure any end tags of self-closing tags are discarded - if ttype == 'end' and tname in SELF_CLOSING_TAGS: - continue - - # remove embedded guide and refernces from old mobis - if tname in ('guide', 'ncx', 'reference') and ttype in ('begin', 'single', 'single_ext'): - tname = 'removeme:{0}'.format(tname) - tattr = None - if tname in ('guide', 'ncx', 'reference', 'font', 'span') and ttype == 'end': - if self.path[-1] == 'removeme:{0}'.format(tname): - tname = 'removeme:{0}'.format(tname) - tattr = None - - # Get rid of font tags that only have a color attribute. - if tname == 'font' and ttype in ('begin', 'single', 'single_ext'): - if 'color' in tattr and len(tattr) == 1: - tname = 'removeme:{0}'.format(tname) - tattr = None - - # Get rid of empty spans in the markup. - if tname == 'span' and ttype in ('begin', 'single', 'single_ext') and not len(tattr): - tname = 'removeme:{0}'.format(tname) - - # need to handle fonts outside of the normal methods - # so fonts tags won't be added to the self.path since we keep track - # of font tags separately with self.font_history - if tname == 'font' and ttype == 'begin': - # check for nested font start tags - if len(self.font_history) > 0 : - # inject a font end tag - taginfo = ('end', 'font', None) - htmlstr += self.processtag(taginfo) - self.font_history.append((ttype, tname, tattr)) - # handle the current font start tag - taginfo = (ttype, tname, tattr) - htmlstr += self.processtag(taginfo) - continue - - # check for nested font tags and unnest them - if tname == 'font' and ttype == 'end': - self.font_history.pop() - # handle this font end tag - taginfo = ('end', 'font', None) - htmlstr += self.processtag(taginfo) - # check if we were nested - if len(self.font_history) > 0: - # inject a copy of the most recent font start tag from history - taginfo = self.font_history[-1] - htmlstr += self.processtag(taginfo) - continue - - # keep track of nesting path - if ttype == 'begin': - self.path.append(tname) - elif ttype == 'end': - if tname != self.path[-1]: - print('improper nesting: ', self.path, tname, ttype) - if tname not in self.path: - # handle case of end tag with no beginning by injecting empty begin tag - taginfo = ('begin', tname, None) - htmlstr += self.processtag(taginfo) - print(" - fixed by injecting empty start tag ", tname) - self.path.append(tname) - elif len(self.path) > 1 and tname == self.path[-2]: - # handle case of dangling missing end - taginfo = ('end', self.path[-1], None) - htmlstr += self.processtag(taginfo) - print(" - fixed by injecting end tag ", self.path[-1]) - self.path.pop() - self.path.pop() - - if tname == 'removeme:{0}'.format(tname): - if ttype in ('begin', 'single', 'single_ext'): - skip = True - else: - skip = False - else: - taginfo = (ttype, tname, tattr) - htmlstr += self.processtag(taginfo) - - # handle potential issue of multiple html, head, and body sections - if tname == 'html' and ttype == 'begin' and not html_done: - htmlstr += '\n' - html_done = True - - if tname == 'head' and ttype == 'begin' and not head_done: - htmlstr += '\n' - # also add in metadata and style link tags - htmlstr += self.meta - htmlstr += '\n' - head_done = True - - if tname == 'body' and ttype == 'begin' and not body_done: - htmlstr += '\n' - body_done = True - - # handle issue of possibly missing html, head, and body tags - # I have not seen this but the original did something like this so ... - if not body_done: - htmlstr = '\n' + htmlstr + '\n' - if not head_done: - headstr = '\n' - headstr += self.meta - headstr += '\n' - headstr += '\n' - htmlstr = headstr + htmlstr - if not html_done: - htmlstr = '\n' + htmlstr + '\n' - - # finally add DOCTYPE info - htmlstr = '\n\n' + htmlstr - - css = self.base_css_rules - for cls, rule in self.tag_css_rules.items(): - css += '.%s { %s }\n' % (cls, rule) - - return (htmlstr, css, self.cssname) - - def ensure_unit(self, raw, unit='px'): - if re.search(r'\d+$', raw) is not None: - raw += unit - return raw - - # flatten possibly modified tag back to string - def taginfo_tostring(self, taginfo): - (ttype, tname, tattr) = taginfo - if ttype is None or tname is None: - return '' - if ttype == 'end': - return '' % tname - if ttype in SPECIAL_HANDLING_TYPES and tattr is not None and 'special' in tattr: - info = tattr['special'] - if ttype == 'comment': - return '<%s %s-->' % (tname, info) - else: - return '<%s %s>' % (tname, info) - res = [] - res.append('<%s' % tname) - if tattr is not None: - for key in tattr: - res.append(' %s="%s"' % (key, tattr[key])) - if ttype == 'single': - res.append('/>') - elif ttype == 'single_ext': - res.append(' />') - else : - res.append('>') - return "".join(res) - - # routines to convert from mobi ml tags atributes to xhtml attributes and styles - def processtag(self, taginfo): - # Converting mobi font sizes to numerics - size_map = { - 'xx-small': '1', - 'x-small': '2', - 'small': '3', - 'medium': '4', - 'large': '5', - 'x-large': '6', - 'xx-large': '7', - } - - size_to_em_map = { - '1': '.65em', - '2': '.75em', - '3': '1em', - '4': '1.125em', - '5': '1.25em', - '6': '1.5em', - '7': '2em', - } - - # current tag to work on - (ttype, tname, tattr) = taginfo - if not tattr: - tattr = {} - - styles = [] - - if tname is None or tname.startswith('removeme'): - return '' - - # have not seen an example of this yet so keep it here to be safe - # until this is better understood - if tname in ('country-region', 'place', 'placetype', 'placename', - 'state', 'city', 'street', 'address', 'content'): - tname = 'div' if tname == 'content' else 'span' - for key in tattr: - tattr.pop(key) - - # handle general case of style, height, width, bgcolor in any tag - if 'style' in tattr: - style = tattr.pop('style').strip() - if style: - styles.append(style) - - if 'align' in tattr: - align = tattr.pop('align').strip() - if align: - if tname in ('table', 'td', 'tr'): - pass - else: - styles.append('text-align: %s' % align) - - if 'height' in tattr: - height = tattr.pop('height').strip() - if height and '<' not in height and '>' not in height and re.search(r'\d+', height): - if tname in ('table', 'td', 'tr'): - pass - elif tname == 'img': - tattr['height'] = height - else: - styles.append('margin-top: %s' % self.ensure_unit(height)) - - if 'width' in tattr: - width = tattr.pop('width').strip() - if width and re.search(r'\d+', width): - if tname in ('table', 'td', 'tr'): - pass - elif tname == 'img': - tattr['width'] = width - else: - styles.append('text-indent: %s' % self.ensure_unit(width)) - if width.startswith('-'): - styles.append('margin-left: %s' % self.ensure_unit(width[1:])) - - if 'bgcolor' in tattr: - # no proprietary html allowed - if tname == 'div': - del tattr['bgcolor'] - - elif tname == 'font': - # Change font tags to span tags - tname = 'span' - if ttype in ('begin', 'single', 'single_ext'): - # move the face attribute to css font-family - if 'face' in tattr: - face = tattr.pop('face').strip() - styles.append('font-family: "%s"' % face) - - # Monitor the constantly changing font sizes, change them to ems and move - # them to css. The following will work for 'flat' font tags, but nested font tags - # will cause things to go wonky. Need to revert to the parent font tag's size - # when a closing tag is encountered. - if 'size' in tattr: - sz = tattr.pop('size').strip().lower() - try: - float(sz) - except ValueError: - if sz in size_map: - sz = size_map[sz] - else: - if sz.startswith('-') or sz.startswith('+'): - sz = self.current_font_size + float(sz) - if sz > 7: - sz = 7 - elif sz < 1: - sz = 1 - sz = str(int(sz)) - styles.append('font-size: %s' % size_to_em_map[sz]) - self.current_font_size = int(sz) - - elif tname == 'img': - for attr in ('width', 'height'): - if attr in tattr: - val = tattr[attr] - if val.lower().endswith('em'): - try: - nval = float(val[:-2]) - nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile - tattr[attr] = "%dpx"%int(nval) - except: - del tattr[attr] - elif val.lower().endswith('%'): - del tattr[attr] - - # convert the anchor tags - if 'filepos-id' in tattr: - tattr['id'] = tattr.pop('filepos-id') - if 'name' in tattr and tattr['name'] != tattr['id']: - tattr['name'] = tattr['id'] - - if 'filepos' in tattr: - filepos = tattr.pop('filepos') - try: - tattr['href'] = "#filepos%d" % int(filepos) - except ValueError: - pass - - if styles: - ncls = None - rule = '; '.join(styles) - for sel, srule in self.tag_css_rules.items(): - if srule == rule: - ncls = sel - break - if ncls is None: - self.tag_css_rule_cnt += 1 - ncls = 'rule_%d' % self.tag_css_rule_cnt - self.tag_css_rules[ncls] = rule - cls = tattr.get('class', '') - cls = cls + (' ' if cls else '') + ncls - tattr['class'] = cls - - # convert updated tag back to string representation - if len(tattr) == 0: - tattr = None - taginfo = (ttype, tname, tattr) - return self.taginfo_tostring(taginfo) - -''' main only left in for testing outside of plugin ''' - -def main(argv=sys.argv): - if len(argv) != 2: - return 1 - else: - infile = argv[1] - - try: - print('Converting Mobi Markup Language to XHTML') - mlc = MobiMLConverter(infile) - print('Processing ...') - htmlstr, css, cssname = mlc.processml() - outname = infile.rsplit('.',1)[0] + '_converted.html' - open(outname, 'w').write(htmlstr) - open(cssname, 'w').write(css) - print('Completed') - print('XHTML version of book can be found at: ' + outname) - - except ValueError as e: - print("Error: %s" % e) - return 1 - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/epy_extras/KindleUnpack/unipath.py b/epy_extras/KindleUnpack/unipath.py deleted file mode 100755 index 2416279..0000000 --- a/epy_extras/KindleUnpack/unipath.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab - -# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without modification, -# are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this list of -# conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, this list -# of conditions and the following disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT -# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED -# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY -# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from __future__ import unicode_literals, division, absolute_import, print_function -from .compatibility_utils import PY2, text_type, binary_type - -import sys -import os - -# utility routines to convert all paths to be full unicode - -# Under Python 2, if a bytestring, try to convert it to unicode using sys.getfilesystemencoding -# Under Python 3, if bytes, try to convert it to unicode using os.fsencode() to decode it - -# Mac OS X and Windows will happily support full unicode paths -# Linux can support full unicode paths but allows arbitrary byte paths which may be inconsistent with unicode - -fsencoding = sys.getfilesystemencoding() - -def pathof(s, enc=fsencoding): - if s is None: - return None - if isinstance(s, text_type): - return s - if isinstance(s, binary_type): - try: - return s.decode(enc) - except: - pass - return s - -def exists(s): - return os.path.exists(pathof(s)) - -def isfile(s): - return os.path.isfile(pathof(s)) - -def isdir(s): - return os.path.isdir(pathof(s)) - -def mkdir(s): - return os.mkdir(pathof(s)) - -def listdir(s): - rv = [] - for file in os.listdir(pathof(s)): - rv.append(pathof(file)) - return rv - -def getcwd(): - if PY2: - return os.getcwdu() - return os.getcwd() - -def walk(top): - top = pathof(top) - rv = [] - for base, dnames, names in os.walk(top): - base = pathof(base) - for name in names: - name = pathof(name) - rv.append(relpath(os.path.join(base, name), top)) - return rv - -def relpath(path, start=None): - return os.path.relpath(pathof(path) , pathof(start)) - -def abspath(path): - return os.path.abspath(pathof(path)) diff --git a/epy_extras/KindleUnpack/unpack_structure.py b/epy_extras/KindleUnpack/unpack_structure.py deleted file mode 100644 index 2e66eb8..0000000 --- a/epy_extras/KindleUnpack/unpack_structure.py +++ /dev/null @@ -1,167 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab - -from __future__ import unicode_literals, division, absolute_import, print_function - -from .compatibility_utils import text_type - -from . import unipath -from .unipath import pathof - -DUMP = False -""" Set to True to dump all possible information. """ - -import os - -import re -# note: re requites the pattern to be the exact same type as the data to be searched in python3 -# but u"" is not allowed for the pattern itself only b"" - -import zipfile -import binascii -from .mobi_utils import mangle_fonts - -class unpackException(Exception): - pass - -class ZipInfo(zipfile.ZipInfo): - - def __init__(self, *args, **kwargs): - if 'compress_type' in kwargs: - compress_type = kwargs.pop('compress_type') - super(ZipInfo, self).__init__(*args, **kwargs) - self.compress_type = compress_type - -class fileNames: - - def __init__(self, infile, outdir): - self.infile = infile - self.outdir = outdir - if not unipath.exists(self.outdir): - unipath.mkdir(self.outdir) - self.mobi7dir = os.path.join(self.outdir,'mobi7') - if not unipath.exists(self.mobi7dir): - unipath.mkdir(self.mobi7dir) - self.imgdir = os.path.join(self.mobi7dir, 'Images') - if not unipath.exists(self.imgdir): - unipath.mkdir(self.imgdir) - self.hdimgdir = os.path.join(self.outdir,'HDImages') - if not unipath.exists(self.hdimgdir): - unipath.mkdir(self.hdimgdir) - self.outbase = os.path.join(self.outdir, os.path.splitext(os.path.split(infile)[1])[0]) - - def getInputFileBasename(self): - return os.path.splitext(os.path.basename(self.infile))[0] - - def makeK8Struct(self): - self.k8dir = os.path.join(self.outdir,'mobi8') - if not unipath.exists(self.k8dir): - unipath.mkdir(self.k8dir) - self.k8metainf = os.path.join(self.k8dir,'META-INF') - if not unipath.exists(self.k8metainf): - unipath.mkdir(self.k8metainf) - self.k8oebps = os.path.join(self.k8dir,'OEBPS') - if not unipath.exists(self.k8oebps): - unipath.mkdir(self.k8oebps) - self.k8images = os.path.join(self.k8oebps,'Images') - if not unipath.exists(self.k8images): - unipath.mkdir(self.k8images) - self.k8fonts = os.path.join(self.k8oebps,'Fonts') - if not unipath.exists(self.k8fonts): - unipath.mkdir(self.k8fonts) - self.k8styles = os.path.join(self.k8oebps,'Styles') - if not unipath.exists(self.k8styles): - unipath.mkdir(self.k8styles) - self.k8text = os.path.join(self.k8oebps,'Text') - if not unipath.exists(self.k8text): - unipath.mkdir(self.k8text) - - # recursive zip creation support routine - def zipUpDir(self, myzip, tdir, localname): - currentdir = tdir - if localname != "": - currentdir = os.path.join(currentdir,localname) - list = unipath.listdir(currentdir) - for file in list: - afilename = file - localfilePath = os.path.join(localname, afilename) - realfilePath = os.path.join(currentdir,file) - if unipath.isfile(realfilePath): - myzip.write(pathof(realfilePath), pathof(localfilePath), zipfile.ZIP_DEFLATED) - elif unipath.isdir(realfilePath): - self.zipUpDir(myzip, tdir, localfilePath) - - def makeEPUB(self, usedmap, obfuscate_data, uid): - bname = os.path.join(self.k8dir, self.getInputFileBasename() + '.epub') - # Create an encryption key for Adobe font obfuscation - # based on the epub's uid - if isinstance(uid,text_type): - uid = uid.encode('ascii') - if obfuscate_data: - key = re.sub(br'[^a-fA-F0-9]', b'', uid) - key = binascii.unhexlify((key + key)[:32]) - - # copy over all images and fonts that are actually used in the ebook - # and remove all font files from mobi7 since not supported - imgnames = unipath.listdir(self.imgdir) - for name in imgnames: - if usedmap.get(name,'not used') == 'used': - filein = os.path.join(self.imgdir,name) - if name.endswith(".ttf"): - fileout = os.path.join(self.k8fonts,name) - elif name.endswith(".otf"): - fileout = os.path.join(self.k8fonts,name) - elif name.endswith(".failed"): - fileout = os.path.join(self.k8fonts,name) - else: - fileout = os.path.join(self.k8images,name) - data = b'' - with open(pathof(filein),'rb') as f: - data = f.read() - if obfuscate_data: - if name in obfuscate_data: - data = mangle_fonts(key, data) - open(pathof(fileout),'wb').write(data) - if name.endswith(".ttf") or name.endswith(".otf"): - os.remove(pathof(filein)) - - # opf file name hard coded to "content.opf" - container = '\n' - container += '\n' - container += ' \n' - container += '' - container += ' \n\n' - fileout = os.path.join(self.k8metainf,'container.xml') - with open(pathof(fileout),'wb') as f: - f.write(container.encode('utf-8')) - - if obfuscate_data: - encryption = '\n' - for font in obfuscate_data: - encryption += ' \n' - encryption += ' \n' - encryption += ' \n' - encryption += ' \n' - encryption += ' \n' - encryption += ' \n' - encryption += '\n' - fileout = os.path.join(self.k8metainf,'encryption.xml') - with open(pathof(fileout),'wb') as f: - f.write(encryption.encode('utf-8')) - - # ready to build epub - self.outzip = zipfile.ZipFile(pathof(bname), 'w') - - # add the mimetype file uncompressed - mimetype = b'application/epub+zip' - fileout = os.path.join(self.k8dir,'mimetype') - with open(pathof(fileout),'wb') as f: - f.write(mimetype) - nzinfo = ZipInfo('mimetype', compress_type=zipfile.ZIP_STORED) - nzinfo.external_attr = 0o600 << 16 # make this a normal file - self.outzip.writestr(nzinfo, mimetype) - self.zipUpDir(self.outzip,self.k8dir,'META-INF') - self.zipUpDir(self.outzip,self.k8dir,'OEBPS') - self.outzip.close() diff --git a/epy_extras/__init__.py b/epy_extras/__init__.py deleted file mode 100644 index c06e358..0000000 --- a/epy_extras/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -__all__ = ["unpackBook"] - -from .KindleUnpack.kindleunpack import unpackBook diff --git a/poetry.lock b/poetry.lock index 224fda9..0547abc 100644 --- a/poetry.lock +++ b/poetry.lock @@ -6,6 +6,20 @@ category = "dev" optional = false python-versions = "*" +[[package]] +name = "asttokens" +version = "2.0.8" +description = "Annotate AST trees with source code positions" +category = "dev" +optional = false +python-versions = "*" + +[package.dependencies] +six = "*" + +[package.extras] +test = ["astroid (<=2.5.3)", "pytest"] + [[package]] name = "attrs" version = "22.1.0" @@ -42,7 +56,6 @@ mypy-extensions = ">=0.4.3" pathspec = ">=0.9.0" platformdirs = ">=2" tomli = {version = ">=1.1.0", markers = "python_full_version < \"3.11.0a7\""} -typed-ast = {version = ">=1.4.2", markers = "python_version < \"3.8\" and implementation_name == \"cpython\""} typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} [package.extras] @@ -51,6 +64,72 @@ d = ["aiohttp (>=3.7.4)"] jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] uvloop = ["uvloop (>=0.15.2)"] +[[package]] +name = "bleach" +version = "5.0.1" +description = "An easy safelist-based HTML-sanitizing tool." +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +six = ">=1.9.0" +webencodings = "*" + +[package.extras] +css = ["tinycss2 (>=1.1.0,<1.2)"] +dev = ["build (==0.8.0)", "flake8 (==4.0.1)", "hashin (==0.17.0)", "pip-tools (==6.6.2)", "pytest (==7.1.2)", "Sphinx (==4.3.2)", "tox (==3.25.0)", "twine (==4.0.1)", "wheel (==0.37.1)", "black (==22.3.0)", "mypy (==0.961)"] + +[[package]] +name = "build" +version = "0.8.0" +description = "A simple, correct PEP 517 build frontend" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +colorama = {version = "*", markers = "os_name == \"nt\""} +packaging = ">=19.0" +pep517 = ">=0.9.1" +tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} + +[package.extras] +virtualenv = ["virtualenv (>=20.0.35)"] +typing = ["typing-extensions (>=3.7.4.3)", "mypy (==0.950)", "importlib-metadata (>=4.6.4)"] +test = ["setuptools (>=56.0.0)", "setuptools (>=42.0.0)", "wheel (>=0.36.0)", "toml (>=0.10.0)", "pytest-xdist (>=1.34)", "pytest-rerunfailures (>=9.1)", "pytest-mock (>=2)", "pytest-cov (>=2.12)", "pytest (>=6.2.4)", "filelock (>=3)"] +docs = ["sphinx-autodoc-typehints (>=1.10)", "sphinx-argparse-cli (>=1.5)", "sphinx (>=4.0,<5.0)", "furo (>=2021.08.31)"] + +[[package]] +name = "certifi" +version = "2022.9.24" +description = "Python package for providing Mozilla's CA Bundle." +category = "dev" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "cffi" +version = "1.15.1" +description = "Foreign Function Interface for Python calling C code." +category = "dev" +optional = false +python-versions = "*" + +[package.dependencies] +pycparser = "*" + +[[package]] +name = "charset-normalizer" +version = "2.1.1" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +category = "dev" +optional = false +python-versions = ">=3.6.0" + +[package.extras] +unicode_backport = ["unicodedata2"] + [[package]] name = "click" version = "8.1.3" @@ -61,7 +140,6 @@ python-versions = ">=3.7" [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} [[package]] name = "colorama" @@ -71,9 +149,20 @@ category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +[[package]] +name = "commonmark" +version = "0.9.1" +description = "Python parser for the CommonMark Markdown spec" +category = "dev" +optional = false +python-versions = "*" + +[package.extras] +test = ["flake8 (==3.7.8)", "hypothesis (==3.55.3)"] + [[package]] name = "coverage" -version = "6.4.4" +version = "6.5.0" description = "Code coverage measurement for Python" category = "dev" optional = false @@ -82,6 +171,25 @@ python-versions = ">=3.7" [package.extras] toml = ["tomli"] +[[package]] +name = "cryptography" +version = "38.0.1" +description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +cffi = ">=1.12" + +[package.extras] +docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"] +docstest = ["pyenchant (>=1.6.11)", "twine (>=1.12.0)", "sphinxcontrib-spelling (>=4.0.1)"] +pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"] +sdist = ["setuptools-rust (>=0.11.4)"] +ssh = ["bcrypt (>=3.1.5)"] +test = ["pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-subtests", "pytest-xdist", "pretend", "iso8601", "pytz", "hypothesis (>=1.11.4,!=3.79.2)"] + [[package]] name = "debugpy" version = "1.6.3" @@ -98,6 +206,25 @@ category = "dev" optional = false python-versions = ">=3.5" +[[package]] +name = "docutils" +version = "0.19" +description = "Docutils -- Python Documentation Utilities" +category = "dev" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "executing" +version = "1.1.0" +description = "Get the currently executing AST node of a frame, and other information" +category = "dev" +optional = false +python-versions = "*" + +[package.extras] +tests = ["rich", "littleutils", "pytest", "asttokens"] + [[package]] name = "greenlet" version = "1.1.3" @@ -109,22 +236,29 @@ python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*" [package.extras] docs = ["sphinx"] +[[package]] +name = "idna" +version = "3.4" +description = "Internationalized Domain Names in Applications (IDNA)" +category = "dev" +optional = false +python-versions = ">=3.5" + [[package]] name = "importlib-metadata" -version = "4.12.0" +version = "5.0.0" description = "Read metadata from Python packages" category = "dev" optional = false python-versions = ">=3.7" [package.dependencies] -typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} zipp = ">=0.5" [package.extras] -docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)"] +docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "jaraco.tidelift (>=1.4)"] perf = ["ipython"] -testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.3)", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"] +testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "flake8 (<5)", "pytest-cov", "pytest-enabler (>=1.3)", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"] [[package]] name = "iniconfig" @@ -136,11 +270,11 @@ python-versions = "*" [[package]] name = "ipython" -version = "7.34.0" +version = "8.5.0" description = "IPython: Productive Interactive Computing" category = "dev" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" [package.dependencies] appnope = {version = "*", markers = "sys_platform == \"darwin\""} @@ -151,20 +285,52 @@ jedi = ">=0.16" matplotlib-inline = "*" pexpect = {version = ">4.3", markers = "sys_platform != \"win32\""} pickleshare = "*" -prompt-toolkit = ">=2.0.0,<3.0.0 || >3.0.0,<3.0.1 || >3.0.1,<3.1.0" -pygments = "*" -traitlets = ">=4.2" +prompt-toolkit = ">3.0.1,<3.1.0" +pygments = ">=2.4.0" +stack-data = "*" +traitlets = ">=5" [package.extras] -all = ["Sphinx (>=1.3)", "ipykernel", "ipyparallel", "ipywidgets", "nbconvert", "nbformat", "nose (>=0.10.1)", "notebook", "numpy (>=1.17)", "pygments", "qtconsole", "requests", "testpath"] +all = ["black", "Sphinx (>=1.3)", "ipykernel", "nbconvert", "nbformat", "ipywidgets", "notebook", "ipyparallel", "qtconsole", "pytest (<7.1)", "pytest-asyncio", "testpath", "curio", "matplotlib (!=3.2.0)", "numpy (>=1.19)", "pandas", "trio"] +black = ["black"] doc = ["Sphinx (>=1.3)"] kernel = ["ipykernel"] nbconvert = ["nbconvert"] nbformat = ["nbformat"] -notebook = ["notebook", "ipywidgets"] +notebook = ["ipywidgets", "notebook"] parallel = ["ipyparallel"] qtconsole = ["qtconsole"] -test = ["nose (>=0.10.1)", "requests", "testpath", "pygments", "nbformat", "ipykernel", "numpy (>=1.17)"] +test = ["pytest (<7.1)", "pytest-asyncio", "testpath"] +test_extra = ["pytest (<7.1)", "pytest-asyncio", "testpath", "curio", "matplotlib (!=3.2.0)", "nbformat", "numpy (>=1.19)", "pandas", "trio"] + +[[package]] +name = "isort" +version = "5.10.1" +description = "A Python utility / library to sort Python imports." +category = "dev" +optional = false +python-versions = ">=3.6.1,<4.0" + +[package.extras] +pipfile_deprecated_finder = ["pipreqs", "requirementslib"] +requirements_deprecated_finder = ["pipreqs", "pip-api"] +colors = ["colorama (>=0.4.3,<0.5.0)"] +plugins = ["setuptools"] + +[[package]] +name = "jaraco.classes" +version = "3.2.3" +description = "Utility functions for Python class constructs" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +more-itertools = "*" + +[package.extras] +docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "jaraco.tidelift (>=1.4)"] +testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "flake8 (<5)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"] [[package]] name = "jedi" @@ -181,6 +347,37 @@ parso = ">=0.8.0,<0.9.0" qa = ["flake8 (==3.8.3)", "mypy (==0.782)"] testing = ["Django (<3.1)", "colorama", "docopt", "pytest (<7.0.0)"] +[[package]] +name = "jeepney" +version = "0.8.0" +description = "Low-level, pure Python DBus protocol wrapper." +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +trio = ["async-generator", "trio"] +test = ["async-timeout", "trio", "testpath", "pytest-asyncio (>=0.17)", "pytest-trio", "pytest"] + +[[package]] +name = "keyring" +version = "23.9.3" +description = "Store and access your passwords safely." +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +importlib-metadata = {version = ">=3.6", markers = "python_version < \"3.10\""} +"jaraco.classes" = "*" +jeepney = {version = ">=0.4.2", markers = "sys_platform == \"linux\""} +pywin32-ctypes = {version = "<0.1.0 || >0.1.0,<0.1.1 || >0.1.1", markers = "sys_platform == \"win32\""} +SecretStorage = {version = ">=3.2", markers = "sys_platform == \"linux\""} + +[package.extras] +docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "jaraco.tidelift (>=1.4)"] +testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "flake8 (<5)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"] + [[package]] name = "matplotlib-inline" version = "0.1.6" @@ -192,6 +389,14 @@ python-versions = ">=3.5" [package.dependencies] traitlets = "*" +[[package]] +name = "more-itertools" +version = "8.14.0" +description = "More routines for operating on iterables, beyond itertools" +category = "dev" +optional = false +python-versions = ">=3.5" + [[package]] name = "msgpack" version = "1.0.4" @@ -202,16 +407,15 @@ python-versions = "*" [[package]] name = "mypy" -version = "0.971" +version = "0.981" description = "Optional static typing for Python" category = "dev" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" [package.dependencies] mypy-extensions = ">=0.4.3" tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typed-ast = {version = ">=1.4.0,<2", markers = "python_version < \"3.8\""} typing-extensions = ">=3.10" [package.extras] @@ -258,6 +462,17 @@ category = "dev" optional = false python-versions = ">=3.7" +[[package]] +name = "pep517" +version = "0.13.0" +description = "Wrappers to build Python packages using PEP 517 hooks" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} + [[package]] name = "pexpect" version = "4.8.0" @@ -277,6 +492,17 @@ category = "dev" optional = false python-versions = "*" +[[package]] +name = "pkginfo" +version = "1.8.3" +description = "Query metadatdata from sdists / bdists / installed packages." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" + +[package.extras] +testing = ["nose", "coverage"] + [[package]] name = "platformdirs" version = "2.5.2" @@ -297,9 +523,6 @@ category = "dev" optional = false python-versions = ">=3.6" -[package.dependencies] -importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} - [package.extras] dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] @@ -323,6 +546,17 @@ category = "dev" optional = false python-versions = "*" +[[package]] +name = "pure-eval" +version = "0.2.2" +description = "Safely evaluate AST nodes without side effects" +category = "dev" +optional = false +python-versions = "*" + +[package.extras] +tests = ["pytest"] + [[package]] name = "py" version = "1.11.0" @@ -331,6 +565,14 @@ category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +[[package]] +name = "pycparser" +version = "2.21" +description = "C parser in Python" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + [[package]] name = "pygments" version = "2.13.0" @@ -380,7 +622,6 @@ python-versions = ">=3.7" [package.dependencies] attrs = ">=19.2.0" colorama = {version = "*", markers = "sys_platform == \"win32\""} -importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} iniconfig = "*" packaging = "*" pluggy = ">=0.12,<2.0" @@ -390,6 +631,122 @@ tomli = ">=1.0.0" [package.extras] testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] +[[package]] +name = "pywin32-ctypes" +version = "0.2.0" +description = "" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "readme-renderer" +version = "37.2" +description = "readme_renderer is a library for rendering \"readme\" descriptions for Warehouse" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +bleach = ">=2.1.0" +docutils = ">=0.13.1" +Pygments = ">=2.5.1" + +[package.extras] +md = ["cmarkgfm (>=0.8.0)"] + +[[package]] +name = "requests" +version = "2.28.1" +description = "Python HTTP for Humans." +category = "dev" +optional = false +python-versions = ">=3.7, <4" + +[package.dependencies] +certifi = ">=2017.4.17" +charset-normalizer = ">=2,<3" +idna = ">=2.5,<4" +urllib3 = ">=1.21.1,<1.27" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"] + +[[package]] +name = "requests-toolbelt" +version = "0.9.1" +description = "A utility belt for advanced users of python-requests" +category = "dev" +optional = false +python-versions = "*" + +[package.dependencies] +requests = ">=2.0.1,<3.0.0" + +[[package]] +name = "rfc3986" +version = "2.0.0" +description = "Validating URI References per RFC 3986" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +idna2008 = ["idna"] + +[[package]] +name = "rich" +version = "12.5.1" +description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" +category = "dev" +optional = false +python-versions = ">=3.6.3,<4.0.0" + +[package.dependencies] +commonmark = ">=0.9.0,<0.10.0" +pygments = ">=2.6.0,<3.0.0" +typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.9\""} + +[package.extras] +jupyter = ["ipywidgets (>=7.5.1,<8.0.0)"] + +[[package]] +name = "secretstorage" +version = "3.3.3" +description = "Python bindings to FreeDesktop.org Secret Service API" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +cryptography = ">=2.0" +jeepney = ">=0.6" + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "stack-data" +version = "0.5.1" +description = "Extract data from python stack frames and tracebacks for informative displays" +category = "dev" +optional = false +python-versions = "*" + +[package.dependencies] +asttokens = "*" +executing = "*" +pure-eval = "*" + +[package.extras] +tests = ["cython", "littleutils", "pygments", "typeguard", "pytest"] + [[package]] name = "tomli" version = "2.0.1" @@ -410,12 +767,23 @@ python-versions = ">=3.7" test = ["pre-commit", "pytest"] [[package]] -name = "typed-ast" -version = "1.5.4" -description = "a fork of Python 2 and 3 ast modules with type comment support" +name = "twine" +version = "4.0.1" +description = "Collection of utilities for publishing packages on PyPI" category = "dev" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" + +[package.dependencies] +importlib-metadata = ">=3.6" +keyring = ">=15.1" +pkginfo = ">=1.8.1" +readme-renderer = ">=35.0" +requests = ">=2.20" +requests-toolbelt = ">=0.8.0,<0.9.0 || >0.9.0" +rfc3986 = ">=1.4.0" +rich = ">=12.0.0" +urllib3 = ">=1.26.0" [[package]] name = "typing-extensions" @@ -425,6 +793,19 @@ category = "dev" optional = false python-versions = ">=3.7" +[[package]] +name = "urllib3" +version = "1.26.12" +description = "HTTP library with thread-safe connection pooling, file post, and more." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4" + +[package.extras] +brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"] +secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "urllib3-secure-extra", "ipaddress"] +socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] + [[package]] name = "wcwidth" version = "0.2.5" @@ -433,6 +814,14 @@ category = "dev" optional = false python-versions = "*" +[[package]] +name = "webencodings" +version = "0.5.1" +description = "Character encoding aliases for legacy web content" +category = "dev" +optional = false +python-versions = "*" + [[package]] name = "windows-curses" version = "2.3.0" @@ -455,40 +844,62 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest- [metadata] lock-version = "1.1" -python-versions = "^3.7" -content-hash = "688a03f5baf4d8cc6850f78d49f42245b7d3ae2eaf21ea326732af64617805ff" +python-versions = "^3.8" +content-hash = "936a54c993790e03f357a092c82f5d0fc8fdeba780d7c66be761b86ff4d00760" [metadata.files] appnope = [ {file = "appnope-0.1.3-py2.py3-none-any.whl", hash = "sha256:265a455292d0bd8a72453494fa24df5a11eb18373a60c7c0430889f22548605e"}, {file = "appnope-0.1.3.tar.gz", hash = "sha256:02bd91c4de869fbb1e1c50aafc4098827a7a54ab2f39d9dcba6c9547ed920e24"}, ] +asttokens = [] attrs = [] backcall = [ {file = "backcall-0.2.0-py2.py3-none-any.whl", hash = "sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255"}, {file = "backcall-0.2.0.tar.gz", hash = "sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e"}, ] black = [] +bleach = [] +build = [] +certifi = [] +cffi = [] +charset-normalizer = [] click = [] colorama = [] +commonmark = [ + {file = "commonmark-0.9.1-py2.py3-none-any.whl", hash = "sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9"}, + {file = "commonmark-0.9.1.tar.gz", hash = "sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60"}, +] coverage = [] +cryptography = [] debugpy = [] decorator = [ {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"}, {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, ] +docutils = [] +executing = [] greenlet = [] +idna = [] importlib-metadata = [] iniconfig = [ {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, ] ipython = [] +isort = [ + {file = "isort-5.10.1-py3-none-any.whl", hash = "sha256:6f62d78e2f89b4500b080fe3a81690850cd254227f27f75c3a0c491a1f351ba7"}, + {file = "isort-5.10.1.tar.gz", hash = "sha256:e8443a5e7a020e9d7f97f1d7d9cd17c88bcb3bc7e218bf9cf5095fe550be2951"}, +] +"jaraco.classes" = [] jedi = [ {file = "jedi-0.18.1-py2.py3-none-any.whl", hash = "sha256:637c9635fcf47945ceb91cd7f320234a7be540ded6f3e99a50cb6febdfd1ba8d"}, {file = "jedi-0.18.1.tar.gz", hash = "sha256:74137626a64a99c8eb6ae5832d99b3bdd7d29a3850fe2aa80a4126b2a7d949ab"}, ] +jeepney = [] +keyring = [] matplotlib-inline = [] +more-itertools = [] msgpack = [] mypy = [] mypy-extensions = [ @@ -504,6 +915,7 @@ parso = [ {file = "parso-0.8.3.tar.gz", hash = "sha256:8c07be290bb59f03588915921e29e8a50002acaf2cdc5fa0e0114f91709fafa0"}, ] pathspec = [] +pep517 = [] pexpect = [ {file = "pexpect-4.8.0-py2.py3-none-any.whl", hash = "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937"}, {file = "pexpect-4.8.0.tar.gz", hash = "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"}, @@ -512,6 +924,7 @@ pickleshare = [ {file = "pickleshare-0.7.5-py2.py3-none-any.whl", hash = "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"}, {file = "pickleshare-0.7.5.tar.gz", hash = "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca"}, ] +pkginfo = [] platformdirs = [ {file = "platformdirs-2.5.2-py3-none-any.whl", hash = "sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788"}, {file = "platformdirs-2.5.2.tar.gz", hash = "sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"}, @@ -525,27 +938,58 @@ ptyprocess = [ {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, ] +pure-eval = [ + {file = "pure_eval-0.2.2-py3-none-any.whl", hash = "sha256:01eaab343580944bc56080ebe0a674b39ec44a945e6d09ba7db3cb8cec289350"}, + {file = "pure_eval-0.2.2.tar.gz", hash = "sha256:2b45320af6dfaa1750f543d714b6d1c520a1688dec6fd24d339063ce0aaa9ac3"}, +] py = [ {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"}, {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, ] +pycparser = [ + {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, + {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, +] pygments = [] pynvim = [ {file = "pynvim-0.4.3.tar.gz", hash = "sha256:3a795378bde5e8092fbeb3a1a99be9c613d2685542f1db0e5c6fd467eed56dff"}, ] pyparsing = [] pytest = [] +pywin32-ctypes = [ + {file = "pywin32-ctypes-0.2.0.tar.gz", hash = "sha256:24ffc3b341d457d48e8922352130cf2644024a4ff09762a2261fd34c36ee5942"}, + {file = "pywin32_ctypes-0.2.0-py2.py3-none-any.whl", hash = "sha256:9dc2d991b3479cc2df15930958b674a48a227d5361d413827a4cfd0b5876fc98"}, +] +readme-renderer = [] +requests = [] +requests-toolbelt = [ + {file = "requests-toolbelt-0.9.1.tar.gz", hash = "sha256:968089d4584ad4ad7c171454f0a5c6dac23971e9472521ea3b6d49d610aa6fc0"}, + {file = "requests_toolbelt-0.9.1-py2.py3-none-any.whl", hash = "sha256:380606e1d10dc85c3bd47bf5a6095f815ec007be7a8b69c878507068df059e6f"}, +] +rfc3986 = [] +rich = [] +secretstorage = [] +six = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] +stack-data = [] tomli = [ {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] traitlets = [] -typed-ast = [] +twine = [] typing-extensions = [] +urllib3 = [] wcwidth = [ {file = "wcwidth-0.2.5-py2.py3-none-any.whl", hash = "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784"}, {file = "wcwidth-0.2.5.tar.gz", hash = "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83"}, ] +webencodings = [ + {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"}, + {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, +] windows-curses = [ {file = "windows_curses-2.3.0-cp310-cp310-win32.whl", hash = "sha256:a3a63a0597729e10f923724c2cf972a23ea677b400d2387dee1d668cf7116177"}, {file = "windows_curses-2.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:7a35eda4cb120b9e1a5ae795f3bc06c55b92c9d391baba6be1903285a05f3551"}, diff --git a/pyproject.toml b/pyproject.toml index 41b3483..e567d3e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,37 +1,48 @@ [tool.poetry] name = "epy-reader" -version = "2022.9.24" -description = "CLI Ebook Reader" +version = "2022.10.2" +description = "TUI Ebook Reader" authors = ["Benawi Adha "] license = "GPL-3.0" +readme = "README.md" +repository = "https://github.com/wustho/epy" +keywords = ["ebook", "epub", "epub3", "fb2", "mobi", "azw3", "TUI", "ebook reader"] packages = [ - { include = "epy.py" }, - { include = "epy_extras" }, + { include = "epy_reader", from = "src" } ] [tool.poetry.scripts] -epy = "epy:main" +epy = "epy_reader.__main__:main" [tool.poetry.dependencies] -python = "^3.7" +python = "^3.8" windows-curses = { version = "*", markers = "platform_system == 'Windows'" } [tool.poetry.dev-dependencies] -black = "*" -coverage = "*" -debugpy = "*" -ipython = "*" -mypy = "*" -pynvim = "*" -pytest = "*" +pynvim = "^0.4.3" +black = "^22.8.0" +coverage = "^6.5.0" +debugpy = "^1.6.3" +ipython = "^8.5.0" +mypy = "^0.981" +pytest = "^7.1.3" +isort = "^5.10.1" +build = "^0.8.0" +twine = "^4.0.1" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" [tool.mypy] strict_optional = true +follow_imports = "silent" +exclude = ["src/epy_reader/tools/"] [tool.black] line-length = 100 target-version = ['py38'] +exclude = "src/epy_reader/tools/" -[build-system] -requires = ["poetry-core>=1.0.0"] -build-backend = "poetry.core.masonry.api" +[tool.isort] +skip = "src/epy_reader/tools/" diff --git a/setup.py b/setup.py deleted file mode 100644 index 1cade5b..0000000 --- a/setup.py +++ /dev/null @@ -1,28 +0,0 @@ -import sys -from setuptools import setup - -with open("README.md", "r") as fh: - long_description = fh.read() - -setup( - name="epy-reader", - version="2022.9.24", - description="Terminal/CLI Ebook (epub, fb2, mobi, azw3) Reader", - long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/wustho/epy", - author="Benawi Adha", - author_email="benawiadha@gmail.com", - license="GPL-3.0", - keywords=["epub", "epub3", "fb2", "mobi", "azw3", "CLI", "Terminal", "Reader"], - python_requires="~=3.7", - py_modules=["epy"], - packages=["epy_extras", "epy_extras.KindleUnpack"], - entry_points={"console_scripts": ["epy = epy:main"]}, - install_requires=["windows-curses;platform_system=='Windows'"], - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", - "Operating System :: OS Independent", - ], -) diff --git a/src/epy_reader/__init__.py b/src/epy_reader/__init__.py new file mode 100644 index 0000000..97e99a2 --- /dev/null +++ b/src/epy_reader/__init__.py @@ -0,0 +1,5 @@ +__version__ = "2022.10.2" +__license__ = "GPL-3.0" +__author__ = "Benawi Adha" +__email__ = "benawiadha@gmail.com" +__url__ = "https://github.com/wustho/epy" diff --git a/src/epy_reader/__main__.py b/src/epy_reader/__main__.py new file mode 100644 index 0000000..ce7d1b2 --- /dev/null +++ b/src/epy_reader/__main__.py @@ -0,0 +1,23 @@ +import curses +import multiprocessing +import sys + +import epy_reader.cli as cli +import epy_reader.reader as reader + + +def main(): + # On Windows, calling this method is necessary + # On Linux/OSX, this method does nothing + multiprocessing.freeze_support() + filepath, dump_only = cli.find_file() + if dump_only: + sys.exit(cli.dump_ebook_content(filepath)) + + while True: + filepath = curses.wrapper(reader.start_reading, filepath) + + +# https://setuptools.pypa.io/en/latest/userguide/entry_point.html +if __name__ == "__main__": + main() diff --git a/src/epy_reader/board.py b/src/epy_reader/board.py new file mode 100644 index 0000000..0562d3f --- /dev/null +++ b/src/epy_reader/board.py @@ -0,0 +1,148 @@ +import curses +import re +from typing import Optional, Tuple, Union + +from epy_reader.models import Direction, InlineStyle, Key, NoUpdate +from epy_reader.settings import DoubleSpreadPadding + + +class InfiniBoard: + """ + Wrapper for curses screen to render infinite texts. + The idea is instead of pre render all the text before reading, + this will only renders part of text on demand by which available + page on screen. + + And what this does is only drawing text/string on curses screen + without .clear() or .refresh() to optimize performance. + """ + + def __init__( + self, + screen, + text: Tuple[str, ...], + textwidth: int = 80, + default_style: Tuple[InlineStyle, ...] = tuple(), + spread: int = 1, + ): + self.screen = screen + self.screen_rows, self.screen_cols = self.screen.getmaxyx() + self.textwidth = textwidth + self.x = ((self.screen_cols - self.textwidth) // 2) + 1 + self.text = text + self.total_lines = len(text) + self.default_style: Tuple[InlineStyle, ...] = default_style + self.temporary_style: Tuple[InlineStyle, ...] = () + self.spread = spread + + if self.spread == 2: + self.x = DoubleSpreadPadding.LEFT.value + self.x_alt = ( + DoubleSpreadPadding.LEFT.value + self.textwidth + DoubleSpreadPadding.MIDDLE.value + ) + + def feed_temporary_style(self, styles: Optional[Tuple[InlineStyle, ...]] = None) -> None: + """Reset styling if `styles` is None""" + self.temporary_style = styles if styles else () + + def render_styles( + self, row: int, styles: Tuple[InlineStyle, ...] = (), bottom_padding: int = 0 + ) -> None: + for i in styles: + if i.row in range(row, row + self.screen_rows - bottom_padding): + self.chgat(row, i.row, i.col, i.n_letters, self.screen.getbkgd() | i.attr) + + if self.spread == 2 and i.row in range( + row + self.screen_rows - bottom_padding, + row + 2 * (self.screen_rows - bottom_padding), + ): + self.chgat( + row, + i.row - (self.screen_rows - bottom_padding), + -self.x + self.x_alt + i.col, + i.n_letters, + self.screen.getbkgd() | i.attr, + ) + + def getch(self) -> Union[NoUpdate, Key]: + input = self.screen.getch() + if input == -1: + return NoUpdate() + return Key(input) + + def getbkgd(self): + return self.screen.getbkgd() + + def chgat(self, row: int, y: int, x: int, n: int, attr: int) -> None: + self.screen.chgat(y - row, self.x + x, n, attr) + + def write(self, row: int, bottom_padding: int = 0) -> None: + for n_row in range(min(self.screen_rows - bottom_padding, self.total_lines - row)): + text_line = self.text[row + n_row] + self.screen.addstr(n_row, self.x, text_line) + + if ( + self.spread == 2 + and row + self.screen_rows - bottom_padding + n_row < self.total_lines + ): + text_line = self.text[row + self.screen_rows - bottom_padding + n_row] + # TODO: clean this up + if re.search("\\[IMG:[0-9]+\\]", text_line): + self.screen.addstr( + n_row, self.x_alt, text_line.center(self.textwidth), curses.A_BOLD + ) + else: + self.screen.addstr(n_row, self.x_alt, text_line) + + self.render_styles(row, self.default_style, bottom_padding) + self.render_styles(row, self.temporary_style, bottom_padding) + # self.screen.refresh() + + def write_n( + self, + row: int, + n: int = 1, + direction: Direction = Direction.FORWARD, + bottom_padding: int = 0, + ) -> None: + assert n > 0 + for n_row in range(min(self.screen_rows - bottom_padding, self.total_lines - row)): + text_line = self.text[row + n_row] + if direction == Direction.FORWARD: + # self.screen.addnstr(n_row, self.x + self.textwidth - n, self.text[row+n_row], n) + # `+ " " * (self.textwidth - len(self.text[row + n_row]))` is workaround to + # to prevent curses trace because not calling screen.clear() + self.screen.addnstr( + n_row, + self.x + self.textwidth - n, + text_line + " " * (self.textwidth - len(text_line)), + n, + ) + + if ( + self.spread == 2 + and row + self.screen_rows - bottom_padding + n_row < self.total_lines + ): + text_line_alt = self.text[row + n_row + self.screen_rows - bottom_padding] + self.screen.addnstr( + n_row, + self.x_alt + self.textwidth - n, + text_line_alt + " " * (self.textwidth - len(text_line_alt)), + n, + ) + + else: + if text_line[self.textwidth - n :]: + self.screen.addnstr(n_row, self.x, text_line[self.textwidth - n :], n) + + if ( + self.spread == 2 + and row + self.screen_rows - bottom_padding + n_row < self.total_lines + ): + text_line_alt = self.text[row + n_row + self.screen_rows - bottom_padding] + self.screen.addnstr( + n_row, + self.x_alt, + text_line_alt[self.textwidth - n :], + n, + ) diff --git a/src/epy_reader/cli.py b/src/epy_reader/cli.py new file mode 100644 index 0000000..e43b51c --- /dev/null +++ b/src/epy_reader/cli.py @@ -0,0 +1,171 @@ +import argparse +import os +import shutil +import sys +import textwrap +from difflib import SequenceMatcher as SM +from typing import List, Optional, Tuple + +from epy_reader import __version__ +from epy_reader.lib import coerce_to_int, is_url, truncate +from epy_reader.models import LibraryItem +from epy_reader.parser import parse_html +from epy_reader.state import State +from epy_reader.utils import get_ebook_obj + + +def cleanup_library(state: State) -> None: + """Cleanup non-existent file from library""" + library_items = state.get_from_history() + for item in library_items: + if not os.path.isfile(item.filepath) and not is_url(item.filepath): + state.delete_from_library(item.filepath) + + +def get_nth_file_from_library(state: State, n) -> Optional[LibraryItem]: + library_items = state.get_from_history() + try: + return library_items[n - 1] + except IndexError: + return None + + +def get_matching_library_item( + state: State, pattern: str, threshold: float = 0.5 +) -> Optional[LibraryItem]: + matches: List[Tuple[LibraryItem, float]] = [] # [(library_item, match_value), ...] + library_items = state.get_from_history() + if not library_items: + return None + + for item in library_items: + tomatch = f"{item.title} - {item.author}" # item.filepath + match_value = sum( + [i.size for i in SM(None, tomatch.lower(), pattern.lower()).get_matching_blocks()] + ) / float(len(pattern)) + matches.append( + ( + item, + match_value, + ) + ) + + sorted_matches = sorted(matches, key=lambda x: -x[1]) + first_match_item, first_match_value = sorted_matches[0] + if first_match_item and first_match_value >= threshold: + return first_match_item + else: + return None + + +def print_reading_history(state: State) -> None: + termc, _ = shutil.get_terminal_size() + library_items = state.get_from_history() + if not library_items: + print("No Reading History.") + return + + print("Reading History:") + dig = len(str(len(library_items) + 1)) + tcols = termc - dig - 2 + for n, item in enumerate(library_items): + print( + "{} {}".format( + str(n + 1).rjust(dig), + truncate(str(item), "...", tcols, tcols - 3), + ) + ) + + +def parse_cli_args() -> argparse.Namespace: + prog = "epy" + positional_arg_help_str = "[PATH | # | PATTERN | URL]" + args_parser = argparse.ArgumentParser( + prog=prog, + usage=f"%(prog)s [-h] [-r] [-d] [-v] {positional_arg_help_str}", + formatter_class=argparse.RawDescriptionHelpFormatter, + description="Read ebook in terminal", + epilog=textwrap.dedent( + f"""\ + examples: + {prog} /path/to/ebook read /path/to/ebook file + {prog} 3 read #3 file from reading history + {prog} count monte read file matching 'count monte' + from reading history + """ + ), + ) + args_parser.add_argument("-r", "--history", action="store_true", help="print reading history") + args_parser.add_argument("-d", "--dump", action="store_true", help="dump the content of ebook") + args_parser.add_argument( + "-v", + "--version", + action="version", + version=f"v{__version__}", + help="print version and exit", + ) + args_parser.add_argument( + "ebook", + action="store", + nargs="*", + metavar=positional_arg_help_str, + help="ebook path, history number, pattern or URL", + ) + return args_parser.parse_args() + + +def find_file() -> Tuple[str, bool]: + args = parse_cli_args() + state = State() + cleanup_library(state) + + if args.history: + print_reading_history(state) + sys.exit() + + if len(args.ebook) == 0: + last_read = state.get_last_read() + if last_read: + return last_read, args.dump + else: + sys.exit("ERROR: Found no last read ebook file.") + + elif len(args.ebook) == 1: + nth = coerce_to_int(args.ebook[0]) + if nth is not None: + file = get_nth_file_from_library(state, nth) + if file: + return file.filepath, args.dump + else: + print(f"ERROR: #{nth} file not found.") + print_reading_history(state) + sys.exit(1) + elif is_url(args.ebook[0]): + return args.ebook[0], args.dump + elif os.path.isfile(args.ebook[0]): + return args.ebook[0], args.dump + + pattern = " ".join(args.ebook) + match = get_matching_library_item(state, pattern) + if match: + return match.filepath, args.dump + else: + sys.exit("ERROR: Found no matching ebook from history.") + + +def dump_ebook_content(filepath: str) -> None: + ebook = get_ebook_obj(filepath) + try: + try: + ebook.initialize() + except Exception as e: + sys.exit("ERROR: Badly-structured ebook.\n" + str(e)) + for i in ebook.contents: + content = ebook.get_raw_text(i) + src_lines = parse_html(content) + assert isinstance(src_lines, tuple) + # sys.stdout.reconfigure(encoding="utf-8") # Python>=3.7 + for j in src_lines: + sys.stdout.buffer.write((j + "\n\n").encode("utf-8")) + finally: + ebook.cleanup() diff --git a/src/epy_reader/config.py b/src/epy_reader/config.py new file mode 100644 index 0000000..db70a98 --- /dev/null +++ b/src/epy_reader/config.py @@ -0,0 +1,80 @@ +import dataclasses +import json +import os +import sys +from typing import Mapping, Tuple, Union + +import epy_reader.settings as settings +from epy_reader.models import AppData, Key + + +class Config(AppData): + def __init__(self): + setting_dict = dataclasses.asdict(settings.Settings()) + keymap_dict = dataclasses.asdict(settings.CfgDefaultKeymaps()) + keymap_builtin_dict = dataclasses.asdict(settings.CfgBuiltinKeymaps()) + + if os.path.isfile(self.filepath): + with open(self.filepath) as f: + cfg_user = json.load(f) + setting_dict = Config.update_dict(setting_dict, cfg_user["Setting"]) + keymap_dict = Config.update_dict(keymap_dict, cfg_user["Keymap"]) + else: + self.save({"Setting": setting_dict, "Keymap": keymap_dict}) + + keymap_dict_tuple = {k: tuple(v) for k, v in keymap_dict.items()} + keymap_updated = { + k: tuple([Key(i) for i in v]) + for k, v in Config.update_keys_tuple(keymap_dict_tuple, keymap_builtin_dict).items() + } + + if sys.platform == "win32": + setting_dict["PageScrollAnimation"] = False + + self.setting = settings.Settings(**setting_dict) + self.keymap = settings.Keymap(**keymap_updated) + # to build help menu text + self.keymap_user_dict = keymap_dict + + @property + def filepath(self) -> str: + return os.path.join(self.prefix, "configuration.json") if self.prefix else os.devnull + + def save(self, cfg_dict): + with open(self.filepath, "w") as file: + json.dump(cfg_dict, file, indent=2) + + @staticmethod + def update_dict( + old_dict: Mapping[str, Union[str, int, bool]], + new_dict: Mapping[str, Union[str, int, bool]], + place_new=False, + ) -> Mapping[str, Union[str, int, bool]]: + """Returns a copy of `old_dict` after updating it with `new_dict`""" + + result = {**old_dict} + for k, _ in new_dict.items(): + if k in result: + result[k] = new_dict[k] + elif place_new: + result[k] = new_dict[k] + + return result + + @staticmethod + def update_keys_tuple( + old_keys: Mapping[str, Tuple[str, ...]], + new_keys: Mapping[str, Tuple[str, ...]], + place_new: bool = False, + ) -> Mapping[str, Tuple[str, ...]]: + """Returns a copy of `old_keys` after updating it with `new_keys` + by appending the tuple value and removes duplicate""" + + result = {**old_keys} + for k, _ in new_keys.items(): + if k in result: + result[k] = tuple(set(result[k] + new_keys[k])) + elif place_new: + result[k] = tuple(set(new_keys[k])) + + return result diff --git a/src/epy_reader/ebooks/__init__.py b/src/epy_reader/ebooks/__init__.py new file mode 100644 index 0000000..da5cfc0 --- /dev/null +++ b/src/epy_reader/ebooks/__init__.py @@ -0,0 +1,15 @@ +__all__ = [ + "Ebook", + "Epub", + "FictionBook", + "Mobi", + "Azw", + "URL", +] + +from epy_reader.ebooks.azw import Azw +from epy_reader.ebooks.base import Ebook +from epy_reader.ebooks.epub import Epub +from epy_reader.ebooks.fictionbook import FictionBook +from epy_reader.ebooks.mobi import Mobi +from epy_reader.ebooks.url import URL diff --git a/src/epy_reader/ebooks/azw.py b/src/epy_reader/ebooks/azw.py new file mode 100644 index 0000000..139fcc5 --- /dev/null +++ b/src/epy_reader/ebooks/azw.py @@ -0,0 +1,26 @@ +import contextlib +import os +import shutil +import tempfile +import zipfile + +from epy_reader.ebooks.epub import Epub +from epy_reader.tools import unpack_kindle_book + + +class Azw(Epub): + def __init__(self, fileepub): + self.path = os.path.abspath(fileepub) + self.tmpdir = tempfile.mkdtemp(prefix="epy-") + basename, _ = os.path.splitext(os.path.basename(self.path)) + self.tmpepub = os.path.join(self.tmpdir, "mobi8", basename + ".epub") + + def initialize(self): + with contextlib.redirect_stdout(None): + unpack_kindle_book(self.path, self.tmpdir, epubver="A", use_hd=True) + self.file = zipfile.ZipFile(self.tmpepub, "r") + Epub.initialize(self) + + def cleanup(self) -> None: + shutil.rmtree(self.tmpdir) + return diff --git a/src/epy_reader/ebooks/base.py b/src/epy_reader/ebooks/base.py new file mode 100644 index 0000000..0869db9 --- /dev/null +++ b/src/epy_reader/ebooks/base.py @@ -0,0 +1,48 @@ +import xml.etree.ElementTree as ET +from typing import Tuple, Union + +from epy_reader.models import BookMetadata, TocEntry + + +class Ebook: + def __init__(self, fileepub: str): + raise NotImplementedError("Ebook.__init__() not implemented") + + @property + def path(self) -> str: + return self._path + + @path.setter + def path(self, value: str) -> None: + self._path = value + + @property + def contents(self) -> Union[Tuple[str, ...], Tuple[ET.Element, ...]]: + return self._contents + + @contents.setter + def contents(self, value: Union[Tuple[str, ...], Tuple[ET.Element, ...]]) -> None: + self._contents = value + + @property + def toc_entries(self) -> Tuple[TocEntry, ...]: + return self._toc_entries + + @toc_entries.setter + def toc_entries(self, value: Tuple[TocEntry, ...]) -> None: + self._toc_entries = value + + def get_meta(self) -> BookMetadata: + raise NotImplementedError("Ebook.get_meta() not implemented") + + def initialize(self) -> None: + raise NotImplementedError("Ebook.initialize() not implemented") + + def get_raw_text(self, content: Union[str, ET.Element]) -> str: + raise NotImplementedError("Ebook.get_raw_text() not implemented") + + def get_img_bytestr(self, impath: str) -> Tuple[str, bytes]: + raise NotImplementedError("Ebook.get_img_bytestr() not implemented") + + def cleanup(self) -> None: + raise NotImplementedError("Ebook.cleanup() not implemented") diff --git a/src/epy_reader/ebooks/epub.py b/src/epy_reader/ebooks/epub.py new file mode 100644 index 0000000..a8cf0fa --- /dev/null +++ b/src/epy_reader/ebooks/epub.py @@ -0,0 +1,202 @@ +import dataclasses +import os +import xml.etree.ElementTree as ET +import zipfile +import zlib +from typing import Dict, List, Optional, Sequence, Tuple, Union +from urllib.parse import unquote, urljoin + +from epy_reader.ebooks.base import Ebook +from epy_reader.models import BookMetadata, TocEntry + + +# TODO: to be deprecated +DEBUG = False + + +class Epub(Ebook): + NAMESPACE = { + "DAISY": "http://www.daisy.org/z3986/2005/ncx/", + "OPF": "http://www.idpf.org/2007/opf", + "CONT": "urn:oasis:names:tc:opendocument:xmlns:container", + "XHTML": "http://www.w3.org/1999/xhtml", + "EPUB": "http://www.idpf.org/2007/ops", + # Dublin Core + "DC": "http://purl.org/dc/elements/1.1/", + } + + def __init__(self, fileepub: str): + self.path: str = os.path.abspath(fileepub) + self.file: Union[zipfile.ZipFile, str] = zipfile.ZipFile(fileepub, "r") + + # populate these attributes + # by calling self.initialize() + self.root_filepath: str + self.root_dirpath: str + + def get_meta(self) -> BookMetadata: + assert isinstance(self.file, zipfile.ZipFile) + # why self.file.read(self.root_filepath) problematic + # content_opf = ET.fromstring(self.file.open(self.root_filepath).read()) + content_opf = ET.parse(self.file.open(self.root_filepath)) + return Epub._get_metadata(content_opf) + + @staticmethod + def _get_metadata(content_opf: ET.ElementTree) -> BookMetadata: + metadata: Dict[str, Optional[str]] = {} + for field in dataclasses.fields(BookMetadata): + element = content_opf.find(f".//DC:{field.name}", Epub.NAMESPACE) + if element is not None: + metadata[field.name] = element.text + + return BookMetadata(**metadata) + + @staticmethod + def _get_contents(content_opf: ET.ElementTree) -> Tuple[str, ...]: + # cont = ET.parse(self.file.open(self.root_filepath)).getroot() + manifests: List[Tuple[str, str]] = [] + for manifest_elem in content_opf.findall("OPF:manifest/*", Epub.NAMESPACE): + # EPUB3 + # if manifest_elem.get("id") != "ncx" and manifest_elem.get("properties") != "nav": + if ( + manifest_elem.get("media-type") != "application/x-dtbncx+xml" + and manifest_elem.get("properties") != "nav" + ): + manifest_id = manifest_elem.get("id") + assert manifest_id is not None + manifest_href = manifest_elem.get("href") + assert manifest_href is not None + manifests.append((manifest_id, manifest_href)) + + spines: List[str] = [] + contents: List[str] = [] + for spine_elem in content_opf.findall("OPF:spine/*", Epub.NAMESPACE): + idref = spine_elem.get("idref") + assert idref is not None + spines.append(idref) + for spine in spines: + for manifest in manifests: + if spine == manifest[0]: + # book_contents.append(root_dirpath + unquote(manifest[1])) + contents.append(unquote(manifest[1])) + manifests.remove(manifest) + # TODO: test is break necessary + break + + return tuple(contents) + + @staticmethod + def _get_tocs(toc: ET.Element, version: str, contents: Sequence[str]) -> Tuple[TocEntry, ...]: + try: + # EPUB3 + if version in {"1.0", "2.0"}: + navPoints = toc.findall("DAISY:navMap//DAISY:navPoint", Epub.NAMESPACE) + elif version == "3.0": + navPoints = toc.findall( + "XHTML:body//XHTML:nav[@EPUB:type='toc']//XHTML:a", Epub.NAMESPACE + ) + + toc_entries: List[TocEntry] = [] + for navPoint in navPoints: + if version in {"1.0", "2.0"}: + src_elem = navPoint.find("DAISY:content", Epub.NAMESPACE) + assert src_elem is not None + src = src_elem.get("src") + + name_elem = navPoint.find("DAISY:navLabel/DAISY:text", Epub.NAMESPACE) + assert name_elem is not None + name = name_elem.text + elif version == "3.0": + src_elem = navPoint + assert src_elem is not None + src = src_elem.get("href") + + name = "".join(list(navPoint.itertext())) + + assert src is not None + src_id = src.split("#") + + try: + idx = contents.index(unquote(src_id[0])) + except ValueError: + continue + + # assert name is not None + # NOTE: skip empty label + if name is not None: + toc_entries.append( + TocEntry( + label=name, + content_index=idx, + section=src_id[1] if len(src_id) == 2 else None, + ) + ) + except AttributeError as e: + # TODO: + if DEBUG: + raise e + + return tuple(toc_entries) + + def initialize(self) -> None: + assert isinstance(self.file, zipfile.ZipFile) + + container = ET.parse(self.file.open("META-INF/container.xml")) + rootfile_elem = container.find("CONT:rootfiles/CONT:rootfile", Epub.NAMESPACE) + assert rootfile_elem is not None + self.root_filepath = rootfile_elem.attrib["full-path"] + self.root_dirpath = ( + os.path.dirname(self.root_filepath) + "/" + if os.path.dirname(self.root_filepath) != "" + else "" + ) + + content_opf = ET.parse(self.file.open(self.root_filepath)) + version = content_opf.getroot().get("version") + + contents = Epub._get_contents(content_opf) + self.contents = tuple(urljoin(self.root_dirpath, content) for content in contents) + + if version in {"1.0", "2.0"}: + # "OPF:manifest/*[@id='ncx']" + relative_toc = content_opf.find( + "OPF:manifest/*[@media-type='application/x-dtbncx+xml']", Epub.NAMESPACE + ) + elif version == "3.0": + relative_toc = content_opf.find("OPF:manifest/*[@properties='nav']", Epub.NAMESPACE) + else: + raise RuntimeError(f"Unsupported Epub version: {version}") + assert relative_toc is not None + relative_toc_path = relative_toc.get("href") + assert relative_toc_path is not None + toc_path = self.root_dirpath + relative_toc_path + toc = ET.parse(self.file.open(toc_path)).getroot() + self.toc_entries = Epub._get_tocs(toc, version, contents) # *self.contents (absolute path) + + def get_raw_text(self, content_path: Union[str, ET.Element]) -> str: + assert isinstance(self.file, zipfile.ZipFile) + assert isinstance(content_path, str) + + max_tries: Optional[int] = None # 1 if DEBUG else None + + # use try-except block to catch + # zlib.error: Error -3 while decompressing data: invalid distance too far back + # seems like caused by multiprocessing + tries = 0 + while True: + try: + content = self.file.open(content_path).read() + break + except zlib.error as e: + tries += 1 + if max_tries is not None and tries >= max_tries: + raise e + + return content.decode("utf-8") + + def get_img_bytestr(self, impath: str) -> Tuple[str, bytes]: + assert isinstance(self.file, zipfile.ZipFile) + return impath, self.file.read(impath) + + def cleanup(self) -> None: + pass diff --git a/src/epy_reader/ebooks/fictionbook.py b/src/epy_reader/ebooks/fictionbook.py new file mode 100644 index 0000000..35611b2 --- /dev/null +++ b/src/epy_reader/ebooks/fictionbook.py @@ -0,0 +1,76 @@ +import base64 +import os +import xml.etree.ElementTree as ET +from typing import List, Tuple, Union + +from epy_reader.ebooks import Ebook +from epy_reader.models import BookMetadata, TocEntry + + +class FictionBook(Ebook): + NAMESPACE = {"FB2": "http://www.gribuser.ru/xml/fictionbook/2.0"} + + def __init__(self, filefb: str): + self.path = os.path.abspath(filefb) + self.file = filefb + + # populate these attribute + # by calling self.initialize() + self.root: ET.Element + + def get_meta(self) -> BookMetadata: + title_elem = self.root.find(".//FB2:book-title", FictionBook.NAMESPACE) + first_name_elem = self.root.find(".//FB2:first-name", FictionBook.NAMESPACE) + last_name_elem = self.root.find(".//FB2:last-name", FictionBook.NAMESPACE) + date_elem = self.root.find(".//FB2:date", FictionBook.NAMESPACE) + identifier_elem = self.root.find(".//FB2:id", FictionBook.NAMESPACE) + + author = first_name_elem.text if first_name_elem is not None else None + if last_name_elem is not None: + if author is not None and author != "": + author += f" {last_name_elem.text}" + else: + author = last_name_elem.text + + return BookMetadata( + title=title_elem.text if title_elem is not None else None, + creator=author, + date=date_elem.text if date_elem is not None else None, + identifier=identifier_elem.text if identifier_elem is not None else None, + ) + + def initialize(self) -> None: + cont = ET.parse(self.file) + self.root = cont.getroot() + + self.contents = tuple(self.root.findall("FB2:body/*", FictionBook.NAMESPACE)) + + # TODO + toc_entries: List[TocEntry] = [] + for n, i in enumerate(self.contents): + title = i.find("FB2:title", FictionBook.NAMESPACE) + if title is not None: + toc_entries.append( + TocEntry(label="".join(title.itertext()), content_index=n, section=None) + ) + self.toc_entries = tuple(toc_entries) + + def get_raw_text(self, node: Union[str, ET.Element]) -> str: + assert isinstance(node, ET.Element) + ET.register_namespace("", "http://www.gribuser.ru/xml/fictionbook/2.0") + # sys.exit(ET.tostring(node, encoding="utf8", method="html").decode("utf-8").replace("ns1:","")) + return ET.tostring(node, encoding="utf8", method="html").decode("utf-8").replace("ns1:", "") + + def get_img_bytestr(self, imgid: str) -> Tuple[str, bytes]: + # TODO: test if image works + imgid = imgid.replace("#", "") + img_elem = self.root.find("*[@id='{}']".format(imgid)) + assert img_elem is not None + imgtype = img_elem.get("content-type") + img_elem_text = img_elem.text + assert imgtype is not None + assert img_elem_text is not None + return imgid + "." + imgtype.split("/")[1], base64.b64decode(img_elem_text) + + def cleanup(self) -> None: + return diff --git a/src/epy_reader/ebooks/mobi.py b/src/epy_reader/ebooks/mobi.py new file mode 100644 index 0000000..39f3be4 --- /dev/null +++ b/src/epy_reader/ebooks/mobi.py @@ -0,0 +1,69 @@ +import contextlib +import os +import shutil +import tempfile +import xml.etree.ElementTree as ET +from typing import Tuple, Union + +from epy_reader.ebooks.epub import Epub +from epy_reader.models import BookMetadata +from epy_reader.tools import unpack_kindle_book + + +class Mobi(Epub): + def __init__(self, filemobi: str): + self.path = os.path.abspath(filemobi) + self.file = tempfile.mkdtemp(prefix="epy-") + + # populate these attribute + # by calling self.initialize() + self.root_filepath: str + self.root_dirpath: str + + def get_meta(self) -> BookMetadata: + # why self.file.read(self.root_filepath) problematic + with open(os.path.join(self.root_dirpath, "content.opf")) as f: + content_opf = ET.parse(f) # .getroot() + return Epub._get_metadata(content_opf) + + def initialize(self) -> None: + assert isinstance(self.file, str) + + with contextlib.redirect_stdout(None): + unpack_kindle_book(self.path, self.file, epubver="A", use_hd=True) + # TODO: add cleanup here + + self.root_dirpath = os.path.join(self.file, "mobi7") + self.toc_path = os.path.join(self.root_dirpath, "toc.ncx") + version = "2.0" + + with open(os.path.join(self.root_dirpath, "content.opf")) as f: + content_opf = ET.parse(f) # .getroot() + + contents = Epub._get_contents(content_opf) + self.contents = tuple(os.path.join(self.root_dirpath, content) for content in contents) + + with open(self.toc_path) as f: + toc = ET.parse(f).getroot() + self.toc_entries = Epub._get_tocs(toc, version, contents) # *self.contents (absolute path) + + def get_raw_text(self, content_path: Union[str, ET.Element]) -> str: + assert isinstance(content_path, str) + with open(content_path, encoding="utf8") as f: + content = f.read() + # return content.decode("utf-8") + return content + + def get_img_bytestr(self, impath: str) -> Tuple[str, bytes]: + # TODO: test on windows + # if impath "Images/asdf.png" is problematic + image_abspath = os.path.join(self.root_dirpath, impath) + image_abspath = os.path.normpath(image_abspath) # handle crossplatform path + with open(image_abspath, "rb") as f: + src = f.read() + return impath, src + + def cleanup(self) -> None: + assert isinstance(self.file, str) + shutil.rmtree(self.file) + return diff --git a/src/epy_reader/ebooks/url.py b/src/epy_reader/ebooks/url.py new file mode 100644 index 0000000..4356fa7 --- /dev/null +++ b/src/epy_reader/ebooks/url.py @@ -0,0 +1,49 @@ +from pathlib import PurePosixPath +from typing import Tuple +from urllib.error import HTTPError, URLError +from urllib.parse import urljoin, urlparse +from urllib.request import Request, urlopen + +from epy_reader import __version__ +from epy_reader.ebooks import Ebook +from epy_reader.lib import is_url +from epy_reader.models import BookMetadata + + +class URL(Ebook): + _header = { + "User-Agent": f"epy/v{__version__}", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.8", + } + + def __init__(self, url: str): + self.path = url + self.file = url + self.contents = ("_",) + self.toc_entries = tuple() + + def get_meta(self) -> BookMetadata: + return BookMetadata() + + def initialize(self) -> None: + try: + with urlopen(Request(self.path, headers=URL._header)) as response: + self.html = response.read().decode() + except HTTPError as e: + raise e + except URLError as e: + raise e + + def get_raw_text(self, _) -> str: + return self.html + + def get_img_bytestr(self, src: str) -> Tuple[str, bytes]: + image_url = src if is_url(src) else urljoin(self.path, src) + # TODO: catch error on request + with urlopen(Request(image_url, headers=URL._header)) as response: + byte_str = response.read() + return PurePosixPath(urlparse(src).path).name, byte_str + + def cleanup(self) -> None: + return diff --git a/src/epy_reader/lib.py b/src/epy_reader/lib.py new file mode 100644 index 0000000..b010323 --- /dev/null +++ b/src/epy_reader/lib.py @@ -0,0 +1,63 @@ +from typing import Any, Optional, Tuple +from urllib.parse import urljoin, urlparse + + +def is_url(string: str) -> bool: + try: + tmp = urlparse(string) + return all([tmp.scheme, tmp.netloc]) + except ValueError: + return False + + +def coerce_to_int(string: str) -> Optional[int]: + try: + return int(string) + except ValueError: + return None + + +def truncate(teks: str, subtitution_text: str, maxlen: int, startsub: int = 0) -> str: + """ + Truncate text + + eg. + :param teks: 'This is long silly dummy text' + :param subtitution_text: '...' + :param maxlen: 12 + :param startsub: 3 + :return: 'This...ly dummy text' + """ + if startsub > maxlen: + raise ValueError("Var startsub cannot be bigger than maxlen.") + elif len(teks) <= maxlen: + return teks + else: + lensu = len(subtitution_text) + beg = teks[:startsub] + mid = ( + subtitution_text + if lensu <= maxlen - startsub + else subtitution_text[: maxlen - startsub] + ) + end = teks[startsub + lensu - maxlen :] if lensu < maxlen - startsub else "" + return beg + mid + end + + +def tuple_subtract(tuple_one: Tuple[Any, ...], tuple_two: Tuple[Any, ...]) -> Tuple[Any, ...]: + """ + Returns tuple with members in tuple_one + but not in tuple_two + """ + return tuple(i for i in tuple_one if i not in tuple_two) + + +def resolve_path(current_dir: str, relative_path: str) -> str: + """ + Resolve path containing dots + eg. '/foo/bar/book.html' + '../img.png' = '/foo/img.png' + NOTE: '/' suffix is important to tell that current dir in 'bar' + """ + # can also using os.path.normpath() + # but if the image in zipfile then posix path is mandatory + return urljoin(current_dir, relative_path) diff --git a/src/epy_reader/models.py b/src/epy_reader/models.py new file mode 100644 index 0000000..db4701b --- /dev/null +++ b/src/epy_reader/models.py @@ -0,0 +1,232 @@ +import os +from dataclasses import dataclass +from datetime import datetime +from enum import Enum +from typing import Any, Mapping, Optional, Tuple, Union + + +class Direction(Enum): + FORWARD = "forward" + BACKWARD = "backward" + + +@dataclass(frozen=True) +class BookMetadata: + title: Optional[str] = None + creator: Optional[str] = None + description: Optional[str] = None + publisher: Optional[str] = None + date: Optional[str] = None + language: Optional[str] = None + format: Optional[str] = None + identifier: Optional[str] = None + source: Optional[str] = None + + +@dataclass(frozen=True) +class LibraryItem: + last_read: datetime + filepath: str + title: Optional[str] = None + author: Optional[str] = None + reading_progress: Optional[float] = None + + def __str__(self) -> str: + if self.reading_progress is None: + reading_progress_str = "N/A" + else: + reading_progress_str = f"{int(self.reading_progress * 100)}%" + reading_progress_str = reading_progress_str.rjust(4) + + book_name: str + filename = self.filepath.replace(os.path.expanduser("~"), "~", 1) + if self.title is not None and self.author is not None: + book_name = f"{self.title} - {self.author} ({filename})" + elif self.title is None and self.author: + book_name = f"{filename} - {self.author}" + else: + book_name = filename + + last_read_str = self.last_read.strftime("%I:%M%p %b %d") + + return f"{reading_progress_str} {last_read_str}: {book_name}" + + +@dataclass(frozen=True) +class ReadingState: + """ + Data model for reading state. + + `row` has to be explicitly assigned with value + because Seamless feature needs it to adjust from + relative (to book's content index) row to absolute + (to book's entire content) row. + + `rel_pctg` and `section` default to None and if + either of them is assigned with value, then it + will be overriding the `row` value. + """ + + content_index: int + textwidth: int + row: int + rel_pctg: Optional[float] = None + section: Optional[str] = None + + +@dataclass(frozen=True) +class SearchData: + direction: Direction = Direction.FORWARD + value: str = "" + + +@dataclass(frozen=True) +class LettersCount: + """ + all: total letters in book + cumulative: list of total letters for previous contents + eg. let's say cumulative = (0, 50, 89, ...) it means + 0 is total cumulative letters of book contents[-1] to contents[0] + 50 is total cumulative letters of book contents[0] to contents[1] + 89 is total cumulative letters of book contents[0] to contents[2] + """ + + all: int + cumulative: Tuple[int, ...] + + +@dataclass(frozen=True) +class CharPos: + """ + Describes character position in text. + eg. ["Lorem ipsum dolor sit amet,", # row=0 + "consectetur adipiscing elit."] # row=1 + ^CharPos(row=1, col=3) + """ + + row: int + col: int + + +@dataclass(frozen=True) +class TextMark: + """ + Describes marking in text. + eg. Interval [CharPos(row=0, col=3), CharPos(row=1, col=4)] + notice the marking inclusive [] for both side instead of right exclusive [) + """ + + start: CharPos + end: Optional[CharPos] = None + + def is_valid(self) -> bool: + """ + Assert validity and check if the mark is unterminated + eg.
This is italic text
+ Missing tag + """ + if self.end is not None: + if self.start.row == self.end.row: + return self.start.col <= self.end.col + else: + return self.start.row < self.end.row + + return False + + +@dataclass(frozen=True) +class TextSpan: + """ + Like TextMark but using span of letters (n_letters) + """ + + start: CharPos + n_letters: int + + +@dataclass(frozen=True) +class InlineStyle: + """ + eg. InlineStyle(attr=curses.A_BOLD, row=3, cols=4, n_letters=3) + """ + + row: int + col: int + n_letters: int + attr: int + + +@dataclass(frozen=True) +class TocEntry: + label: str + content_index: int + section: Optional[str] + + +@dataclass(frozen=True) +class TextStructure: + """ + Object that describes how the text + should be displayed in screen. + + text_lines: ("list of lines", "of text", ...) + image_maps: {line_num: path/to/image/in/ebook/zip} + section_rows: {section_id: line_num} + formatting: (InlineStyle, ...) + """ + + text_lines: Tuple[str, ...] + image_maps: Mapping[int, str] + section_rows: Mapping[str, int] + formatting: Tuple[InlineStyle, ...] + + +@dataclass(frozen=True) +class NoUpdate: + pass + + +class Key: + """ + Because ord("k") chr(34) are confusing + """ + + def __init__(self, char_or_int: Union[str, int]): + self.value: int = char_or_int if isinstance(char_or_int, int) else ord(char_or_int) + self.char: str = char_or_int if isinstance(char_or_int, str) else chr(char_or_int) + + def __eq__(self, other: Any) -> bool: + if isinstance(other, Key): + return self.value == other.value + return False + + def __ne__(self, other: Any) -> bool: + return self.__eq__(other) + + def __hash__(self) -> int: + return hash(self.value) + + +class AppData: + @property + def prefix(self) -> Optional[str]: + """Return None if there exists no homedir | userdir""" + prefix: Optional[str] = None + + # UNIX filesystem + homedir = os.getenv("HOME") + # WIN filesystem + userdir = os.getenv("USERPROFILE") + + if homedir: + if os.path.isdir(os.path.join(homedir, ".config")): + prefix = os.path.join(homedir, ".config", "epy") + else: + prefix = os.path.join(homedir, ".epy") + elif userdir: + prefix = os.path.join(userdir, ".epy") + + if prefix: + os.makedirs(prefix, exist_ok=True) + + return prefix diff --git a/src/epy_reader/parser.py b/src/epy_reader/parser.py new file mode 100644 index 0000000..6eced00 --- /dev/null +++ b/src/epy_reader/parser.py @@ -0,0 +1,421 @@ +import curses +import dataclasses +import re +import textwrap +from html import unescape +from html.parser import HTMLParser +from typing import Dict, List, Mapping, Optional, Sequence, Set, Tuple, Union +from urllib.parse import unquote + +from epy_reader.models import CharPos, InlineStyle, TextMark, TextSpan, TextStructure + + +class HTMLtoLines(HTMLParser): + para = {"p", "div"} + inde = {"q", "dt", "dd", "blockquote"} + pref = {"pre"} + bull = {"li"} + hide = {"script", "style", "head"} + ital = {"i", "em"} + bold = {"b", "strong"} + # hide = {"script", "style", "head", ", "sub} + # sup_lookup = "⁰¹²³⁴⁵⁶⁷⁸⁹" + # sub_lookup = "₀₁₂₃₄₅₆₇₈₉" + + attr_bold = curses.A_BOLD + try: + attr_italic = curses.A_ITALIC + except AttributeError: + try: + attr_italic = curses.A_UNDERLINE + except AttributeError: + attr_italic = curses.A_NORMAL + + @staticmethod + def _mark_to_spans(text: Sequence[str], marks: Sequence[TextMark]) -> List[TextSpan]: + """ + Convert text marks in line of text to per line text span. + Keeping duplicate spans. + """ + spans: List[TextSpan] = [] + for mark in marks: + if mark.is_valid(): + # mypy issue, should be handled by mark.is_valid() + assert mark.end is not None + if mark.start.row == mark.end.row: + spans.append( + TextSpan(start=mark.start, n_letters=mark.end.col - mark.start.col) + ) + else: + spans.append( + TextSpan( + start=mark.start, n_letters=len(text[mark.start.row]) - mark.start.col + ) + ) + for nth_line in range(mark.start.row + 1, mark.end.row): + spans.append( + TextSpan( + start=CharPos(row=nth_line, col=0), n_letters=len(text[nth_line]) + ) + ) + spans.append( + TextSpan(start=CharPos(row=mark.end.row, col=0), n_letters=mark.end.col) + ) + + return spans # list(set(spans)) + + @staticmethod + def _adjust_wrapped_spans( + wrapped_lines: Sequence[str], + span: TextSpan, + *, + line_adjustment: int = 0, + left_adjustment: int = 0, + ) -> List[TextSpan]: + """ + Adjust text span to wrapped lines. + Not perfect, but should be good enough considering + the limitation on commandline interface. + """ + + # current_row = span.start.row + line_adjustment + current_row = line_adjustment + start_col = span.start.col + end_col = start_col + span.n_letters + + prev = 0 # chars length before current line + spans: List[TextSpan] = [] + for n, line in enumerate(wrapped_lines): + # + 1 compensates textwrap.wrap(*args, replace_whitespace=True, drop_whitespace=True) + line_len = len(line) + 1 + current = prev + line_len # chars length before next line + + # -:unmarked *:marked + # |------*****--------| + if start_col in range(prev, current) and end_col in range(prev, current): + spans.append( + TextSpan( + start=CharPos(row=current_row + n, col=start_col - prev + left_adjustment), + n_letters=span.n_letters, + ) + ) + + # |----------*********| + elif start_col in range(prev, current): + spans.append( + TextSpan( + start=CharPos(row=current_row + n, col=start_col - prev + left_adjustment), + n_letters=current - start_col - 1, # -1: dropped whitespace + ) + ) + + # |********-----------| + elif end_col in range(prev, current): + spans.append( + TextSpan( + start=CharPos(row=current_row + n, col=0 + left_adjustment), + n_letters=end_col - prev + 1, # +1: dropped whitespace + ) + ) + + # |*******************| + elif prev in range(start_col, end_col) and current in range(start_col, end_col): + spans.append( + TextSpan( + start=CharPos(row=current_row + n, col=0 + left_adjustment), + n_letters=line_len - 1, # -1: dropped whitespace + ) + ) + + elif prev > end_col: + break + + prev = current + + return spans + + @staticmethod + def _group_spans_by_row(blocks: Sequence[TextSpan]) -> Mapping[int, List[TextSpan]]: + groups: Dict[int, List[TextSpan]] = {} + for block in blocks: + row = block.start.row + if row in groups: + groups[row].append(block) + else: + groups[row] = [block] + return groups + + def __init__(self, sects={""}): + HTMLParser.__init__(self) + self.text = [""] + self.ishead = False + self.isinde = False + self.isbull = False + self.ispref = False + self.ishidden = False + self.idhead = set() + self.idinde = set() + self.idbull = set() + self.idpref = set() + self.idimgs = set() + self.sects = sects + self.sectsindex = {} + self.italic_marks: List[TextMark] = [] + self.bold_marks: List[TextMark] = [] + self.imgs: Dict[int, str] = dict() + + def handle_starttag(self, tag, attrs): + if re.match("h[1-6]", tag) is not None: + self.ishead = True + elif tag in self.inde: + self.isinde = True + elif tag in self.pref: + self.ispref = True + elif tag in self.bull: + self.isbull = True + elif tag in self.hide: + self.ishidden = True + elif tag == "sup": + self.text[-1] += "^{" + elif tag == "sub": + self.text[-1] += "_{" + # NOTE: "img" and "image" + # In HTML, both are startendtag (no need endtag) + # but in XHTML both need endtag + elif tag in {"img", "image"}: + for i in attrs: + if (tag == "img" and i[0] == "src") or (tag == "image" and i[0].endswith("href")): + this_line = len(self.text) + self.idimgs.add(this_line) + self.imgs[this_line] = unquote(i[1]) + self.text.append("[IMAGE]") + # formatting + elif tag in self.ital: + if len(self.italic_marks) == 0 or self.italic_marks[-1].is_valid(): + char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1])) + self.italic_marks.append(TextMark(start=char_pos)) + elif tag in self.bold: + if len(self.bold_marks) == 0 or self.bold_marks[-1].is_valid(): + char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1])) + self.bold_marks.append(TextMark(start=char_pos)) + if self.sects != {""}: + for i in attrs: + if i[0] == "id" and i[1] in self.sects: + # self.text[-1] += " (#" + i[1] + ") " + # self.sectsindex.append([len(self.text), i[1]]) + self.sectsindex[len(self.text) - 1] = i[1] + + def handle_startendtag(self, tag, attrs): + if tag == "br": + self.text += [""] + elif tag in {"img", "image"}: + for i in attrs: + # if (tag == "img" and i[0] == "src")\ + # or (tag == "image" and i[0] == "xlink:href"): + if (tag == "img" and i[0] == "src") or (tag == "image" and i[0].endswith("href")): + this_line = len(self.text) + self.idimgs.add(this_line) + self.imgs[this_line] = unquote(i[1]) + self.text.append("[IMAGE]") + self.text.append("") + # sometimes attribute "id" is inside "startendtag" + # especially html from mobi module (kindleunpack fork) + if self.sects != {""}: + for i in attrs: + if i[0] == "id" and i[1] in self.sects: + # self.text[-1] += " (#" + i[1] + ") " + self.sectsindex[len(self.text) - 1] = i[1] + + def handle_endtag(self, tag): + if re.match("h[1-6]", tag) is not None: + self.text.append("") + self.text.append("") + self.ishead = False + elif tag in self.para: + self.text.append("") + elif tag in self.hide: + self.ishidden = False + elif tag in self.inde: + if self.text[-1] != "": + self.text.append("") + self.isinde = False + elif tag in self.pref: + if self.text[-1] != "": + self.text.append("") + self.ispref = False + elif tag in self.bull: + if self.text[-1] != "": + self.text.append("") + self.isbull = False + elif tag in {"sub", "sup"}: + self.text[-1] += "}" + elif tag in {"img", "image"}: + self.text.append("") + # formatting + elif tag in self.ital: + char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1])) + last_mark = self.italic_marks[-1] + self.italic_marks[-1] = dataclasses.replace(last_mark, end=char_pos) + elif tag in self.bold: + char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1])) + last_mark = self.bold_marks[-1] + self.bold_marks[-1] = dataclasses.replace(last_mark, end=char_pos) + + def handle_data(self, raw): + if raw and not self.ishidden: + if self.text[-1] == "": + tmp = raw.lstrip() + else: + tmp = raw + if self.ispref: + line = unescape(tmp) + else: + line = unescape(re.sub(r"\s+", " ", tmp)) + self.text[-1] += line + if self.ishead: + self.idhead.add(len(self.text) - 1) + elif self.isbull: + self.idbull.add(len(self.text) - 1) + elif self.isinde: + self.idinde.add(len(self.text) - 1) + elif self.ispref: + self.idpref.add(len(self.text) - 1) + + def get_structured_text( + self, textwidth: Optional[int] = 0, starting_line: int = 0 + ) -> Union[Tuple[str, ...], TextStructure]: + + if not textwidth: + return tuple(self.text) + + text: List[str] = [] + images: Dict[int, str] = dict() # {line_num: path/in/zip} + sect: Dict[str, int] = dict() # {section_id: line_num} + formatting: List[InlineStyle] = [] + + italic_spans: List[TextSpan] = HTMLtoLines._mark_to_spans(self.text, self.italic_marks) + bold_spans: List[TextSpan] = HTMLtoLines._mark_to_spans(self.text, self.bold_marks) + italic_groups = HTMLtoLines._group_spans_by_row(italic_spans) + bold_groups = HTMLtoLines._group_spans_by_row(bold_spans) + + for n, line in enumerate(self.text): + + startline = len(text) + # findsect = re.search(r"(?<= \(#).*?(?=\) )", line) + # if findsect is not None and findsect.group() in self.sects: + # line = line.replace(" (#" + findsect.group() + ") ", "") + # # line = line.replace(" (#" + findsect.group() + ") ", " "*(5+len(findsect.group()))) + # sect[findsect.group()] = len(text) + if n in self.sectsindex.keys(): + sect[self.sectsindex[n]] = starting_line + len(text) + if n in self.idhead: + # text += [line.rjust(textwidth // 2 + len(line) // 2)] + [""] + text += [line.center(textwidth)] + [""] + formatting += [ + InlineStyle( + row=starting_line + i, col=0, n_letters=len(text[i]), attr=self.attr_bold + ) + for i in range(startline, len(text)) + ] + elif n in self.idinde: + text += [" " + i for i in textwrap.wrap(line, textwidth - 3)] + [""] + elif n in self.idbull: + tmp = textwrap.wrap(line, textwidth - 3) + text += [" - " + i if i == tmp[0] else " " + i for i in tmp] + [""] + elif n in self.idpref: + tmp = line.splitlines() + wraptmp = [] + for tmp_line in tmp: + wraptmp += [i for i in textwrap.wrap(tmp_line, textwidth - 6)] + text += [" " + i for i in wraptmp] + [""] + elif n in self.idimgs: + images[starting_line + len(text)] = self.imgs[n] + text += [line.center(textwidth)] + formatting += [ + InlineStyle( + row=starting_line + len(text) - 1, + col=0, + n_letters=len(text[-1]), + attr=self.attr_bold, + ) + ] + text += [""] + else: + text += textwrap.wrap(line, textwidth) + [""] + + endline = len(text) # -1 + + left_adjustment = 3 if n in self.idbull | self.idinde else 0 + + for spans in italic_groups.get(n, []): + italics = HTMLtoLines._adjust_wrapped_spans( + text[startline:endline], + spans, + line_adjustment=startline, + left_adjustment=left_adjustment, + ) + for span in italics: + formatting.append( + InlineStyle( + row=starting_line + span.start.row, + col=span.start.col, + n_letters=span.n_letters, + attr=self.attr_italic, + ) + ) + + for spans in bold_groups.get(n, []): + bolds = HTMLtoLines._adjust_wrapped_spans( + text[startline:endline], + spans, + line_adjustment=startline, + left_adjustment=left_adjustment, + ) + for span in bolds: + formatting.append( + InlineStyle( + row=starting_line + span.start.row, + col=span.start.col, + n_letters=span.n_letters, + attr=self.attr_bold, + ) + ) + + # chapter suffix + text += ["***".center(textwidth)] + + return TextStructure( + text_lines=tuple(text), + image_maps=images, + section_rows=sect, + formatting=tuple(formatting), + ) + + +def parse_html( + html_src: str, + *, + textwidth: Optional[int] = None, + section_ids: Optional[Set[str]] = None, + starting_line: int = 0, +) -> Union[Tuple[str, ...], TextStructure]: + """ + Parse html string into TextStructure + + :param html_src: html str to parse + :param textwidth: textwidth to count max length of returned TextStructure + if None given, sequence of text as paragraph is returned + :param section_ids: set of section ids to look for inside html tag attr + :return: Tuple[str, ...] if textwidth not given else TextStructure + """ + if not section_ids: + section_ids = set() + + parser = HTMLtoLines(section_ids) + # try: + parser.feed(html_src) + parser.close() + # except: + # pass + + return parser.get_structured_text(textwidth, starting_line) diff --git a/src/epy_reader/reader.py b/src/epy_reader/reader.py new file mode 100644 index 0000000..a903b62 --- /dev/null +++ b/src/epy_reader/reader.py @@ -0,0 +1,1610 @@ +import curses +import dataclasses +import multiprocessing +import os +import re +import shutil +import signal +import sqlite3 +import subprocess +import sys +import tempfile +import uuid +import xml.etree.ElementTree as ET +from html import unescape +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union + +import epy_reader.settings as settings +from epy_reader.board import InfiniBoard +from epy_reader.config import Config +from epy_reader.ebooks import Azw, Ebook, Epub, Mobi +from epy_reader.lib import resolve_path +from epy_reader.models import ( + Direction, + InlineStyle, + Key, + LettersCount, + NoUpdate, + ReadingState, + SearchData, + TextStructure, + TocEntry, +) +from epy_reader.parser import parse_html +from epy_reader.settings import DoubleSpreadPadding +from epy_reader.speakers import SpeakerBaseModel +from epy_reader.state import State +from epy_reader.utils import ( + choice_win, + construct_relative_reading_state, + construct_speaker, + count_letters, + count_letters_parallel, + find_current_content_index, + get_ebook_obj, + merge_text_structures, + pgend, + safe_curs_set, + text_win, +) + + +# TODO: to be deprecated +DEBUG = False + + +class Reader: + def __init__(self, screen, ebook: Ebook, config: Config, state: State): + + self.setting = config.setting + self.keymap = config.keymap + # to build help menu text + self.keymap_user_dict = config.keymap_user_dict + + self.seamless = self.setting.SeamlessBetweenChapters + + # keys that will make + # windows exit and return the said key + self._win_keys = ( + # curses.KEY_RESIZE is a must + (Key(curses.KEY_RESIZE),) + + self.keymap.TableOfContents + + self.keymap.Metadata + + self.keymap.Help + ) + + # screen initialization + self.screen = screen + self.screen.keypad(True) + safe_curs_set(0) + if self.setting.MouseSupport: + curses.mousemask(-1) + # curses.mouseinterval(0) + self.screen.clear() + + # screen color + self.is_color_supported: bool = False + try: + curses.use_default_colors() + curses.init_pair(1, self.setting.DefaultColorFG, self.setting.DefaultColorBG) + curses.init_pair(2, self.setting.DarkColorFG, self.setting.DarkColorBG) + curses.init_pair(3, self.setting.LightColorFG, self.setting.LightColorBG) + self.screen.bkgd(curses.color_pair(1)) + self.is_color_supported = True + except: + self.is_color_supported = False + + # show loader and start heavy resources processes + self.show_loader(subtext="initalizing ebook") + + # main ebook object + self.ebook = ebook + try: + self.ebook.initialize() + except (KeyboardInterrupt, Exception) as e: + self.ebook.cleanup() + if DEBUG: + raise e + else: + sys.exit("ERROR: Badly-structured ebook.\n" + str(e)) + + # state + self.state = state + + # page scroll animation + self.page_animation: Optional[Direction] = None + + # show reading progress + self.show_reading_progress: bool = self.setting.ShowProgressIndicator + self.reading_progress: Optional[float] = None # calculate after count_letters() + + # search storage + self.search_data: Optional[SearchData] = None + + # double spread + self.spread = 2 if self.setting.StartWithDoubleSpread else 1 + + # jumps marker container + self.jump_list: Dict[str, ReadingState] = dict() + + # TTS speaker utils + self._tts_speaker: Optional[SpeakerBaseModel] = construct_speaker( + self.setting.PreferredTTSEngine, self.setting.TTSEngineArgs + ) + self.tts_support: bool = bool(self._tts_speaker) + self.is_speaking: bool = False + + # multi process & progress percentage + self._multiprocess_support: bool = False if multiprocessing.cpu_count() == 1 else True + self._process_counting_letter: Optional[multiprocessing.Process] = None + self.letters_count: Optional[LettersCount] = None + + def run_counting_letters(self): + if self._multiprocess_support: + try: + self._proc_parent, self._proc_child = multiprocessing.Pipe() + self._process_counting_letter = multiprocessing.Process( + name="epy-subprocess-counting-letters", + target=count_letters_parallel, + args=(self.ebook, self._proc_child), + ) + # forking will raise + # zlib.error: Error -3 while decompressing data: invalid distance too far back + self._process_counting_letter.start() + except Exception as e: + if DEBUG: + raise e + self._multiprocess_support = False + if not self._multiprocess_support: + self.letters_count = count_letters(self.ebook) + + def try_assign_letters_count(self, *, force_wait=False) -> None: + if isinstance(self._process_counting_letter, multiprocessing.Process): + if force_wait and self._process_counting_letter.is_alive(): + self._process_counting_letter.join() + + if self._process_counting_letter.exitcode == 0: + self.letters_count = self._proc_parent.recv() + self._proc_parent.close() + self._process_counting_letter.terminate() + self._process_counting_letter.close() + self._process_counting_letter = None + + def calculate_reading_progress( + self, letters_per_content: List[int], reading_state: ReadingState + ) -> None: + if self.letters_count: + self.reading_progress = ( + self.letters_count.cumulative[reading_state.content_index] + + sum( + letters_per_content[: reading_state.row + (self.screen_rows * self.spread) - 1] + ) + ) / self.letters_count.all + + @property + def screen_rows(self) -> int: + return self.screen.getmaxyx()[0] + + @property + def screen_cols(self) -> int: + return self.screen.getmaxyx()[1] + + @property + def ext_dict_app(self) -> Optional[str]: + self._ext_dict_app: Optional[str] = None + + if shutil.which(self.setting.DictionaryClient.split()[0]): + self._ext_dict_app = self.setting.DictionaryClient + else: + for i in settings.DICT_PRESET_LIST: + if shutil.which(i) is not None: + self._ext_dict_app = i + break + if self._ext_dict_app in {"sdcv"}: + self._ext_dict_app += " -n" + + return self._ext_dict_app + + @property + def image_viewer(self) -> Optional[str]: + self._image_viewer: Optional[str] = None + + if shutil.which(self.setting.DefaultViewer.split()[0]) is not None: + self._image_viewer = self.setting.DefaultViewer + elif sys.platform == "win32": + self._image_viewer = "start" + elif sys.platform == "darwin": + self._image_viewer = "open" + else: + for i in settings.VIEWER_PRESET_LIST: + if shutil.which(i) is not None: + self._image_viewer = i + break + + if self._image_viewer in {"gio"}: + self._image_viewer += " open" + + return self._image_viewer + + def open_image(self, pad, name, bstr): + sfx = os.path.splitext(name)[1] + fd, path = tempfile.mkstemp(suffix=sfx) + try: + with os.fdopen(fd, "wb") as tmp: + # tmp.write(epub.file.read(src)) + tmp.write(bstr) + # run(VWR + " " + path, shell=True) + subprocess.call( + self.image_viewer + " " + path, + shell=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + k = pad.getch() + finally: + os.remove(path) + return k + + def show_loader(self, *, loader_str: str = "\u231B", subtext: Optional[str] = None): + self.screen.clear() + rows, cols = self.screen.getmaxyx() + middle_row = (rows - 1) // 2 + self.screen.addstr(middle_row, 0, loader_str.center(cols)) + if subtext: + self.screen.addstr(middle_row + 1, 0, subtext.center(cols)) + # self.screen.addstr(((rows-2)//2)+1, (cols-len(msg))//2, msg) + self.screen.refresh() + + @choice_win(True) + def show_win_options(self, title, options, active_index, key_set): + return title, options, active_index, key_set + + @text_win + def show_win_error(self, title, msg, key): + return title, msg, key + + @choice_win() + def toc(self, toc_entries: Tuple[TocEntry, ...], index: int): + return ( + "Table of Contents", + [i.label for i in toc_entries], + index, + self.keymap.TableOfContents, + ) + + @text_win + def show_win_metadata(self): + if os.path.isfile(self.ebook.path): + mdata = "[File Info]\nPATH: {}\nSIZE: {} MB\n \n[Book Info]\n".format( + self.ebook.path, round(os.path.getsize(self.ebook.path) / 1024**2, 2) + ) + else: + mdata = "[File Info]\nPATH: {}\n \n[Book Info]\n".format(self.ebook.path) + + book_metadata = self.ebook.get_meta() + for field in dataclasses.fields(book_metadata): + value = getattr(book_metadata, field.name) + if value: + value = unescape(re.sub("<[^>]*>", "", value)) + mdata += f"{field.name.title()}: {value}\n" + + return "Metadata", mdata, self.keymap.Metadata + + @text_win + def show_win_help(self): + src = "Key Bindings:\n" + dig = max([len(i) for i in self.keymap_user_dict.values()]) + 2 + for i in self.keymap_user_dict.keys(): + src += "{} {}\n".format( + self.keymap_user_dict[i].rjust(dig), " ".join(re.findall("[A-Z][^A-Z]*", i)) + ) + return "Help", src, self.keymap.Help + + @text_win + def define_word(self, word): + rows, cols = self.screen.getmaxyx() + hi, wi = 5, 16 + Y, X = (rows - hi) // 2, (cols - wi) // 2 + + p = subprocess.Popen( + "{} {}".format(self.ext_dict_app, word), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True, + ) + + dictwin = curses.newwin(hi, wi, Y, X) + dictwin.box() + dictwin.addstr((hi - 1) // 2, (wi - 10) // 2, "Loading...") + dictwin.refresh() + + out, err = p.communicate() + + dictwin.clear() + dictwin.refresh() + + if err == b"": + return "Definition: " + word.upper(), out.decode(), self.keymap.DefineWord + else: + return "Error: " + self.ext_dict_app, err.decode(), self.keymap.DefineWord + + def show_win_choices_bookmarks(self): + idx = 0 + while True: + bookmarks = [i[0] for i in self.state.get_bookmarks(self.ebook)] + if not bookmarks: + return self.keymap.ShowBookmarks[0], None + + retk, idx, todel = self.show_win_options( + "Bookmarks", bookmarks, idx, self.keymap.ShowBookmarks + ) + if todel is not None: + self.state.delete_bookmark(self.ebook, bookmarks[todel]) + else: + return retk, idx + + def show_win_library(self): + while True: + library_items = self.state.get_from_history() + if not library_items: + return self.keymap.Library[0], None + + retk, choice_index, todel_index = self.show_win_options( + "Library", [str(item) for item in library_items], 0, self.keymap.Library + ) + if todel_index is not None: + self.state.delete_from_library(library_items[todel_index].filepath) + else: + return retk, choice_index + + def input_prompt(self, prompt: str) -> Union[NoUpdate, Key, str]: + """ + :param prompt: prompt text + :return: NoUpdate if cancelled or interrupted + Key if curses.KEY_RESIZE triggered + str for successful input + """ + # prevent pad hole when prompting for input while + # other window is active + # pad.refresh(y, 0, 0, x, rows-2, x+width) + rows, cols = self.screen.getmaxyx() + stat = curses.newwin(1, cols, rows - 1, 0) + if self.is_color_supported: + stat.bkgd(self.screen.getbkgd()) + stat.keypad(True) + curses.echo(True) + safe_curs_set(2) + + init_text = "" + + stat.addstr(0, 0, prompt, curses.A_REVERSE) + stat.addstr(0, len(prompt), init_text) + stat.refresh() + + try: + while True: + # NOTE: getch() only handles ascii + # to handle wide char like: é, use get_wch() + ipt = Key(stat.get_wch()) + # get_wch() return ambiguous type + # str for string input but int for function or special keys + # if type(ipt) == str: + # ipt = ord(ipt) + + if ipt == Key(27): + stat.clear() + stat.refresh() + curses.echo(False) + safe_curs_set(0) + return NoUpdate() + elif ipt == Key(10): + stat.clear() + stat.refresh() + curses.echo(False) + safe_curs_set(0) + return init_text + elif ipt in (Key(8), Key(127), Key(curses.KEY_BACKSPACE)): + init_text = init_text[:-1] + elif ipt == Key(curses.KEY_RESIZE): + stat.clear() + stat.refresh() + curses.echo(False) + safe_curs_set(0) + return Key(curses.KEY_RESIZE) + # elif len(init_text) <= maxlen: + else: + init_text += ipt.char + + stat.clear() + stat.addstr(0, 0, prompt, curses.A_REVERSE) + stat.addstr( + 0, + len(prompt), + init_text + if len(prompt + init_text) < cols + else "..." + init_text[len(prompt) - cols + 4 :], + ) + stat.refresh() + except KeyboardInterrupt: + stat.clear() + stat.refresh() + curses.echo(False) + safe_curs_set(0) + return NoUpdate() + + def searching( + self, board: InfiniBoard, src: Sequence[str], reading_state: ReadingState, tot + ) -> Union[NoUpdate, ReadingState, Key]: + # reusable loop indices + i: Any + j: Any + + rows, cols = self.screen.getmaxyx() + # unnecessary + # if self.spread == 2: + # reading_state = dataclasses.replace(reading_state, textwidth=(cols - 7) // 2) + + x = (cols - reading_state.textwidth) // 2 + if self.spread == 1: + x = (cols - reading_state.textwidth) // 2 + else: + x = 2 + + if not self.search_data: + candidate_text = self.input_prompt(" Regex:") + # if isinstance(candidate_text, str) and candidate_text != "": + if isinstance(candidate_text, str) and candidate_text: + self.search_data = SearchData(value=candidate_text) + else: + assert isinstance(candidate_text, NoUpdate) or isinstance(candidate_text, Key) + return candidate_text + + found = [] + try: + pattern = re.compile(self.search_data.value, re.IGNORECASE) + except re.error as reerrmsg: + self.search_data = None + tmpk = self.show_win_error("!Regex Error", str(reerrmsg), tuple()) + return tmpk + + for n, i in enumerate(src): + for j in pattern.finditer(i): + found.append([n, j.span()[0], j.span()[1] - j.span()[0]]) + + if not found: + if ( + self.search_data.direction == Direction.FORWARD + and reading_state.content_index + 1 < tot + ): + return ReadingState( + content_index=reading_state.content_index + 1, + textwidth=reading_state.textwidth, + row=0, + ) + elif ( + self.search_data.direction == Direction.BACKWARD and reading_state.content_index > 0 + ): + return ReadingState( + content_index=reading_state.content_index - 1, + textwidth=reading_state.textwidth, + row=0, + ) + else: + s: Union[NoUpdate, Key] = NoUpdate() + while True: + if s in self.keymap.Quit: + self.search_data = None + self.screen.clear() + self.screen.refresh() + return reading_state + # TODO: maybe >= 0? + elif s == Key("n") and reading_state.content_index == 0: + self.search_data = dataclasses.replace( + self.search_data, direction=Direction.FORWARD + ) + return ReadingState( + content_index=reading_state.content_index + 1, + textwidth=reading_state.textwidth, + row=0, + ) + elif s == Key("N") and reading_state.content_index + 1 == tot: + self.search_data = dataclasses.replace( + self.search_data, direction=Direction.BACKWARD + ) + return ReadingState( + content_index=reading_state.content_index - 1, + textwidth=reading_state.textwidth, + row=0, + ) + + self.screen.clear() + self.screen.addstr( + rows - 1, + 0, + " Finished searching: " + self.search_data.value[: cols - 22] + " ", + curses.A_REVERSE, + ) + board.write(reading_state.row, 1) + self.screen.refresh() + s = board.getch() + + sidx = len(found) - 1 + if self.search_data.direction == Direction.FORWARD: + if reading_state.row > found[-1][0]: + return ReadingState( + content_index=reading_state.content_index + 1, + textwidth=reading_state.textwidth, + row=0, + ) + for n, i in enumerate(found): + if i[0] >= reading_state.row: + sidx = n + break + + s = NoUpdate() + msg = ( + " Searching: " + + self.search_data.value + + " --- Res {}/{} Ch {}/{} ".format( + sidx + 1, len(found), reading_state.content_index + 1, tot + ) + ) + while True: + if s in self.keymap.Quit: + self.search_data = None + # for i in found: + # pad.chgat(i[0], i[1], i[2], pad.getbkgd()) + board.feed_temporary_style() + # pad.format() + # self.screen.clear() + # self.screen.refresh() + return reading_state + elif s == Key("n"): + self.search_data = dataclasses.replace( + self.search_data, direction=Direction.FORWARD + ) + if sidx == len(found) - 1: + if reading_state.content_index + 1 < tot: + return ReadingState( + content_index=reading_state.content_index + 1, + textwidth=reading_state.textwidth, + row=0, + ) + else: + s = NoUpdate() + msg = " Finished searching: " + self.search_data.value + " " + continue + else: + sidx += 1 + msg = ( + " Searching: " + + self.search_data.value + + " --- Res {}/{} Ch {}/{} ".format( + sidx + 1, len(found), reading_state.content_index + 1, tot + ) + ) + elif s == Key("N"): + self.search_data = dataclasses.replace( + self.search_data, direction=Direction.BACKWARD + ) + if sidx == 0: + if reading_state.content_index > 0: + return ReadingState( + content_index=reading_state.content_index - 1, + textwidth=reading_state.textwidth, + row=0, + ) + else: + s = NoUpdate() + msg = " Finished searching: " + self.search_data.value + " " + continue + else: + sidx -= 1 + msg = ( + " Searching: " + + self.search_data.value + + " --- Res {}/{} Ch {}/{} ".format( + sidx + 1, len(found), reading_state.content_index + 1, tot + ) + ) + elif s == Key(curses.KEY_RESIZE): + return Key(curses.KEY_RESIZE) + + # if reading_state.row + rows - 1 > pad.chunks[pad.find_chunkidx(reading_state.row)]: + # reading_state = dataclasses.replace( + # reading_state, row=pad.chunks[pad.find_chunkidx(reading_state.row)] + 1 + # ) + + while found[sidx][0] not in list( + range(reading_state.row, reading_state.row + (rows - 1) * self.spread) + ): + if found[sidx][0] > reading_state.row: + reading_state = dataclasses.replace( + reading_state, row=reading_state.row + ((rows - 1) * self.spread) + ) + else: + reading_state = dataclasses.replace( + reading_state, row=reading_state.row - ((rows - 1) * self.spread) + ) + if reading_state.row < 0: + reading_state = dataclasses.replace(reading_state, row=0) + + # formats = [InlineStyle(row=i[0], col=i[1], n_letters=i[2], attr=curses.A_REVERSE) for i in found] + # pad.feed_style(formats) + styles: List[InlineStyle] = [] + for n, i in enumerate(found): + attr = curses.A_REVERSE if n == sidx else curses.A_NORMAL + # pad.chgat(i[0], i[1], i[2], pad.getbkgd() | attr) + styles.append( + InlineStyle(row=i[0], col=i[1], n_letters=i[2], attr=board.getbkgd() | attr) + ) + board.feed_temporary_style(tuple(styles)) + + self.screen.clear() + self.screen.addstr(rows - 1, 0, msg, curses.A_REVERSE) + self.screen.refresh() + # pad.refresh(reading_state.row, 0, 0, x, rows - 2, x + reading_state.textwidth) + board.write(reading_state.row, 1) + s = board.getch() + + def speaking(self, text): + self.is_speaking = True + self.screen.addstr(self.screen_rows - 1, 0, " Speaking! ", curses.A_REVERSE) + self.screen.refresh() + self.screen.timeout(1) + try: + self._tts_speaker.speak(text) + + while True: + if self._tts_speaker.is_done(): + k = self.keymap.PageDown[0] + break + tmp = self.screen.getch() + k = NoUpdate() if tmp == -1 else Key(tmp) + if k == Key(curses.KEY_MOUSE): + mouse_event = curses.getmouse() + if mouse_event[4] == curses.BUTTON2_CLICKED: + k = self.keymap.Quit[0] + elif mouse_event[4] == curses.BUTTON1_CLICKED: + if mouse_event[1] < self.screen_cols // 2: + k = self.keymap.PageUp[0] + else: + k = self.keymap.PageDown[0] + elif mouse_event[4] == curses.BUTTON4_PRESSED: + k = self.keymap.ScrollUp[0] + elif mouse_event[4] == 2097152: + k = self.keymap.ScrollDown[0] + if ( + k + in self.keymap.Quit + + self.keymap.PageUp + + self.keymap.PageDown + + self.keymap.ScrollUp + + self.keymap.ScrollDown + + (curses.KEY_RESIZE,) + ): + self._tts_speaker.stop() + break + finally: + self.screen.timeout(-1) + self._tts_speaker.cleanup() + + if k in self.keymap.Quit: + self.is_speaking = False + k = NoUpdate() + return k + + def savestate(self, reading_state: ReadingState) -> None: + if self.seamless: + reading_state = self.convert_absolute_reading_state_to_relative(reading_state) + self.state.set_last_reading_state(self.ebook, reading_state) + self.state.update_library(self.ebook, self.reading_progress) + + def cleanup(self) -> None: + self.ebook.cleanup() + + if isinstance(self._process_counting_letter, multiprocessing.Process): + if self._process_counting_letter.is_alive(): + self._process_counting_letter.terminate() + # weird python multiprocessing issue, need to call .join() before .close() + # ValueError: Cannot close a process while it is still running. + # You should first call join() or terminate(). + self._process_counting_letter.join() + self._process_counting_letter.close() + + def convert_absolute_reading_state_to_relative(self, reading_state) -> ReadingState: + if not self.seamless: + raise RuntimeError( + "Reader.convert_absolute_reading_state_to_relative() only implemented when Seamless=True" + ) + return construct_relative_reading_state(reading_state, self.totlines_per_content) + + def convert_relative_reading_state_to_absolute( + self, reading_state: ReadingState + ) -> ReadingState: + if not self.seamless: + raise RuntimeError( + "Reader.convert_relative_reading_state_to_absolute() only implemented when Seamless=True" + ) + + absolute_row = reading_state.row + sum( + self.totlines_per_content[: reading_state.content_index] + ) + absolute_pctg = ( + absolute_row / sum(self.totlines_per_content) if reading_state.rel_pctg else None + ) + + return dataclasses.replace( + reading_state, content_index=0, row=absolute_row, rel_pctg=absolute_pctg + ) + + def get_all_book_contents( + self, reading_state: ReadingState + ) -> Tuple[TextStructure, Tuple[TocEntry, ...], Union[Tuple[str, ...], Tuple[ET.Element, ...]]]: + if not self.seamless: + raise RuntimeError("Reader.get_all_book_contents() only implemented when Seamless=True") + + contents = self.ebook.contents + toc_entries = self.ebook.toc_entries + + text_structure: TextStructure = TextStructure( + text_lines=tuple(), image_maps=dict(), section_rows=dict(), formatting=tuple() + ) + toc_entries_tmp: List[TocEntry] = [] + section_rows_tmp: Dict[str, int] = dict() + + # self.totlines_per_content only defined when Seamless=True + self.totlines_per_content: Tuple[int, ...] = tuple() + + for n, content in enumerate(contents): + self.show_loader(subtext=f"loading contents ({n+1}/{len(contents)})") + starting_line = sum(self.totlines_per_content) + assert isinstance(content, str) or isinstance(content, ET.Element) + text_structure_tmp = parse_html( + self.ebook.get_raw_text(content), + textwidth=reading_state.textwidth, + section_ids=set(toc_entry.section for toc_entry in toc_entries), # type: ignore + starting_line=starting_line, + ) + assert isinstance(text_structure_tmp, TextStructure) + # self.totlines_per_content.append(len(text_structure_tmp.text_lines)) + self.totlines_per_content += (len(text_structure_tmp.text_lines),) + + for toc_entry in toc_entries: + if toc_entry.content_index == n: + if toc_entry.section: + toc_entries_tmp.append(dataclasses.replace(toc_entry, content_index=0)) + else: + section_id_tmp = str(uuid.uuid4()) + toc_entries_tmp.append( + TocEntry(label=toc_entry.label, content_index=0, section=section_id_tmp) + ) + section_rows_tmp[section_id_tmp] = starting_line + + text_structure = merge_text_structures(text_structure, text_structure_tmp) + + text_structure = dataclasses.replace( + text_structure, section_rows={**text_structure.section_rows, **section_rows_tmp} + ) + + return text_structure, tuple(toc_entries_tmp), (self.ebook.contents[0],) + + def get_current_book_content( + self, reading_state: ReadingState + ) -> Tuple[TextStructure, Tuple[TocEntry, ...], Union[Tuple[str, ...], Tuple[ET.Element, ...]]]: + contents = self.ebook.contents + toc_entries = self.ebook.toc_entries + content_path = contents[reading_state.content_index] + content = self.ebook.get_raw_text(content_path) + text_structure = parse_html( # type: ignore + content, + textwidth=reading_state.textwidth, + section_ids=set(toc_entry.section for toc_entry in toc_entries), # type: ignore + ) + return text_structure, toc_entries, contents + + def read(self, reading_state: ReadingState) -> Union[ReadingState, Ebook]: + # reusable loop indices + i: Any + + k = self.keymap.RegexSearch[0] if self.search_data else NoUpdate() + rows, cols = self.screen.getmaxyx() + + mincols_doublespr = ( + DoubleSpreadPadding.LEFT.value + + 22 + + DoubleSpreadPadding.MIDDLE.value + + 22 + + DoubleSpreadPadding.RIGHT.value + ) + if cols < mincols_doublespr: + self.spread = 1 + if self.spread == 2: + reading_state = dataclasses.replace( + reading_state, + textwidth=( + cols + - sum( + [ + DoubleSpreadPadding.LEFT.value, + DoubleSpreadPadding.MIDDLE.value, + DoubleSpreadPadding.RIGHT.value, + ] + ) + ) + // 2, + ) + x = (cols - reading_state.textwidth) // 2 + if self.spread == 2: + x = DoubleSpreadPadding.LEFT.value + + self.show_loader(subtext="loading contents") + # get text structure, toc entries and contents of the book + if self.seamless: + text_structure, toc_entries, contents = self.get_all_book_contents(reading_state) + # adjustment + reading_state = self.convert_relative_reading_state_to_absolute(reading_state) + else: + text_structure, toc_entries, contents = self.get_current_book_content(reading_state) + + totlines = len(text_structure.text_lines) + + if reading_state.row < 0 and totlines <= rows * self.spread: + reading_state = dataclasses.replace(reading_state, row=0) + elif reading_state.rel_pctg is not None: + reading_state = dataclasses.replace( + reading_state, row=round(reading_state.rel_pctg * totlines) + ) + else: + reading_state = dataclasses.replace(reading_state, row=reading_state.row % totlines) + + board = InfiniBoard( + screen=self.screen, + text=text_structure.text_lines, + textwidth=reading_state.textwidth, + default_style=text_structure.formatting, + spread=self.spread, + ) + + letters_per_content: List[int] = [] + for i in text_structure.text_lines: + letters_per_content.append(len(re.sub(r"\s", "", i))) + + self.screen.clear() + self.screen.refresh() + # try-except clause if there is issue + # with curses resize event + board.write(reading_state.row) + + # if reading_state.section is not None + # then override reading_state.row to follow the section + if reading_state.section: + reading_state = dataclasses.replace( + reading_state, row=text_structure.section_rows.get(reading_state.section, 0) + ) + + checkpoint_row: Optional[int] = None + countstring = "" + + try: + while True: + if countstring == "": + count = 1 + else: + count = int(countstring) + if k in tuple(Key(i) for i in range(48, 58)): # i.e., k is a numeral + countstring = countstring + k.char + else: + if k in self.keymap.Quit: + if k == Key(27) and countstring != "": + countstring = "" + else: + self.try_assign_letters_count(force_wait=True) + self.calculate_reading_progress(letters_per_content, reading_state) + + self.savestate( + dataclasses.replace( + reading_state, rel_pctg=reading_state.row / totlines + ) + ) + sys.exit() + + elif k in self.keymap.TTSToggle and self.tts_support: + tospeak = "" + for i in text_structure.text_lines[ + reading_state.row : reading_state.row + (rows * self.spread) + ]: + if re.match(r"^\s*$", i) is not None: + tospeak += "\n. \n" + else: + tospeak += i + " " + k = self.speaking(tospeak) + if ( + totlines - reading_state.row <= rows + and reading_state.content_index == len(contents) - 1 + ): + self.is_speaking = False + continue + + elif k in self.keymap.DoubleSpreadToggle: + if cols < mincols_doublespr: + k = self.show_win_error( + "Screen is too small", + "Min: {} cols x {} rows".format(mincols_doublespr, 12), + (Key("D"),), + ) + self.spread = (self.spread % 2) + 1 + return ReadingState( + content_index=reading_state.content_index, + textwidth=reading_state.textwidth, + row=reading_state.row, + rel_pctg=reading_state.row / totlines, + ) + + elif k in self.keymap.ScrollUp: + if self.spread == 2: + k = self.keymap.PageUp[0] + continue + if count > 1: + checkpoint_row = reading_state.row - 1 + if reading_state.row >= count: + reading_state = dataclasses.replace( + reading_state, row=reading_state.row - count + ) + elif reading_state.row == 0 and reading_state.content_index != 0: + self.page_animation = Direction.BACKWARD + # return -1, width, -rows, None, "" + return ReadingState( + content_index=reading_state.content_index - 1, + textwidth=reading_state.textwidth, + row=-rows, + ) + else: + reading_state = dataclasses.replace(reading_state, row=0) + + elif k in self.keymap.PageUp: + if reading_state.row == 0 and reading_state.content_index != 0: + self.page_animation = Direction.BACKWARD + text_structure_content_before = parse_html( + self.ebook.get_raw_text(contents[reading_state.content_index - 1]), + textwidth=reading_state.textwidth, + ) + assert isinstance(text_structure_content_before, TextStructure) + return ReadingState( + content_index=reading_state.content_index - 1, + textwidth=reading_state.textwidth, + row=rows + * self.spread + * ( + len(text_structure_content_before.text_lines) + // (rows * self.spread) + ), + ) + else: + if reading_state.row >= rows * self.spread * count: + self.page_animation = Direction.BACKWARD + reading_state = dataclasses.replace( + reading_state, + row=reading_state.row - (rows * self.spread * count), + ) + else: + reading_state = dataclasses.replace(reading_state, row=0) + + elif k in self.keymap.ScrollDown: + if self.spread == 2: + k = self.keymap.PageDown[0] + continue + if count > 1: + checkpoint_row = reading_state.row + rows - 1 + if reading_state.row + count <= totlines - rows: + reading_state = dataclasses.replace( + reading_state, row=reading_state.row + count + ) + elif ( + reading_state.row >= totlines - rows + and reading_state.content_index != len(contents) - 1 + ): + self.page_animation = Direction.FORWARD + return ReadingState( + content_index=reading_state.content_index + 1, + textwidth=reading_state.textwidth, + row=0, + ) + + elif k in self.keymap.PageDown: + if totlines - reading_state.row > rows * self.spread: + self.page_animation = Direction.FORWARD + reading_state = dataclasses.replace( + reading_state, row=reading_state.row + (rows * self.spread) + ) + elif reading_state.content_index != len(contents) - 1: + self.page_animation = Direction.FORWARD + return ReadingState( + content_index=reading_state.content_index + 1, + textwidth=reading_state.textwidth, + row=0, + ) + + # elif k in K["HalfScreenUp"] | K["HalfScreenDown"]: + # countstring = str(rows // 2) + # k = list(K["ScrollUp" if k in K["HalfScreenUp"] else "ScrollDown"])[0] + # continue + + elif k in self.keymap.NextChapter: + ntoc = find_current_content_index( + toc_entries, + text_structure.section_rows, + reading_state.content_index, + reading_state.row, + ) + if ntoc < len(toc_entries) - 1: + if reading_state.content_index == toc_entries[ntoc + 1].content_index: + try: + reading_state = dataclasses.replace( + reading_state, + row=text_structure.section_rows[ + toc_entries[ntoc + 1].section # type: ignore + ], + ) + except KeyError: + pass + else: + return ReadingState( + content_index=toc_entries[ntoc + 1].content_index, + textwidth=reading_state.textwidth, + row=0, + section=toc_entries[ntoc + 1].section, + ) + + elif k in self.keymap.PrevChapter: + ntoc = find_current_content_index( + toc_entries, + text_structure.section_rows, + reading_state.content_index, + reading_state.row, + ) + if ntoc > 0: + if reading_state.content_index == toc_entries[ntoc - 1].content_index: + reading_state = dataclasses.replace( + reading_state, + row=text_structure.section_rows.get( + toc_entries[ntoc - 1].section, 0 # type: ignore + ), + ) + else: + return ReadingState( + content_index=toc_entries[ntoc - 1].content_index, + textwidth=reading_state.textwidth, + row=0, + section=toc_entries[ntoc - 1].section, + ) + + elif k in self.keymap.BeginningOfCh: + ntoc = find_current_content_index( + toc_entries, + text_structure.section_rows, + reading_state.content_index, + reading_state.row, + ) + try: + reading_state = dataclasses.replace( + reading_state, + row=text_structure.section_rows[toc_entries[ntoc].section], # type: ignore + ) + except (KeyError, IndexError): + reading_state = dataclasses.replace(reading_state, row=0) + + elif k in self.keymap.EndOfCh: + ntoc = find_current_content_index( + toc_entries, + text_structure.section_rows, + reading_state.content_index, + reading_state.row, + ) + try: + if ( + text_structure.section_rows[toc_entries[ntoc + 1].section] - rows # type: ignore + >= 0 + ): + reading_state = dataclasses.replace( + reading_state, + row=text_structure.section_rows[toc_entries[ntoc + 1].section] # type: ignore + - rows, + ) + else: + reading_state = dataclasses.replace( + reading_state, + row=text_structure.section_rows[toc_entries[ntoc].section], # type: ignore + ) + except (KeyError, IndexError): + reading_state = dataclasses.replace( + reading_state, row=pgend(totlines, rows) + ) + + elif k in self.keymap.TableOfContents: + if not toc_entries: + k = self.show_win_error( + "Table of Contents", + "N/A: TableOfContents is unavailable for this book.", + self.keymap.TableOfContents, + ) + continue + ntoc = find_current_content_index( + toc_entries, + text_structure.section_rows, + reading_state.content_index, + reading_state.row, + ) + rettock, fllwd, _ = self.toc(toc_entries, ntoc) + if rettock is not None: # and rettock in WINKEYS: + k = rettock + continue + elif fllwd is not None: + if reading_state.content_index == toc_entries[fllwd].content_index: + try: + reading_state = dataclasses.replace( + reading_state, + row=text_structure.section_rows[toc_entries[fllwd].section], + ) + except KeyError: + reading_state = dataclasses.replace(reading_state, row=0) + else: + return ReadingState( + content_index=toc_entries[fllwd].content_index, + textwidth=reading_state.textwidth, + row=0, + section=toc_entries[fllwd].section, + ) + + elif k in self.keymap.Metadata: + k = self.show_win_metadata() + if k in self._win_keys: + continue + + elif k in self.keymap.Help: + k = self.show_win_help() + if k in self._win_keys: + continue + + elif ( + k in self.keymap.Enlarge + and (reading_state.textwidth + count) < cols - 4 + and self.spread == 1 + ): + return dataclasses.replace( + reading_state, + textwidth=reading_state.textwidth + count, + rel_pctg=reading_state.row / totlines, + ) + + elif ( + k in self.keymap.Shrink + and reading_state.textwidth >= 22 + and self.spread == 1 + ): + return dataclasses.replace( + reading_state, + textwidth=reading_state.textwidth - count, + rel_pctg=reading_state.row / totlines, + ) + + elif k in self.keymap.SetWidth and self.spread == 1: + if countstring == "": + # if called without a count, toggle between 80 cols and full width + if reading_state.textwidth != 80 and cols - 4 >= 80: + return ReadingState( + content_index=reading_state.content_index, + textwidth=80, + row=reading_state.row, + rel_pctg=reading_state.row / totlines, + ) + else: + return ReadingState( + content_index=reading_state.content_index, + textwidth=cols - 4, + row=reading_state.row, + rel_pctg=reading_state.row / totlines, + ) + else: + reading_state = dataclasses.replace(reading_state, textwidth=count) + if reading_state.textwidth < 20: + reading_state = dataclasses.replace(reading_state, textwidth=20) + elif reading_state.textwidth >= cols - 4: + reading_state = dataclasses.replace(reading_state, textwidth=cols - 4) + + return ReadingState( + content_index=reading_state.content_index, + textwidth=reading_state.textwidth, + row=reading_state.row, + rel_pctg=reading_state.row / totlines, + ) + + elif k in self.keymap.RegexSearch: + ret_object = self.searching( + board, + text_structure.text_lines, + reading_state, + len(contents), + ) + if isinstance(ret_object, Key) or isinstance(ret_object, NoUpdate): + k = ret_object + # k = ret_object.value + continue + elif isinstance(ret_object, ReadingState) and self.search_data: + return ret_object + # else: + elif isinstance(ret_object, ReadingState): + # y = ret_object + reading_state = ret_object + + elif k in self.keymap.OpenImage and self.image_viewer: + imgs_in_screen = list( + set( + range(reading_state.row, reading_state.row + rows * self.spread + 1) + ) + & set(text_structure.image_maps.keys()) + ) + if not imgs_in_screen: + k = NoUpdate() + continue + + imgs_in_screen.sort() + image_path: Optional[str] = None + if len(imgs_in_screen) == 1: + image_path = text_structure.image_maps[imgs_in_screen[0]] + elif len(imgs_in_screen) > 1: + imgs_rel_to_row = [i - reading_state.row for i in imgs_in_screen] + p: Union[NoUpdate, Key] = NoUpdate() + i = 0 + while p not in self.keymap.Quit and p not in self.keymap.Follow: + self.screen.move( + imgs_rel_to_row[i] % rows, + ( + x + if imgs_rel_to_row[i] // rows == 0 + else cols + - DoubleSpreadPadding.RIGHT.value + - reading_state.textwidth + ) + + reading_state.textwidth // 2, + ) + self.screen.refresh() + safe_curs_set(2) + p = board.getch() + if p in self.keymap.ScrollDown: + i += 1 + elif p in self.keymap.ScrollUp: + i -= 1 + i = i % len(imgs_rel_to_row) + + safe_curs_set(0) + if p in self.keymap.Follow: + image_path = text_structure.image_maps[imgs_in_screen[i]] + + if image_path: + try: + # if self.ebook.__class__.__name__ in {"Epub", "Mobi", "Azw"}: + if isinstance(self.ebook, (Epub, Mobi, Azw)): + # self.seamless adjustment + if self.seamless: + current_content_index = ( + self.convert_absolute_reading_state_to_relative( + reading_state + ).content_index + ) + else: + current_content_index = reading_state.content_index + # for n, content in enumerate(self.ebook.contents): + # content_path = content + # if reading_state.row < sum(totlines_per_content[:n]): + # break + + content_path = self.ebook.contents[current_content_index] + assert isinstance(content_path, str) + image_path = resolve_path(content_path, image_path) + imgnm, imgbstr = self.ebook.get_img_bytestr(image_path) + k = self.open_image(board, imgnm, imgbstr) + continue + except Exception as e: + self.show_win_error("Error Opening Image", str(e), tuple()) + if DEBUG: + raise e + + elif ( + k in self.keymap.SwitchColor + and self.is_color_supported + and countstring in {"", "0", "1", "2"} + ): + if countstring == "": + count_color = curses.pair_number(self.screen.getbkgd()) + if count_color not in {2, 3}: + count_color = 1 + count_color = count_color % 3 + else: + count_color = count + self.screen.bkgd(curses.color_pair(count_color + 1)) + # pad.format() + return ReadingState( + content_index=reading_state.content_index, + textwidth=reading_state.textwidth, + row=reading_state.row, + ) + + elif k in self.keymap.AddBookmark: + bmname = self.input_prompt(" Add bookmark:") + if isinstance(bmname, str) and bmname: + try: + self.state.insert_bookmark( + self.ebook, + bmname, + dataclasses.replace( + reading_state, rel_pctg=reading_state.row / totlines + ), + ) + except sqlite3.IntegrityError: + k = self.show_win_error( + "Error: Add Bookmarks", + f"Bookmark with name '{bmname}' already exists.", + (Key("B"),), + ) + continue + else: + k = bmname + continue + + elif k in self.keymap.ShowBookmarks: + bookmarks = self.state.get_bookmarks(self.ebook) + if not bookmarks: + k = self.show_win_error( + "Bookmarks", + "N/A: Bookmarks are not found in this book.", + self.keymap.ShowBookmarks, + ) + continue + else: + retk, idxchoice = self.show_win_choices_bookmarks() + if retk is not None: + k = retk + continue + elif idxchoice is not None: + bookmark_to_jump = self.state.get_bookmarks(self.ebook)[idxchoice][ + 1 + ] + if ( + bookmark_to_jump.content_index == reading_state.content_index + and bookmark_to_jump.textwidth == reading_state.textwidth + ): + reading_state = bookmark_to_jump + else: + return ReadingState( + content_index=bookmark_to_jump.content_index, + textwidth=reading_state.textwidth, + row=bookmark_to_jump.row, + rel_pctg=bookmark_to_jump.rel_pctg, + ) + + elif k in self.keymap.DefineWord and self.ext_dict_app: + word = self.input_prompt(" Define:") + if isinstance(word, str) and word: + defin = self.define_word(word) + if defin in self._win_keys: + k = defin + continue + else: + k = word + continue + + elif k in self.keymap.MarkPosition: + jumnum = board.getch() + if isinstance(jumnum, Key) and jumnum in tuple( + Key(i) for i in range(48, 58) + ): + self.jump_list[jumnum.char] = reading_state + else: + k = NoUpdate() + continue + + elif k in self.keymap.JumpToPosition: + jumnum = board.getch() + if ( + isinstance(jumnum, Key) + and jumnum in tuple(Key(i) for i in range(48, 58)) + and jumnum.char in self.jump_list + ): + marked_reading_state = self.jump_list[jumnum.char] + return dataclasses.replace( + marked_reading_state, + textwidth=reading_state.textwidth, + rel_pctg=None + if marked_reading_state.textwidth == reading_state.textwidth + else marked_reading_state.rel_pctg, + section="", + ) + else: + k = NoUpdate() + continue + + elif k in self.keymap.ShowHideProgress: + self.show_reading_progress = not self.show_reading_progress + + elif k in self.keymap.Library: + self.try_assign_letters_count(force_wait=True) + self.calculate_reading_progress(letters_per_content, reading_state) + + self.savestate( + dataclasses.replace( + reading_state, rel_pctg=reading_state.row / totlines + ) + ) + library_items = self.state.get_from_history() + if not library_items: + k = self.show_win_error( + "Library", + "N/A: No reading history.", + self.keymap.Library, + ) + continue + else: + retk, choice_index = self.show_win_library() + if retk is not None: + k = retk + continue + elif choice_index is not None: + return get_ebook_obj(library_items[choice_index].filepath) + + elif k == Key(curses.KEY_RESIZE): + self.savestate( + dataclasses.replace( + reading_state, rel_pctg=reading_state.row / totlines + ) + ) + # stated in pypi windows-curses page: + # to call resize_term right after KEY_RESIZE + if sys.platform == "win32": + curses.resize_term(rows, cols) + rows, cols = self.screen.getmaxyx() + else: + rows, cols = self.screen.getmaxyx() + curses.resize_term(rows, cols) + if cols < 22 or rows < 12: + sys.exit("ERROR: Screen was too small (min 22cols x 12rows).") + if cols <= reading_state.textwidth + 4: + return ReadingState( + content_index=reading_state.content_index, + textwidth=cols - 4, + row=reading_state.row, + rel_pctg=reading_state.row / totlines, + ) + else: + return ReadingState( + content_index=reading_state.content_index, + textwidth=reading_state.textwidth, + row=reading_state.row, + ) + + countstring = "" + + if checkpoint_row: + board.feed_temporary_style( + ( + InlineStyle( + row=checkpoint_row, + col=0, + n_letters=reading_state.textwidth, + attr=curses.A_UNDERLINE, + ), + ) + ) + + try: + if self.setting.PageScrollAnimation and self.page_animation: + self.screen.clear() + for i in range(1, reading_state.textwidth + 1): + curses.napms(1) + # self.screen.clear() + board.write_n(reading_state.row, i, self.page_animation) + self.screen.refresh() + self.page_animation = None + + self.screen.clear() + self.screen.addstr(0, 0, countstring) + board.write(reading_state.row) + + # check if letters counting process is done + self.try_assign_letters_count() + + # reading progress + self.calculate_reading_progress(letters_per_content, reading_state) + + # display reading progress + if ( + self.reading_progress + and self.show_reading_progress + and (cols - reading_state.textwidth - 2) // 2 > 3 + ): + reading_progress_str = "{}%".format(int(self.reading_progress * 100)) + self.screen.addstr( + 0, cols - len(reading_progress_str), reading_progress_str + ) + + self.screen.refresh() + except curses.error: + pass + + if self.is_speaking: + k = self.keymap.TTSToggle[0] + continue + + k = board.getch() + if k == Key(curses.KEY_MOUSE): + mouse_event = curses.getmouse() + if mouse_event[4] == curses.BUTTON1_CLICKED: + if mouse_event[1] < cols // 2: + k = self.keymap.PageUp[0] + else: + k = self.keymap.PageDown[0] + elif mouse_event[4] == curses.BUTTON3_CLICKED: + k = self.keymap.TableOfContents[0] + elif mouse_event[4] == curses.BUTTON4_PRESSED: + k = self.keymap.ScrollUp[0] + elif mouse_event[4] == 2097152: + k = self.keymap.ScrollDown[0] + elif mouse_event[4] == curses.BUTTON4_PRESSED + curses.BUTTON_CTRL: + k = self.keymap.Enlarge[0] + elif mouse_event[4] == 2097152 + curses.BUTTON_CTRL: + k = self.keymap.Shrink[0] + elif mouse_event[4] == curses.BUTTON2_CLICKED: + k = self.keymap.TTSToggle[0] + + if checkpoint_row: + board.feed_temporary_style() + checkpoint_row = None + + except KeyboardInterrupt: + self.savestate( + dataclasses.replace(reading_state, rel_pctg=reading_state.row / totlines) + ) + sys.exit() + + +def start_reading(stdscr, filepath: str): + + ebook = get_ebook_obj(filepath) + state = State() + config = Config() + + reader = Reader(screen=stdscr, ebook=ebook, config=config, state=state) + + def handle_signal(signum, _): + """ + Method to raise SystemExit based on signal received + to trigger `try-finally` clause + """ + msg = f"[{os.getpid()}] killed" + if signal.Signals(signum) == signal.SIGTERM: + msg = f"[{os.getpid()}] terminated" + sys.exit(msg) + + signal.signal(signal.SIGTERM, handle_signal) + + try: + reader.run_counting_letters() + + reading_state = state.get_last_reading_state(reader.ebook) + if reader.screen_cols <= reading_state.textwidth + 4: + reading_state = dataclasses.replace(reading_state, textwidth=reader.screen_cols - 4) + else: + reading_state = dataclasses.replace(reading_state, rel_pctg=None) + + while True: + reading_state_or_ebook = reader.read(reading_state) + + if isinstance(reading_state_or_ebook, Ebook): + return reading_state_or_ebook.path + else: + reading_state = reading_state_or_ebook + if reader.seamless: + reading_state = reader.convert_absolute_reading_state_to_relative(reading_state) + + finally: + reader.cleanup() diff --git a/src/epy_reader/settings.py b/src/epy_reader/settings.py new file mode 100644 index 0000000..f09bc98 --- /dev/null +++ b/src/epy_reader/settings.py @@ -0,0 +1,133 @@ +import curses +from dataclasses import dataclass, field +from enum import Enum +from typing import List, Optional, Tuple + +from epy_reader.models import Key + + +class DoubleSpreadPadding(Enum): + LEFT = 10 + MIDDLE = 7 + RIGHT = 10 + + +# add image viewers here +# sorted by most widely used +VIEWER_PRESET_LIST = ( + "feh", + "imv", + "gio", + "gnome-open", + "gvfs-open", + "xdg-open", + "kde-open", + "firefox", +) + +DICT_PRESET_LIST = ( + "wkdict", + "sdcv", + "dict", +) + + +@dataclass(frozen=True) +class Settings: + DefaultViewer: str = "auto" + DictionaryClient: str = "auto" + ShowProgressIndicator: bool = True + PageScrollAnimation: bool = True + MouseSupport: bool = False + StartWithDoubleSpread: bool = False + # -1 is default terminal fg/bg colors + DefaultColorFG: int = -1 + DefaultColorBG: int = -1 + DarkColorFG: int = 252 + DarkColorBG: int = 235 + LightColorFG: int = 238 + LightColorBG: int = 253 + SeamlessBetweenChapters: bool = False + PreferredTTSEngine: Optional[str] = None + TTSEngineArgs: List[str] = field(default_factory=list) + + +@dataclass(frozen=True) +class CfgDefaultKeymaps: + ScrollUp: str = "k" + ScrollDown: str = "j" + PageUp: str = "h" + PageDown: str = "l" + # HalfScreenUp: str = "h" + # HalfScreenDown: str + NextChapter: str = "L" + PrevChapter: str = "H" + BeginningOfCh: str = "g" + EndOfCh: str = "G" + Shrink: str = "-" + Enlarge: str = "+" + SetWidth: str = "=" + Metadata: str = "M" + DefineWord: str = "d" + TableOfContents: str = "t" + Follow: str = "f" + OpenImage: str = "o" + RegexSearch: str = "/" + ShowHideProgress: str = "s" + MarkPosition: str = "m" + JumpToPosition: str = "`" + AddBookmark: str = "b" + ShowBookmarks: str = "B" + Quit: str = "q" + Help: str = "?" + SwitchColor: str = "c" + TTSToggle: str = "!" + DoubleSpreadToggle: str = "D" + Library: str = "R" + + +@dataclass(frozen=True) +class CfgBuiltinKeymaps: + ScrollUp: Tuple[int, ...] = (curses.KEY_UP,) + ScrollDown: Tuple[int, ...] = (curses.KEY_DOWN,) + PageUp: Tuple[int, ...] = (curses.KEY_PPAGE, curses.KEY_LEFT) + PageDown: Tuple[int, ...] = (curses.KEY_NPAGE, ord(" "), curses.KEY_RIGHT) + BeginningOfCh: Tuple[int, ...] = (curses.KEY_HOME,) + EndOfCh: Tuple[int, ...] = (curses.KEY_END,) + TableOfContents: Tuple[int, ...] = (9, ord("\t")) + Follow: Tuple[int, ...] = (10,) + Quit: Tuple[int, ...] = (3, 27, 304) + + +@dataclass(frozen=True) +class Keymap: + # HalfScreenDown: Tuple[Key, ...] + # HalfScreenUp: Tuple[Key, ...] + AddBookmark: Tuple[Key, ...] + BeginningOfCh: Tuple[Key, ...] + DefineWord: Tuple[Key, ...] + DoubleSpreadToggle: Tuple[Key, ...] + EndOfCh: Tuple[Key, ...] + Enlarge: Tuple[Key, ...] + Follow: Tuple[Key, ...] + Help: Tuple[Key, ...] + JumpToPosition: Tuple[Key, ...] + Library: Tuple[Key, ...] + MarkPosition: Tuple[Key, ...] + Metadata: Tuple[Key, ...] + NextChapter: Tuple[Key, ...] + OpenImage: Tuple[Key, ...] + PageDown: Tuple[Key, ...] + PageUp: Tuple[Key, ...] + PrevChapter: Tuple[Key, ...] + Quit: Tuple[Key, ...] + RegexSearch: Tuple[Key, ...] + ScrollDown: Tuple[Key, ...] + ScrollUp: Tuple[Key, ...] + SetWidth: Tuple[Key, ...] + ShowBookmarks: Tuple[Key, ...] + ShowHideProgress: Tuple[Key, ...] + Shrink: Tuple[Key, ...] + SwitchColor: Tuple[Key, ...] + TTSToggle: Tuple[Key, ...] + TableOfContents: Tuple[Key, ...] diff --git a/src/epy_reader/speakers/__init__.py b/src/epy_reader/speakers/__init__.py new file mode 100644 index 0000000..078be31 --- /dev/null +++ b/src/epy_reader/speakers/__init__.py @@ -0,0 +1,9 @@ +__all__ = [ + "SpeakerBaseModel", + "SpeakerMimic", + "SpeakerPico", +] + +from epy_reader.speakers.base import SpeakerBaseModel +from epy_reader.speakers.mimic import SpeakerMimic +from epy_reader.speakers.pico import SpeakerPico diff --git a/src/epy_reader/speakers/base.py b/src/epy_reader/speakers/base.py new file mode 100644 index 0000000..7c1a8d5 --- /dev/null +++ b/src/epy_reader/speakers/base.py @@ -0,0 +1,21 @@ +from typing import List + + +class SpeakerBaseModel: + cmd: str = "tts_engine_binary" + available: bool = False + + def __init__(self, args: List[str] = []): + self.args = args + + def speak(self, text: str) -> None: + raise NotImplementedError("Speaker.speak() not implemented") + + def is_done(self) -> bool: + raise NotImplementedError("Speaker.is_done() not implemented") + + def stop(self) -> None: + raise NotImplementedError("Speaker.stop() not implemented") + + def cleanup(self) -> None: + raise NotImplementedError("Speaker.cleanup() not implemented") diff --git a/src/epy_reader/speakers/mimic.py b/src/epy_reader/speakers/mimic.py new file mode 100644 index 0000000..0db4ed8 --- /dev/null +++ b/src/epy_reader/speakers/mimic.py @@ -0,0 +1,31 @@ +import shutil +import subprocess + +from epy_reader.speakers.base import SpeakerBaseModel + + +class SpeakerMimic(SpeakerBaseModel): + cmd = "mimic" + available = bool(shutil.which("mimic")) + + def speak(self, text: str) -> None: + self.process = subprocess.Popen( + [self.cmd, *self.args], + text=True, + stdin=subprocess.PIPE, + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT, + ) + assert self.process.stdin + self.process.stdin.write(text) + self.process.stdin.close() + + def is_done(self) -> bool: + return self.process.poll() is not None + + def stop(self) -> None: + self.process.terminate() + # self.process.kill() + + def cleanup(self) -> None: + pass diff --git a/src/epy_reader/speakers/pico.py b/src/epy_reader/speakers/pico.py new file mode 100644 index 0000000..95065f1 --- /dev/null +++ b/src/epy_reader/speakers/pico.py @@ -0,0 +1,43 @@ +import os +import shutil +import subprocess +import sys +import tempfile + +from epy_reader.speakers.base import SpeakerBaseModel + + +class SpeakerPico(SpeakerBaseModel): + cmd = "pico2wave" + available = all([shutil.which(dep) for dep in ["pico2wave", "play"]]) + + def speak(self, text: str) -> None: + _, self.tmp_path = tempfile.mkstemp(suffix=".wav") + + try: + subprocess.run( + [self.cmd, *self.args, "-w", self.tmp_path, text], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + check=True, + ) + except subprocess.CalledProcessError as e: + if "invalid pointer" not in e.output: + sys.exit(e.output) + + self.process = subprocess.Popen( + ["play", self.tmp_path], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + + def is_done(self) -> bool: + return self.process.poll() is not None + + def stop(self) -> None: + self.process.terminate() + # self.process.kill() + + def cleanup(self) -> None: + os.remove(self.tmp_path) diff --git a/src/epy_reader/state.py b/src/epy_reader/state.py new file mode 100644 index 0000000..5129394 --- /dev/null +++ b/src/epy_reader/state.py @@ -0,0 +1,195 @@ +import dataclasses +import hashlib +import os +import sqlite3 +from datetime import datetime +from typing import List, Tuple + +from epy_reader.ebooks import Ebook +from epy_reader.models import AppData, LibraryItem, Optional, ReadingState + + +class State(AppData): + """ + Use sqlite3 instead of JSON (in older version) + to shift the weight from memory to process + """ + + def __init__(self): + if not os.path.isfile(self.filepath): + self.init_db() + + @property + def filepath(self) -> str: + return os.path.join(self.prefix, "states.db") if self.prefix else os.devnull + + def get_from_history(self) -> List[LibraryItem]: + try: + conn = sqlite3.connect(self.filepath) + cur = conn.cursor() + cur.execute( + """ + SELECT last_read, filepath, title, author, reading_progress + FROM library ORDER BY last_read DESC + """ + ) + results = cur.fetchall() + library_items: List[LibraryItem] = [] + for result in results: + library_items.append( + LibraryItem( + last_read=datetime.fromisoformat(result[0]), + filepath=result[1], + title=result[2], + author=result[3], + reading_progress=result[4], + ) + ) + return library_items + finally: + conn.close() + + def delete_from_library(self, filepath: str) -> None: + try: + conn = sqlite3.connect(self.filepath) + conn.execute("PRAGMA foreign_keys = ON") + conn.execute("DELETE FROM reading_states WHERE filepath=?", (filepath,)) + conn.commit() + finally: + conn.close() + + def get_last_read(self) -> Optional[str]: + library = self.get_from_history() + return library[0].filepath if library else None + + def update_library(self, ebook: Ebook, reading_progress: Optional[float]) -> None: + try: + metadata = ebook.get_meta() + conn = sqlite3.connect(self.filepath) + conn.execute( + """ + INSERT OR REPLACE INTO library (filepath, title, author, reading_progress) + VALUES (?, ?, ?, ?) + """, + (ebook.path, metadata.title, metadata.creator, reading_progress), + ) + conn.commit() + finally: + conn.close() + + def get_last_reading_state(self, ebook: Ebook) -> ReadingState: + try: + conn = sqlite3.connect(self.filepath) + conn.row_factory = sqlite3.Row + cur = conn.cursor() + cur.execute("SELECT * FROM reading_states WHERE filepath=?", (ebook.path,)) + result = cur.fetchone() + if result: + result = dict(result) + del result["filepath"] + return ReadingState(**result, section=None) + return ReadingState(content_index=0, textwidth=80, row=0, rel_pctg=None, section=None) + finally: + conn.close() + + def set_last_reading_state(self, ebook: Ebook, reading_state: ReadingState) -> None: + try: + conn = sqlite3.connect(self.filepath) + conn.execute( + """ + INSERT OR REPLACE INTO reading_states + VALUES (:filepath, :content_index, :textwidth, :row, :rel_pctg) + """, + {"filepath": ebook.path, **dataclasses.asdict(reading_state)}, + ) + conn.commit() + finally: + conn.close() + + def insert_bookmark(self, ebook: Ebook, name: str, reading_state: ReadingState) -> None: + try: + conn = sqlite3.connect(self.filepath) + conn.execute( + """ + INSERT INTO bookmarks + VALUES (:id, :filepath, :name, :content_index, :textwidth, :row, :rel_pctg) + """, + { + "id": hashlib.sha1(f"{ebook.path}{name}".encode()).hexdigest()[:10], + "filepath": ebook.path, + "name": name, + **dataclasses.asdict(reading_state), + }, + ) + conn.commit() + finally: + conn.close() + + def delete_bookmark(self, ebook: Ebook, name: str) -> None: + try: + conn = sqlite3.connect(self.filepath) + conn.execute("DELETE FROM bookmarks WHERE filepath=? AND name=?", (ebook.path, name)) + conn.commit() + finally: + conn.close() + + def get_bookmarks(self, ebook: Ebook) -> List[Tuple[str, ReadingState]]: + try: + conn = sqlite3.connect(self.filepath) + conn.row_factory = sqlite3.Row + cur = conn.cursor() + cur.execute("SELECT * FROM bookmarks WHERE filepath=?", (ebook.path,)) + results = cur.fetchall() + bookmarks: List[Tuple[str, ReadingState]] = [] + for result in results: + tmp_dict = dict(result) + name = tmp_dict["name"] + tmp_dict = { + k: v + for k, v in tmp_dict.items() + if k in ("content_index", "textwidth", "row", "rel_pctg") + } + bookmarks.append((name, ReadingState(**tmp_dict))) + return bookmarks + finally: + conn.close() + + def init_db(self) -> None: + try: + conn = sqlite3.connect(self.filepath) + conn.executescript( + """ + CREATE TABLE reading_states ( + filepath TEXT PRIMARY KEY, + content_index INTEGER, + textwidth INTEGER, + row INTEGER, + rel_pctg REAL + ); + + CREATE TABLE library ( + last_read DATETIME DEFAULT (datetime('now','localtime')), + filepath TEXT PRIMARY KEY, + title TEXT, + author TEXT, + reading_progress REAL, + FOREIGN KEY (filepath) REFERENCES reading_states(filepath) + ON DELETE CASCADE + ); + + CREATE TABLE bookmarks ( + id TEXT PRIMARY KEY, + filepath TEXT, + name TEXT, + content_index INTEGER, + textwidth INTEGER, + row INTEGER, + rel_pctg REAL, + FOREIGN KEY (filepath) REFERENCES reading_states(filepath) + ON DELETE CASCADE + ); + """ + ) + conn.commit() + finally: + conn.close() diff --git a/src/epy_reader/tools/KindleUnpack/__init__.py b/src/epy_reader/tools/KindleUnpack/__init__.py new file mode 100644 index 0000000..0077258 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai diff --git a/src/epy_reader/tools/KindleUnpack/compatibility_utils.py b/src/epy_reader/tools/KindleUnpack/compatibility_utils.py new file mode 100755 index 0000000..c46c0bb --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/compatibility_utils.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this list of +# conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, this list +# of conditions and the following disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY +# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import unicode_literals, division, absolute_import, print_function + +import sys +import codecs + +PY2 = sys.version_info[0] == 2 +PY3 = sys.version_info[0] == 3 + +iswindows = sys.platform.startswith('win') + +try: + from urllib.parse import unquote +except ImportError: + from urllib import unquote + +if PY2: + from HTMLParser import HTMLParser + _h = HTMLParser() +elif sys.version_info[1] < 4: + import html.parser + _h = html.parser.HTMLParser() +else: + import html as _h + +if PY3: + text_type = str + binary_type = bytes + # if will be printing arbitraty binary data to stdout on python 3 + # sys.stdin = sys.stdin.detach() + # sys.stdout = sys.stdout.detach() + # sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach()) +else: + range = xrange + text_type = unicode + binary_type = str + # if will be printing unicode under python 2 need to protect + # against sys.stdout.encoding being None stupidly forcing forcing ascii encoding of unicode + # sys.stdout = codecs.getwriter("utf-8")(sys.stdout) + # alternatively set environment variable as follows **before** launching python: export PYTHONIOENCODING=UTF-8 + +# NOTE: Python 3 is completely broken when accessing single bytes in bytes strings +# (and they amazingly claim by design and no bug!) + +# To illustrate: this works for unicode in Python 3 and for all Python 2.X for both bytestrings and unicode +# >>> o = '123456789' +# >>> o[-3] +# '7' +# >>> type(o[-3]) +# +# >>> type(o) +# + +# Unfortunately, this is what Python 3 does for no sane reason and only for bytestrings +# >>> o = b'123456789' +# >>> o[-3] +# 55 +# >>> type(o[-3]) +# +# >>> type(o) +# + +# This mind boggling behaviour also happens when indexing a bytestring and/or +# iteratoring over a bytestring. In other words it will return an int but not +# the byte itself!!!!!!! + +# The only way to access a single byte as a byte in bytestring and get the byte in both +# Python 2 and Python 3 is to use a slice + +# This problem is so common there are horrible hacks floating around the net to **try** +# to work around it, so that code that works on both Python 2 and Python 3 is possible. + +# So in order to write code that works on both Python 2 and Python 3 +# if you index or access a single byte and want its ord() then use the bord() function. +# If instead you want it as a single character byte use the bchar() function +# both of which are defined below. + +if PY3: + # Also Note: if decode a bytestring using 'latin-1' (or any other full range 0-255 encoding) + # in place of ascii you will get a byte value to half-word or integer value + # one-to-one mapping (in the 0 - 255 range) + + def bchr(s): + return bytes([s]) + + def bstr(s): + if isinstance(s, str): + return bytes(s, 'latin-1') + else: + return bytes(s) + + def bord(s): + return s + + def bchar(s): + return bytes([s]) + +else: + def bchr(s): + return chr(s) + + def bstr(s): + return str(s) + + def bord(s): + return ord(s) + + def bchar(s): + return s + +if PY3: + # list-producing versions of the major Python iterating functions + def lrange(*args, **kwargs): + return list(range(*args, **kwargs)) + + def lzip(*args, **kwargs): + return list(zip(*args, **kwargs)) + + def lmap(*args, **kwargs): + return list(map(*args, **kwargs)) + + def lfilter(*args, **kwargs): + return list(filter(*args, **kwargs)) +else: + import __builtin__ + # Python 2-builtin ranges produce lists + lrange = __builtin__.range + lzip = __builtin__.zip + lmap = __builtin__.map + lfilter = __builtin__.filter + +# In Python 3 you can no longer use .encode('hex') on a bytestring +# instead use the following on both platforms +import binascii +def hexlify(bdata): + return (binascii.hexlify(bdata)).decode('ascii') + +# If you: import struct +# Note: struct pack, unpack, unpack_from all *require* bytestring format +# data all the way up to at least Python 2.7.5, Python 3 is okay with either + +# If you: import re +# note: Python 3 "re" requires the pattern to be the exact same type as the data to be +# searched ... but u"" is not allowed for the pattern itself only b"" +# Python 2.X allows the pattern to be any type and converts it to match the data +# and returns the same type as the data + +# convert string to be utf-8 encoded +def utf8_str(p, enc='utf-8'): + if p is None: + return None + if isinstance(p, text_type): + return p.encode('utf-8') + if enc != 'utf-8': + return p.decode(enc).encode('utf-8') + return p + +# convert string to be unicode encoded +def unicode_str(p, enc='utf-8'): + if p is None: + return None + if isinstance(p, text_type): + return p + return p.decode(enc) + +ASCII_CHARS = set(chr(x) for x in range(128)) +URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ' + 'abcdefghijklmnopqrstuvwxyz' + '0123456789' '#' '_.-/~') +IRI_UNSAFE = ASCII_CHARS - URL_SAFE + +# returns a quoted IRI (not a URI) +def quoteurl(href): + if isinstance(href,binary_type): + href = href.decode('utf-8') + result = [] + for char in href: + if char in IRI_UNSAFE: + char = "%%%02x" % ord(char) + result.append(char) + return ''.join(result) + +# unquotes url/iri +def unquoteurl(href): + if isinstance(href,binary_type): + href = href.decode('utf-8') + href = unquote(href) + return href + +# unescape html +def unescapeit(sval): + return _h.unescape(sval) + +# Python 2.X commandline parsing under Windows has been horribly broken for years! +# Use the following code to emulate full unicode commandline parsing on Python 2 +# ie. To get sys.argv arguments and properly encode them as unicode + +def unicode_argv(): + global iswindows + global PY3 + if PY3: + return sys.argv + if iswindows: + # Versions 2.x of Python don't support Unicode in sys.argv on + # Windows, with the underlying Windows API instead replacing multi-byte + # characters with '?'. So use shell32.GetCommandLineArgvW to get sys.argv + # as a list of Unicode strings + from ctypes import POINTER, byref, cdll, c_int, windll + from ctypes.wintypes import LPCWSTR, LPWSTR + + GetCommandLineW = cdll.kernel32.GetCommandLineW + GetCommandLineW.argtypes = [] + GetCommandLineW.restype = LPCWSTR + + CommandLineToArgvW = windll.shell32.CommandLineToArgvW + CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)] + CommandLineToArgvW.restype = POINTER(LPWSTR) + + cmd = GetCommandLineW() + argc = c_int(0) + argv = CommandLineToArgvW(cmd, byref(argc)) + if argc.value > 0: + # Remove Python executable and commands if present + start = argc.value - len(sys.argv) + return [argv[i] for i in + range(start, argc.value)] + # this should never happen + return None + else: + argv = [] + argvencoding = sys.stdin.encoding + if argvencoding is None: + argvencoding = sys.getfilesystemencoding() + if argvencoding is None: + argvencoding = 'utf-8' + for arg in sys.argv: + if isinstance(arg, text_type): + argv.append(arg) + else: + argv.append(arg.decode(argvencoding)) + return argv + + +# Python 2.X is broken in that it does not recognize CP65001 as UTF-8 +def add_cp65001_codec(): + if PY2: + try: + codecs.lookup('cp65001') + except LookupError: + codecs.register( + lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None) + return diff --git a/src/epy_reader/tools/KindleUnpack/kindleunpack.py b/src/epy_reader/tools/KindleUnpack/kindleunpack.py new file mode 100644 index 0000000..317941a --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/kindleunpack.py @@ -0,0 +1,1029 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +import os + +__path__ = ["lib", os.path.dirname(os.path.realpath(__file__)), "kindleunpack"] + +import sys +import codecs +import traceback + +from .compatibility_utils import PY2, binary_type, utf8_str, unicode_str +from .compatibility_utils import unicode_argv, add_cp65001_codec +from .compatibility_utils import hexlify + +add_cp65001_codec() + +from .unipath import pathof + +if PY2: + range = xrange + # since will be printing unicode under python 2 need to protect + # against sys.stdout.encoding being None stupidly forcing forcing ascii encoding + if sys.stdout.encoding is None: + sys.stdout = codecs.getwriter("utf-8")(sys.stdout) + else: + encoding = sys.stdout.encoding + sys.stdout = codecs.getwriter(encoding)(sys.stdout) + +# Changelog +# 0.11 - Version by adamselene +# 0.11pd - Tweaked version by pdurrant +# 0.12 - extracts pictures too, and all into a folder. +# 0.13 - added back in optional output dir for those who don't want it based on infile +# 0.14 - auto flush stdout and wrapped in main, added proper return codes +# 0.15 - added support for metadata +# 0.16 - metadata now starting to be output as an opf file (PD) +# 0.17 - Also created tweaked text as source for Mobipocket Creator +# 0.18 - removed raw mobi file completely but kept _meta.html file for ease of conversion +# 0.19 - added in metadata for ASIN, Updated Title and Rights to the opf +# 0.20 - remove _meta.html since no longer needed +# 0.21 - Fixed some typos in the opf output, and also updated handling +# of test for trailing data/multibyte characters +# 0.22 - Fixed problem with > 9 images +# 0.23 - Now output Start guide item +# 0.24 - Set firstaddl value for 'TEXtREAd' +# 0.25 - Now added character set metadata to html file for utf-8 files. +# 0.26 - Dictionary support added. Image handling speed improved. +# For huge files create temp files to speed up decoding. +# Language decoding fixed. Metadata is now converted to utf-8 when written to opf file. +# 0.27 - Add idx:entry attribute "scriptable" if dictionary contains entry length tags. +# Don't save non-image sections as images. Extract and save source zip file +# included by kindlegen as kindlegensrc.zip. +# 0.28 - Added back correct image file name extensions, created FastConcat class to simplify and clean up +# 0.29 - Metadata handling reworked, multiple entries of the same type are now supported. +# Several missing types added. +# FastConcat class has been removed as in-memory handling with lists is faster, even for huge files. +# 0.30 - Add support for outputting **all** metadata values - encode content with hex if of unknown type +# 0.31 - Now supports Print Replica ebooks, outputting PDF and mysterious data sections +# 0.32 - Now supports NCX file extraction/building. +# Overhauled the structure of mobiunpack to be more class oriented. +# 0.33 - Split Classes ito separate files and added prelim support for KF8 format eBooks +# 0.34 - Improved KF8 support, guide support, bug fixes +# 0.35 - Added splitting combo mobi7/mobi8 into standalone mobi7 and mobi8 files +# Also handle mobi8-only file properly +# 0.36 - very minor changes to support KF8 mobis with no flow items, no ncx, etc +# 0.37 - separate output, add command line switches to control, interface to Mobi_Unpack.pyw +# 0.38 - improve split function by resetting flags properly, fix bug in Thumbnail Images +# 0.39 - improve split function so that ToC info is not lost for standalone mobi8s +# 0.40 - make mobi7 split match official versions, add support for graphic novel metadata, +# improve debug for KF8 +# 0.41 - fix when StartOffset set to 0xffffffff, fix to work with older mobi versions, +# fix other minor metadata issues +# 0.42 - add new class interface to allow it to integrate more easily with internal calibre routines +# 0.43 - bug fixes for new class interface +# 0.44 - more bug fixes and fix for potnetial bug caused by not properly closing created zip archive +# 0.45 - sync to version in the new Mobi_Unpack plugin +# 0.46 - fixes for: obfuscated fonts, improper toc links and ncx, add support for opentype fonts +# 0.47 - minor opf improvements +# 0.48 - ncx link fixes +# 0.49 - use azw3 when splitting mobis +# 0.50 - unknown change +# 0.51 - fix for converting filepos links to hrefs, Added GPL3 notice, made KF8 extension just '.azw3' +# 0.52 - fix for cover metadata (no support for Mobipocket Creator) +# 0.53 - fix for proper identification of embedded fonts, added new metadata items +# 0.54 - Added error-handling so wonky embedded fonts don't bomb the whole unpack process, +# entity escape KF8 metadata to ensure valid OPF. +# 0.55 Strip extra StartOffset EXTH from the mobi8 header when splitting, keeping only the relevant one +# For mobi8 files, don't generate duplicate guide entries from the metadata if we could extract one +# from the OTH table. +# 0.56 - Added further entity escaping of OPF text. +# Allow unicode string file paths to be passed as arguments to the unpackBook method without blowing up later +# when the attempt to "re"-unicode a portion of that filename occurs in the process_all_mobi_headers method. +# 0.57 - Fixed eror when splitting Preview files downloaded from KDP website +# 0.58 - Output original kindlegen build log ('CMET' record) if included in the package. +# 0.58 - Include and extend functionality of DumpMobiHeader, replacing DEBUG with DUMP +# 0.59 - Much added DUMP functionality, including full dumping and descriptions of sections +# 0.60 - Bug fixes in opf, div tables, bad links, page breaks, section descriptions +# - plus a number of other bug fixed that were found by Sergey Dubinets +# - fixs for file/paths that require full unicode to work properly +# - replace subprocess with multiprocessing to remove need for unbuffered stdout +# 0.61 - renamed to be KindleUnpack and more unicode/utf-8 path bug fixes and other minor fixes +# 0.62 - fix for multiprocessing on Windows, split fixes, opf improvements +# 0.63 - Modified to process right to left page progression books properly. +# - Added some id_map_strings and RESC section processing; metadata and +# - spine in the RESC are integrated partly to content.opf. +# 0.63a- Separated K8 RESC processor to an individual file. Bug fixes. Added cover page creation. +# 0.64 - minor bug fixes to more properly handle unicode command lines, and support for more jpeg types +# 0.64a- Modifed to handle something irregular mobi and azw3 files. +# 0.64b- Modifed to create k8resc.spine for no RECS files. +# 0.65 - Bug fixes to shorten title and remove epub3 "properties" to make the output epub2 compliant +# 0.65a- Bug fixes to extract RESC section correctly, to prevent item id confliction +# - and to process multiline comments in RESC. +# 0.66 - Bug fix to deal with missing first resource information sometimes generated by calibre +# 0.66a- Fixed minor bugs, which probably do not affect the output anything +# 0.67 - Fixed Mobi Split functionality bug with azw3 images not being properly copied +# 0.68 - preliminary support for handling PAGE sections to create page-map.xml +# 0.69 - preliminary support for CONT and CRES for HD Images +# 0.70 - preliminary support for decoding apnx files when used with azw3 ebooks +# 0.71 - extensive refactoring of kindleunpack.py to make it more manageable +# 0.72 - many bug fixes from tkeo: fix pageProcessing, fix print replica, fix resc usage, fix font mangling, etc. +# 0.72a- fix for still broken PrintReplica support +# 0.72b- preview for primary epub3 support. A parameter epubver(default='2') is added to process_all_mobi_headers(), unpackBook(). +# 0.72c- preview for apnx page support +# 0.72d- more bugs fixed in preview features, much improved GUI with ability to dynaically grow the Log Window with preference support +# 0.72e- more bug fixes, Tk GUI adds support for epub version and HDImage use +# 0.72f- more bug fixes, implement use hd images if present +# 0.72g- minor bug fixes and cleanups from tkeo +# 0.72h- updated mobi_header and mobi_k8proc to use the correct fragment and guide terms in place of div and other +# to better match the terms that both Calibre and Amazon use internally to their own software +# 0.72x- very experimental conversion to use new mobi_k8resc.py and some of its associated changes +# 0.72y- more changes to simplify and integrate in epub3 support in a simpler manner +# 0.72z- remove redundancy in mobi_opf.py and bug fixes for mobi_k8resc.py +# 0.73 faster mobi split, numerous bug fixes in mobi_k8proc, mobi_header, mobi_opf, mobi_k8resc, etc +# 0.74 added refines metadata, fixed language code in ncx and title in nav, added support for opf: from refines +# 0.75 much improved dictioanry support including support for multiple inflection sections, minor mobi_opf fixes +# 0.76 pre-release version only fix name related issues in opf by not using original file name in mobi7 +# 0.77 bug fix for unpacking HDImages with included Fonts +# 0.80 converted to work with both python 2.7 and Python 3.3 and later +# 0.81 various fixes +# 0.82 Handle calibre-generated mobis that can have skeletons with no fragments +# 0.83 Fix header item 114 being mistakenly treated as a string instead of a value + +DUMP = False +""" Set to True to dump all possible information. """ + +WRITE_RAW_DATA = False +""" Set to True to create additional files with raw data for debugging/reverse engineering. """ + +SPLIT_COMBO_MOBIS = False +""" Set to True to split combination mobis into mobi7 and mobi8 pieces. """ + +CREATE_COVER_PAGE = True # XXX experimental +""" Create and insert a cover xhtml page. """ + +EOF_RECORD = b'\xe9\x8e' + b'\r\n' +""" The EOF record content. """ + +TERMINATION_INDICATOR1 = b'\x00' +TERMINATION_INDICATOR2 = b'\x00\x00' +TERMINATION_INDICATOR3 = b'\x00\x00\x00' + +KINDLEGENSRC_FILENAME = "kindlegensrc.zip" +""" The name for the kindlegen source archive. """ + +KINDLEGENLOG_FILENAME = "kindlegenbuild.log" +""" The name for the kindlegen build log. """ + +K8_BOUNDARY = b'BOUNDARY' +""" The section data that divides K8 mobi ebooks. """ + +import os +import struct +import re +import zlib +import getopt + +class unpackException(Exception): + pass + + +# import the kindleunpack support libraries +from .unpack_structure import fileNames +from .mobi_sectioner import Sectionizer, describe +from .mobi_header import MobiHeader, dump_contexth +from .mobi_utils import toBase32 +from .mobi_opf import OPFProcessor +from .mobi_html import HTMLProcessor, XHTMLK8Processor +from .mobi_ncx import ncxExtract +from .mobi_k8proc import K8Processor +from .mobi_split import mobi_split +from .mobi_k8resc import K8RESCProcessor +from .mobi_nav import NAVProcessor +from .mobi_cover import CoverProcessor, get_image_type +from .mobi_pagemap import PageMapProcessor +from .mobi_dict import dictSupport + + +def processSRCS(i, files, rscnames, sect, data): + # extract the source zip archive and save it. + print("File contains kindlegen source archive, extracting as %s" % KINDLEGENSRC_FILENAME) + srcname = os.path.join(files.outdir, KINDLEGENSRC_FILENAME) + with open(pathof(srcname), 'wb') as f: + f.write(data[16:]) + rscnames.append(None) + sect.setsectiondescription(i,"Zipped Source Files") + return rscnames + + +def processPAGE(i, files, rscnames, sect, data, mh, pagemapproc): + # process any page map information and create an apnx file + pagemapproc = PageMapProcessor(mh, data) + rscnames.append(None) + sect.setsectiondescription(i,"PageMap") + apnx_meta = {} + acr = sect.palmname.decode('latin-1').rstrip('\x00') + apnx_meta['acr'] = acr + apnx_meta['cdeType'] = mh.metadata['cdeType'][0] + apnx_meta['contentGuid'] = hex(int(mh.metadata['UniqueID'][0]))[2:] + apnx_meta['asin'] = mh.metadata['ASIN'][0] + apnx_meta['pageMap'] = pagemapproc.getPageMap() + if mh.version == 8: + apnx_meta['format'] = 'MOBI_8' + else: + apnx_meta['format'] = 'MOBI_7' + apnx_data = pagemapproc.generateAPNX(apnx_meta) + if mh.isK8(): + outname = os.path.join(files.outdir, 'mobi8-'+files.getInputFileBasename() + '.apnx') + else: + outname = os.path.join(files.outdir, 'mobi7-'+files.getInputFileBasename() + '.apnx') + with open(pathof(outname), 'wb') as f: + f.write(apnx_data) + return rscnames, pagemapproc + + +def processCMET(i, files, rscnames, sect, data): + # extract the build log + print("File contains kindlegen build log, extracting as %s" % KINDLEGENLOG_FILENAME) + srcname = os.path.join(files.outdir, KINDLEGENLOG_FILENAME) + with open(pathof(srcname), 'wb') as f: + f.write(data[10:]) + rscnames.append(None) + sect.setsectiondescription(i,"Kindlegen log") + return rscnames + + +# fonts only exist in KF8 ebooks +# Format: bytes 0 - 3: 'FONT' +# bytes 4 - 7: uncompressed size +# bytes 8 - 11: flags +# flag bit 0x0001 - zlib compression +# flag bit 0x0002 - obfuscated with xor string +# bytes 12 - 15: offset to start of compressed font data +# bytes 16 - 19: length of xor string stored before the start of the comnpress font data +# bytes 20 - 23: start of xor string +def processFONT(i, files, rscnames, sect, data, obfuscate_data, beg, rsc_ptr): + fontname = "font%05d" % i + ext = '.dat' + font_error = False + font_data = data + try: + usize, fflags, dstart, xor_len, xor_start = struct.unpack_from(b'>LLLLL',data,4) + except: + print("Failed to extract font: {0:s} from section {1:d}".format(fontname,i)) + font_error = True + ext = '.failed' + pass + if not font_error: + print("Extracting font:", fontname) + font_data = data[dstart:] + extent = len(font_data) + extent = min(extent, 1040) + if fflags & 0x0002: + # obfuscated so need to de-obfuscate the first 1040 bytes + key = bytearray(data[xor_start: xor_start+ xor_len]) + buf = bytearray(font_data) + for n in range(extent): + buf[n] ^= key[n%xor_len] + font_data = bytes(buf) + if fflags & 0x0001: + # ZLIB compressed data + font_data = zlib.decompress(font_data) + hdr = font_data[0:4] + if hdr == b'\0\1\0\0' or hdr == b'true' or hdr == b'ttcf': + ext = '.ttf' + elif hdr == b'OTTO': + ext = '.otf' + else: + print("Warning: unknown font header %s" % hexlify(hdr)) + if (ext == '.ttf' or ext == '.otf') and (fflags & 0x0002): + obfuscate_data.append(fontname + ext) + fontname += ext + outfnt = os.path.join(files.imgdir, fontname) + with open(pathof(outfnt), 'wb') as f: + f.write(font_data) + rscnames.append(fontname) + sect.setsectiondescription(i,"Font {0:s}".format(fontname)) + if rsc_ptr == -1: + rsc_ptr = i - beg + return rscnames, obfuscate_data, rsc_ptr + + +def processCRES(i, files, rscnames, sect, data, beg, rsc_ptr, use_hd): + # extract an HDImage + global DUMP + data = data[12:] + imgtype = get_image_type(None, data) + + if imgtype is None: + print("Warning: CRES Section %s does not contain a recognised resource" % i) + rscnames.append(None) + sect.setsectiondescription(i,"Mysterious CRES data, first four bytes %s" % describe(data[0:4])) + if DUMP: + fname = "unknown%05d.dat" % i + outname= os.path.join(files.outdir, fname) + with open(pathof(outname), 'wb') as f: + f.write(data) + sect.setsectiondescription(i,"Mysterious CRES data, first four bytes %s extracting as %s" % (describe(data[0:4]), fname)) + rsc_ptr += 1 + return rscnames, rsc_ptr + + if use_hd: + # overwrite corresponding lower res image with hd version + imgname = rscnames[rsc_ptr] + imgdest = files.imgdir + else: + imgname = "HDimage%05d.%s" % (i, imgtype) + imgdest = files.hdimgdir + print("Extracting HD image: {0:s} from section {1:d}".format(imgname,i)) + outimg = os.path.join(imgdest, imgname) + with open(pathof(outimg), 'wb') as f: + f.write(data) + rscnames.append(None) + sect.setsectiondescription(i,"Optional HD Image {0:s}".format(imgname)) + rsc_ptr += 1 + return rscnames, rsc_ptr + + +def processCONT(i, files, rscnames, sect, data): + global DUMP + # process a container header, most of this is unknown + # right now only extract its EXTH + dt = data[0:12] + if dt == b"CONTBOUNDARY": + rscnames.append(None) + sect.setsectiondescription(i,"CONTAINER BOUNDARY") + else: + sect.setsectiondescription(i,"CONT Header") + rscnames.append(None) + if DUMP: + cpage, = struct.unpack_from(b'>L', data, 12) + contexth = data[48:] + print("\n\nContainer EXTH Dump") + dump_contexth(cpage, contexth) + fname = "CONT_Header%05d.dat" % i + outname= os.path.join(files.outdir, fname) + with open(pathof(outname), 'wb') as f: + f.write(data) + return rscnames + + +def processkind(i, files, rscnames, sect, data): + global DUMP + dt = data[0:12] + if dt == b"kindle:embed": + if DUMP: + print("\n\nHD Image Container Description String") + print(data) + sect.setsectiondescription(i,"HD Image Container Description String") + rscnames.append(None) + return rscnames + + +# spine information from the original content.opf +def processRESC(i, files, rscnames, sect, data, k8resc): + global DUMP + if DUMP: + rescname = "RESC%05d.dat" % i + print("Extracting Resource: ", rescname) + outrsc = os.path.join(files.outdir, rescname) + with open(pathof(outrsc), 'wb') as f: + f.write(data) + if True: # try: + # parse the spine and metadata from RESC + k8resc = K8RESCProcessor(data[16:], DUMP) + else: # except: + print("Warning: cannot extract information from RESC.") + k8resc = None + rscnames.append(None) + sect.setsectiondescription(i,"K8 RESC section") + return rscnames, k8resc + + +def processImage(i, files, rscnames, sect, data, beg, rsc_ptr, cover_offset, thumb_offset): + global DUMP + # Extract an Image + imgtype = get_image_type(None, data) + if imgtype is None: + print("Warning: Section %s does not contain a recognised resource" % i) + rscnames.append(None) + sect.setsectiondescription(i,"Mysterious Section, first four bytes %s" % describe(data[0:4])) + if DUMP: + fname = "unknown%05d.dat" % i + outname= os.path.join(files.outdir, fname) + with open(pathof(outname), 'wb') as f: + f.write(data) + sect.setsectiondescription(i,"Mysterious Section, first four bytes %s extracting as %s" % (describe(data[0:4]), fname)) + return rscnames, rsc_ptr + + imgname = "image%05d.%s" % (i, imgtype) + if cover_offset is not None and i == beg + cover_offset: + imgname = "cover%05d.%s" % (i, imgtype) + if thumb_offset is not None and i == beg + thumb_offset: + imgname = "thumb%05d.%s" % (i, imgtype) + print("Extracting image: {0:s} from section {1:d}".format(imgname,i)) + outimg = os.path.join(files.imgdir, imgname) + with open(pathof(outimg), 'wb') as f: + f.write(data) + rscnames.append(imgname) + sect.setsectiondescription(i,"Image {0:s}".format(imgname)) + if rsc_ptr == -1: + rsc_ptr = i - beg + return rscnames, rsc_ptr + + +def processPrintReplica(metadata, files, rscnames, mh): + global DUMP + global WRITE_RAW_DATA + rawML = mh.getRawML() + if DUMP or WRITE_RAW_DATA: + outraw = os.path.join(files.outdir,files.getInputFileBasename() + '.rawpr') + with open(pathof(outraw),'wb') as f: + f.write(rawML) + + fileinfo = [] + print("Print Replica ebook detected") + try: + numTables, = struct.unpack_from(b'>L', rawML, 0x04) + tableIndexOffset = 8 + 4*numTables + # for each table, read in count of sections, assume first section is a PDF + # and output other sections as binary files + for i in range(numTables): + sectionCount, = struct.unpack_from(b'>L', rawML, 0x08 + 4*i) + for j in range(sectionCount): + sectionOffset, sectionLength, = struct.unpack_from(b'>LL', rawML, tableIndexOffset) + tableIndexOffset += 8 + if j == 0: + entryName = os.path.join(files.outdir, files.getInputFileBasename() + ('.%03d.pdf' % (i+1))) + else: + entryName = os.path.join(files.outdir, files.getInputFileBasename() + ('.%03d.%03d.data' % ((i+1),j))) + with open(pathof(entryName), 'wb') as f: + f.write(rawML[sectionOffset:(sectionOffset+sectionLength)]) + except Exception as e: + print('Error processing Print Replica: ' + str(e)) + + fileinfo.append([None,'', files.getInputFileBasename() + '.pdf']) + usedmap = {} + for name in rscnames: + if name is not None: + usedmap[name] = 'used' + opf = OPFProcessor(files, metadata, fileinfo, rscnames, False, mh, usedmap) + opf.writeOPF() + + +def processMobi8(mh, metadata, sect, files, rscnames, pagemapproc, k8resc, obfuscate_data, apnxfile=None, epubver='2'): + global DUMP + global WRITE_RAW_DATA + + # extract raw markup langauge + rawML = mh.getRawML() + if DUMP or WRITE_RAW_DATA: + outraw = os.path.join(files.k8dir,files.getInputFileBasename() + '.rawml') + with open(pathof(outraw),'wb') as f: + f.write(rawML) + + # KF8 require other indexes which contain parsing information and the FDST info + # to process the rawml back into the xhtml files, css files, svg image files, etc + k8proc = K8Processor(mh, sect, files, DUMP) + k8proc.buildParts(rawML) + + # collect information for the guide first + guidetext = unicode_str(k8proc.getGuideText()) + + # if the guide was empty, add in any guide info from metadata, such as StartOffset + if not guidetext and 'StartOffset' in metadata: + # Apparently, KG 2.5 carries over the StartOffset from the mobi7 part... + # Taking that into account, we only care about the *last* StartOffset, which + # should always be the correct one in these cases (the one actually pointing + # to the right place in the mobi8 part). + starts = metadata['StartOffset'] + last_start = starts[-1] + last_start = int(last_start) + if last_start == 0xffffffff: + last_start = 0 + seq, idtext = k8proc.getFragTblInfo(last_start) + filename, idtext = k8proc.getIDTagByPosFid(toBase32(seq), b'0000000000') + linktgt = filename + idtext = unicode_str(idtext, mh.codec) + if idtext != '': + linktgt += '#' + idtext + guidetext += '\n' % linktgt + + # if apnxfile is passed in use it for page map information + if apnxfile is not None and pagemapproc is None: + with open(apnxfile, 'rb') as f: + apnxdata = b"00000000" + f.read() + pagemapproc = PageMapProcessor(mh, apnxdata) + + # generate the page map + pagemapxml = '' + if pagemapproc is not None: + pagemapxml = pagemapproc.generateKF8PageMapXML(k8proc) + outpm = os.path.join(files.k8oebps,'page-map.xml') + with open(pathof(outpm),'wb') as f: + f.write(pagemapxml.encode('utf-8')) + if DUMP: + print(pagemapproc.getNames()) + print(pagemapproc.getOffsets()) + print("\n\nPage Map") + print(pagemapxml) + + # process the toc ncx + # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num + print("Processing ncx / toc") + ncx = ncxExtract(mh, files) + ncx_data = ncx.parseNCX() + # extend the ncx data with filenames and proper internal idtags + for i in range(len(ncx_data)): + ncxmap = ncx_data[i] + [junk1, junk2, junk3, fid, junk4, off] = ncxmap['pos_fid'].split(':') + filename, idtag = k8proc.getIDTagByPosFid(fid, off) + ncxmap['filename'] = filename + ncxmap['idtag'] = unicode_str(idtag) + ncx_data[i] = ncxmap + + # convert the rawML to a set of xhtml files + print("Building an epub-like structure") + htmlproc = XHTMLK8Processor(rscnames, k8proc) + usedmap = htmlproc.buildXHTML() + + # write out the xhtml svg, and css files + # fileinfo = [skelid|coverpage, dir, name] + fileinfo = [] + # first create a cover page if none exists + if CREATE_COVER_PAGE: + cover = CoverProcessor(files, metadata, rscnames) + cover_img = utf8_str(cover.getImageName()) + need_to_create_cover_page = False + if cover_img is not None: + if k8resc is None or not k8resc.hasSpine(): + part = k8proc.getPart(0) + if part.find(cover_img) == -1: + need_to_create_cover_page = True + else: + if "coverpage" not in k8resc.spine_idrefs: + part = k8proc.getPart(int(k8resc.spine_order[0])) + if part.find(cover_img) == -1: + k8resc.prepend_to_spine("coverpage", "inserted", "no", None) + if k8resc.spine_order[0] == "coverpage": + need_to_create_cover_page = True + if need_to_create_cover_page: + filename = cover.getXHTMLName() + fileinfo.append(["coverpage", 'Text', filename]) + guidetext += cover.guide_toxml() + cover.writeXHTML() + + n = k8proc.getNumberOfParts() + for i in range(n): + part = k8proc.getPart(i) + [skelnum, dir, filename, beg, end, aidtext] = k8proc.getPartInfo(i) + fileinfo.append([str(skelnum), dir, filename]) + fname = os.path.join(files.k8oebps,dir,filename) + with open(pathof(fname),'wb') as f: + f.write(part) + n = k8proc.getNumberOfFlows() + for i in range(1, n): + [ptype, pformat, pdir, filename] = k8proc.getFlowInfo(i) + flowpart = k8proc.getFlow(i) + if pformat == b'file': + fileinfo.append([None, pdir, filename]) + fname = os.path.join(files.k8oebps,pdir,filename) + with open(pathof(fname),'wb') as f: + f.write(flowpart) + + # create the opf + opf = OPFProcessor(files, metadata.copy(), fileinfo, rscnames, True, mh, usedmap, + pagemapxml=pagemapxml, guidetext=guidetext, k8resc=k8resc, epubver=epubver) + uuid = opf.writeOPF(bool(obfuscate_data)) + + if opf.hasNCX(): + # Create a toc.ncx. + ncx.writeK8NCX(ncx_data, metadata) + if opf.hasNAV(): + # Create a navigation document. + nav = NAVProcessor(files) + nav.writeNAV(ncx_data, guidetext, metadata) + + # make an epub-like structure of it all + print("Creating an epub-like file") + files.makeEPUB(usedmap, obfuscate_data, uuid) + + +def processMobi7(mh, metadata, sect, files, rscnames): + global DUMP + global WRITE_RAW_DATA + # An original Mobi + rawML = mh.getRawML() + if DUMP or WRITE_RAW_DATA: + outraw = os.path.join(files.mobi7dir,files.getInputFileBasename() + '.rawml') + with open(pathof(outraw),'wb') as f: + f.write(rawML) + + # process the toc ncx + # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num + ncx = ncxExtract(mh, files) + ncx_data = ncx.parseNCX() + ncx.writeNCX(metadata) + + positionMap = {} + + # if Dictionary build up the positionMap + if mh.isDictionary(): + if mh.DictInLanguage(): + metadata['DictInLanguage'] = [mh.DictInLanguage()] + if mh.DictOutLanguage(): + metadata['DictOutLanguage'] = [mh.DictOutLanguage()] + positionMap = dictSupport(mh, sect).getPositionMap() + + # convert the rawml back to Mobi ml + proc = HTMLProcessor(files, metadata, rscnames) + srctext = proc.findAnchors(rawML, ncx_data, positionMap) + srctext, usedmap = proc.insertHREFS() + + # write the proper mobi html + fileinfo=[] + # fname = files.getInputFileBasename() + '.html' + fname = 'book.html' + fileinfo.append([None,'', fname]) + outhtml = os.path.join(files.mobi7dir, fname) + with open(pathof(outhtml), 'wb') as f: + f.write(srctext) + + # extract guidetext from srctext + guidetext =b'' + # no pagemap support for older mobis + # pagemapxml = None + guidematch = re.search(br'''(.*)''',srctext,re.IGNORECASE+re.DOTALL) + if guidematch: + guidetext = guidematch.group(1) + # sometimes old mobi guide from srctext horribly written so need to clean up + guidetext = guidetext.replace(b"\r", b"") + guidetext = guidetext.replace(b']*>)''', re.IGNORECASE) + guidepieces = ref_tag_pattern.split(guidetext) + for i in range(1,len(guidepieces), 2): + reftag = guidepieces[i] + # remove any href there now to replace with filepos + reftag = re.sub(br'''href\s*=[^'"]*['"][^'"]*['"]''',b'', reftag) + # make sure the reference tag ends properly + if not reftag.endswith(b"/>"): + reftag = reftag[0:-1] + b"/>" + guidepieces[i] = reftag + guidetext = b''.join(guidepieces) + replacetext = br'''href="'''+utf8_str(fileinfo[0][2])+ br'''#filepos\1"''' + guidetext = re.sub(br'''filepos=['"]{0,1}0*(\d+)['"]{0,1}''', replacetext, guidetext) + guidetext += b'\n' + + if 'StartOffset' in metadata: + for value in metadata['StartOffset']: + if int(value) == 0xffffffff: + value = '0' + starting_offset = value + # get guide items from metadata + metaguidetext = b'\n' + guidetext += metaguidetext + + if isinstance(guidetext, binary_type): + guidetext = guidetext.decode(mh.codec) + + # create an OPF + opf = OPFProcessor(files, metadata, fileinfo, rscnames, ncx.isNCX, mh, usedmap, guidetext=guidetext) + opf.writeOPF() + + +def processUnknownSections(mh, sect, files, K8Boundary): + global DUMP + global TERMINATION_INDICATOR1 + global TERMINATION_INDICATOR2 + global TERMINATION_INDICATOR3 + if DUMP: + print("Unpacking any remaining unknown records") + beg = mh.start + end = sect.num_sections + if beg < K8Boundary: + # then we're processing the first part of a combination file + end = K8Boundary + for i in range(beg, end): + if sect.sectiondescriptions[i] == "": + data = sect.loadSection(i) + type = data[0:4] + if type == TERMINATION_INDICATOR3: + description = "Termination Marker 3 Nulls" + elif type == TERMINATION_INDICATOR2: + description = "Termination Marker 2 Nulls" + elif type == TERMINATION_INDICATOR1: + description = "Termination Marker 1 Null" + elif type == "INDX": + fname = "Unknown%05d_INDX.dat" % i + description = "Unknown INDX section" + if DUMP: + outname= os.path.join(files.outdir, fname) + with open(pathof(outname), 'wb') as f: + f.write(data) + print("Extracting %s: %s from section %d" % (description, fname, i)) + description = description + ", extracting as %s" % fname + else: + fname = "unknown%05d.dat" % i + description = "Mysterious Section, first four bytes %s" % describe(data[0:4]) + if DUMP: + outname= os.path.join(files.outdir, fname) + with open(pathof(outname), 'wb') as f: + f.write(data) + print("Extracting %s: %s from section %d" % (description, fname, i)) + description = description + ", extracting as %s" % fname + sect.setsectiondescription(i, description) + + +def process_all_mobi_headers(files, apnxfile, sect, mhlst, K8Boundary, k8only=False, epubver='2', use_hd=False): + global DUMP + global WRITE_RAW_DATA + rscnames = [] + rsc_ptr = -1 + k8resc = None + obfuscate_data = [] + for mh in mhlst: + pagemapproc = None + if mh.isK8(): + sect.setsectiondescription(mh.start,"KF8 Header") + mhname = os.path.join(files.outdir,"header_K8.dat") + print("Processing K8 section of book...") + elif mh.isPrintReplica(): + sect.setsectiondescription(mh.start,"Print Replica Header") + mhname = os.path.join(files.outdir,"header_PR.dat") + print("Processing PrintReplica section of book...") + else: + if mh.version == 0: + sect.setsectiondescription(mh.start, "PalmDoc Header".format(mh.version)) + else: + sect.setsectiondescription(mh.start,"Mobipocket {0:d} Header".format(mh.version)) + mhname = os.path.join(files.outdir,"header.dat") + print("Processing Mobipocket {0:d} section of book...".format(mh.version)) + + if DUMP: + # write out raw mobi header data + with open(pathof(mhname), 'wb') as f: + f.write(mh.header) + + # process each mobi header + metadata = mh.getMetaData() + mh.describeHeader(DUMP) + if mh.isEncrypted(): + raise unpackException('Book is encrypted') + + pagemapproc = None + + # first handle all of the different resource sections: images, resources, fonts, and etc + # build up a list of image names to use to postprocess the ebook + + print("Unpacking images, resources, fonts, etc") + beg = mh.firstresource + end = sect.num_sections + if beg < K8Boundary: + # processing first part of a combination file + end = K8Boundary + + # Not sure the try/except is necessary, but just in case + try: + thumb_offset = int(metadata.get('ThumbOffset', ['-1'])[0]) + except: + thumb_offset = None + + cover_offset = int(metadata.get('CoverOffset', ['-1'])[0]) + if not CREATE_COVER_PAGE: + cover_offset = None + + for i in range(beg, end): + data = sect.loadSection(i) + type = data[0:4] + + # handle the basics first + if type in [b"FLIS", b"FCIS", b"FDST", b"DATP"]: + if DUMP: + fname = unicode_str(type) + "%05d" % i + if mh.isK8(): + fname += "_K8" + fname += '.dat' + outname= os.path.join(files.outdir, fname) + with open(pathof(outname), 'wb') as f: + f.write(data) + print("Dumping section {0:d} type {1:s} to file {2:s} ".format(i,unicode_str(type),outname)) + sect.setsectiondescription(i,"Type {0:s}".format(unicode_str(type))) + rscnames.append(None) + elif type == b"SRCS": + rscnames = processSRCS(i, files, rscnames, sect, data) + elif type == b"PAGE": + rscnames, pagemapproc = processPAGE(i, files, rscnames, sect, data, mh, pagemapproc) + elif type == b"CMET": + rscnames = processCMET(i, files, rscnames, sect, data) + elif type == b"FONT": + rscnames, obfuscate_data, rsc_ptr = processFONT(i, files, rscnames, sect, data, obfuscate_data, beg, rsc_ptr) + elif type == b"CRES": + rscnames, rsc_ptr = processCRES(i, files, rscnames, sect, data, beg, rsc_ptr, use_hd) + elif type == b"CONT": + rscnames = processCONT(i, files, rscnames, sect, data) + elif type == b"kind": + rscnames = processkind(i, files, rscnames, sect, data) + elif type == b'\xa0\xa0\xa0\xa0': + sect.setsectiondescription(i,"Empty_HD_Image/Resource_Placeholder") + rscnames.append(None) + rsc_ptr += 1 + elif type == b"RESC": + rscnames, k8resc = processRESC(i, files, rscnames, sect, data, k8resc) + elif data == EOF_RECORD: + sect.setsectiondescription(i,"End Of File") + rscnames.append(None) + elif data[0:8] == b"BOUNDARY": + sect.setsectiondescription(i,"BOUNDARY Marker") + rscnames.append(None) + else: + # if reached here should be an image ow treat as unknown + rscnames, rsc_ptr = processImage(i, files, rscnames, sect, data, beg, rsc_ptr, cover_offset, thumb_offset) + # done unpacking resources + + # Print Replica + if mh.isPrintReplica() and not k8only: + processPrintReplica(metadata, files, rscnames, mh) + continue + + # KF8 (Mobi 8) + if mh.isK8(): + processMobi8(mh, metadata, sect, files, rscnames, pagemapproc, k8resc, obfuscate_data, apnxfile, epubver) + + # Old Mobi (Mobi 7) + elif not k8only: + processMobi7(mh, metadata, sect, files, rscnames) + + # process any remaining unknown sections of the palm file + processUnknownSections(mh, sect, files, K8Boundary) + + return + + +def unpackBook(infile, outdir, apnxfile=None, epubver='2', use_hd=False, dodump=False, dowriteraw=False, dosplitcombos=False): + global DUMP + global WRITE_RAW_DATA + global SPLIT_COMBO_MOBIS + if DUMP or dodump: + DUMP = True + if WRITE_RAW_DATA or dowriteraw: + WRITE_RAW_DATA = True + if SPLIT_COMBO_MOBIS or dosplitcombos: + SPLIT_COMBO_MOBIS = True + + infile = unicode_str(infile) + outdir = unicode_str(outdir) + if apnxfile is not None: + apnxfile = unicode_str(apnxfile) + + files = fileNames(infile, outdir) + + # process the PalmDoc database header and verify it is a mobi + sect = Sectionizer(infile) + if sect.ident != b'BOOKMOBI' and sect.ident != b'TEXtREAd': + raise unpackException('Invalid file format') + if DUMP: + sect.dumppalmheader() + else: + print("Palm DB type: %s, %d sections." % (sect.ident.decode('utf-8'),sect.num_sections)) + + # scan sections to see if this is a compound mobi file (K8 format) + # and build a list of all mobi headers to process. + mhlst = [] + mh = MobiHeader(sect,0) + # if this is a mobi8-only file hasK8 here will be true + mhlst.append(mh) + K8Boundary = -1 + + if mh.isK8(): + print("Unpacking a KF8 book...") + hasK8 = True + else: + # This is either a Mobipocket 7 or earlier, or a combi M7/KF8 + # Find out which + hasK8 = False + for i in range(len(sect.sectionoffsets)-1): + before, after = sect.sectionoffsets[i:i+2] + if (after - before) == 8: + data = sect.loadSection(i) + if data == K8_BOUNDARY: + sect.setsectiondescription(i,"Mobi/KF8 Boundary Section") + mh = MobiHeader(sect,i+1) + hasK8 = True + mhlst.append(mh) + K8Boundary = i + break + if hasK8: + print("Unpacking a Combination M{0:d}/KF8 book...".format(mh.version)) + if SPLIT_COMBO_MOBIS: + # if this is a combination mobi7-mobi8 file split them up + mobisplit = mobi_split(infile) + if mobisplit.combo: + outmobi7 = os.path.join(files.outdir, 'mobi7-'+files.getInputFileBasename() + '.mobi') + outmobi8 = os.path.join(files.outdir, 'mobi8-'+files.getInputFileBasename() + '.azw3') + with open(pathof(outmobi7), 'wb') as f: + f.write(mobisplit.getResult7()) + with open(pathof(outmobi8), 'wb') as f: + f.write(mobisplit.getResult8()) + else: + print("Unpacking a Mobipocket {0:d} book...".format(mh.version)) + + if hasK8: + files.makeK8Struct() + + process_all_mobi_headers(files, apnxfile, sect, mhlst, K8Boundary, False, epubver, use_hd) + + if DUMP: + sect.dumpsectionsinfo() + return + + +def usage(progname): + print("") + print("Description:") + print(" Unpacks an unencrypted Kindle/MobiPocket ebook to html and images") + print(" or an unencrypted Kindle/Print Replica ebook to PDF and images") + print(" into the specified output folder.") + print("Usage:") + print(" %s -r -s -p apnxfile -d -h --epub_version= infile [outdir]" % progname) + print("Options:") + print(" -h print this help message") + print(" -i use HD Images, if present, to overwrite reduced resolution images") + print(" -s split combination mobis into mobi7 and mobi8 ebooks") + print(" -p APNXFILE path to an .apnx file associated with the azw3 input (optional)") + print(" --epub_version= specify epub version to unpack to: 2, 3, A (for automatic) or ") + print(" F (force to fit to epub2 definitions), default is 2") + print(" -d dump headers and other info to output and extra files") + print(" -r write raw data to the output folder") + + +def main(argv=unicode_argv()): + global DUMP + global WRITE_RAW_DATA + global SPLIT_COMBO_MOBIS + + print("KindleUnpack v0.83") + print(" Based on initial mobipocket version Copyright © 2009 Charles M. Hannum ") + print(" Extensive Extensions and Improvements Copyright © 2009-2020 ") + print(" by: P. Durrant, K. Hendricks, S. Siebert, fandrieu, DiapDealer, nickredding, tkeo.") + print(" This program is free software: you can redistribute it and/or modify") + print(" it under the terms of the GNU General Public License as published by") + print(" the Free Software Foundation, version 3.") + + progname = os.path.basename(argv[0]) + try: + opts, args = getopt.getopt(argv[1:], "dhirsp:", ['epub_version=']) + except getopt.GetoptError as err: + print(str(err)) + usage(progname) + sys.exit(2) + + if len(args)<1: + usage(progname) + sys.exit(2) + + apnxfile = None + epubver = '2' + use_hd = False + + for o, a in opts: + if o == "-h": + usage(progname) + sys.exit(0) + if o == "-i": + use_hd = True + if o == "-d": + DUMP = True + if o == "-r": + WRITE_RAW_DATA = True + if o == "-s": + SPLIT_COMBO_MOBIS = True + if o == "-p": + apnxfile = a + if o == "--epub_version": + epubver = a + + if len(args) > 1: + infile, outdir = args + else: + infile = args[0] + outdir = os.path.splitext(infile)[0] + + infileext = os.path.splitext(infile)[1].upper() + if infileext not in ['.MOBI', '.PRC', '.AZW', '.AZW3', '.AZW4']: + print("Error: first parameter must be a Kindle/Mobipocket ebook or a Kindle/Print Replica ebook.") + return 1 + + try: + print('Unpacking Book...') + unpackBook(infile, outdir, apnxfile, epubver, use_hd) + print('Completed') + + except ValueError as e: + print("Error: %s" % e) + print(traceback.format_exc()) + return 1 + + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/epy_reader/tools/KindleUnpack/mobi_cover.py b/src/epy_reader/tools/KindleUnpack/mobi_cover.py new file mode 100644 index 0000000..3078ac4 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_cover.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import unicode_str + +from .unipath import pathof +import os +import imghdr + +import struct +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring + +USE_SVG_WRAPPER = True +""" Set to True to use svg wrapper for default. """ + +FORCE_DEFAULT_TITLE = False +""" Set to True to force to use the default title. """ + +COVER_PAGE_FINENAME = 'cover_page.xhtml' +""" The name for the cover page. """ + +DEFAULT_TITLE = 'Cover' +""" The default title for the cover page. """ + +MAX_WIDTH = 4096 +""" The max width for the svg cover page. """ + +MAX_HEIGHT = 4096 +""" The max height for the svg cover page. """ + + +def get_image_type(imgname, imgdata=None): + imgtype = unicode_str(imghdr.what(pathof(imgname), imgdata)) + + # imghdr only checks for JFIF or Exif JPEG files. Apparently, there are some + # with only the magic JPEG bytes out there... + # ImageMagick handles those, so, do it too. + if imgtype is None: + if imgdata is None: + with open(pathof(imgname), 'rb') as f: + imgdata = f.read() + if imgdata[0:2] == b'\xFF\xD8': + # Get last non-null bytes + last = len(imgdata) + while (imgdata[last-1:last] == b'\x00'): + last-=1 + # Be extra safe, check the trailing bytes, too. + if imgdata[last-2:last] == b'\xFF\xD9': + imgtype = "jpeg" + return imgtype + + +def get_image_size(imgname, imgdata=None): + '''Determine the image type of imgname (or imgdata) and return its size. + + Originally, + Determine the image type of fhandle and return its size. + from draco''' + if imgdata is None: + fhandle = open(pathof(imgname), 'rb') + head = fhandle.read(24) + else: + head = imgdata[0:24] + if len(head) != 24: + return + + imgtype = get_image_type(imgname, imgdata) + if imgtype == 'png': + check = struct.unpack(b'>i', head[4:8])[0] + if check != 0x0d0a1a0a: + return + width, height = struct.unpack(b'>ii', head[16:24]) + elif imgtype == 'gif': + width, height = struct.unpack(b'H', fhandle.read(2))[0] - 2 + # We are at a SOFn block + fhandle.seek(1, 1) # Skip `precision' byte. + height, width = struct.unpack(b'>HH', fhandle.read(4)) + except Exception: # IGNORE:W0703 + return + elif imgtype == 'jpeg' and imgdata is not None: + try: + pos = 0 + size = 2 + ftype = 0 + while not 0xc0 <= ftype <= 0xcf: + pos += size + byte = imgdata[pos:pos+1] + pos += 1 + while ord(byte) == 0xff: + byte = imgdata[pos:pos+1] + pos += 1 + ftype = ord(byte) + size = struct.unpack(b'>H', imgdata[pos:pos+2])[0] - 2 + pos += 2 + # We are at a SOFn block + pos += 1 # Skip `precision' byte. + height, width = struct.unpack(b'>HH', imgdata[pos:pos+4]) + pos += 4 + except Exception: # IGNORE:W0703 + return + else: + return + return width, height + +# XXX experimental +class CoverProcessor(object): + + """Create a cover page. + + """ + def __init__(self, files, metadata, rscnames, imgname=None, imgdata=None): + self.files = files + self.metadata = metadata + self.rscnames = rscnames + self.cover_page = COVER_PAGE_FINENAME + self.use_svg = USE_SVG_WRAPPER # Use svg wrapper. + self.lang = metadata.get('Language', ['en'])[0] + # This should ensure that if the methods to find the cover image's + # dimensions should fail for any reason, the SVG routine will not be used. + [self.width, self.height] = (-1,-1) + if FORCE_DEFAULT_TITLE: + self.title = DEFAULT_TITLE + else: + self.title = metadata.get('Title', [DEFAULT_TITLE])[0] + + self.cover_image = None + if imgname is not None: + self.cover_image = imgname + elif 'CoverOffset' in metadata: + imageNumber = int(metadata['CoverOffset'][0]) + cover_image = self.rscnames[imageNumber] + if cover_image is not None: + self.cover_image = cover_image + else: + print('Warning: Cannot identify the cover image.') + if self.use_svg: + try: + if imgdata is None: + fname = os.path.join(files.imgdir, self.cover_image) + [self.width, self.height] = get_image_size(fname) + else: + [self.width, self.height] = get_image_size(None, imgdata) + except: + self.use_svg = False + width = self.width + height = self.height + if width < 0 or height < 0 or width > MAX_WIDTH or height > MAX_HEIGHT: + self.use_svg = False + return + + def getImageName(self): + return self.cover_image + + def getXHTMLName(self): + return self.cover_page + + def buildXHTML(self): + print('Building a cover page.') + files = self.files + cover_image = self.cover_image + title = self.title + lang = self.lang + + image_dir = os.path.normpath(os.path.relpath(files.k8images, files.k8text)) + image_path = os.path.join(image_dir, cover_image).replace('\\', '/') + + if not self.use_svg: + data = '' + data += '' + data += 'L', idata, 0x14) + count, = struct.unpack_from(b'>L', idata, 0x18) + self.starts.append(start) + self.counts.append(count) + + def lookup(self, lookupvalue): + i = 0 + rvalue = lookupvalue + while rvalue >= self.counts[i]: + rvalue = rvalue - self.counts[i] + i += 1 + if i == len(self.counts): + print("Error: Problem with multiple inflections data sections") + return lookupvalue, self.starts[0], self.counts[0], self.infldatas[0] + return rvalue, self.starts[i], self.counts[i], self.infldatas[i] + + def offsets(self, value): + rvalue, start, count, data = self.lookup(value) + offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue)) + if rvalue + 1 < count: + nextOffset, = struct.unpack_from(b'>H',data, start + 4 + (2 * (rvalue + 1))) + else: + nextOffset = None + return offset, nextOffset, data + + +class dictSupport(object): + + def __init__(self, mh, sect): + self.mh = mh + self.header = mh.header + self.sect = sect + self.metaOrthIndex = mh.metaOrthIndex + self.metaInflIndex = mh.metaInflIndex + + def parseHeader(self, data): + "read INDX header" + if not data[:4] == b'INDX': + print("Warning: index section is not INDX") + return False + words = ( + 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code', + 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc' + ) + num = len(words) + values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)]) + header = {} + for n in range(num): + header[words[n]] = values[n] + + ordt1 = None + ordt2 = None + + otype, oentries, op1, op2, otagx = struct.unpack_from(b'>LLLLL',data, 0xa4) + header['otype'] = otype + header['oentries'] = oentries + + if DEBUG_DICT: + print("otype %d, oentries %d, op1 %d, op2 %d, otagx %d" % (otype, oentries, op1, op2, otagx)) + + if header['code'] == 0xfdea or oentries > 0: + # some dictionaries seem to be codepage 65002 (0xFDEA) which seems + # to be some sort of strange EBCDIC utf-8 or 16 encoded strings + # So we need to look for them and store them away to process leading text + # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries + # we only ever seem to use the second but ... + # + # if otype = 0, ORDT table uses 16 bit values as offsets into the table + # if otype = 1, ORDT table uses 8 bit values as offsets inot the table + + assert(data[op1:op1+4] == b'ORDT') + assert(data[op2:op2+4] == b'ORDT') + ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4) + ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4) + + if DEBUG_DICT: + print("parsed INDX header:") + for key in header: + print(key, "%x" % header[key],) + print("\n") + return header, ordt1, ordt2 + + def getPositionMap(self): + sect = self.sect + + positionMap = {} + + metaOrthIndex = self.metaOrthIndex + metaInflIndex = self.metaInflIndex + + decodeInflection = True + if metaOrthIndex != 0xFFFFFFFF: + print("Info: Document contains orthographic index, handle as dictionary") + if metaInflIndex == 0xFFFFFFFF: + decodeInflection = False + else: + metaInflIndexData = sect.loadSection(metaInflIndex) + + print("\nParsing metaInflIndexData") + midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData) + + metaIndexCount = midxhdr['count'] + idatas = [] + for j in range(metaIndexCount): + idatas.append(sect.loadSection(metaInflIndex + 1 + j)) + dinfl = InflectionData(idatas) + + inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount) + tagSectionStart = midxhdr['len'] + inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData) + if DEBUG_DICT: + print("inflectionTagTable: %s" % inflectionTagTable) + if self.hasTag(inflectionTagTable, 0x07): + print("Error: Dictionary uses obsolete inflection rule scheme which is not yet supported") + decodeInflection = False + + data = sect.loadSection(metaOrthIndex) + + print("\nParsing metaOrthIndex") + idxhdr, hordt1, hordt2 = self.parseHeader(data) + + tagSectionStart = idxhdr['len'] + controlByteCount, tagTable = readTagSection(tagSectionStart, data) + orthIndexCount = idxhdr['count'] + print("orthIndexCount is", orthIndexCount) + if DEBUG_DICT: + print("orthTagTable: %s" % tagTable) + if hordt2 is not None: + print("orth entry uses ordt2 lookup table of type ", idxhdr['otype']) + hasEntryLength = self.hasTag(tagTable, 0x02) + if not hasEntryLength: + print("Info: Index doesn't contain entry length tags") + + print("Read dictionary index data") + for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount): + data = sect.loadSection(i) + hdrinfo, ordt1, ordt2 = self.parseHeader(data) + idxtPos = hdrinfo['start'] + entryCount = hdrinfo['count'] + idxPositions = [] + for j in range(entryCount): + pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j)) + idxPositions.append(pos) + # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) + idxPositions.append(idxtPos) + for j in range(entryCount): + startPos = idxPositions[j] + endPos = idxPositions[j+1] + textLength = ord(data[startPos:startPos+1]) + text = data[startPos+1:startPos+1+textLength] + if hordt2 is not None: + utext = u"" + if idxhdr['otype'] == 0: + pattern = b'>H' + inc = 2 + else: + pattern = b'>B' + inc = 1 + pos = 0 + while pos < textLength: + off, = struct.unpack_from(pattern, text, pos) + if off < len(hordt2): + utext += unichr(hordt2[off]) + else: + utext += unichr(off) + pos += inc + text = utext.encode('utf-8') + + tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos) + if 0x01 in tagMap: + if decodeInflection and 0x2a in tagMap: + inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable, + dinfl, inflNameData, tagMap[0x2a]) + else: + inflectionGroups = b'' + assert len(tagMap[0x01]) == 1 + entryStartPosition = tagMap[0x01][0] + if hasEntryLength: + # The idx:entry attribute "scriptable" must be present to create entry length tags. + ml = b'' + inflectionGroups + b'' + if entryStartPosition in positionMap: + positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml + else: + positionMap[entryStartPosition] = ml + assert len(tagMap[0x02]) == 1 + entryEndPosition = entryStartPosition + tagMap[0x02][0] + if entryEndPosition in positionMap: + positionMap[entryEndPosition] = b"" + positionMap[entryEndPosition] + else: + positionMap[entryEndPosition] = b"" + + else: + indexTags = b'\n\n' + inflectionGroups + b'\n' + if entryStartPosition in positionMap: + positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags + else: + positionMap[entryStartPosition] = indexTags + return positionMap + + def hasTag(self, tagTable, tag): + ''' + Test if tag table contains given tag. + + @param tagTable: The tag table. + @param tag: The tag to search. + @return: True if tag table contains given tag; False otherwise. + ''' + for currentTag, _, _, _ in tagTable: + if currentTag == tag: + return True + return False + + def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList): + ''' + Create string which contains the inflection groups with inflection rules as mobipocket tags. + + @param mainEntry: The word to inflect. + @param controlByteCount: The number of control bytes. + @param tagTable: The tag table. + @param data: The Inflection data object to properly select the right inflection data section to use + @param inflectionNames: The inflection rule name data. + @param groupList: The list of inflection groups to process. + @return: String with inflection groups and rules or empty string if required tags are not available. + ''' + result = b"" + for value in groupList: + offset, nextOffset, data = dinfl.offsets(value) + + # First byte seems to be always 0x00 and must be skipped. + assert ord(data[offset:offset+1]) == 0x00 + tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset) + + # Make sure that the required tags are available. + if 0x05 not in tagMap: + print("Error: Required tag 0x05 not found in tagMap") + return "" + if 0x1a not in tagMap: + print("Error: Required tag 0x1a not found in tagMap") + return b'' + + result += b'' + + for i in range(len(tagMap[0x05])): + + # Get name of inflection rule. + value = tagMap[0x05][i] + consumed, textLength = getVariableWidthValue(inflectionNames, value) + inflectionName = inflectionNames[value+consumed:value+consumed+textLength] + + # Get and apply inflection rule across possibly multiple inflection data sections + value = tagMap[0x1a][i] + rvalue, start, count, data = dinfl.lookup(value) + offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue)) + textLength = ord(data[offset:offset+1]) + inflection = self.applyInflectionRule(mainEntry, data, offset+1, offset+1+textLength) + if inflection is not None: + result += b' ' + + result += b'' + return result + + def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end): + ''' + Apply inflection rule. + + @param mainEntry: The word to inflect. + @param inflectionRuleData: The inflection rules. + @param start: The start position of the inflection rule to use. + @param end: The end position of the inflection rule to use. + @return: The string with the inflected word or None if an error occurs. + ''' + mode = -1 + byteArray = array.array(array_format, mainEntry) + position = len(byteArray) + for charOffset in range(start, end): + char = inflectionRuleData[charOffset:charOffset+1] + abyte = ord(char) + if abyte >= 0x0a and abyte <= 0x13: + # Move cursor backwards + offset = abyte - 0x0a + if mode not in [0x02, 0x03]: + mode = 0x02 + position = len(byteArray) + position -= offset + elif abyte > 0x13: + if mode == -1: + print("Error: Unexpected first byte %i of inflection rule" % abyte) + return None + elif position == -1: + print("Error: Unexpected first byte %i of inflection rule" % abyte) + return None + else: + if mode == 0x01: + # Insert at word start + byteArray.insert(position, abyte) + position += 1 + elif mode == 0x02: + # Insert at word end + byteArray.insert(position, abyte) + elif mode == 0x03: + # Delete at word end + position -= 1 + deleted = byteArray.pop(position) + if bchr(deleted) != char: + if DEBUG_DICT: + print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted))) + print("Error: Delete operation of inflection rule failed") + return None + elif mode == 0x04: + # Delete at word start + deleted = byteArray.pop(position) + if bchr(deleted) != char: + if DEBUG_DICT: + print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted))) + print("Error: Delete operation of inflection rule failed") + return None + else: + print("Error: Inflection rule mode %x is not implemented" % mode) + return None + elif abyte == 0x01: + # Insert at word start + if mode not in [0x01, 0x04]: + position = 0 + mode = abyte + elif abyte == 0x02: + # Insert at word end + if mode not in [0x02, 0x03]: + position = len(byteArray) + mode = abyte + elif abyte == 0x03: + # Delete at word end + if mode not in [0x02, 0x03]: + position = len(byteArray) + mode = abyte + elif abyte == 0x04: + # Delete at word start + if mode not in [0x01, 0x04]: + position = 0 + # Delete at word start + mode = abyte + else: + print("Error: Inflection rule mode %x is not implemented" % abyte) + return None + return utf8_str(byteArray.tostring()) diff --git a/src/epy_reader/tools/KindleUnpack/mobi_header.py b/src/epy_reader/tools/KindleUnpack/mobi_header.py new file mode 100644 index 0000000..a15f636 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_header.py @@ -0,0 +1,936 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +DEBUG_USE_ORDERED_DICTIONARY = False # OrderedDict is supoorted >= python 2.7. +""" set to True to use OrderedDict for MobiHeader.metadata.""" + +if DEBUG_USE_ORDERED_DICTIONARY: + from collections import OrderedDict as dict_ +else: + dict_ = dict + +from .compatibility_utils import PY2, unicode_str, hexlify, bord + +if PY2: + range = xrange + +import struct +import uuid + +# import the mobiunpack support libraries +from .mobi_utils import getLanguage +from .mobi_uncompress import HuffcdicReader, PalmdocReader, UncompressedReader + +class unpackException(Exception): + pass + + +def sortedHeaderKeys(mheader): + hdrkeys = sorted(list(mheader.keys()), key=lambda akey: mheader[akey][0]) + return hdrkeys + + +# HD Containers have their own headers and their own EXTH +# this is just guesswork so far, making big assumption that +# metavalue key numbers remain the same in the CONT EXTH + +# Note: The layout of the CONT Header is still unknown +# so just deal with their EXTH sections for now + +def dump_contexth(cpage, extheader): + # determine text encoding + codec = 'windows-1252' + codec_map = { + 1252 : 'windows-1252', + 65001: 'utf-8', + } + if cpage in codec_map: + codec = codec_map[cpage] + if extheader == b'': + return + id_map_strings = { + 1 : 'Drm Server Id', + 2 : 'Drm Commerce Id', + 3 : 'Drm Ebookbase Book Id', + 4 : 'Drm Ebookbase Dep Id', + 100 : 'Creator', + 101 : 'Publisher', + 102 : 'Imprint', + 103 : 'Description', + 104 : 'ISBN', + 105 : 'Subject', + 106 : 'Published', + 107 : 'Review', + 108 : 'Contributor', + 109 : 'Rights', + 110 : 'SubjectCode', + 111 : 'Type', + 112 : 'Source', + 113 : 'ASIN', + # 114 : 'versionNumber', + 117 : 'Adult', + 118 : 'Retail-Price', + 119 : 'Retail-Currency', + 120 : 'TSC', + 122 : 'fixed-layout', + 123 : 'book-type', + 124 : 'orientation-lock', + 126 : 'original-resolution', + 127 : 'zero-gutter', + 128 : 'zero-margin', + 129 : 'MetadataResourceURI', + 132 : 'RegionMagnification', + 150 : 'LendingEnabled', + 200 : 'DictShortName', + 501 : 'cdeType', + 502 : 'last_update_time', + 503 : 'Updated_Title', + 504 : 'CDEContentKey', + 505 : 'AmazonContentReference', + 506 : 'Title-Language', + 507 : 'Title-Display-Direction', + 508 : 'Title-Pronunciation', + 509 : 'Title-Collation', + 510 : 'Secondary-Title', + 511 : 'Secondary-Title-Language', + 512 : 'Secondary-Title-Direction', + 513 : 'Secondary-Title-Pronunciation', + 514 : 'Secondary-Title-Collation', + 515 : 'Author-Language', + 516 : 'Author-Display-Direction', + 517 : 'Author-Pronunciation', + 518 : 'Author-Collation', + 519 : 'Author-Type', + 520 : 'Publisher-Language', + 521 : 'Publisher-Display-Direction', + 522 : 'Publisher-Pronunciation', + 523 : 'Publisher-Collation', + 524 : 'Content-Language-Tag', + 525 : 'primary-writing-mode', + 526 : 'NCX-Ingested-By-Software', + 527 : 'page-progression-direction', + 528 : 'override-kindle-fonts', + 529 : 'Compression-Upgraded', + 530 : 'Soft-Hyphens-In-Content', + 531 : 'Dictionary_In_Langague', + 532 : 'Dictionary_Out_Language', + 533 : 'Font_Converted', + 534 : 'Amazon_Creator_Info', + 535 : 'Creator-Build-Tag', + 536 : 'HD-Media-Containers-Info', # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?) + 538 : 'Resource-Container-Fidelity', + 539 : 'HD-Container-Mimetype', + 540 : 'Sample-For_Special-Purpose', + 541 : 'Kindletool-Operation-Information', + 542 : 'Container_Id', + 543 : 'Asset-Type', # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER + 544 : 'Unknown_544', + } + id_map_values = { + 114 : 'versionNumber', + 115 : 'sample', + 116 : 'StartOffset', + 121 : 'Mobi8-Boundary-Section', + 125 : 'Embedded-Record-Count', + 130 : 'Offline-Sample', + 131 : 'Metadata-Record-Offset', + 201 : 'CoverOffset', + 202 : 'ThumbOffset', + 203 : 'HasFakeCover', + 204 : 'Creator-Software', + 205 : 'Creator-Major-Version', + 206 : 'Creator-Minor-Version', + 207 : 'Creator-Build-Number', + 401 : 'Clipping-Limit', + 402 : 'Publisher-Limit', + 404 : 'Text-to-Speech-Disabled', + 406 : 'Rental-Expiration-Time', + } + id_map_hexstrings = { + 208 : 'Watermark_(hex)', + 209 : 'Tamper-Proof-Keys_(hex)', + 300 : 'Font-Signature_(hex)', + 403 : 'Unknown_(403)_(hex)', + 405 : 'Ownership-Type_(hex)', + 407 : 'Unknown_(407)_(hex)', + 420 : 'Multimedia-Content-Reference_(hex)', + 450 : 'Locations_Match_(hex)', + 451 : 'Full-Story-Length_(hex)', + 452 : 'Sample-Start_Location_(hex)', + 453 : 'Sample-End-Location_(hex)', + } + _length, num_items = struct.unpack(b'>LL', extheader[4:12]) + extheader = extheader[12:] + pos = 0 + for _ in range(num_items): + id, size = struct.unpack(b'>LL', extheader[pos:pos+8]) + content = extheader[pos + 8: pos + size] + if id in id_map_strings: + name = id_map_strings[id] + print('\n Key: "%s"\n Value: "%s"' % (name, content.decode(codec, errors='replace'))) + elif id in id_map_values: + name = id_map_values[id] + if size == 9: + value, = struct.unpack(b'B',content) + print('\n Key: "%s"\n Value: 0x%01x' % (name, value)) + elif size == 10: + value, = struct.unpack(b'>H',content) + print('\n Key: "%s"\n Value: 0x%02x' % (name, value)) + elif size == 12: + value, = struct.unpack(b'>L',content) + print('\n Key: "%s"\n Value: 0x%04x' % (name, value)) + else: + print("\nError: Value for %s has unexpected size of %s" % (name, size)) + elif id in id_map_hexstrings: + name = id_map_hexstrings[id] + print('\n Key: "%s"\n Value: 0x%s' % (name, hexlify(content))) + else: + print("\nWarning: Unknown metadata with id %s found" % id) + name = str(id) + ' (hex)' + print(' Key: "%s"\n Value: 0x%s' % (name, hexlify(content))) + pos += size + return + + +class MobiHeader: + # all values are packed in big endian format + palmdoc_header = { + 'compression_type' : (0x00, b'>H', 2), + 'fill0' : (0x02, b'>H', 2), + 'text_length' : (0x04, b'>L', 4), + 'text_records' : (0x08, b'>H', 2), + 'max_section_size' : (0x0a, b'>H', 2), + 'read_pos ' : (0x0c, b'>L', 4), + } + + mobi6_header = { + 'compression_type' : (0x00, b'>H', 2), + 'fill0' : (0x02, b'>H', 2), + 'text_length' : (0x04, b'>L', 4), + 'text_records' : (0x08, b'>H', 2), + 'max_section_size' : (0x0a, b'>H', 2), + 'crypto_type' : (0x0c, b'>H', 2), + 'fill1' : (0x0e, b'>H', 2), + 'magic' : (0x10, b'4s', 4), + 'header_length (from MOBI)' : (0x14, b'>L', 4), + 'type' : (0x18, b'>L', 4), + 'codepage' : (0x1c, b'>L', 4), + 'unique_id' : (0x20, b'>L', 4), + 'version' : (0x24, b'>L', 4), + 'metaorthindex' : (0x28, b'>L', 4), + 'metainflindex' : (0x2c, b'>L', 4), + 'index_names' : (0x30, b'>L', 4), + 'index_keys' : (0x34, b'>L', 4), + 'extra_index0' : (0x38, b'>L', 4), + 'extra_index1' : (0x3c, b'>L', 4), + 'extra_index2' : (0x40, b'>L', 4), + 'extra_index3' : (0x44, b'>L', 4), + 'extra_index4' : (0x48, b'>L', 4), + 'extra_index5' : (0x4c, b'>L', 4), + 'first_nontext' : (0x50, b'>L', 4), + 'title_offset' : (0x54, b'>L', 4), + 'title_length' : (0x58, b'>L', 4), + 'language_code' : (0x5c, b'>L', 4), + 'dict_in_lang' : (0x60, b'>L', 4), + 'dict_out_lang' : (0x64, b'>L', 4), + 'min_version' : (0x68, b'>L', 4), + 'first_resc_offset' : (0x6c, b'>L', 4), + 'huff_offset' : (0x70, b'>L', 4), + 'huff_num' : (0x74, b'>L', 4), + 'huff_tbl_offset' : (0x78, b'>L', 4), + 'huff_tbl_len' : (0x7c, b'>L', 4), + 'exth_flags' : (0x80, b'>L', 4), + 'fill3_a' : (0x84, b'>L', 4), + 'fill3_b' : (0x88, b'>L', 4), + 'fill3_c' : (0x8c, b'>L', 4), + 'fill3_d' : (0x90, b'>L', 4), + 'fill3_e' : (0x94, b'>L', 4), + 'fill3_f' : (0x98, b'>L', 4), + 'fill3_g' : (0x9c, b'>L', 4), + 'fill3_h' : (0xa0, b'>L', 4), + 'unknown0' : (0xa4, b'>L', 4), + 'drm_offset' : (0xa8, b'>L', 4), + 'drm_count' : (0xac, b'>L', 4), + 'drm_size' : (0xb0, b'>L', 4), + 'drm_flags' : (0xb4, b'>L', 4), + 'fill4_a' : (0xb8, b'>L', 4), + 'fill4_b' : (0xbc, b'>L', 4), + 'first_content' : (0xc0, b'>H', 2), + 'last_content' : (0xc2, b'>H', 2), + 'unknown0' : (0xc4, b'>L', 4), + 'fcis_offset' : (0xc8, b'>L', 4), + 'fcis_count' : (0xcc, b'>L', 4), + 'flis_offset' : (0xd0, b'>L', 4), + 'flis_count' : (0xd4, b'>L', 4), + 'unknown1' : (0xd8, b'>L', 4), + 'unknown2' : (0xdc, b'>L', 4), + 'srcs_offset' : (0xe0, b'>L', 4), + 'srcs_count' : (0xe4, b'>L', 4), + 'unknown3' : (0xe8, b'>L', 4), + 'unknown4' : (0xec, b'>L', 4), + 'fill5' : (0xf0, b'>H', 2), + 'traildata_flags' : (0xf2, b'>H', 2), + 'ncx_index' : (0xf4, b'>L', 4), + 'unknown5' : (0xf8, b'>L', 4), + 'unknown6' : (0xfc, b'>L', 4), + 'datp_offset' : (0x100, b'>L', 4), + 'unknown7' : (0x104, b'>L', 4), + 'Unknown ' : (0x108, b'>L', 4), + 'Unknown ' : (0x10C, b'>L', 4), + 'Unknown ' : (0x110, b'>L', 4), + 'Unknown ' : (0x114, b'>L', 4), + 'Unknown ' : (0x118, b'>L', 4), + 'Unknown ' : (0x11C, b'>L', 4), + 'Unknown ' : (0x120, b'>L', 4), + 'Unknown ' : (0x124, b'>L', 4), + 'Unknown ' : (0x128, b'>L', 4), + 'Unknown ' : (0x12C, b'>L', 4), + 'Unknown ' : (0x130, b'>L', 4), + 'Unknown ' : (0x134, b'>L', 4), + 'Unknown ' : (0x138, b'>L', 4), + 'Unknown ' : (0x11C, b'>L', 4), + } + + mobi8_header = { + 'compression_type' : (0x00, b'>H', 2), + 'fill0' : (0x02, b'>H', 2), + 'text_length' : (0x04, b'>L', 4), + 'text_records' : (0x08, b'>H', 2), + 'max_section_size' : (0x0a, b'>H', 2), + 'crypto_type' : (0x0c, b'>H', 2), + 'fill1' : (0x0e, b'>H', 2), + 'magic' : (0x10, b'4s', 4), + 'header_length (from MOBI)' : (0x14, b'>L', 4), + 'type' : (0x18, b'>L', 4), + 'codepage' : (0x1c, b'>L', 4), + 'unique_id' : (0x20, b'>L', 4), + 'version' : (0x24, b'>L', 4), + 'metaorthindex' : (0x28, b'>L', 4), + 'metainflindex' : (0x2c, b'>L', 4), + 'index_names' : (0x30, b'>L', 4), + 'index_keys' : (0x34, b'>L', 4), + 'extra_index0' : (0x38, b'>L', 4), + 'extra_index1' : (0x3c, b'>L', 4), + 'extra_index2' : (0x40, b'>L', 4), + 'extra_index3' : (0x44, b'>L', 4), + 'extra_index4' : (0x48, b'>L', 4), + 'extra_index5' : (0x4c, b'>L', 4), + 'first_nontext' : (0x50, b'>L', 4), + 'title_offset' : (0x54, b'>L', 4), + 'title_length' : (0x58, b'>L', 4), + 'language_code' : (0x5c, b'>L', 4), + 'dict_in_lang' : (0x60, b'>L', 4), + 'dict_out_lang' : (0x64, b'>L', 4), + 'min_version' : (0x68, b'>L', 4), + 'first_resc_offset' : (0x6c, b'>L', 4), + 'huff_offset' : (0x70, b'>L', 4), + 'huff_num' : (0x74, b'>L', 4), + 'huff_tbl_offset' : (0x78, b'>L', 4), + 'huff_tbl_len' : (0x7c, b'>L', 4), + 'exth_flags' : (0x80, b'>L', 4), + 'fill3_a' : (0x84, b'>L', 4), + 'fill3_b' : (0x88, b'>L', 4), + 'fill3_c' : (0x8c, b'>L', 4), + 'fill3_d' : (0x90, b'>L', 4), + 'fill3_e' : (0x94, b'>L', 4), + 'fill3_f' : (0x98, b'>L', 4), + 'fill3_g' : (0x9c, b'>L', 4), + 'fill3_h' : (0xa0, b'>L', 4), + 'unknown0' : (0xa4, b'>L', 4), + 'drm_offset' : (0xa8, b'>L', 4), + 'drm_count' : (0xac, b'>L', 4), + 'drm_size' : (0xb0, b'>L', 4), + 'drm_flags' : (0xb4, b'>L', 4), + 'fill4_a' : (0xb8, b'>L', 4), + 'fill4_b' : (0xbc, b'>L', 4), + 'fdst_offset' : (0xc0, b'>L', 4), + 'fdst_flow_count' : (0xc4, b'>L', 4), + 'fcis_offset' : (0xc8, b'>L', 4), + 'fcis_count' : (0xcc, b'>L', 4), + 'flis_offset' : (0xd0, b'>L', 4), + 'flis_count' : (0xd4, b'>L', 4), + 'unknown1' : (0xd8, b'>L', 4), + 'unknown2' : (0xdc, b'>L', 4), + 'srcs_offset' : (0xe0, b'>L', 4), + 'srcs_count' : (0xe4, b'>L', 4), + 'unknown3' : (0xe8, b'>L', 4), + 'unknown4' : (0xec, b'>L', 4), + 'fill5' : (0xf0, b'>H', 2), + 'traildata_flags' : (0xf2, b'>H', 2), + 'ncx_index' : (0xf4, b'>L', 4), + 'fragment_index' : (0xf8, b'>L', 4), + 'skeleton_index' : (0xfc, b'>L', 4), + 'datp_offset' : (0x100, b'>L', 4), + 'guide_index' : (0x104, b'>L', 4), + 'Unknown ' : (0x108, b'>L', 4), + 'Unknown ' : (0x10C, b'>L', 4), + 'Unknown ' : (0x110, b'>L', 4), + 'Unknown ' : (0x114, b'>L', 4), + 'Unknown ' : (0x118, b'>L', 4), + 'Unknown ' : (0x11C, b'>L', 4), + 'Unknown ' : (0x120, b'>L', 4), + 'Unknown ' : (0x124, b'>L', 4), + 'Unknown ' : (0x128, b'>L', 4), + 'Unknown ' : (0x12C, b'>L', 4), + 'Unknown ' : (0x130, b'>L', 4), + 'Unknown ' : (0x134, b'>L', 4), + 'Unknown ' : (0x138, b'>L', 4), + 'Unknown ' : (0x11C, b'>L', 4), + } + + palmdoc_header_sorted_keys = sortedHeaderKeys(palmdoc_header) + mobi6_header_sorted_keys = sortedHeaderKeys(mobi6_header) + mobi8_header_sorted_keys = sortedHeaderKeys(mobi8_header) + + id_map_strings = { + 1 : 'Drm Server Id', + 2 : 'Drm Commerce Id', + 3 : 'Drm Ebookbase Book Id', + 4 : 'Drm Ebookbase Dep Id', + 100 : 'Creator', + 101 : 'Publisher', + 102 : 'Imprint', + 103 : 'Description', + 104 : 'ISBN', + 105 : 'Subject', + 106 : 'Published', + 107 : 'Review', + 108 : 'Contributor', + 109 : 'Rights', + 110 : 'SubjectCode', + 111 : 'Type', + 112 : 'Source', + 113 : 'ASIN', + # 114 : 'versionNumber', + 117 : 'Adult', + 118 : 'Retail-Price', + 119 : 'Retail-Currency', + 120 : 'TSC', + 122 : 'fixed-layout', + 123 : 'book-type', + 124 : 'orientation-lock', + 126 : 'original-resolution', + 127 : 'zero-gutter', + 128 : 'zero-margin', + 129 : 'MetadataResourceURI', + 132 : 'RegionMagnification', + 150 : 'LendingEnabled', + 200 : 'DictShortName', + 501 : 'cdeType', + 502 : 'last_update_time', + 503 : 'Updated_Title', + 504 : 'CDEContentKey', + 505 : 'AmazonContentReference', + 506 : 'Title-Language', + 507 : 'Title-Display-Direction', + 508 : 'Title-Pronunciation', + 509 : 'Title-Collation', + 510 : 'Secondary-Title', + 511 : 'Secondary-Title-Language', + 512 : 'Secondary-Title-Direction', + 513 : 'Secondary-Title-Pronunciation', + 514 : 'Secondary-Title-Collation', + 515 : 'Author-Language', + 516 : 'Author-Display-Direction', + 517 : 'Author-Pronunciation', + 518 : 'Author-Collation', + 519 : 'Author-Type', + 520 : 'Publisher-Language', + 521 : 'Publisher-Display-Direction', + 522 : 'Publisher-Pronunciation', + 523 : 'Publisher-Collation', + 524 : 'Content-Language-Tag', + 525 : 'primary-writing-mode', + 526 : 'NCX-Ingested-By-Software', + 527 : 'page-progression-direction', + 528 : 'override-kindle-fonts', + 529 : 'Compression-Upgraded', + 530 : 'Soft-Hyphens-In-Content', + 531 : 'Dictionary_In_Langague', + 532 : 'Dictionary_Out_Language', + 533 : 'Font_Converted', + 534 : 'Amazon_Creator_Info', + 535 : 'Creator-Build-Tag', + 536 : 'HD-Media-Containers-Info', # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?) + 538 : 'Resource-Container-Fidelity', + 539 : 'HD-Container-Mimetype', + 540 : 'Sample-For_Special-Purpose', + 541 : 'Kindletool-Operation-Information', + 542 : 'Container_Id', + 543 : 'Asset-Type', # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER + 544 : 'Unknown_544', + } + id_map_values = { + 114 : 'versionNumber', + 115 : 'sample', + 116 : 'StartOffset', + 121 : 'Mobi8-Boundary-Section', + 125 : 'Embedded-Record-Count', + 130 : 'Offline-Sample', + 131 : 'Metadata-Record-Offset', + 201 : 'CoverOffset', + 202 : 'ThumbOffset', + 203 : 'HasFakeCover', + 204 : 'Creator-Software', + 205 : 'Creator-Major-Version', + 206 : 'Creator-Minor-Version', + 207 : 'Creator-Build-Number', + 401 : 'Clipping-Limit', + 402 : 'Publisher-Limit', + 404 : 'Text-to-Speech-Disabled', + 406 : 'Rental-Expiration-Time', + } + id_map_hexstrings = { + 208 : 'Watermark_(hex)', + 209 : 'Tamper-Proof-Keys_(hex)', + 300 : 'Font-Signature_(hex)', + 403 : 'Unknown_(403)_(hex)', + 405 : 'Ownership-Type_(hex)', + 407 : 'Unknown_(407)_(hex)', + 420 : 'Multimedia-Content-Reference_(hex)', + 450 : 'Locations_Match_(hex)', + 451 : 'Full-Story-Length_(hex)', + 452 : 'Sample-Start_Location_(hex)', + 453 : 'Sample-End-Location_(hex)', + } + + def __init__(self, sect, sectNumber): + self.sect = sect + self.start = sectNumber + self.header = self.sect.loadSection(self.start) + if len(self.header)>20 and self.header[16:20] == b'MOBI': + self.sect.setsectiondescription(0,"Mobipocket Header") + self.palm = False + elif self.sect.ident == b'TEXtREAd': + self.sect.setsectiondescription(0, "PalmDOC Header") + self.palm = True + else: + raise unpackException('Unknown File Format') + + self.records, = struct.unpack_from(b'>H', self.header, 0x8) + + # set defaults in case this is a PalmDOC + self.title = self.sect.palmname.decode('latin-1', errors='replace') + self.length = len(self.header)-16 + self.type = 3 + self.codepage = 1252 + self.codec = 'windows-1252' + self.unique_id = 0 + self.version = 0 + self.hasExth = False + self.exth = b'' + self.exth_offset = self.length + 16 + self.exth_length = 0 + self.crypto_type = 0 + self.firstnontext = self.start+self.records + 1 + self.firstresource = self.start+self.records + 1 + self.ncxidx = 0xffffffff + self.metaOrthIndex = 0xffffffff + self.metaInflIndex = 0xffffffff + self.skelidx = 0xffffffff + self.fragidx = 0xffffffff + self.guideidx = 0xffffffff + self.fdst = 0xffffffff + self.mlstart = self.sect.loadSection(self.start+1)[:4] + self.rawSize = 0 + self.metadata = dict_() + + # set up for decompression/unpacking + self.compression, = struct.unpack_from(b'>H', self.header, 0x0) + if self.compression == 0x4448: + reader = HuffcdicReader() + huffoff, huffnum = struct.unpack_from(b'>LL', self.header, 0x70) + huffoff = huffoff + self.start + self.sect.setsectiondescription(huffoff,"Huffman Compression Seed") + reader.loadHuff(self.sect.loadSection(huffoff)) + for i in range(1, huffnum): + self.sect.setsectiondescription(huffoff+i,"Huffman CDIC Compression Seed %d" % i) + reader.loadCdic(self.sect.loadSection(huffoff+i)) + self.unpack = reader.unpack + elif self.compression == 2: + self.unpack = PalmdocReader().unpack + elif self.compression == 1: + self.unpack = UncompressedReader().unpack + else: + raise unpackException('invalid compression type: 0x%4x' % self.compression) + + if self.palm: + return + + self.length, self.type, self.codepage, self.unique_id, self.version = struct.unpack(b'>LLLLL', self.header[20:40]) + codec_map = { + 1252 : 'windows-1252', + 65001: 'utf-8', + } + if self.codepage in codec_map: + self.codec = codec_map[self.codepage] + + # title + toff, tlen = struct.unpack(b'>II', self.header[0x54:0x5c]) + tend = toff + tlen + self.title=self.header[toff:tend].decode(self.codec, errors='replace') + + exth_flag, = struct.unpack(b'>L', self.header[0x80:0x84]) + self.hasExth = exth_flag & 0x40 + self.exth_offset = self.length + 16 + self.exth_length = 0 + if self.hasExth: + self.exth_length, = struct.unpack_from(b'>L', self.header, self.exth_offset+4) + self.exth_length = ((self.exth_length + 3)>>2)<<2 # round to next 4 byte boundary + self.exth = self.header[self.exth_offset:self.exth_offset+self.exth_length] + + # parse the exth / metadata + self.parseMetaData() + + # self.mlstart = self.sect.loadSection(self.start+1) + # self.mlstart = self.mlstart[0:4] + self.crypto_type, = struct.unpack_from(b'>H', self.header, 0xC) + + # Start sector for additional files such as images, fonts, resources, etc + # Can be missing so fall back to default set previously + ofst, = struct.unpack_from(b'>L', self.header, 0x6C) + if ofst != 0xffffffff: + self.firstresource = ofst + self.start + ofst, = struct.unpack_from(b'>L', self.header, 0x50) + if ofst != 0xffffffff: + self.firstnontext = ofst + self.start + + if self.isPrintReplica(): + return + + if self.version < 8: + # Dictionary metaOrthIndex + self.metaOrthIndex, = struct.unpack_from(b'>L', self.header, 0x28) + if self.metaOrthIndex != 0xffffffff: + self.metaOrthIndex += self.start + + # Dictionary metaInflIndex + self.metaInflIndex, = struct.unpack_from(b'>L', self.header, 0x2C) + if self.metaInflIndex != 0xffffffff: + self.metaInflIndex += self.start + + # handle older headers without any ncxindex info and later + # specifically 0xe4 headers + if self.length + 16 < 0xf8: + return + + # NCX Index + self.ncxidx, = struct.unpack(b'>L', self.header[0xf4:0xf8]) + if self.ncxidx != 0xffffffff: + self.ncxidx += self.start + + # K8 specific Indexes + if self.start != 0 or self.version == 8: + # Index into file skeletons in RawML + self.skelidx, = struct.unpack_from(b'>L', self.header, 0xfc) + if self.skelidx != 0xffffffff: + self.skelidx += self.start + + # Index into
sections in RawML + self.fragidx, = struct.unpack_from(b'>L', self.header, 0xf8) + if self.fragidx != 0xffffffff: + self.fragidx += self.start + + # Index into Other files + self.guideidx, = struct.unpack_from(b'>L', self.header, 0x104) + if self.guideidx != 0xffffffff: + self.guideidx += self.start + + # dictionaries do not seem to use the same approach in K8's + # so disable them + self.metaOrthIndex = 0xffffffff + self.metaInflIndex = 0xffffffff + + # need to use the FDST record to find out how to properly unpack + # the rawML into pieces + # it is simply a table of start and end locations for each flow piece + self.fdst, = struct.unpack_from(b'>L', self.header, 0xc0) + self.fdstcnt, = struct.unpack_from(b'>L', self.header, 0xc4) + # if cnt is 1 or less, fdst section mumber can be garbage + if self.fdstcnt <= 1: + self.fdst = 0xffffffff + if self.fdst != 0xffffffff: + self.fdst += self.start + # setting of fdst section description properly handled in mobi_kf8proc + + def dump_exth(self): + # determine text encoding + codec=self.codec + if (not self.hasExth) or (self.exth_length) == 0 or (self.exth == b''): + return + num_items, = struct.unpack(b'>L', self.exth[8:12]) + pos = 12 + print("Key Size Description Value") + for _ in range(num_items): + id, size = struct.unpack(b'>LL', self.exth[pos:pos+8]) + contentsize = size-8 + content = self.exth[pos + 8: pos + size] + if id in MobiHeader.id_map_strings: + exth_name = MobiHeader.id_map_strings[id] + print('{0: >3d} {1: >4d} {2: <30s} {3:s}'.format(id, contentsize, exth_name, content.decode(codec, errors='replace'))) + elif id in MobiHeader.id_map_values: + exth_name = MobiHeader.id_map_values[id] + if size == 9: + value, = struct.unpack(b'B',content) + print('{0:3d} byte {1:<30s} {2:d}'.format(id, exth_name, value)) + elif size == 10: + value, = struct.unpack(b'>H',content) + print('{0:3d} word {1:<30s} 0x{2:0>4X} ({2:d})'.format(id, exth_name, value)) + elif size == 12: + value, = struct.unpack(b'>L',content) + print('{0:3d} long {1:<30s} 0x{2:0>8X} ({2:d})'.format(id, exth_name, value)) + else: + print('{0: >3d} {1: >4d} {2: <30s} (0x{3:s})'.format(id, contentsize, "Bad size for "+exth_name, hexlify(content))) + elif id in MobiHeader.id_map_hexstrings: + exth_name = MobiHeader.id_map_hexstrings[id] + print('{0:3d} {1:4d} {2:<30s} 0x{3:s}'.format(id, contentsize, exth_name, hexlify(content))) + else: + exth_name = "Unknown EXTH ID {0:d}".format(id) + print("{0: >3d} {1: >4d} {2: <30s} 0x{3:s}".format(id, contentsize, exth_name, hexlify(content))) + pos += size + return + + def dumpheader(self): + # first 16 bytes are not part of the official mobiheader + # but we will treat it as such + # so section 0 is 16 (decimal) + self.length in total == at least 0x108 bytes for Mobi 8 headers + print("Dumping section %d, Mobipocket Header version: %d, total length %d" % (self.start,self.version, self.length+16)) + self.hdr = {} + # set it up for the proper header version + if self.version == 0: + self.mobi_header = MobiHeader.palmdoc_header + self.mobi_header_sorted_keys = MobiHeader.palmdoc_header_sorted_keys + elif self.version < 8: + self.mobi_header = MobiHeader.mobi6_header + self.mobi_header_sorted_keys = MobiHeader.mobi6_header_sorted_keys + else: + self.mobi_header = MobiHeader.mobi8_header + self.mobi_header_sorted_keys = MobiHeader.mobi8_header_sorted_keys + + # parse the header information + for key in self.mobi_header_sorted_keys: + (pos, format, tot_len) = self.mobi_header[key] + if pos < (self.length + 16): + val, = struct.unpack_from(format, self.header, pos) + self.hdr[key] = val + + if 'title_offset' in self.hdr: + title_offset = self.hdr['title_offset'] + title_length = self.hdr['title_length'] + else: + title_offset = 0 + title_length = 0 + if title_offset == 0: + title_offset = len(self.header) + title_length = 0 + self.title = self.sect.palmname.decode('latin-1', errors='replace') + else: + self.title = self.header[title_offset:title_offset+title_length].decode(self.codec, errors='replace') + # title record always padded with two nul bytes and then padded with nuls to next 4 byte boundary + title_length = ((title_length+2+3)>>2)<<2 + + self.extra1 = self.header[self.exth_offset+self.exth_length:title_offset] + self.extra2 = self.header[title_offset+title_length:] + + print("Mobipocket header from section %d" % self.start) + print(" Offset Value Hex Dec Description") + for key in self.mobi_header_sorted_keys: + (pos, format, tot_len) = self.mobi_header[key] + if pos < (self.length + 16): + if key != 'magic': + fmt_string = "0x{0:0>3X} ({0:3d}){1: >" + str(9-2*tot_len) +"s}0x{2:0>" + str(2*tot_len) + "X} {2:10d} {3:s}" + else: + self.hdr[key] = unicode_str(self.hdr[key]) + fmt_string = "0x{0:0>3X} ({0:3d}){2:>11s} {3:s}" + print(fmt_string.format(pos, " ",self.hdr[key], key)) + print("") + + if self.exth_length > 0: + print("EXTH metadata, offset %d, padded length %d" % (self.exth_offset,self.exth_length)) + self.dump_exth() + print("") + + if len(self.extra1) > 0: + print("Extra data between EXTH and Title, length %d" % len(self.extra1)) + print(hexlify(self.extra1)) + print("") + + if title_length > 0: + print("Title in header at offset %d, padded length %d: '%s'" %(title_offset,title_length,self.title)) + print("") + + if len(self.extra2) > 0: + print("Extra data between Title and end of header, length %d" % len(self.extra2)) + print(hexlify(self.extra2)) + print("") + + def isPrintReplica(self): + return self.mlstart[0:4] == b"%MOP" + + def isK8(self): + return self.start != 0 or self.version == 8 + + def isEncrypted(self): + return self.crypto_type != 0 + + def hasNCX(self): + return self.ncxidx != 0xffffffff + + def isDictionary(self): + return self.metaOrthIndex != 0xffffffff + + def getncxIndex(self): + return self.ncxidx + + def decompress(self, data): + return self.unpack(data) + + def Language(self): + langcode = struct.unpack(b'!L', self.header[0x5c:0x60])[0] + langid = langcode & 0xFF + sublangid = (langcode >> 8) & 0xFF + return getLanguage(langid, sublangid) + + def DictInLanguage(self): + if self.isDictionary(): + langcode = struct.unpack(b'!L', self.header[0x60:0x64])[0] + langid = langcode & 0xFF + sublangid = (langcode >> 10) & 0xFF + if langid != 0: + return getLanguage(langid, sublangid) + return False + + def DictOutLanguage(self): + if self.isDictionary(): + langcode = struct.unpack(b'!L', self.header[0x64:0x68])[0] + langid = langcode & 0xFF + sublangid = (langcode >> 10) & 0xFF + if langid != 0: + return getLanguage(langid, sublangid) + return False + + def getRawML(self): + def getSizeOfTrailingDataEntry(data): + num = 0 + for v in data[-4:]: + if bord(v) & 0x80: + num = 0 + num = (num << 7) | (bord(v) & 0x7f) + return num + def trimTrailingDataEntries(data): + for _ in range(trailers): + num = getSizeOfTrailingDataEntry(data) + data = data[:-num] + if multibyte: + num = (ord(data[-1:]) & 3) + 1 + data = data[:-num] + return data + multibyte = 0 + trailers = 0 + if self.sect.ident == b'BOOKMOBI': + mobi_length, = struct.unpack_from(b'>L', self.header, 0x14) + mobi_version, = struct.unpack_from(b'>L', self.header, 0x68) + if (mobi_length >= 0xE4) and (mobi_version >= 5): + flags, = struct.unpack_from(b'>H', self.header, 0xF2) + multibyte = flags & 1 + while flags > 1: + if flags & 2: + trailers += 1 + flags = flags >> 1 + # get raw mobi markup languge + print("Unpacking raw markup language") + dataList = [] + # offset = 0 + for i in range(1, self.records+1): + data = trimTrailingDataEntries(self.sect.loadSection(self.start + i)) + dataList.append(self.unpack(data)) + if self.isK8(): + self.sect.setsectiondescription(self.start + i,"KF8 Text Section {0:d}".format(i)) + elif self.version == 0: + self.sect.setsectiondescription(self.start + i,"PalmDOC Text Section {0:d}".format(i)) + else: + self.sect.setsectiondescription(self.start + i,"Mobipocket Text Section {0:d}".format(i)) + rawML = b''.join(dataList) + self.rawSize = len(rawML) + return rawML + + # all metadata is stored in a dictionary with key and returns a *list* of values + # a list is used to allow for multiple creators, multiple contributors, etc + def parseMetaData(self): + def addValue(name, value): + if name not in self.metadata: + self.metadata[name] = [value] + else: + self.metadata[name].append(value) + + codec=self.codec + if self.hasExth: + extheader=self.exth + _length, num_items = struct.unpack(b'>LL', extheader[4:12]) + extheader = extheader[12:] + pos = 0 + for _ in range(num_items): + id, size = struct.unpack(b'>LL', extheader[pos:pos+8]) + content = extheader[pos + 8: pos + size] + if id in MobiHeader.id_map_strings: + name = MobiHeader.id_map_strings[id] + addValue(name, content.decode(codec, errors='replace')) + elif id in MobiHeader.id_map_values: + name = MobiHeader.id_map_values[id] + if size == 9: + value, = struct.unpack(b'B',content) + addValue(name, unicode_str(str(value))) + elif size == 10: + value, = struct.unpack(b'>H',content) + addValue(name, unicode_str(str(value))) + elif size == 12: + value, = struct.unpack(b'>L',content) + # handle special case of missing CoverOffset or missing ThumbOffset + if id == 201 or id == 202: + if value != 0xffffffff: + addValue(name, unicode_str(str(value))) + else: + addValue(name, unicode_str(str(value))) + else: + print("Warning: Bad key, size, value combination detected in EXTH ", id, size, hexlify(content)) + addValue(name, hexlify(content)) + elif id in MobiHeader.id_map_hexstrings: + name = MobiHeader.id_map_hexstrings[id] + addValue(name, hexlify(content)) + else: + name = unicode_str(str(id)) + ' (hex)' + addValue(name, hexlify(content)) + pos += size + + # add the basics to the metadata each as a list element + self.metadata['Language'] = [self.Language()] + self.metadata['Title'] = [unicode_str(self.title,self.codec)] + self.metadata['Codec'] = [self.codec] + self.metadata['UniqueID'] = [unicode_str(str(self.unique_id))] + # if no asin create one using a uuid + if 'ASIN' not in self.metadata: + self.metadata['ASIN'] = [unicode_str(str(uuid.uuid4()))] + # if no cdeType set it to "EBOK" + if 'cdeType' not in self.metadata: + self.metadata['cdeType'] = ['EBOK'] + + def getMetaData(self): + return self.metadata + + def describeHeader(self, DUMP): + print("Mobi Version:", self.version) + print("Codec:", self.codec) + print("Title:", self.title) + if 'Updated_Title' in self.metadata: + print("EXTH Title:", self.metadata['Updated_Title'][0]) + if self.compression == 0x4448: + print("Huffdic compression") + elif self.compression == 2: + print("Palmdoc compression") + elif self.compression == 1: + print("No compression") + if DUMP: + self.dumpheader() diff --git a/src/epy_reader/tools/KindleUnpack/mobi_html.py b/src/epy_reader/tools/KindleUnpack/mobi_html.py new file mode 100644 index 0000000..eda766c --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_html.py @@ -0,0 +1,439 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, utf8_str + +if PY2: + range = xrange + +import re +# note: re requites the pattern to be the exact same type as the data to be searched in python3 +# but u"" is not allowed for the pattern itself only b"" + +from .mobi_utils import fromBase32 + +class HTMLProcessor: + + def __init__(self, files, metadata, rscnames): + self.files = files + self.metadata = metadata + self.rscnames = rscnames + # for original style mobis, default to including all image files in the opf manifest + self.used = {} + for name in rscnames: + self.used[name] = 'used' + + def findAnchors(self, rawtext, indx_data, positionMap): + # process the raw text + # find anchors... + print("Find link anchors") + link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''', re.IGNORECASE) + # TEST NCX: merge in filepos from indx + pos_links = [int(m.group(1)) for m in link_pattern.finditer(rawtext)] + if indx_data: + pos_indx = [e['pos'] for e in indx_data if e['pos']>0] + pos_links = list(set(pos_links + pos_indx)) + + for position in pos_links: + if position in positionMap: + positionMap[position] = positionMap[position] + utf8_str('' % position) + else: + positionMap[position] = utf8_str('' % position) + + # apply dictionary metadata and anchors + print("Insert data into html") + pos = 0 + lastPos = len(rawtext) + dataList = [] + for end in sorted(positionMap.keys()): + if end == 0 or end > lastPos: + continue # something's up - can't put a tag in outside ... + dataList.append(rawtext[pos:end]) + dataList.append(positionMap[end]) + pos = end + dataList.append(rawtext[pos:]) + srctext = b"".join(dataList) + rawtext = None + dataList = None + self.srctext = srctext + self.indx_data = indx_data + return srctext + + def insertHREFS(self): + srctext = self.srctext + rscnames = self.rscnames + metadata = self.metadata + + # put in the hrefs + print("Insert hrefs into html") + # There doesn't seem to be a standard, so search as best as we can + + link_pattern = re.compile(br''']*?)filepos=['"]{0,1}0*(\d+)['"]{0,1}([^>]*?)>''', re.IGNORECASE) + srctext = link_pattern.sub(br'''''', srctext) + + # remove empty anchors + print("Remove empty anchors from html") + srctext = re.sub(br"",br"", srctext) + srctext = re.sub(br"\s*",br"", srctext) + + # convert image references + print("Insert image references into html") + # split string into image tag pieces and other pieces + image_pattern = re.compile(br'''()''', re.IGNORECASE) + image_index_pattern = re.compile(br'''recindex=['"]{0,1}([0-9]+)['"]{0,1}''', re.IGNORECASE) + srcpieces = image_pattern.split(srctext) + srctext = self.srctext = None + + # all odd pieces are image tags (nulls string on even pieces if no space between them in srctext) + for i in range(1, len(srcpieces), 2): + tag = srcpieces[i] + for m in image_index_pattern.finditer(tag): + imageNumber = int(m.group(1)) + imageName = rscnames[imageNumber-1] + if imageName is None: + print("Error: Referenced image %s was not recognized as a valid image" % imageNumber) + else: + replacement = b'src="Images/' + utf8_str(imageName) + b'"' + tag = image_index_pattern.sub(replacement, tag, 1) + srcpieces[i] = tag + srctext = b"".join(srcpieces) + + # add in character set meta into the html header if needed + if 'Codec' in metadata: + srctext = srctext[0:12]+b''+srctext[12:] + return srctext, self.used + + +class XHTMLK8Processor: + + def __init__(self, rscnames, k8proc): + self.rscnames = rscnames + self.k8proc = k8proc + self.used = {} + + def buildXHTML(self): + + # first need to update all links that are internal which + # are based on positions within the xhtml files **BEFORE** + # cutting and pasting any pieces into the xhtml text files + + # kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml) + # XXXX is the offset in records into divtbl + # YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position + + # pos:fid pattern + posfid_pattern = re.compile(br'''()''', re.IGNORECASE) + posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''') + + parts = [] + print("Building proper xhtml for each file") + for i in range(self.k8proc.getNumberOfParts()): + part = self.k8proc.getPart(i) + [partnum, dir, filename, beg, end, aidtext] = self.k8proc.getPartInfo(i) + + # internal links + srcpieces = posfid_pattern.split(part) + for j in range(1, len(srcpieces),2): + tag = srcpieces[j] + if tag.startswith(b'<'): + for m in posfid_index_pattern.finditer(tag): + posfid = m.group(1) + offset = m.group(2) + filename, idtag = self.k8proc.getIDTagByPosFid(posfid, offset) + if idtag == b'': + replacement= b'"' + utf8_str(filename) + b'"' + else: + replacement = b'"' + utf8_str(filename) + b'#' + idtag + b'"' + tag = posfid_index_pattern.sub(replacement, tag, 1) + srcpieces[j] = tag + part = b"".join(srcpieces) + parts.append(part) + + # we are free to cut and paste as we see fit + # we can safely remove all of the Kindlegen generated aid tags + # change aid ids that are in k8proc.linked_aids to xhtml ids + find_tag_with_aid_pattern = re.compile(br'''(<[^>]*\said\s*=[^>]*>)''', re.IGNORECASE) + within_tag_aid_position_pattern = re.compile(br'''\said\s*=['"]([^'"]*)['"]''') + for i in range(len(parts)): + part = parts[i] + srcpieces = find_tag_with_aid_pattern.split(part) + for j in range(len(srcpieces)): + tag = srcpieces[j] + if tag.startswith(b'<'): + for m in within_tag_aid_position_pattern.finditer(tag): + try: + aid = m.group(1) + except IndexError: + aid = None + replacement = b'' + if aid in self.k8proc.linked_aids: + replacement = b' id="aid-' + aid + b'"' + tag = within_tag_aid_position_pattern.sub(replacement, tag, 1) + srcpieces[j] = tag + part = b"".join(srcpieces) + parts[i] = part + + # we can safely replace all of the Kindlegen generated data-AmznPageBreak tags + # with page-break-after style patterns + find_tag_with_AmznPageBreak_pattern = re.compile(br'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE) + within_tag_AmznPageBreak_position_pattern = re.compile(br'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''') + for i in range(len(parts)): + part = parts[i] + srcpieces = find_tag_with_AmznPageBreak_pattern.split(part) + for j in range(len(srcpieces)): + tag = srcpieces[j] + if tag.startswith(b'<'): + srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub( + lambda m:b' style="page-break-after:' + m.group(1) + b'"', tag) + part = b"".join(srcpieces) + parts[i] = part + + # we have to handle substitutions for the flows pieces first as they may + # be inlined into the xhtml text + # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images) + # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc) + # kindle:embed:XXXX (used for fonts) + + flows = [] + flows.append(None) + flowinfo = [] + flowinfo.append([None, None, None, None]) + + # regular expression search patterns + img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE) + img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE) + + tag_pattern = re.compile(br'''(<[^>]*>)''') + flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE) + + url_pattern = re.compile(br'''(url\(.*?\))''', re.IGNORECASE) + url_img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*["')]''', re.IGNORECASE) + font_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)["')]''', re.IGNORECASE) + url_css_index_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE) + url_svg_image_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=image/svg\+xml[^\)]*''', re.IGNORECASE) + + for i in range(1, self.k8proc.getNumberOfFlows()): + [ftype, format, dir, filename] = self.k8proc.getFlowInfo(i) + flowpart = self.k8proc.getFlow(i) + + # links to raster image files from image tags + # image_pattern + srcpieces = img_pattern.split(flowpart) + for j in range(1, len(srcpieces),2): + tag = srcpieces[j] + if tag.startswith(b']*>)''') + flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE) + for i in range(len(parts)): + part = parts[i] + [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] + # flow pattern + srcpieces = tag_pattern.split(part) + for j in range(1, len(srcpieces),2): + tag = srcpieces[j] + if tag.startswith(b'<'): + for m in flow_pattern.finditer(tag): + num = fromBase32(m.group(1)) + if num > 0 and num < len(self.k8proc.flowinfo): + [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) + flowpart = flows[num] + if fmt == b'inline': + tag = flowpart + else: + replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"' + tag = flow_pattern.sub(replacement, tag, 1) + self.used[fnm] = 'used' + else: + print("warning: ignoring non-existent flow link", tag, " value 0x%x" % num) + srcpieces[j] = tag + part = b''.join(srcpieces) + + # store away modified version + parts[i] = part + + # Handle any embedded raster images links in style= attributes urls + style_pattern = re.compile(br'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''', re.IGNORECASE) + img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE) + + for i in range(len(parts)): + part = parts[i] + [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] + + # replace urls in style attributes + srcpieces = style_pattern.split(part) + for j in range(1, len(srcpieces),2): + tag = srcpieces[j] + if b'kindle:embed' in tag: + for m in img_index_pattern.finditer(tag): + imageNumber = fromBase32(m.group(1)) + imageName = self.rscnames[imageNumber-1] + osep = m.group()[0:1] + csep = m.group()[-1:] + if imageName is not None: + replacement = osep + b'../Images/'+ utf8_str(imageName) + csep + self.used[imageName] = 'used' + tag = img_index_pattern.sub(replacement, tag, 1) + else: + print("Error: Referenced image %s in style url was not recognized in %s" % (imageNumber, tag)) + srcpieces[j] = tag + part = b"".join(srcpieces) + + # store away modified version + parts[i] = part + + # Handle any embedded raster images links in the xhtml text + # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images) + img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE) + img_index_pattern = re.compile(br'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''') + + for i in range(len(parts)): + part = parts[i] + [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] + + # links to raster image files + # image_pattern + srcpieces = img_pattern.split(part) + for j in range(1, len(srcpieces),2): + tag = srcpieces[j] + if tag.startswith(b' remove value="XX" attributes since these are illegal + tag_pattern = re.compile(br'''(<[^>]*>)''') + li_value_pattern = re.compile(br'''\svalue\s*=\s*['"][^'"]*['"]''', re.IGNORECASE) + + for i in range(len(parts)): + part = parts[i] + [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] + + # tag pattern + srcpieces = tag_pattern.split(part) + for j in range(1, len(srcpieces),2): + tag = srcpieces[j] + if tag.startswith(b'H', data, idxtPos + 4 + (2 * j)) + idxPositions.append(pos) + # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) + idxPositions.append(idxtPos) + # for each entry in the IDXT build up the tagMap and any associated text + for j in range(entryCount): + startPos = idxPositions[j] + endPos = idxPositions[j+1] + textLength = ord(data[startPos:startPos+1]) + text = data[startPos+1:startPos+1+textLength] + if hordt2 is not None: + text = b''.join(bchr(hordt2[bord(x)]) for x in text) + tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos) + outtbl.append([text, tagMap]) + if self.DEBUG: + print(tagMap) + print(text) + return outtbl, ctoc_text + + def parseINDXHeader(self, data): + "read INDX header" + if not data[:4] == b'INDX': + print("Warning: index section is not INDX") + return False + words = ( + 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code', + 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc' + ) + num = len(words) + values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)]) + header = {} + for n in range(num): + header[words[n]] = values[n] + + ordt1 = None + ordt2 = None + + ocnt, oentries, op1, op2, otagx = struct.unpack_from(b'>LLLLL',data, 0xa4) + if header['code'] == 0xfdea or ocnt != 0 or oentries > 0: + # horribly hacked up ESP (sample) mobi books use two ORDT sections but never specify + # them in the proper place in the header. They seem to be codepage 65002 which seems + # to be some sort of strange EBCDIC utf-8 or 16 encoded strings + + # so we need to look for them and store them away to process leading text + # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries + # we only ever seem to use the seocnd but ... + assert(ocnt == 1) + assert(data[op1:op1+4] == b'ORDT') + assert(data[op2:op2+4] == b'ORDT') + ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4) + ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4) + + if self.DEBUG: + print("parsed INDX header:") + for n in words: + print(n, "%X" % header[n],) + print("") + return header, ordt1, ordt2 + + def readCTOC(self, txtdata): + # read all blocks from CTOC + ctoc_data = {} + offset = 0 + while offset next bytes: name + name = txtdata[offset:offset+ilen] + offset += ilen + if self.DEBUG: + print("name length is ", ilen) + print(idx_offs, name) + ctoc_data[idx_offs] = name + return ctoc_data + + +def getVariableWidthValue(data, offset): + ''' + Decode variable width value from given bytes. + + @param data: The bytes to decode. + @param offset: The start offset into data. + @return: Tuple of consumed bytes count and decoded value. + ''' + value = 0 + consumed = 0 + finished = False + while not finished: + v = data[offset + consumed: offset + consumed + 1] + consumed += 1 + if ord(v) & 0x80: + finished = True + value = (value << 7) | (ord(v) & 0x7f) + return consumed, value + + +def readTagSection(start, data): + ''' + Read tag section from given data. + + @param start: The start position in the data. + @param data: The data to process. + @return: Tuple of control byte count and list of tag tuples. + ''' + controlByteCount = 0 + tags = [] + if data[start:start+4] == b"TAGX": + firstEntryOffset, = struct.unpack_from(b'>L', data, start + 0x04) + controlByteCount, = struct.unpack_from(b'>L', data, start + 0x08) + + # Skip the first 12 bytes already read above. + for i in range(12, firstEntryOffset, 4): + pos = start + i + tags.append((ord(data[pos:pos+1]), ord(data[pos+1:pos+2]), ord(data[pos+2:pos+3]), ord(data[pos+3:pos+4]))) + return controlByteCount, tags + + +def countSetBits(value, bits=8): + ''' + Count the set bits in the given value. + + @param value: Integer value. + @param bits: The number of bits of the input value (defaults to 8). + @return: Number of set bits. + ''' + count = 0 + for _ in range(bits): + if value & 0x01 == 0x01: + count += 1 + value = value >> 1 + return count + + +def getTagMap(controlByteCount, tagTable, entryData, startPos, endPos): + ''' + Create a map of tags and values from the given byte section. + + @param controlByteCount: The number of control bytes. + @param tagTable: The tag table. + @param entryData: The data to process. + @param startPos: The starting position in entryData. + @param endPos: The end position in entryData or None if it is unknown. + @return: Hashmap of tag and list of values. + ''' + tags = [] + tagHashMap = {} + controlByteIndex = 0 + dataStart = startPos + controlByteCount + + for tag, valuesPerEntry, mask, endFlag in tagTable: + if endFlag == 0x01: + controlByteIndex += 1 + continue + cbyte = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1]) + if 0: + print("Control Byte Index %0x , Control Byte Value %0x" % (controlByteIndex, cbyte)) + + value = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1]) & mask + if value != 0: + if value == mask: + if countSetBits(mask) > 1: + # If all bits of masked value are set and the mask has more than one bit, a variable width value + # will follow after the control bytes which defines the length of bytes (NOT the value count!) + # which will contain the corresponding variable width values. + consumed, value = getVariableWidthValue(entryData, dataStart) + dataStart += consumed + tags.append((tag, None, value, valuesPerEntry)) + else: + tags.append((tag, 1, None, valuesPerEntry)) + else: + # Shift bits to get the masked value. + while mask & 0x01 == 0: + mask = mask >> 1 + value = value >> 1 + tags.append((tag, value, None, valuesPerEntry)) + for tag, valueCount, valueBytes, valuesPerEntry in tags: + values = [] + if valueCount is not None: + # Read valueCount * valuesPerEntry variable width values. + for _ in range(valueCount): + for _ in range(valuesPerEntry): + consumed, data = getVariableWidthValue(entryData, dataStart) + dataStart += consumed + values.append(data) + else: + # Convert valueBytes to variable width values. + totalConsumed = 0 + while totalConsumed < valueBytes: + # Does this work for valuesPerEntry != 1? + consumed, data = getVariableWidthValue(entryData, dataStart) + dataStart += consumed + totalConsumed += consumed + values.append(data) + if totalConsumed != valueBytes: + print("Error: Should consume %s bytes, but consumed %s" % (valueBytes, totalConsumed)) + tagHashMap[tag] = values + # Test that all bytes have been processed if endPos is given. + if endPos is not None and dataStart != endPos: + # The last entry might have some zero padding bytes, so complain only if non zero bytes are left. + for char in entryData[dataStart:endPos]: + if bord(char) != 0: + print("Warning: There are unprocessed index bytes left: %s" % toHex(entryData[dataStart:endPos])) + if 0: + print("controlByteCount: %s" % controlByteCount) + print("tagTable: %s" % tagTable) + print("data: %s" % toHex(entryData[startPos:endPos])) + print("tagHashMap: %s" % tagHashMap) + break + + return tagHashMap diff --git a/src/epy_reader/tools/KindleUnpack/mobi_k8proc.py b/src/epy_reader/tools/KindleUnpack/mobi_k8proc.py new file mode 100644 index 0000000..5b8274e --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_k8proc.py @@ -0,0 +1,496 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, bstr, utf8_str + +if PY2: + range = xrange + +import os + +import struct +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring + +import re +# note: re requites the pattern to be the exact same type as the data to be searched in python3 +# but u"" is not allowed for the pattern itself only b"" + +from .mobi_index import MobiIndex +from .mobi_utils import fromBase32 +from .unipath import pathof + +_guide_types = [b'cover',b'title-page',b'toc',b'index',b'glossary',b'acknowledgements', + b'bibliography',b'colophon',b'copyright-page',b'dedication', + b'epigraph',b'foreward',b'loi',b'lot',b'notes',b'preface',b'text'] + +# locate beginning and ending positions of tag with specific aid attribute +def locate_beg_end_of_tag(ml, aid): + pattern = utf8_str(r'''<[^>]*\said\s*=\s*['"]%s['"][^>]*>''' % aid) + aid_pattern = re.compile(pattern,re.IGNORECASE) + for m in re.finditer(aid_pattern, ml): + plt = m.start() + pgt = ml.find(b'>',plt+1) + return plt, pgt + return 0, 0 + + +# iterate over all tags in block in reverse order, i.e. last ta to first tag +def reverse_tag_iter(block): + end = len(block) + while True: + pgt = block.rfind(b'>', 0, end) + if pgt == -1: + break + plt = block.rfind(b'<', 0, pgt) + if plt == -1: + break + yield block[plt:pgt+1] + end = plt + + +class K8Processor: + + def __init__(self, mh, sect, files, debug=False): + self.sect = sect + self.files = files + self.mi = MobiIndex(sect) + self.mh = mh + self.skelidx = mh.skelidx + self.fragidx = mh.fragidx + self.guideidx = mh.guideidx + self.fdst = mh.fdst + self.flowmap = {} + self.flows = None + self.flowinfo = [] + self.parts = None + self.partinfo = [] + self.linked_aids = set() + self.fdsttbl= [0,0xffffffff] + self.DEBUG = debug + + # read in and parse the FDST info which is very similar in format to the Palm DB section + # parsing except it provides offsets into rawML file and not the Palm DB file + # this is needed to split up the final css, svg, etc flow section + # that can exist at the end of the rawML file + if self.fdst != 0xffffffff: + header = self.sect.loadSection(self.fdst) + if header[0:4] == b"FDST": + num_sections, = struct.unpack_from(b'>L', header, 0x08) + self.fdsttbl = struct.unpack_from(bstr('>%dL' % (num_sections*2)), header, 12)[::2] + (mh.rawSize, ) + sect.setsectiondescription(self.fdst,"KF8 FDST INDX") + if self.DEBUG: + print("\nFDST Section Map: %d sections" % num_sections) + for j in range(num_sections): + print("Section %d: 0x%08X - 0x%08X" % (j, self.fdsttbl[j],self.fdsttbl[j+1])) + else: + print("\nError: K8 Mobi with Missing FDST info") + + # read/process skeleton index info to create the skeleton table + skeltbl = [] + if self.skelidx != 0xffffffff: + # for i in range(2): + # fname = 'skel%04d.dat' % i + # data = self.sect.loadSection(self.skelidx + i) + # with open(pathof(fname), 'wb') as f: + # f.write(data) + outtbl, ctoc_text = self.mi.getIndexData(self.skelidx, "KF8 Skeleton") + fileptr = 0 + for [text, tagMap] in outtbl: + # file number, skeleton name, fragtbl record count, start position, length + skeltbl.append([fileptr, text, tagMap[1][0], tagMap[6][0], tagMap[6][1]]) + fileptr += 1 + self.skeltbl = skeltbl + if self.DEBUG: + print("\nSkel Table: %d entries" % len(self.skeltbl)) + print("table: filenum, skeleton name, frag tbl record count, start position, length") + for j in range(len(self.skeltbl)): + print(self.skeltbl[j]) + + # read/process the fragment index to create the fragment table + fragtbl = [] + if self.fragidx != 0xffffffff: + # for i in range(3): + # fname = 'frag%04d.dat' % i + # data = self.sect.loadSection(self.fragidx + i) + # with open(pathof(fname), 'wb') as f: + # f.write(data) + outtbl, ctoc_text = self.mi.getIndexData(self.fragidx, "KF8 Fragment") + for [text, tagMap] in outtbl: + # insert position, ctoc offset (aidtext), file number, sequence number, start position, length + ctocoffset = tagMap[2][0] + ctocdata = ctoc_text[ctocoffset] + fragtbl.append([int(text), ctocdata, tagMap[3][0], tagMap[4][0], tagMap[6][0], tagMap[6][1]]) + self.fragtbl = fragtbl + if self.DEBUG: + print("\nFragment Table: %d entries" % len(self.fragtbl)) + print("table: file position, link id text, file num, sequence number, start position, length") + for j in range(len(self.fragtbl)): + print(self.fragtbl[j]) + + # read / process guide index for guide elements of opf + guidetbl = [] + if self.guideidx != 0xffffffff: + # for i in range(3): + # fname = 'guide%04d.dat' % i + # data = self.sect.loadSection(self.guideidx + i) + # with open(pathof(fname), 'wb') as f: + # f.write(data) + outtbl, ctoc_text = self.mi.getIndexData(self.guideidx, "KF8 Guide elements)") + for [text, tagMap] in outtbl: + # ref_type, ref_title, frag number + ctocoffset = tagMap[1][0] + ref_title = ctoc_text[ctocoffset] + ref_type = text + fileno = None + if 3 in tagMap: + fileno = tagMap[3][0] + if 6 in tagMap: + fileno = tagMap[6][0] + guidetbl.append([ref_type, ref_title, fileno]) + self.guidetbl = guidetbl + if self.DEBUG: + print("\nGuide Table: %d entries" % len(self.guidetbl)) + print("table: ref_type, ref_title, fragtbl entry number") + for j in range(len(self.guidetbl)): + print(self.guidetbl[j]) + + def buildParts(self, rawML): + # now split the rawML into its flow pieces + self.flows = [] + for j in range(0, len(self.fdsttbl)-1): + start = self.fdsttbl[j] + end = self.fdsttbl[j+1] + self.flows.append(rawML[start:end]) + + # the first piece represents the xhtml text + text = self.flows[0] + self.flows[0] = b'' + + # walk the and fragment tables to build original source xhtml files + # *without* destroying any file position information needed for later href processing + # and create final list of file separation start: stop points and etc in partinfo + if self.DEBUG: + print("\nRebuilding flow piece 0: the main body of the ebook") + self.parts = [] + self.partinfo = [] + fragptr = 0 + baseptr = 0 + cnt = 0 + filename = 'part%04d.xhtml' % cnt + for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl: + baseptr = skelpos + skellen + skeleton = text[skelpos: baseptr] + aidtext = "0" + for i in range(fragcnt): + [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[fragptr] + aidtext = idtext[12:-2] + if i == 0: + filename = 'part%04d.xhtml' % filenum + slice = text[baseptr: baseptr + length] + insertpos = insertpos - skelpos + head = skeleton[:insertpos] + tail = skeleton[insertpos:] + actual_inspos = insertpos + if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') < head.rfind(b'<')): + # There is an incomplete tag in either the head or tail. + # This can happen for some badly formed KF8 files + print('The fragment table for %s has incorrect insert position. Calculating manually.' % skelname) + bp, ep = locate_beg_end_of_tag(skeleton, aidtext) + if bp != ep: + actual_inspos = ep + 1 + startpos + if insertpos != actual_inspos: + print("fixed corrupt fragment table insert position", insertpos+skelpos, actual_inspos+skelpos) + insertpos = actual_inspos + self.fragtbl[fragptr][0] = actual_inspos + skelpos + skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:] + baseptr = baseptr + length + fragptr += 1 + cnt += 1 + self.parts.append(skeleton) + self.partinfo.append([skelnum, 'Text', filename, skelpos, baseptr, aidtext]) + + assembled_text = b''.join(self.parts) + if self.DEBUG: + outassembled = os.path.join(self.files.k8dir, 'assembled_text.dat') + with open(pathof(outassembled),'wb') as f: + f.write(assembled_text) + + # The primary css style sheet is typically stored next followed by any + # snippets of code that were previously inlined in the + # original xhtml but have been stripped out and placed here. + # This can include local CDATA snippets and and svg sections. + + # The problem is that for most browsers and ereaders, you can not + # use to import any svg image that itself + # properly uses an tag to import some raster image - it + # should work according to the spec but does not for almost all browsers + # and ereaders and causes epub validation issues because those raster + # images are in manifest but not in xhtml text - since they only + # referenced from an svg image + + # So we need to check the remaining flow pieces to see if they are css + # or svg images. if svg images, we must check if they have an + # and if so inline them into the xhtml text pieces. + + # there may be other sorts of pieces stored here but until we see one + # in the wild to reverse engineer we won't be able to tell + self.flowinfo.append([None, None, None, None]) + svg_tag_pattern = re.compile(br'''(]*>)''', re.IGNORECASE) + image_tag_pattern = re.compile(br'''(]*>)''', re.IGNORECASE) + for j in range(1,len(self.flows)): + flowpart = self.flows[j] + nstr = '%04d' % j + m = re.search(svg_tag_pattern, flowpart) + if m is not None: + # svg + ptype = b'svg' + start = m.start() + m2 = re.search(image_tag_pattern, flowpart) + if m2 is not None: + pformat = b'inline' + pdir = None + fname = None + # strip off anything before = 0: + ptype = b'css' + flowpart = b'\n' + pformat = b'inline' + pdir = None + fname = None + else: + # css - assume as standalone css file + ptype = b'css' + pformat = b'file' + pdir = "Styles" + fname = 'style' + nstr + '.css' + + self.flows[j] = flowpart + self.flowinfo.append([ptype, pformat, pdir, fname]) + + if self.DEBUG: + print("\nFlow Map: %d entries" % len(self.flowinfo)) + for fi in self.flowinfo: + print(fi) + print("\n") + + print("\nXHTML File Part Position Information: %d entries" % len(self.partinfo)) + for pi in self.partinfo: + print(pi) + + if False: # self.Debug: + # dump all of the locations of the aid tags used in TEXT + # find id links only inside of tags + # inside any < > pair find all "aid=' and return whatever is inside the quotes + # [^>]* means match any amount of chars except for '>' char + # [^'"] match any amount of chars except for the quote character + # \s* means match any amount of whitespace + print("\npositions of all aid= pieces") + id_pattern = re.compile(br'''<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>''',re.IGNORECASE) + for m in re.finditer(id_pattern, rawML): + [filename, partnum, start, end] = self.getFileInfo(m.start()) + [seqnum, idtext] = self.getFragTblInfo(m.start()) + value = fromBase32(m.group(1)) + print(" aid: %s value: %d at: %d -> part: %d, start: %d, end: %d" % (m.group(1), value, m.start(), partnum, start, end)) + print(" %s fragtbl entry %d" % (idtext, seqnum)) + + return + + # get information fragment table entry by pos + def getFragTblInfo(self, pos): + for j in range(len(self.fragtbl)): + [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[j] + if pos >= insertpos and pos < (insertpos + length): + # why are these "in: and before: added here + return seqnum, b'in: ' + idtext + if pos < insertpos: + return seqnum, b'before: ' + idtext + return None, None + + # get information about the part (file) that exists at pos in original rawML + def getFileInfo(self, pos): + for [partnum, pdir, filename, start, end, aidtext] in self.partinfo: + if pos >= start and pos < end: + return filename, partnum, start, end + return None, None, None, None + + # accessor functions to properly protect the internal structure + def getNumberOfParts(self): + return len(self.parts) + + def getPart(self,i): + if i >= 0 and i < len(self.parts): + return self.parts[i] + return None + + def getPartInfo(self, i): + if i >= 0 and i < len(self.partinfo): + return self.partinfo[i] + return None + + def getNumberOfFlows(self): + return len(self.flows) + + def getFlow(self,i): + # note flows[0] is empty - it was all of the original text + if i > 0 and i < len(self.flows): + return self.flows[i] + return None + + def getFlowInfo(self,i): + # note flowinfo[0] is empty - it was all of the original text + if i > 0 and i < len(self.flowinfo): + return self.flowinfo[i] + return None + + def getIDTagByPosFid(self, posfid, offset): + # first convert kindle:pos:fid and offset info to position in file + # (fromBase32 can handle both string types on input) + row = fromBase32(posfid) + off = fromBase32(offset) + [insertpos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[row] + pos = insertpos + off + fname, pn, skelpos, skelend = self.getFileInfo(pos) + if fname is None: + # pos does not exist + # default to skeleton pos instead + print("Link To Position", pos, "does not exist, retargeting to top of target") + pos = self.skeltbl[filenum][3] + fname, pn, skelpos, skelend = self.getFileInfo(pos) + # an existing "id=" or "name=" attribute must exist in original xhtml otherwise it would not have worked for linking. + # Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent + # some position information encoded into Base32 name. + # so find the closest "id=" before position the file by actually searching in that file + idtext = self.getIDTag(pos) + return fname, idtext + + def getIDTag(self, pos): + # find the first tag with a named anchor (name or id attribute) before pos + fname, pn, skelpos, skelend = self.getFileInfo(pos) + if pn is None and skelpos is None: + print("Error: getIDTag - no file contains ", pos) + textblock = self.parts[pn] + npos = pos - skelpos + # if npos inside a tag then search all text before the its end of tag marker + pgt = textblock.find(b'>',npos) + plt = textblock.find(b'<',npos) + if plt == npos or pgt < plt: + npos = pgt + 1 + # find id and name attributes only inside of tags + # use a reverse tag search since that is faster + # inside any < > pair find "id=" and "name=" attributes return it + # [^>]* means match any amount of chars except for '>' char + # [^'"] match any amount of chars except for the quote character + # \s* means match any amount of whitespace + textblock = textblock[0:npos] + id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE) + name_pattern = re.compile(br'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE) + aid_pattern = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''') + for tag in reverse_tag_iter(textblock): + # any ids in the body should default to top of file + if tag[0:6] == b'= start and pos < end: + return [partnum, pdir, filename, start, end, aidtext] + return [None, None, None, None, None, None] + + # fileno is actually a reference into fragtbl (a fragment) + def getGuideText(self): + guidetext = b'' + for [ref_type, ref_title, fileno] in self.guidetbl: + if ref_type == b'thumbimagestandard': + continue + if ref_type not in _guide_types and not ref_type.startswith(b'other.'): + if ref_type == b'start': + ref_type = b'text' + else: + ref_type = b'other.' + ref_type + [pos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[fileno] + [pn, pdir, filename, skelpos, skelend, aidtext] = self.getSkelInfo(pos) + idtext = self.getIDTag(pos) + linktgt = filename.encode('utf-8') + if idtext != b'': + linktgt += b'#' + idtext + guidetext += b'\n' + # opf is encoded utf-8 so must convert any titles properly + guidetext = (guidetext.decode(self.mh.codec)).encode("utf-8") + return guidetext + + def getPageIDTag(self, pos): + # find the first tag with a named anchor (name or id attribute) before pos + # but page map offsets need to little more leeway so if the offset points + # into a tag look for the next ending tag "/>" or "',npos) + plt = textblock.find(b'<',npos) + if plt == npos or pgt < plt: + # we are in a tag + # so find first ending tag + pend1 = textblock.find(b'/>', npos) + pend2 = textblock.find(b' pair find "id=" and "name=" attributes return it + # [^>]* means match any amount of chars except for '>' char + # [^'"] match any amount of chars except for the quote character + # \s* means match any amount of whitespace + textblock = textblock[0:npos] + id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE) + name_pattern = re.compile(br'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE) + for tag in reverse_tag_iter(textblock): + # any ids in the body should default to top of file + if tag[0:6] == b'= python 2.7. +""" set to True to use OrderedDict for K8RESCProcessor.parsetag.tattr.""" + +if DEBUG_USE_ORDERED_DICTIONARY: + from collections import OrderedDict as dict_ +else: + dict_ = dict + +from .compatibility_utils import unicode_str + +from .mobi_utils import fromBase32 + +_OPF_PARENT_TAGS = ['xml', 'package', 'metadata', 'dc-metadata', + 'x-metadata', 'manifest', 'spine', 'tours', 'guide'] + +class K8RESCProcessor(object): + + def __init__(self, data, debug=False): + self._debug = debug + self.resc = None + self.opos = 0 + self.extrameta = [] + self.cover_name = None + self.spine_idrefs = {} + self.spine_order = [] + self.spine_pageattributes = {} + self.spine_ppd = None + # need3 indicate the book has fields which require epub3. + # but the estimation of the source epub version from the fields is difficult. + self.need3 = False + self.package_ver = None + self.extra_metadata = [] + self.refines_metadata = [] + self.extra_attributes = [] + # get header + start_pos = data.find(b'<') + self.resc_header = data[:start_pos] + # get resc data length + start = self.resc_header.find(b'=') + 1 + end = self.resc_header.find(b'&', start) + resc_size = 0 + if end > 0: + resc_size = fromBase32(self.resc_header[start:end]) + resc_rawbytes = len(data) - start_pos + if resc_rawbytes == resc_size: + self.resc_length = resc_size + else: + # Most RESC has a nul string at its tail but some do not. + end_pos = data.find(b'\x00', start_pos) + if end_pos < 0: + self.resc_length = resc_rawbytes + else: + self.resc_length = end_pos - start_pos + if self.resc_length != resc_size: + print("Warning: RESC section length({:d}bytes) does not match its size({:d}bytes).".format(self.resc_length, resc_size)) + # now parse RESC after converting it to unicode from utf-8 + try: + self.resc = unicode_str(data[start_pos:start_pos+self.resc_length]) + except UnicodeDecodeError: + self.resc = unicode_str(data[start_pos:start_pos+self.resc_length], enc='latin-1') + self.parseData() + + def prepend_to_spine(self, key, idref, linear, properties): + self.spine_order = [key] + self.spine_order + self.spine_idrefs[key] = idref + attributes = {} + if linear is not None: + attributes['linear'] = linear + if properties is not None: + attributes['properties'] = properties + self.spine_pageattributes[key] = attributes + + # RESC tag iterator + def resc_tag_iter(self): + tcontent = last_tattr = None + prefix = [''] + while True: + text, tag = self.parseresc() + if text is None and tag is None: + break + if text is not None: + tcontent = text.rstrip(' \r\n') + else: # we have a tag + ttype, tname, tattr = self.parsetag(tag) + if ttype == 'begin': + tcontent = None + prefix.append(tname + '.') + if tname in _OPF_PARENT_TAGS: + yield ''.join(prefix), tname, tattr, tcontent + else: + last_tattr = tattr + else: # single or end + if ttype == 'end': + prefix.pop() + tattr = last_tattr + last_tattr = None + if tname in _OPF_PARENT_TAGS: + tname += '-end' + yield ''.join(prefix), tname, tattr, tcontent + tcontent = None + + # now parse the RESC to extract spine and extra metadata info + def parseData(self): + for prefix, tname, tattr, tcontent in self.resc_tag_iter(): + if self._debug: + print(" Parsing RESC: ", prefix, tname, tattr, tcontent) + if tname == 'package': + self.package_ver = tattr.get('version', '2.0') + package_prefix = tattr.get('prefix','') + if self.package_ver.startswith('3') or package_prefix.startswith('rendition'): + self.need3 = True + if tname == 'spine': + self.spine_ppd = tattr.get('page-progession-direction', None) + if self.spine_ppd is not None and self.spine_ppd == 'rtl': + self.need3 = True + if tname == 'itemref': + skelid = tattr.pop('skelid', None) + if skelid is None and len(self.spine_order) == 0: + # assume it was removed initial coverpage + skelid = 'coverpage' + tattr['linear'] = 'no' + self.spine_order.append(skelid) + idref = tattr.pop('idref', None) + if idref is not None: + idref = 'x_' + idref + self.spine_idrefs[skelid] = idref + if 'id' in tattr: + del tattr['id'] + # tattr["id"] = 'x_' + tattr["id"] + if 'properties' in tattr: + self.need3 = True + self.spine_pageattributes[skelid] = tattr + if tname == 'meta' or tname.startswith('dc:'): + if 'refines' in tattr or 'property' in tattr: + self.need3 = True + if tattr.get('name','') == 'cover': + cover_name = tattr.get('content',None) + if cover_name is not None: + cover_name = 'x_' + cover_name + self.cover_name = cover_name + else: + self.extrameta.append([tname, tattr, tcontent]) + + # parse and return either leading text or the next tag + def parseresc(self): + p = self.opos + if p >= len(self.resc): + return None, None + if self.resc[p] != '<': + res = self.resc.find('<',p) + if res == -1 : + res = len(self.resc) + self.opos = res + return self.resc[p:res], None + # handle comment as a special case + if self.resc[p:p+4] == '',p+1) + if te != -1: + te = te+2 + else: + te = self.resc.find('>',p+1) + ntb = self.resc.find('<',p+1) + if ntb != -1 and ntb < te: + self.opos = ntb + return self.resc[p:ntb], None + self.opos = te + 1 + return None, self.resc[p:te+1] + + # parses tag to identify: [tname, ttype, tattr] + # tname: tag name + # ttype: tag type ('begin', 'end' or 'single'); + # tattr: dictionary of tag atributes + def parsetag(self, s): + p = 1 + tname = None + ttype = None + tattr = dict_() + while s[p:p+1] == ' ' : + p += 1 + if s[p:p+1] == '/': + ttype = 'end' + p += 1 + while s[p:p+1] == ' ' : + p += 1 + b = p + while s[p:p+1] not in ('>', '/', ' ', '"', "'",'\r','\n') : + p += 1 + tname=s[b:p].lower() + # some special cases + if tname == '?xml': + tname = 'xml' + if tname == '!--': + ttype = 'single' + comment = s[p:-3].strip() + tattr['comment'] = comment + if ttype is None: + # parse any attributes of begin or single tags + while s.find('=',p) != -1 : + while s[p:p+1] == ' ' : + p += 1 + b = p + while s[p:p+1] != '=' : + p += 1 + aname = s[b:p].lower() + aname = aname.rstrip(' ') + p += 1 + while s[p:p+1] == ' ' : + p += 1 + if s[p:p+1] in ('"', "'") : + p = p + 1 + b = p + while s[p:p+1] not in ('"', "'"): + p += 1 + val = s[b:p] + p += 1 + else : + b = p + while s[p:p+1] not in ('>', '/', ' ') : + p += 1 + val = s[b:p] + tattr[aname] = val + if ttype is None: + ttype = 'begin' + if s.find('/',p) >= 0: + ttype = 'single' + return ttype, tname, tattr + + def taginfo_toxml(self, taginfo): + res = [] + tname, tattr, tcontent = taginfo + res.append('<' + tname) + if tattr is not None: + for key in tattr: + res.append(' ' + key + '="'+tattr[key]+'"') + if tcontent is not None: + res.append('>' + tcontent + '\n') + else: + res.append('/>\n') + return "".join(res) + + def hasSpine(self): + return len(self.spine_order) > 0 + + def needEPUB3(self): + return self.need3 + + def hasRefines(self): + for [tname, tattr, tcontent] in self.extrameta: + if 'refines' in tattr: + return True + return False + + def createMetadata(self, epubver): + for taginfo in self.extrameta: + tname, tattr, tcontent = taginfo + if 'refines' in tattr: + if epubver == 'F' and 'property' in tattr: + attr = ' id="%s" opf:%s="%s"\n' % (tattr['refines'], tattr['property'], tcontent) + self.extra_attributes.append(attr) + else: + tag = self.taginfo_toxml(taginfo) + self.refines_metadata.append(tag) + else: + tag = self.taginfo_toxml(taginfo) + self.extra_metadata.append(tag) diff --git a/src/epy_reader/tools/KindleUnpack/mobi_nav.py b/src/epy_reader/tools/KindleUnpack/mobi_nav.py new file mode 100644 index 0000000..16fb0be --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_nav.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import unicode_str +import os +from .unipath import pathof + +import re +# note: re requites the pattern to be the exact same type as the data to be searched in python3 +# but u"" is not allowed for the pattern itself only b"" + +DEBUG_NAV = False + +FORCE_DEFAULT_TITLE = False +""" Set to True to force to use the default title. """ + +NAVIGATION_FINENAME = 'nav.xhtml' +""" The name for the navigation document. """ + +DEFAULT_TITLE = 'Navigation' +""" The default title for the navigation document. """ + +class NAVProcessor(object): + + def __init__(self, files): + self.files = files + self.navname = NAVIGATION_FINENAME + + def buildLandmarks(self, guidetext): + header = '' + header += ' \n' + + type_map = { + 'cover' : 'cover', + 'title-page' : 'title-page', + # ?: 'frontmatter', + 'text' : 'bodymatter', + # ?: 'backmatter', + 'toc' : 'toc', + 'loi' : 'loi', + 'lot' : 'lot', + 'preface' : 'preface', + 'bibliography' : 'bibliography', + 'index' : 'index', + 'glossary' : 'glossary', + 'acknowledgements' : 'acknowledgements', + 'colophon' : None, + 'copyright-page' : None, + 'dedication' : None, + 'epigraph' : None, + 'foreword' : None, + 'notes' : None + } + + re_type = re.compile(r'\s+type\s*=\s*"(.*?)"', re.I) + re_title = re.compile(r'\s+title\s*=\s*"(.*?)"', re.I) + re_link = re.compile(r'\s+href\s*=\s*"(.*?)"', re.I) + dir_ = os.path.relpath(self.files.k8text, self.files.k8oebps).replace('\\', '/') + + data = '' + references = re.findall(r'', unicode_str(guidetext), re.I) + for reference in references: + mo_type = re_type.search(reference) + mo_title = re_title.search(reference) + mo_link = re_link.search(reference) + if mo_type is not None: + type_ = type_map.get(mo_type.group(1), None) + else: + type_ = None + if mo_title is not None: + title = mo_title.group(1) + else: + title = None + if mo_link is not None: + link = mo_link.group(1) + else: + link = None + + if type_ is not None and title is not None and link is not None: + link = os.path.relpath(link, dir_).replace('\\', '/') + data += element.format(type_, link, title) + if len(data) > 0: + return header + data + footer + else: + return '' + + def buildTOC(self, indx_data): + header = '' + header += ' \n' + + # recursive part + def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1): + if start>len(indx_data) or end>len(indx_data): + print("Warning (in buildTOC): missing INDX child entries", start, end, len(indx_data)) + return '' + if DEBUG_NAV: + print("recursINDX (in buildTOC) lvl %d from %d to %d" % (lvl, start, end)) + xhtml = '' + if start <= 0: + start = 0 + if end <= 0: + end = len(indx_data) + if lvl > max_lvl: + max_lvl = lvl + + indent1 = ' ' * (2 + lvl * 2) + indent2 = ' ' * (3 + lvl * 2) + xhtml += indent1 + '
    \n' + for i in range(start, end): + e = indx_data[i] + htmlfile = e['filename'] + desttag = e['idtag'] + text = e['text'] + if not e['hlvl'] == lvl: + continue + num += 1 + if desttag == '': + link = htmlfile + else: + link = '{:s}#{:s}'.format(htmlfile, desttag) + xhtml += indent2 + '
  1. ' + entry = '{:s}'.format(link, text) + xhtml += entry + # recurs + if e['child1'] >= 0: + xhtml += '\n' + xhtmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1, + e['child1'], e['childn'] + 1) + xhtml += xhtmlrec + xhtml += indent2 + # close entry + xhtml += '
  2. \n' + xhtml += indent1 + '
\n' + return xhtml, max_lvl, num + + data, max_lvl, num = recursINDX() + if not len(indx_data) == num: + print("Warning (in buildTOC): different number of entries in NCX", len(indx_data), num) + return header + data + footer + + def buildNAV(self, ncx_data, guidetext, title, lang): + print("Building Navigation Document.") + if FORCE_DEFAULT_TITLE: + title = DEFAULT_TITLE + nav_header = '' + nav_header += '\n' + nav_header += ' + + + + + + + + + +%s + + +''' + + ncx_footer = \ +''' + +''' + + ncx_entry = \ +''' + +%s + +''' + + # recursive part + def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1): + if start>len(indx_data) or end>len(indx_data): + print("Warning: missing INDX child entries", start, end, len(indx_data)) + return '' + if DEBUG_NCX: + print("recursINDX lvl %d from %d to %d" % (lvl, start, end)) + xml = '' + if start <= 0: + start = 0 + if end <= 0: + end = len(indx_data) + if lvl > max_lvl: + max_lvl = lvl + indent = ' ' * (2 + lvl) + + for i in range(start, end): + e = indx_data[i] + if not e['hlvl'] == lvl: + continue + # open entry + num += 1 + link = '%s#filepos%d' % (htmlfile, e['pos']) + tagid = 'np_%d' % num + entry = ncx_entry % (tagid, num, xmlescape(unescapeit(e['text'])), link) + entry = re.sub(re.compile('^', re.M), indent, entry, 0) + xml += entry + '\n' + # recurs + if e['child1']>=0: + xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1, + e['child1'], e['childn'] + 1) + xml += xmlrec + # close entry + xml += indent + '\n' + return xml, max_lvl, num + + body, max_lvl, num = recursINDX() + header = ncx_header % (lang, ident, max_lvl + 1, title) + ncx = header + body + ncx_footer + if not len(indx_data) == num: + print("Warning: different number of entries in NCX", len(indx_data), num) + return ncx + + def writeNCX(self, metadata): + # build the xml + self.isNCX = True + print("Write ncx") + # htmlname = os.path.basename(self.files.outbase) + # htmlname += '.html' + htmlname = 'book.html' + xml = self.buildNCX(htmlname, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0]) + # write the ncx file + # ncxname = os.path.join(self.files.mobi7dir, self.files.getInputFileBasename() + '.ncx') + ncxname = os.path.join(self.files.mobi7dir, 'toc.ncx') + with open(pathof(ncxname), 'wb') as f: + f.write(xml.encode('utf-8')) + + def buildK8NCX(self, indx_data, title, ident, lang): + ncx_header = \ +''' + + + + + + + + + +%s + + +''' + + ncx_footer = \ +''' + +''' + + ncx_entry = \ +''' + +%s + +''' + + # recursive part + def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1): + if start>len(indx_data) or end>len(indx_data): + print("Warning: missing INDX child entries", start, end, len(indx_data)) + return '' + if DEBUG_NCX: + print("recursINDX lvl %d from %d to %d" % (lvl, start, end)) + xml = '' + if start <= 0: + start = 0 + if end <= 0: + end = len(indx_data) + if lvl > max_lvl: + max_lvl = lvl + indent = ' ' * (2 + lvl) + + for i in range(start, end): + e = indx_data[i] + htmlfile = e['filename'] + desttag = e['idtag'] + if not e['hlvl'] == lvl: + continue + # open entry + num += 1 + if desttag == '': + link = 'Text/%s' % htmlfile + else: + link = 'Text/%s#%s' % (htmlfile, desttag) + tagid = 'np_%d' % num + entry = ncx_entry % (tagid, num, xmlescape(unescapeit(e['text'])), link) + entry = re.sub(re.compile('^', re.M), indent, entry, 0) + xml += entry + '\n' + # recurs + if e['child1']>=0: + xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1, + e['child1'], e['childn'] + 1) + xml += xmlrec + # close entry + xml += indent + '\n' + return xml, max_lvl, num + + body, max_lvl, num = recursINDX() + header = ncx_header % (lang, ident, max_lvl + 1, title) + ncx = header + body + ncx_footer + if not len(indx_data) == num: + print("Warning: different number of entries in NCX", len(indx_data), num) + return ncx + + def writeK8NCX(self, ncx_data, metadata): + # build the xml + self.isNCX = True + print("Write K8 ncx") + xml = self.buildK8NCX(ncx_data, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0]) + bname = 'toc.ncx' + ncxname = os.path.join(self.files.k8oebps,bname) + with open(pathof(ncxname), 'wb') as f: + f.write(xml.encode('utf-8')) diff --git a/src/epy_reader/tools/KindleUnpack/mobi_opf.py b/src/epy_reader/tools/KindleUnpack/mobi_opf.py new file mode 100644 index 0000000..742d776 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_opf.py @@ -0,0 +1,686 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import unicode_str, unescapeit +from .compatibility_utils import lzip + +from .unipath import pathof + +from xml.sax.saxutils import escape as xmlescape + +import os +import uuid +from datetime import datetime + +# In EPUB3, NCX and MAY exist in OPF, although the NCX is superseded +# by the Navigation Document and the is deprecated. Currently, EPUB3_WITH_NCX +# and EPUB3_WITH_GUIDE are set to True due to compatibility with epub2 reading systems. +# They might be change to set to False in the future. + +EPUB3_WITH_NCX = True # Do not set to False except for debug. +""" Set to True to create a toc.ncx when converting to epub3. """ + +EPUB3_WITH_GUIDE = True # Do not set to False except for debug. +""" Set to True to create a guide element in an opf when converting to epub3. """ + +EPUB_OPF = 'content.opf' +""" The name for the OPF of EPUB. """ + +TOC_NCX = 'toc.ncx' +""" The name for the TOC of EPUB2. """ + +NAVIGATION_DOCUMENT = 'nav.xhtml' +""" The name for the navigation document of EPUB3. """ + +BEGIN_INFO_ONLY = '' +""" The comment to indicate the end of metadata which will be ignored by kindlegen. """ + +EXTH_TITLE_FURIGANA = 'Title-Pronunciation' +""" The name for Title Furigana(similar to file-as) set by KDP. """ + +EXTH_CREATOR_FURIGANA = 'Author-Pronunciation' +""" The name for Creator Furigana(similar to file-as) set by KDP. """ + +EXTH_PUBLISHER_FURIGANA = 'Publisher-Pronunciation' +""" The name for Publisher Furigana(similar to file-as) set by KDP. """ + +EXTRA_ENTITIES = {'"': '"', "'": "'"} + +class OPFProcessor(object): + + def __init__(self, files, metadata, fileinfo, rscnames, hasNCX, mh, usedmap, pagemapxml='', guidetext='', k8resc=None, epubver='2'): + self.files = files + self.metadata = metadata + self.fileinfo = fileinfo + self.rscnames = rscnames + self.has_ncx = hasNCX + self.codec = mh.codec + self.isK8 = mh.isK8() + self.printReplica = mh.isPrintReplica() + self.guidetext = unicode_str(guidetext) + self.used = usedmap + self.k8resc = k8resc + self.covername = None + self.cover_id = 'cover_img' + if self.k8resc is not None and self.k8resc.cover_name is not None: + # update cover id info from RESC if available + self.cover_id = self.k8resc.cover_name + # Create a unique urn uuid + self.BookId = unicode_str(str(uuid.uuid4())) + self.pagemap = pagemapxml + + self.ncxname = None + self.navname = None + + # page-progression-direction is only set in spine + self.page_progression_direction = metadata.pop('page-progression-direction', [None])[0] + if 'rl' in metadata.get('primary-writing-mode', [''])[0]: + self.page_progression_direction = 'rtl' + self.epubver = epubver # the epub version set by user + self.target_epubver = epubver # the epub vertion set by user or detected automatically + if self.epubver == 'A': + self.target_epubver = self.autodetectEPUBVersion() + elif self.epubver == 'F': + self.target_epubver = '2' + elif self.epubver != '2' and self.epubver != '3': + self.target_epubver = '2' + + # id for rifine attributes + self.title_id = {} + self.creator_id = {} + self.publisher_id = {} + # extra attributes + self.title_attrib = {} + self.creator_attrib = {} + self.publisher_attrib = {} + self.extra_attributes = [] # for force epub2 option + # Create epub3 metadata from EXTH. + self.exth_solved_refines_metadata = [] + self.exth_refines_metadata = [] + self.exth_fixedlayout_metadata = [] + + self.defineRefinesID() + self.processRefinesMetadata() + if self.k8resc is not None: + # Create metadata in RESC section. + self.k8resc.createMetadata(epubver) + if self.target_epubver == "3": + self.createMetadataForFixedlayout() + + def escapeit(self, sval, EXTRAS=None): + # note, xmlescape and unescape do not work with utf-8 bytestrings + sval = unicode_str(sval) + if EXTRAS: + res = xmlescape(unescapeit(sval), EXTRAS) + else: + res = xmlescape(unescapeit(sval)) + return res + + def createMetaTag(self, data, property, content, refid=''): + refines = '' + if refid: + refines = ' refines="#%s"' % refid + data.append('%s\n' % (property, refines, content)) + + def buildOPFMetadata(self, start_tag, has_obfuscated_fonts=False): + # convert from EXTH metadata format to target epub version metadata + # epub 3 will ignore style metatags + # but allows them to be present for backwards compatibility + # instead the new format is + # property_value + # and DCMES elements such as: + # value + + metadata = self.metadata + k8resc = self.k8resc + + META_TAGS = ['Drm Server Id', 'Drm Commerce Id', 'Drm Ebookbase Book Id', 'ASIN', 'ThumbOffset', 'Fake Cover', + 'Creator Software', 'Creator Major Version', 'Creator Minor Version', 'Creator Build Number', + 'Watermark', 'Clipping Limit', 'Publisher Limit', 'Text to Speech Disabled', 'CDE Type', + 'Updated Title', 'Font Signature (hex)', 'Tamper Proof Keys (hex)',] + + # def handleTag(data, metadata, key, tag, ids={}): + def handleTag(data, metadata, key, tag, attrib={}): + '''Format metadata values. + + @param data: List of formatted metadata entries. + @param metadata: The metadata dictionary. + @param key: The key of the metadata value to handle. + @param tag: The opf tag corresponds to the metadata value. + ###@param ids: The ids in tags for refines property of epub3. + @param attrib: The extra attibute for refines or opf prefixs. + ''' + if key in metadata: + for i, value in enumerate(metadata[key]): + closingTag = tag.split(" ")[0] + res = '<%s%s>%s\n' % (tag, attrib.get(i, ''), self.escapeit(value), closingTag) + data.append(res) + del metadata[key] + + # these are allowed but ignored by epub3 + def handleMetaPairs(data, metadata, key, name): + if key in metadata: + for value in metadata[key]: + res = '\n' % (name, self.escapeit(value, EXTRA_ENTITIES)) + data.append(res) + del metadata[key] + + data = [] + data.append(start_tag + '\n') + # Handle standard metadata + if 'Title' in metadata: + handleTag(data, metadata, 'Title', 'dc:title', self.title_attrib) + else: + data.append('Untitled\n') + handleTag(data, metadata, 'Language', 'dc:language') + if 'UniqueID' in metadata: + handleTag(data, metadata, 'UniqueID', 'dc:identifier id="uid"') + else: + # No unique ID in original, give it a generic one. + data.append('0\n') + + if self.target_epubver == '3': + # epub version 3 minimal metadata requires a dcterms:modifed date tag + self.createMetaTag(data, 'dcterms:modified', datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")) + + if self.isK8 and has_obfuscated_fonts: + # Use the random generated urn:uuid so obuscated fonts work. + # It doesn't need to be _THE_ unique identifier to work as a key + # for obfuscated fonts in Sigil, ADE and calibre. Its just has + # to use the opf:scheme="UUID" and have the urn:uuid: prefix. + if self.target_epubver == '3': + data.append('urn:uuid:'+self.BookId+'\n') + else: + data.append('urn:uuid:'+self.BookId+'\n') + + handleTag(data, metadata, 'Creator', 'dc:creator', self.creator_attrib) + handleTag(data, metadata, 'Contributor', 'dc:contributor') + handleTag(data, metadata, 'Publisher', 'dc:publisher', self.publisher_attrib) + handleTag(data, metadata, 'Source', 'dc:source') + handleTag(data, metadata, 'Type', 'dc:type') + if self.target_epubver == '3': + if 'ISBN' in metadata: + for i, value in enumerate(metadata['ISBN']): + res = 'urn:isbn:%s\n' % self.escapeit(value) + data.append(res) + else: + handleTag(data, metadata, 'ISBN', 'dc:identifier opf:scheme="ISBN"') + if 'Subject' in metadata: + if 'SubjectCode' in metadata: + codeList = metadata['SubjectCode'] + del metadata['SubjectCode'] + else: + codeList = None + for i in range(len(metadata['Subject'])): + if codeList and i < len(codeList): + data.append('') + else: + data.append('') + data.append(self.escapeit(metadata['Subject'][i])+'\n') + del metadata['Subject'] + handleTag(data, metadata, 'Description', 'dc:description') + if self.target_epubver == '3': + if 'Published' in metadata: + for i, value in enumerate(metadata['Published']): + res = '%s\n' % self.escapeit(value) + data.append(res) + else: + handleTag(data, metadata, 'Published', 'dc:date opf:event="publication"') + handleTag(data, metadata, 'Rights', 'dc:rights') + + if self.epubver == 'F': + if self.extra_attributes or k8resc is not None and k8resc.extra_attributes: + data.append('\n') + else: + # Append refines metadata. + if self.exth_solved_refines_metadata: + data.append('\n') + data += self.exth_solved_refines_metadata + if self.exth_refines_metadata or k8resc is not None and k8resc.refines_metadata: + data.append('\n') + + # Append metadata in RESC section. + if k8resc is not None and k8resc.extra_metadata: + data.append('\n') + + if 'CoverOffset' in metadata: + imageNumber = int(metadata['CoverOffset'][0]) + self.covername = self.rscnames[imageNumber] + if self.covername is None: + print("Error: Cover image %s was not recognized as a valid image" % imageNumber) + else: + # is obsoleted in EPUB3, but kindlegen v2.9 requires it. + data.append('\n') + self.used[self.covername] = 'used' + del metadata['CoverOffset'] + + handleMetaPairs(data, metadata, 'Codec', 'output encoding') + # handle kindlegen specifc tags + handleTag(data, metadata, 'DictInLanguage', 'DictionaryInLanguage') + handleTag(data, metadata, 'DictOutLanguage', 'DictionaryOutLanguage') + handleMetaPairs(data, metadata, 'RegionMagnification', 'RegionMagnification') + handleMetaPairs(data, metadata, 'book-type', 'book-type') + handleMetaPairs(data, metadata, 'zero-gutter', 'zero-gutter') + handleMetaPairs(data, metadata, 'zero-margin', 'zero-margin') + handleMetaPairs(data, metadata, 'primary-writing-mode', 'primary-writing-mode') + handleMetaPairs(data, metadata, 'fixed-layout', 'fixed-layout') + handleMetaPairs(data, metadata, 'orientation-lock', 'orientation-lock') + handleMetaPairs(data, metadata, 'original-resolution', 'original-resolution') + + # these are not allowed in epub2 or 3 so convert them to meta name content pairs + # perhaps these could better be mapped into the dcterms namespace instead + handleMetaPairs(data, metadata, 'Review', 'review') + handleMetaPairs(data, metadata, 'Imprint', 'imprint') + handleMetaPairs(data, metadata, 'Adult', 'adult') + handleMetaPairs(data, metadata, 'DictShortName', 'DictionaryVeryShortName') + + # these are needed by kobo books upon submission but not sure if legal metadata in epub2 or epub3 + if 'Price' in metadata and 'Currency' in metadata: + priceList = metadata['Price'] + currencyList = metadata['Currency'] + if len(priceList) != len(currencyList): + print("Error: found %s price entries, but %s currency entries.") + else: + for i in range(len(priceList)): + data.append(''+priceList[i]+'\n') + del metadata['Price'] + del metadata['Currency'] + + if self.target_epubver == '3': + # Append metadata for EPUB3. + if self.exth_fixedlayout_metadata: + data.append('\n') + data += self.exth_fixedlayout_metadata + + # all that remains is extra EXTH info we will store inside a comment inside meta name/content pairs + # so it can not impact anything and will be automatically stripped out if found again in a RESC section + data.append(BEGIN_INFO_ONLY + '\n') + if 'ThumbOffset' in metadata: + imageNumber = int(metadata['ThumbOffset'][0]) + # Some bad books give image indexes that are 'out of range' + try: + imageName = self.rscnames[imageNumber] + except: + print('Number given for Cover Thumbnail is out of range: %s' % imageNumber) + imageName = None + if imageName is None: + print("Error: Cover Thumbnail image %s was not recognized as a valid image" % imageNumber) + else: + data.append('\n') + # self.used[imageName] = 'used' # thumbnail image is always generated by Kindlegen, so don't include in manifest + self.used[imageName] = 'not used' + del metadata['ThumbOffset'] + for metaName in META_TAGS: + if metaName in metadata: + for value in metadata[metaName]: + data.append('\n') + del metadata[metaName] + for key in list(metadata.keys()): + for value in metadata[key]: + data.append('\n') + del metadata[key] + data.append(END_INFO_ONLY + '\n') + data.append('\n') + return data + + def buildOPFManifest(self, ncxname, navname=None): + # buildManifest for mobi7, azw4, epub2 and epub3. + k8resc = self.k8resc + cover_id = self.cover_id + hasK8RescSpine = k8resc is not None and k8resc.hasSpine() + self.ncxname = ncxname + self.navname = navname + + data = [] + data.append('\n') + media_map = { + '.jpg' : 'image/jpeg', + '.jpeg' : 'image/jpeg', + '.png' : 'image/png', + '.gif' : 'image/gif', + '.svg' : 'image/svg+xml', + '.xhtml': 'application/xhtml+xml', + '.html' : 'text/html', # for mobi7 + '.pdf' : 'application/pdf', # for azw4(print replica textbook) + '.ttf' : 'application/x-font-ttf', + '.otf' : 'application/x-font-opentype', # replaced? + '.css' : 'text/css', + # '.html' : 'text/x-oeb1-document', # for mobi7 + # '.otf' : 'application/vnd.ms-opentype', # [OpenType] OpenType fonts + # '.woff' : 'application/font-woff', # [WOFF] WOFF fonts + # '.smil' : 'application/smil+xml', # [MediaOverlays301] EPUB Media Overlay documents + # '.pls' : 'application/pls+xml', # [PLS] Text-to-Speech (TTS) Pronunciation lexicons + # '.mp3' : 'audio/mpeg', + # '.mp4' : 'video/mp4', + # '.js' : 'text/javascript', # not supported in K8 + } + spinerefs = [] + + idcnt = 0 + for [key,dir,fname] in self.fileinfo: + name, ext = os.path.splitext(fname) + ext = ext.lower() + media = media_map.get(ext) + ref = "item%d" % idcnt + if hasK8RescSpine: + if key is not None and key in k8resc.spine_idrefs: + ref = k8resc.spine_idrefs[key] + properties = '' + if dir != '': + fpath = dir + '/' + fname + else: + fpath = fname + data.append('\n'.format(ref, media, fpath, properties)) + + if ext in ['.xhtml', '.html']: + spinerefs.append(ref) + idcnt += 1 + + for fname in self.rscnames: + if fname is not None: + if self.used.get(fname,'not used') == 'not used': + continue + name, ext = os.path.splitext(fname) + ext = ext.lower() + media = media_map.get(ext,ext[1:]) + properties = '' + if fname == self.covername: + ref = cover_id + if self.target_epubver == '3': + properties = 'properties="cover-image"' + else: + ref = "item%d" % idcnt + if ext == '.ttf' or ext == '.otf': + if self.isK8: # fonts are only used in Mobi 8 + fpath = 'Fonts/' + fname + data.append('\n'.format(ref, media, fpath, properties)) + else: + fpath = 'Images/' + fname + data.append('\n'.format(ref, media, fpath, properties)) + idcnt += 1 + + if self.target_epubver == '3' and navname is not None: + data.append('\n') + if self.has_ncx and ncxname is not None: + data.append('\n') + if self.pagemap != '': + data.append('\n') + data.append('\n') + return [data, spinerefs] + + def buildOPFSpine(self, spinerefs, isNCX): + # build spine + k8resc = self.k8resc + hasK8RescSpine = k8resc is not None and k8resc.hasSpine() + data = [] + ppd = '' + if self.isK8 and self.page_progression_direction is not None: + ppd = ' page-progression-direction="{:s}"'.format(self.page_progression_direction) + ncx = '' + if isNCX: + ncx = ' toc="ncx"' + map='' + if self.pagemap != '': + map = ' page-map="map"' + if self.epubver == 'F': + if ppd: + ppd = '' + spine_start_tag = '{0:s}\n'.format(ppd, map, ncx) + else: + spine_start_tag = '\n'.format(ppd, map, ncx) + data.append(spine_start_tag) + + if hasK8RescSpine: + for key in k8resc.spine_order: + idref = k8resc.spine_idrefs[key] + attribs = k8resc.spine_pageattributes[key] + tag = '\n' % entry) + start += 1 + for entry in spinerefs[start:]: + data.append('\n') + data.append('\n') + return data + + def buildMobi7OPF(self): + # Build an OPF for mobi7 and azw4. + print("Building an opf for mobi7/azw4.") + data = [] + data.append('\n') + data.append('\n') + metadata_tag = '' + opf_metadata = self.buildOPFMetadata(metadata_tag) + data += opf_metadata + if self.has_ncx: + # ncxname = self.files.getInputFileBasename() + '.ncx' + ncxname = 'toc.ncx' + else: + ncxname = None + [opf_manifest, spinerefs] = self.buildOPFManifest(ncxname) + data += opf_manifest + opf_spine = self.buildOPFSpine(spinerefs, self.has_ncx) + data += opf_spine + data.append('\n\n') + if not self.printReplica: + guide ='\n' + self.guidetext + '\n' + data.append(guide) + data.append('\n') + return ''.join(data) + + def buildEPUBOPF(self, has_obfuscated_fonts=False): + print("Building an opf for mobi8 using epub version: ", self.target_epubver) + if self.target_epubver == '2': + has_ncx = self.has_ncx + has_guide = True + ncxname = None + ncxname = TOC_NCX + navname = None + package = '\n' + tours = '\n\n' + metadata_tag = '' + else: + has_ncx = EPUB3_WITH_NCX + has_guide = EPUB3_WITH_GUIDE + ncxname = None + if has_ncx: + ncxname = TOC_NCX + navname = NAVIGATION_DOCUMENT + package = '\n' + tours = '' + metadata_tag = '' + + data = [] + data.append('\n') + data.append(package) + opf_metadata = self.buildOPFMetadata(metadata_tag, has_obfuscated_fonts) + data += opf_metadata + [opf_manifest, spinerefs] = self.buildOPFManifest(ncxname, navname) + data += opf_manifest + opf_spine = self.buildOPFSpine(spinerefs, has_ncx) + data += opf_spine + data.append(tours) + if has_guide: + guide ='\n' + self.guidetext + '\n' + data.append(guide) + data.append('\n') + return ''.join(data) + + def writeOPF(self, has_obfuscated_fonts=False): + if self.isK8: + data = self.buildEPUBOPF(has_obfuscated_fonts) + outopf = os.path.join(self.files.k8oebps, EPUB_OPF) + with open(pathof(outopf), 'wb') as f: + f.write(data.encode('utf-8')) + return self.BookId + else: + data = self.buildMobi7OPF() + outopf = os.path.join(self.files.mobi7dir, 'content.opf') + with open(pathof(outopf), 'wb') as f: + f.write(data.encode('utf-8')) + return 0 + + def getBookId(self): + return self.BookId + + def getNCXName(self): + return self.ncxname + + def getNAVName(self): + return self.navname + + def getEPUBVersion(self): + return self.target_epubver + + def hasNCX(self): + return self.ncxname is not None and self.has_ncx + + def hasNAV(self): + return self.navname is not None + + def autodetectEPUBVersion(self): + # Determine EPUB version from metadata and RESC. + metadata = self.metadata + k8resc = self.k8resc + epubver = '2' + if 'true' == metadata.get('fixed-layout', [''])[0].lower(): + epubver = '3' + elif metadata.get('orientation-lock', [''])[0].lower() in ['portrait', 'landscape']: + epubver = '3' + elif self.page_progression_direction == 'rtl': + epubver = '3' + elif EXTH_TITLE_FURIGANA in metadata: + epubver = '3' + elif EXTH_CREATOR_FURIGANA in metadata: + epubver = '3' + elif EXTH_PUBLISHER_FURIGANA in metadata: + epubver = '3' + elif k8resc is not None and k8resc.needEPUB3(): + epubver = '3' + return epubver + + def defineRefinesID(self): + # the following EXTH are set by KDP. + # 'Title_Furigana_(508)' + # 'Creator_Furigana_(517)', + # 'Publisher_Furigana_(522)' + # It is difficult to find correspondence between Title, Creator, Publisher + # and EXTH 508,512, 522 if they have more than two values since KDP seems not preserve the oders of EXTH 508,512 and 522. + # It is also difficult to find correspondence between them and tags which have refine attributes in RESC. + # So editing manually is required. + metadata = self.metadata + + needRefinesId = False + if self.k8resc is not None: + needRefinesId = self.k8resc.hasRefines() + # Create id for rifine attributes + if (needRefinesId or EXTH_TITLE_FURIGANA in metadata) and 'Title' in metadata: + for i in range(len(metadata.get('Title'))): + self.title_id[i] = 'title%02d' % (i+1) + + if (needRefinesId or EXTH_CREATOR_FURIGANA in metadata) and 'Creator' in metadata: + for i in range(len(metadata.get('Creator'))): + self.creator_id[i] = 'creator%02d' % (i+1) + + if (needRefinesId or EXTH_PUBLISHER_FURIGANA in metadata) and 'Publisher' in metadata: + for i in range(len(metadata.get('Publisher'))): + self.publisher_id[i] = 'publisher%02d' % (i+1) + + def processRefinesMetadata(self): + # create refines metadata defined in epub3 or convert refines property to opf: attribues for epub2. + metadata = self.metadata + + refines_list = [ + [EXTH_TITLE_FURIGANA, self.title_id, self.title_attrib, 'title00'], + [EXTH_CREATOR_FURIGANA, self.creator_id, self.creator_attrib, 'creator00'], + [EXTH_PUBLISHER_FURIGANA, self.publisher_id, self.publisher_attrib, 'publisher00'] + ] + + create_refines_metadata = False + for EXTH in lzip(*refines_list)[0]: + if EXTH in metadata: + create_refines_metadata = True + break + if create_refines_metadata: + for [EXTH, id, attrib, defaultid] in refines_list: + if self.target_epubver == '3': + for i, value in list(id.items()): + attrib[i] = ' id="%s"' % value + + if EXTH in metadata: + if len(metadata[EXTH]) == 1 and len(id) == 1: + self.createMetaTag(self.exth_solved_refines_metadata, 'file-as', metadata[EXTH][0], id[0]) + else: + for i, value in enumerate(metadata[EXTH]): + self.createMetaTag(self.exth_refines_metadata, 'file-as', value, id.get(i, defaultid)) + else: + if EXTH in metadata: + if len(metadata[EXTH]) == 1 and len(id) == 1: + attr = ' opf:file-as="%s"' % metadata[EXTH][0] + attrib[0] = attr + else: + for i, value in enumerate(metadata[EXTH]): + attr = ' id="#%s" opf:file-as="%s"\n' % (id.get(i, defaultid), value) + self.extra_attributes.append(attr) + + def createMetadataForFixedlayout(self): + # convert fixed layout to epub3 format if needed. + metadata = self.metadata + + if 'fixed-layout' in metadata: + fixedlayout = metadata['fixed-layout'][0] + content = {'true' : 'pre-paginated'}.get(fixedlayout.lower(), 'reflowable') + self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:layout', content) + + if 'orientation-lock' in metadata: + content = metadata['orientation-lock'][0].lower() + if content == 'portrait' or content == 'landscape': + self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:orientation', content) + + # according to epub3 spec about correspondence with Amazon + # if 'original-resolution' is provided it needs to be converted to + # meta viewport property tag stored in the of **each** + # xhtml page - so this tag would need to be handled by editing each part + # before reaching this routine + # we need to add support for this to the k8html routine + # if 'original-resolution' in metadata.keys(): + # resolution = metadata['original-resolution'][0].lower() + # width, height = resolution.split('x') + # if width.isdigit() and int(width) > 0 and height.isdigit() and int(height) > 0: + # viewport = 'width=%s, height=%s' % (width, height) + # self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:viewport', viewport) diff --git a/src/epy_reader/tools/KindleUnpack/mobi_pagemap.py b/src/epy_reader/tools/KindleUnpack/mobi_pagemap.py new file mode 100644 index 0000000..5228d4e --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_pagemap.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, unicode_str + +if PY2: + range = xrange + +import struct +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring + +import re +# note: re requites the pattern to be the exact same type as the data to be searched in python3 +# but u"" is not allowed for the pattern itself only b"" + + +_TABLE = [('m', 1000), ('cm', 900), ('d', 500), ('cd', 400), ('c', 100), ('xc', 90), ('l', 50), ('xl', 40), ('x', 10), ('ix', 9), ('v', 5), ('iv', 4), ('i', 1)] + +def int_to_roman(i): + parts = [] + num = i + for letter, value in _TABLE: + while value <= num: + num -= value + parts.append(letter) + return ''.join(parts) + +def roman_to_int(s): + result = 0 + rnstr = s + for letter, value in _TABLE: + while rnstr.startswith(letter): + result += value + rnstr = rnstr[len(letter):] + return result + +_pattern = r'''\(([^\)]*)\)''' +_tup_pattern = re.compile(_pattern,re.IGNORECASE) + + +def _parseNames(numpages, data): + data = unicode_str(data) + pagenames = [] + pageMap = '' + for i in range(numpages): + pagenames.append(None) + for m in re.finditer(_tup_pattern, data): + tup = m.group(1) + if pageMap != '': + pageMap += ',' + pageMap += '(' + tup + ')' + spos, nametype, svalue = tup.split(",") + # print(spos, nametype, svalue) + if nametype == 'a' or nametype == 'r': + svalue = int(svalue) + spos = int(spos) + for i in range(spos - 1, numpages): + if nametype == 'r': + pname = int_to_roman(svalue) + svalue += 1 + elif nametype == 'a': + pname = "%s" % svalue + svalue += 1 + elif nametype == 'c': + sp = svalue.find('|') + if sp == -1: + pname = svalue + else: + pname = svalue[0:sp] + svalue = svalue[sp+1:] + else: + print("Error: unknown page numbering type", nametype) + pagenames[i] = pname + return pagenames, pageMap + + +class PageMapProcessor: + + def __init__(self, mh, data): + self.mh = mh + self.data = data + self.pagenames = [] + self.pageoffsets = [] + self.pageMap = '' + self.pm_len = 0 + self.pm_nn = 0 + self.pn_bits = 0 + self.pmoff = None + self.pmstr = '' + print("Extracting Page Map Information") + rev_len, = struct.unpack_from(b'>L', self.data, 0x10) + # skip over header, revision string length data, and revision string + ptr = 0x14 + rev_len + pm_1, self.pm_len, self.pm_nn, self.pm_bits = struct.unpack_from(b'>4H', self.data, ptr) + # print(pm_1, self.pm_len, self.pm_nn, self.pm_bits) + self.pmstr = self.data[ptr+8:ptr+8+self.pm_len] + self.pmoff = self.data[ptr+8+self.pm_len:] + offsize = b">L" + offwidth = 4 + if self.pm_bits == 16: + offsize = b">H" + offwidth = 2 + ptr = 0 + for i in range(self.pm_nn): + od, = struct.unpack_from(offsize, self.pmoff, ptr) + ptr += offwidth + self.pageoffsets.append(od) + self.pagenames, self.pageMap = _parseNames(self.pm_nn, self.pmstr) + + def getPageMap(self): + return self.pageMap + + def getNames(self): + return self.pagenames + + def getOffsets(self): + return self.pageoffsets + + # page-map.xml will be unicode but encoded to utf-8 immediately before being written to a file + def generateKF8PageMapXML(self, k8proc): + pagemapxml = '\n' + for i in range(len(self.pagenames)): + pos = self.pageoffsets[i] + name = self.pagenames[i] + if name is not None and name != "": + [pn, dir, filename, skelpos, skelend, aidtext] = k8proc.getSkelInfo(pos) + idtext = unicode_str(k8proc.getPageIDTag(pos)) + linktgt = unicode_str(filename) + if idtext != '': + linktgt += '#' + idtext + pagemapxml += '\n' % (name, dir, linktgt) + pagemapxml += "\n" + return pagemapxml + + def generateAPNX(self, apnx_meta): + if apnx_meta['format'] == 'MOBI_8': + content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","format":"%(format)s","fileRevisionId":"1","acr":"%(acr)s"}' %apnx_meta + else: + content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","fileRevisionId":"1"}' % apnx_meta + content_header = content_header.encode('utf-8') + page_header = '{"asin":"%(asin)s","pageMap":"%(pageMap)s"}' % apnx_meta + page_header = page_header.encode('utf-8') + apnx = struct.pack(b'>H',1) + struct.pack(b'>H',1) + apnx += struct.pack(b'>I', 12 + len(content_header)) + apnx += struct.pack(b'>I', len(content_header)) + apnx += content_header + apnx += struct.pack(b'>H', 1) + apnx += struct.pack(b'>H', len(page_header)) + apnx += struct.pack(b'>H', self.pm_nn) + apnx += struct.pack(b'>H', 32) + apnx += page_header + for page in self.pageoffsets: + apnx += struct.pack(b'>L', page) + return apnx diff --git a/src/epy_reader/tools/KindleUnpack/mobi_sectioner.py b/src/epy_reader/tools/KindleUnpack/mobi_sectioner.py new file mode 100644 index 0000000..81f62bb --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_sectioner.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, hexlify, bstr, bord, bchar + +import datetime + +if PY2: + range = xrange + +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring +import struct + +from .unipath import pathof + +DUMP = False +""" Set to True to dump all possible information. """ + +class unpackException(Exception): + pass + + +def describe(data): + txtans = '' + hexans = hexlify(data) + for i in data: + if bord(i) < 32 or bord(i) > 127: + txtans += '?' + else: + txtans += bchar(i).decode('latin-1') + return '"' + txtans + '"' + ' 0x'+ hexans + +def datetimefrompalmtime(palmtime): + if palmtime > 0x7FFFFFFF: + pythondatetime = datetime.datetime(year=1904,month=1,day=1)+datetime.timedelta(seconds=palmtime) + else: + pythondatetime = datetime.datetime(year=1970,month=1,day=1)+datetime.timedelta(seconds=palmtime) + return pythondatetime + + +class Sectionizer: + + def __init__(self, filename): + self.data = b'' + with open(pathof(filename), 'rb') as f: + self.data = f.read() + self.palmheader = self.data[:78] + self.palmname = self.data[:32] + self.ident = self.palmheader[0x3C:0x3C+8] + self.num_sections, = struct.unpack_from(b'>H', self.palmheader, 76) + self.filelength = len(self.data) + sectionsdata = struct.unpack_from(bstr('>%dL' % (self.num_sections*2)), self.data, 78) + (self.filelength, 0) + self.sectionoffsets = sectionsdata[::2] + self.sectionattributes = sectionsdata[1::2] + self.sectiondescriptions = ["" for x in range(self.num_sections+1)] + self.sectiondescriptions[-1] = "File Length Only" + return + + def dumpsectionsinfo(self): + print("Section Offset Length UID Attribs Description") + for i in range(self.num_sections): + print("%3d %3X 0x%07X 0x%05X % 8d % 7d %s" % (i,i, self.sectionoffsets[i], self.sectionoffsets[ + i+1] - self.sectionoffsets[i], self.sectionattributes[i]&0xFFFFFF, (self.sectionattributes[i]>>24)&0xFF, self.sectiondescriptions[i])) + print("%3d %3X 0x%07X %s" % + (self.num_sections,self.num_sections, self.sectionoffsets[self.num_sections], self.sectiondescriptions[self.num_sections])) + + def setsectiondescription(self, section, description): + if section < len(self.sectiondescriptions): + self.sectiondescriptions[section] = description + else: + print("Section out of range: %d, description %s" % (section,description)) + + def dumppalmheader(self): + print("Palm Database Header") + print("Database name: " + repr(self.palmheader[:32])) + dbattributes, = struct.unpack_from(b'>H', self.palmheader, 32) + print("Bitfield attributes: 0x%0X" % dbattributes,) + if dbattributes != 0: + print(" (",) + if (dbattributes & 2): + print("Read-only; ",) + if (dbattributes & 4): + print("Dirty AppInfoArea; ",) + if (dbattributes & 8): + print("Needs to be backed up; ",) + if (dbattributes & 16): + print("OK to install over newer; ",) + if (dbattributes & 32): + print("Reset after installation; ",) + if (dbattributes & 64): + print("No copying by PalmPilot beaming; ",) + print(")") + else: + print("") + print("File version: %d" % struct.unpack_from(b'>H', self.palmheader, 34)[0]) + dbcreation, = struct.unpack_from(b'>L', self.palmheader, 36) + print("Creation Date: " + str(datetimefrompalmtime(dbcreation))+ (" (0x%0X)" % dbcreation)) + dbmodification, = struct.unpack_from(b'>L', self.palmheader, 40) + print("Modification Date: " + str(datetimefrompalmtime(dbmodification))+ (" (0x%0X)" % dbmodification)) + dbbackup, = struct.unpack_from(b'>L', self.palmheader, 44) + if dbbackup != 0: + print("Backup Date: " + str(datetimefrompalmtime(dbbackup))+ (" (0x%0X)" % dbbackup)) + print("Modification No.: %d" % struct.unpack_from(b'>L', self.palmheader, 48)[0]) + print("App Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 52)[0]) + print("Sort Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 56)[0]) + print("Type/Creator: %s/%s" % (repr(self.palmheader[60:64]), repr(self.palmheader[64:68]))) + print("Unique seed: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 68)[0]) + expectedzero, = struct.unpack_from(b'>L', self.palmheader, 72) + if expectedzero != 0: + print("Should be zero but isn't: %d" % struct.unpack_from(b'>L', self.palmheader, 72)[0]) + print("Number of sections: %d" % struct.unpack_from(b'>H', self.palmheader, 76)[0]) + return + + def loadSection(self, section): + before, after = self.sectionoffsets[section:section+2] + return self.data[before:after] diff --git a/src/epy_reader/tools/KindleUnpack/mobi_split.py b/src/epy_reader/tools/KindleUnpack/mobi_split.py new file mode 100755 index 0000000..3535029 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_split.py @@ -0,0 +1,438 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +import struct +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring + +from .unipath import pathof + + +# important pdb header offsets +unique_id_seed = 68 +number_of_pdb_records = 76 + +# important palmdoc header offsets +book_length = 4 +book_record_count = 8 +first_pdb_record = 78 + +# important rec0 offsets +length_of_book = 4 +mobi_header_base = 16 +mobi_header_length = 20 +mobi_type = 24 +mobi_version = 36 +first_non_text = 80 +title_offset = 84 +first_resc_record = 108 +first_content_index = 192 +last_content_index = 194 +kf8_fdst_index = 192 # for KF8 mobi headers +fcis_index = 200 +flis_index = 208 +srcs_index = 224 +srcs_count = 228 +primary_index = 244 +datp_index = 256 +huffoff = 112 +hufftbloff = 120 + +def getint(datain,ofs,sz=b'L'): + i, = struct.unpack_from(b'>'+sz,datain,ofs) + return i + +def writeint(datain,ofs,n,len=b'L'): + if len==b'L': + return datain[:ofs]+struct.pack(b'>L',n)+datain[ofs+4:] + else: + return datain[:ofs]+struct.pack(b'>H',n)+datain[ofs+2:] + +def getsecaddr(datain,secno): + nsec = getint(datain,number_of_pdb_records,b'H') + assert secno>=0 & secnoL',2*nsec+1)) + datalst.append(datain[unique_id_seed+4:number_of_pdb_records]) + datalst.append(struct.pack(b'>H',nsec)) + newstart = zerosecstart + for i in range(0,secno): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) + datalst.append(struct.pack(b'>L', secstart) + struct.pack(b'>L', (2*secno))) + for i in range(secno+1,nsec): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + ofs = ofs + dif + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) + lpad = newstart - (first_pdb_record + 8*nsec) + if lpad > 0: + datalst.append(b'\0' * lpad) + datalst.append(datain[zerosecstart:secstart]) + datalst.append(secdata) + datalst.append(datain[secend:]) + dataout = b''.join(datalst) + return dataout + +def nullsection(datain,secno): # make it zero-length without deleting it + datalst = [] + nsec = getint(datain,number_of_pdb_records,b'H') + secstart, secend = getsecaddr(datain,secno) + zerosecstart, zerosecend = getsecaddr(datain, 0) + dif = secend-secstart + datalst.append(datain[:first_pdb_record]) + for i in range(0,secno+1): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) + for i in range(secno+1, nsec): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + ofs = ofs - dif + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) + lpad = zerosecstart - (first_pdb_record + 8*nsec) + if lpad > 0: + datalst.append(b'\0' * lpad) + datalst.append(datain[zerosecstart: secstart]) + datalst.append(datain[secend:]) + dataout = b''.join(datalst) + return dataout + +def deletesectionrange(datain,firstsec,lastsec): # delete a range of sections + datalst = [] + firstsecstart,firstsecend = getsecaddr(datain,firstsec) + lastsecstart,lastsecend = getsecaddr(datain,lastsec) + zerosecstart, zerosecend = getsecaddr(datain, 0) + dif = lastsecend - firstsecstart + 8*(lastsec-firstsec+1) + nsec = getint(datain,number_of_pdb_records,b'H') + datalst.append(datain[:unique_id_seed]) + datalst.append(struct.pack(b'>L',2*(nsec-(lastsec-firstsec+1))+1)) + datalst.append(datain[unique_id_seed+4:number_of_pdb_records]) + datalst.append(struct.pack(b'>H',nsec-(lastsec-firstsec+1))) + newstart = zerosecstart - 8*(lastsec-firstsec+1) + for i in range(0,firstsec): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + ofs = ofs-8*(lastsec-firstsec+1) + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) + for i in range(lastsec+1,nsec): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + ofs = ofs - dif + flgval = 2*(i-(lastsec-firstsec+1)) + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) + lpad = newstart - (first_pdb_record + 8*(nsec - (lastsec - firstsec + 1))) + if lpad > 0: + datalst.append(b'\0' * lpad) + datalst.append(datain[zerosecstart:firstsecstart]) + datalst.append(datain[lastsecend:]) + dataout = b''.join(datalst) + return dataout + +def insertsection(datain,secno,secdata): # insert a new section + datalst = [] + nsec = getint(datain,number_of_pdb_records,b'H') + # print("inserting secno" , secno, "into" ,nsec, "sections") + secstart,secend = getsecaddr(datain,secno) + zerosecstart,zerosecend = getsecaddr(datain,0) + dif = len(secdata) + datalst.append(datain[:unique_id_seed]) + datalst.append(struct.pack(b'>L',2*(nsec+1)+1)) + datalst.append(datain[unique_id_seed+4:number_of_pdb_records]) + datalst.append(struct.pack(b'>H',nsec+1)) + newstart = zerosecstart + 8 + for i in range(0,secno): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + ofs += 8 + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) + datalst.append(struct.pack(b'>L', secstart + 8) + struct.pack(b'>L', (2*secno))) + for i in range(secno,nsec): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + ofs = ofs + dif + 8 + flgval = 2*(i+1) + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) + lpad = newstart - (first_pdb_record + 8*(nsec + 1)) + if lpad > 0: + datalst.append(b'\0' * lpad) + datalst.append(datain[zerosecstart:secstart]) + datalst.append(secdata) + datalst.append(datain[secstart:]) + dataout = b''.join(datalst) + return dataout + + +def insertsectionrange(sectionsource,firstsec,lastsec,sectiontarget,targetsec): # insert a range of sections + # print("inserting secno" , firstsec, "to", lastsec, "into" ,targetsec, "sections") + # dataout = sectiontarget + # for idx in range(lastsec,firstsec-1,-1): + # dataout = insertsection(dataout,targetsec,readsection(sectionsource,idx)) + # return dataout + datalst = [] + nsec = getint(sectiontarget,number_of_pdb_records,b'H') + zerosecstart, zerosecend = getsecaddr(sectiontarget,0) + insstart, nul = getsecaddr(sectiontarget,targetsec) + nins = lastsec - firstsec + 1 + srcstart, nul = getsecaddr(sectionsource,firstsec) + nul, srcend = getsecaddr(sectionsource,lastsec) + newstart = zerosecstart + 8*nins + + datalst.append(sectiontarget[:unique_id_seed]) + datalst.append(struct.pack(b'>L',2*(nsec+nins)+1)) + datalst.append(sectiontarget[unique_id_seed+4:number_of_pdb_records]) + datalst.append(struct.pack(b'>H',nsec+nins)) + for i in range(0,targetsec): + ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8) + ofsnew = ofs + 8*nins + flgvalnew = flgval + datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew)) + # print(ofsnew, flgvalnew, ofs, flgval) + srcstart0, nul = getsecaddr(sectionsource,firstsec) + for i in range(nins): + isrcstart, nul = getsecaddr(sectionsource,firstsec+i) + ofsnew = insstart + (isrcstart-srcstart0) + 8*nins + flgvalnew = 2*(targetsec+i) + datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew)) + # print(ofsnew, flgvalnew) + dif = srcend - srcstart + for i in range(targetsec,nsec): + ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8) + ofsnew = ofs + dif + 8*nins + flgvalnew = 2*(i+nins) + datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L',flgvalnew)) + # print(ofsnew, flgvalnew, ofs, flgval) + lpad = newstart - (first_pdb_record + 8*(nsec + nins)) + if lpad > 0: + datalst.append(b'\0' * lpad) + datalst.append(sectiontarget[zerosecstart:insstart]) + datalst.append(sectionsource[srcstart:srcend]) + datalst.append(sectiontarget[insstart:]) + dataout = b''.join(datalst) + return dataout + +def get_exth_params(rec0): + ebase = mobi_header_base + getint(rec0,mobi_header_length) + elen = getint(rec0,ebase+4) + enum = getint(rec0,ebase+8) + return ebase,elen,enum + +def add_exth(rec0,exth_num,exth_bytes): + ebase,elen,enum = get_exth_params(rec0) + newrecsize = 8+len(exth_bytes) + newrec0 = rec0[0:ebase+4]+struct.pack(b'>L',elen+newrecsize)+struct.pack(b'>L',enum+1)+\ + struct.pack(b'>L',exth_num)+struct.pack(b'>L',newrecsize)+exth_bytes+rec0[ebase+12:] + newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+newrecsize) + return newrec0 + +def read_exth(rec0,exth_num): + exth_values = [] + ebase,elen,enum = get_exth_params(rec0) + ebase = ebase+12 + while enum>0: + exth_id = getint(rec0,ebase) + if exth_id == exth_num: + # We might have multiple exths, so build a list. + exth_values.append(rec0[ebase+8:ebase+getint(rec0,ebase+4)]) + enum = enum-1 + ebase = ebase+getint(rec0,ebase+4) + return exth_values + +def write_exth(rec0,exth_num,exth_bytes): + ebase,elen,enum = get_exth_params(rec0) + ebase_idx = ebase+12 + enum_idx = enum + while enum_idx>0: + exth_id = getint(rec0,ebase_idx) + if exth_id == exth_num: + dif = len(exth_bytes)+8-getint(rec0,ebase_idx+4) + newrec0 = rec0 + if dif != 0: + newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+dif) + return newrec0[:ebase+4]+struct.pack(b'>L',elen+len(exth_bytes)+8-getint(rec0,ebase_idx+4))+\ + struct.pack(b'>L',enum)+rec0[ebase+12:ebase_idx+4]+\ + struct.pack(b'>L',len(exth_bytes)+8)+exth_bytes+\ + rec0[ebase_idx+getint(rec0,ebase_idx+4):] + enum_idx = enum_idx-1 + ebase_idx = ebase_idx+getint(rec0,ebase_idx+4) + return rec0 + +def del_exth(rec0,exth_num): + ebase,elen,enum = get_exth_params(rec0) + ebase_idx = ebase+12 + enum_idx = 0 + while enum_idx < enum: + exth_id = getint(rec0,ebase_idx) + exth_size = getint(rec0,ebase_idx+4) + if exth_id == exth_num: + newrec0 = rec0 + newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)-exth_size) + newrec0 = newrec0[:ebase_idx]+newrec0[ebase_idx+exth_size:] + newrec0 = newrec0[0:ebase+4]+struct.pack(b'>L',elen-exth_size)+struct.pack(b'>L',enum-1)+newrec0[ebase+12:] + return newrec0 + enum_idx += 1 + ebase_idx = ebase_idx+exth_size + return rec0 + + +class mobi_split: + + def __init__(self, infile): + datain = b'' + with open(pathof(infile), 'rb') as f: + datain = f.read() + datain_rec0 = readsection(datain,0) + ver = getint(datain_rec0,mobi_version) + self.combo = (ver!=8) + if not self.combo: + return + exth121 = read_exth(datain_rec0,121) + if len(exth121) == 0: + self.combo = False + return + else: + # only pay attention to first exth121 + # (there should only be one) + datain_kf8, = struct.unpack_from(b'>L',exth121[0],0) + if datain_kf8 == 0xffffffff: + self.combo = False + return + datain_kfrec0 =readsection(datain,datain_kf8) + + # create the standalone mobi7 + num_sec = getint(datain,number_of_pdb_records,b'H') + # remove BOUNDARY up to but not including ELF record + self.result_file7 = deletesectionrange(datain,datain_kf8-1,num_sec-2) + # check if there are SRCS records and delete them + srcs = getint(datain_rec0,srcs_index) + num_srcs = getint(datain_rec0,srcs_count) + if srcs != 0xffffffff and num_srcs > 0: + self.result_file7 = deletesectionrange(self.result_file7,srcs,srcs+num_srcs-1) + datain_rec0 = writeint(datain_rec0,srcs_index,0xffffffff) + datain_rec0 = writeint(datain_rec0,srcs_count,0) + # reset the EXTH 121 KF8 Boundary meta data to 0xffffffff + datain_rec0 = write_exth(datain_rec0,121, struct.pack(b'>L', 0xffffffff)) + # datain_rec0 = del_exth(datain_rec0,121) + # datain_rec0 = del_exth(datain_rec0,534) + # don't remove the EXTH 125 KF8 Count of Resources, seems to be present in mobi6 files as well + # set the EXTH 129 KF8 Masthead / Cover Image string to the null string + datain_rec0 = write_exth(datain_rec0,129, b'') + # don't remove the EXTH 131 KF8 Unidentified Count, seems to be present in mobi6 files as well + + # need to reset flags stored in 0x80-0x83 + # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050 + # Bit Flags + # 0x1000 = Bit 12 indicates if embedded fonts are used or not + # 0x0800 = means this Header points to *shared* images/resource/fonts ?? + # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8? + # 0x0040 = exth exists + # 0x0010 = Not sure but this is always set so far + fval, = struct.unpack_from(b'>L',datain_rec0, 0x80) + # need to remove flag 0x0800 for KindlePreviewer 2.8 and unset Bit 12 for embedded fonts + fval = fval & 0x07FF + datain_rec0 = datain_rec0[:0x80] + struct.pack(b'>L',fval) + datain_rec0[0x84:] + + self.result_file7 = writesection(self.result_file7,0,datain_rec0) + + # no need to replace kf8 style fcis with mobi 7 one + # fcis_secnum, = struct.unpack_from(b'>L',datain_rec0, 0xc8) + # if fcis_secnum != 0xffffffff: + # fcis_info = readsection(datain, fcis_secnum) + # text_len, = struct.unpack_from(b'>L', fcis_info, 0x14) + # new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00' + # new_fcis += struct.pack(b'>L',text_len) + # new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00' + # self.result_file7 = writesection(self.result_file7, fcis_secnum, new_fcis) + + firstimage = getint(datain_rec0,first_resc_record) + lastimage = getint(datain_rec0,last_content_index,b'H') + # print("Old First Image, last Image", firstimage,lastimage) + if lastimage == 0xffff: + # find the lowest of the next sections and copy up to that. + ofs_list = [(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')] + for ofs,sz in ofs_list: + n = getint(datain_rec0,ofs,sz) + # print("n",n) + if n > 0 and n < lastimage: + lastimage = n-1 + print("First Image, last Image", firstimage,lastimage) + + # Try to null out FONT and RES, but leave the (empty) PDB record so image refs remain valid + for i in range(firstimage,lastimage): + imgsec = readsection(self.result_file7,i) + if imgsec[0:4] in [b'RESC',b'FONT']: + self.result_file7 = nullsection(self.result_file7,i) + + # mobi7 finished + + # create standalone mobi8 + self.result_file8 = deletesectionrange(datain,0,datain_kf8-1) + target = getint(datain_kfrec0,first_resc_record) + self.result_file8 = insertsectionrange(datain,firstimage,lastimage,self.result_file8,target) + datain_kfrec0 =readsection(self.result_file8,0) + + # Only keep the correct EXTH 116 StartOffset, KG 2.5 carries over the one from the mobi7 part, which then points at garbage in the mobi8 part, and confuses FW 3.4 + kf8starts = read_exth(datain_kfrec0,116) + # If we have multiple StartOffset, keep only the last one + kf8start_count = len(kf8starts) + while kf8start_count > 1: + kf8start_count -= 1 + datain_kfrec0 = del_exth(datain_kfrec0,116) + + # update the EXTH 125 KF8 Count of Images/Fonts/Resources + datain_kfrec0 = write_exth(datain_kfrec0,125,struct.pack(b'>L',lastimage-firstimage+1)) + + # need to reset flags stored in 0x80-0x83 + # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050 + # standalone mobi8 with exth: 0x0050 + # Bit Flags + # 0x1000 = Bit 12 indicates if embedded fonts are used or not + # 0x0800 = means this Header points to *shared* images/resource/fonts ?? + # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8? + # 0x0040 = exth exists + # 0x0010 = Not sure but this is always set so far + fval, = struct.unpack_from('>L',datain_kfrec0, 0x80) + fval = fval & 0x1FFF + fval |= 0x0800 + datain_kfrec0 = datain_kfrec0[:0x80] + struct.pack(b'>L',fval) + datain_kfrec0[0x84:] + + # properly update other index pointers that have been shifted by the insertion of images + ofs_list = [(kf8_fdst_index,b'L'),(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')] + for ofs,sz in ofs_list: + n = getint(datain_kfrec0,ofs,sz) + if n != 0xffffffff: + datain_kfrec0 = writeint(datain_kfrec0,ofs,n+lastimage-firstimage+1,sz) + self.result_file8 = writesection(self.result_file8,0,datain_kfrec0) + + # no need to replace kf8 style fcis with mobi 7 one + # fcis_secnum, = struct.unpack_from(b'>L',datain_kfrec0, 0xc8) + # if fcis_secnum != 0xffffffff: + # fcis_info = readsection(self.result_file8, fcis_secnum) + # text_len, = struct.unpack_from(b'>L', fcis_info, 0x14) + # new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00' + # new_fcis += struct.pack(b'>L',text_len) + # new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00' + # self.result_file8 = writesection(self.result_file8, fcis_secnum, new_fcis) + + # mobi8 finished + + def getResult8(self): + return self.result_file8 + + def getResult7(self): + return self.result_file7 diff --git a/src/epy_reader/tools/KindleUnpack/mobi_uncompress.py b/src/epy_reader/tools/KindleUnpack/mobi_uncompress.py new file mode 100644 index 0000000..c5fad85 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_uncompress.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, bchr, lmap, bstr + +if PY2: + range = xrange + +import struct +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring + + +class unpackException(Exception): + pass + +class UncompressedReader: + + def unpack(self, data): + return data + +class PalmdocReader: + + def unpack(self, i): + o, p = b'', 0 + while p < len(i): + # for python 3 must use slice since i[p] returns int while slice returns character + c = ord(i[p:p+1]) + p += 1 + if (c >= 1 and c <= 8): + o += i[p:p+c] + p += c + elif (c < 128): + o += bchr(c) + elif (c >= 192): + o += b' ' + bchr(c ^ 128) + else: + if p < len(i): + c = (c << 8) | ord(i[p:p+1]) + p += 1 + m = (c >> 3) & 0x07ff + n = (c & 7) + 3 + if (m > n): + o += o[-m:n-m] + else: + for _ in range(n): + # because of completely ass-backwards decision by python mainters for python 3 + # we must use slice for bytes as i[p] returns int while slice returns character + if m == 1: + o += o[-m:] + else: + o += o[-m:-m+1] + return o + +class HuffcdicReader: + q = struct.Struct(b'>Q').unpack_from + + def loadHuff(self, huff): + if huff[0:8] != b'HUFF\x00\x00\x00\x18': + raise unpackException('invalid huff header') + off1, off2 = struct.unpack_from(b'>LL', huff, 8) + + def dict1_unpack(v): + codelen, term, maxcode = v&0x1f, v&0x80, v>>8 + assert codelen != 0 + if codelen <= 8: + assert term + maxcode = ((maxcode + 1) << (32 - codelen)) - 1 + return (codelen, term, maxcode) + self.dict1 = lmap(dict1_unpack, struct.unpack_from(b'>256L', huff, off1)) + + dict2 = struct.unpack_from(b'>64L', huff, off2) + self.mincode, self.maxcode = (), () + for codelen, mincode in enumerate((0,) + dict2[0::2]): + self.mincode += (mincode << (32 - codelen), ) + for codelen, maxcode in enumerate((0,) + dict2[1::2]): + self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, ) + + self.dictionary = [] + + def loadCdic(self, cdic): + if cdic[0:8] != b'CDIC\x00\x00\x00\x10': + raise unpackException('invalid cdic header') + phrases, bits = struct.unpack_from(b'>LL', cdic, 8) + n = min(1<H').unpack_from + def getslice(off): + blen, = h(cdic, 16+off) + slice = cdic[18+off:18+off+(blen&0x7fff)] + return (slice, blen&0x8000) + self.dictionary += lmap(getslice, struct.unpack_from(bstr('>%dH' % n), cdic, 16)) + + def unpack(self, data): + q = HuffcdicReader.q + + bitsleft = len(data) * 8 + data += b"\x00\x00\x00\x00\x00\x00\x00\x00" + pos = 0 + x, = q(data, pos) + n = 32 + + s = b'' + while True: + if n <= 0: + pos += 4 + x, = q(data, pos) + n += 32 + code = (x >> n) & ((1 << 32) - 1) + + codelen, term, maxcode = self.dict1[code >> 24] + if not term: + while code < self.mincode[codelen]: + codelen += 1 + maxcode = self.maxcode[codelen] + + n -= codelen + bitsleft -= codelen + if bitsleft < 0: + break + + r = (maxcode - code) >> (32 - codelen) + slice, flag = self.dictionary[r] + if not flag: + self.dictionary[r] = None + slice = self.unpack(slice) + self.dictionary[r] = (slice, 1) + s += slice + return s diff --git a/src/epy_reader/tools/KindleUnpack/mobi_utils.py b/src/epy_reader/tools/KindleUnpack/mobi_utils.py new file mode 100644 index 0000000..6791e0d --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_utils.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab +# flake8: noqa + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, text_type, bchr, bord + +import binascii + +if PY2: + range = xrange + +from itertools import cycle + +def getLanguage(langID, sublangID): + mobilangdict = { + 54 : {0 : 'af'}, # Afrikaans + 28 : {0 : 'sq'}, # Albanian + 1 : {0 : 'ar' , 5 : 'ar-dz' , 15 : 'ar-bh' , 3 : 'ar-eg' , 2 : 'ar-iq', 11 : 'ar-jo' , 13 : 'ar-kw' , 12 : 'ar-lb' , 4: 'ar-ly', + 6 : 'ar-ma' , 8 : 'ar-om' , 16 : 'ar-qa' , 1 : 'ar-sa' , 10 : 'ar-sy' , 7 : 'ar-tn' , 14 : 'ar-ae' , 9 : 'ar-ye'}, + # Arabic, Arabic (Algeria), Arabic (Bahrain), Arabic (Egypt), Arabic + # (Iraq), Arabic (Jordan), Arabic (Kuwait), Arabic (Lebanon), Arabic + # (Libya), Arabic (Morocco), Arabic (Oman), Arabic (Qatar), Arabic + # (Saudi Arabia), Arabic (Syria), Arabic (Tunisia), Arabic (United Arab + # Emirates), Arabic (Yemen) + 43 : {0 : 'hy'}, # Armenian + 77 : {0 : 'as'}, # Assamese + 44 : {0 : 'az'}, # "Azeri (IANA: Azerbaijani) + 45 : {0 : 'eu'}, # Basque + 35 : {0 : 'be'}, # Belarusian + 69 : {0 : 'bn'}, # Bengali + 2 : {0 : 'bg'}, # Bulgarian + 3 : {0 : 'ca'}, # Catalan + 4 : {0 : 'zh' , 3 : 'zh-hk' , 2 : 'zh-cn' , 4 : 'zh-sg' , 1 : 'zh-tw'}, + # Chinese, Chinese (Hong Kong), Chinese (PRC), Chinese (Singapore), Chinese (Taiwan) + 26 : {0 : 'hr', 3 : 'sr'}, # Croatian, Serbian + 5 : {0 : 'cs'}, # Czech + 6 : {0 : 'da'}, # Danish + 19 : {0: 'nl', 1 : 'nl' , 2 : 'nl-be'}, # Dutch / Flemish, Dutch (Belgium) + 9 : {0: 'en', 1 : 'en' , 3 : 'en-au' , 40 : 'en-bz' , 4 : 'en-ca' , 6 : 'en-ie' , 8 : 'en-jm' , 5 : 'en-nz' , 13 : 'en-ph' , + 7 : 'en-za' , 11 : 'en-tt' , 2 : 'en-gb', 1 : 'en-us' , 12 : 'en-zw'}, + # English, English (Australia), English (Belize), English (Canada), + # English (Ireland), English (Jamaica), English (New Zealand), English + # (Philippines), English (South Africa), English (Trinidad), English + # (United Kingdom), English (United States), English (Zimbabwe) + 37 : {0 : 'et'}, # Estonian + 56 : {0 : 'fo'}, # Faroese + 41 : {0 : 'fa'}, # Farsi / Persian + 11 : {0 : 'fi'}, # Finnish + 12 : {0 : 'fr', 1 : 'fr' , 2 : 'fr-be' , 3 : 'fr-ca' , 5 : 'fr-lu' , 6 : 'fr-mc' , 4 : 'fr-ch'}, + # French, French (Belgium), French (Canada), French (Luxembourg), French (Monaco), French (Switzerland) + 55 : {0 : 'ka'}, # Georgian + 7 : {0 : 'de', 1 : 'de' , 3 : 'de-at' , 5 : 'de-li' , 4 : 'de-lu' , 2 : 'de-ch'}, + # German, German (Austria), German (Liechtenstein), German (Luxembourg), German (Switzerland) + 8 : {0 : 'el'}, # Greek, Modern (1453-) + 71 : {0 : 'gu'}, # Gujarati + 13 : {0 : 'he'}, # Hebrew (also code 'iw'?) + 57 : {0 : 'hi'}, # Hindi + 14 : {0 : 'hu'}, # Hungarian + 15 : {0 : 'is'}, # Icelandic + 33 : {0 : 'id'}, # Indonesian + 16 : {0 : 'it', 1 : 'it' , 2 : 'it-ch'}, # Italian, Italian (Switzerland) + 17 : {0 : 'ja'}, # Japanese + 75 : {0 : 'kn'}, # Kannada + 63 : {0 : 'kk'}, # Kazakh + 87 : {0 : 'x-kok'}, # Konkani (real language code is 'kok'?) + 18 : {0 : 'ko'}, # Korean + 38 : {0 : 'lv'}, # Latvian + 39 : {0 : 'lt'}, # Lithuanian + 47 : {0 : 'mk'}, # Macedonian + 62 : {0 : 'ms'}, # Malay + 76 : {0 : 'ml'}, # Malayalam + 58 : {0 : 'mt'}, # Maltese + 78 : {0 : 'mr'}, # Marathi + 97 : {0 : 'ne'}, # Nepali + 20 : {0 : 'no'}, # Norwegian + 72 : {0 : 'or'}, # Oriya + 21 : {0 : 'pl'}, # Polish + 22 : {0 : 'pt', 2 : 'pt' , 1 : 'pt-br'}, # Portuguese, Portuguese (Brazil) + 70 : {0 : 'pa'}, # Punjabi + 23 : {0 : 'rm'}, # "Rhaeto-Romanic" (IANA: Romansh) + 24 : {0 : 'ro'}, # Romanian + 25 : {0 : 'ru'}, # Russian + 59 : {0 : 'sz'}, # "Sami (Lappish)" (not an IANA language code) + # IANA code for "Northern Sami" is 'se' + # 'SZ' is the IANA region code for Swaziland + 79 : {0 : 'sa'}, # Sanskrit + 27 : {0 : 'sk'}, # Slovak + 36 : {0 : 'sl'}, # Slovenian + 46 : {0 : 'sb'}, # "Sorbian" (not an IANA language code) + # 'SB' is IANA region code for 'Solomon Islands' + # Lower Sorbian = 'dsb' + # Upper Sorbian = 'hsb' + # Sorbian Languages = 'wen' + 10 : {0 : 'es' , 4 : 'es' , 44 : 'es-ar' , 64 : 'es-bo' , 52 : 'es-cl' , 36 : 'es-co' , 20 : 'es-cr' , 28 : 'es-do' , + 48 : 'es-ec' , 68 : 'es-sv' , 16 : 'es-gt' , 72 : 'es-hn' , 8 : 'es-mx' , 76 : 'es-ni' , 24 : 'es-pa' , + 60 : 'es-py' , 40 : 'es-pe' , 80 : 'es-pr' , 56 : 'es-uy' , 32 : 'es-ve'}, + # Spanish, Spanish (Mobipocket bug?), Spanish (Argentina), Spanish + # (Bolivia), Spanish (Chile), Spanish (Colombia), Spanish (Costa Rica), + # Spanish (Dominican Republic), Spanish (Ecuador), Spanish (El + # Salvador), Spanish (Guatemala), Spanish (Honduras), Spanish (Mexico), + # Spanish (Nicaragua), Spanish (Panama), Spanish (Paraguay), Spanish + # (Peru), Spanish (Puerto Rico), Spanish (Uruguay), Spanish (Venezuela) + 48 : {0 : 'sx'}, # "Sutu" (not an IANA language code) + # "Sutu" is another name for "Southern Sotho"? + # IANA code for "Southern Sotho" is 'st' + 65 : {0 : 'sw'}, # Swahili + 29 : {0 : 'sv' , 1 : 'sv' , 8 : 'sv-fi'}, # Swedish, Swedish (Finland) + 73 : {0 : 'ta'}, # Tamil + 68 : {0 : 'tt'}, # Tatar + 74 : {0 : 'te'}, # Telugu + 30 : {0 : 'th'}, # Thai + 49 : {0 : 'ts'}, # Tsonga + 50 : {0 : 'tn'}, # Tswana + 31 : {0 : 'tr'}, # Turkish + 34 : {0 : 'uk'}, # Ukrainian + 32 : {0 : 'ur'}, # Urdu + 67 : {0 : 'uz', 2 : 'uz'}, # Uzbek + 42 : {0 : 'vi'}, # Vietnamese + 52 : {0 : 'xh'}, # Xhosa + 53 : {0 : 'zu'}, # Zulu + } + lang = "en" + if langID in mobilangdict: + subdict = mobilangdict[langID] + lang = subdict[0] + if sublangID in subdict: + lang = subdict[sublangID] + return lang + + +def toHex(byteList): + return binascii.hexlify(byteList) + +# returns base32 bytestring +def toBase32(value, npad=4): + digits = b'0123456789ABCDEFGHIJKLMNOPQRSTUV' + num_string=b'' + current = value + while current != 0: + next, remainder = divmod(current, 32) + rem_string = digits[remainder:remainder+1] + num_string = rem_string + num_string + current=next + if num_string == b'': + num_string = b'0' + pad = npad - len(num_string) + if pad > 0: + num_string = b'0' * pad + num_string + return num_string + + +# converts base32 string to value +def fromBase32(str_num): + if isinstance(str_num, text_type): + str_num = str_num.encode('latin-1') + scalelst = [1,32,1024,32768,1048576,33554432,1073741824,34359738368] + value = 0 + j = 0 + n = len(str_num) + scale = 0 + for i in range(n): + c = str_num[n-i-1:n-i] + if c in b'0123456789': + v = ord(c) - ord(b'0') + else: + v = ord(c) - ord(b'A') + 10 + if j < len(scalelst): + scale = scalelst[j] + else: + scale = scale * 32 + j += 1 + if v != 0: + value = value + (v * scale) + return value + + +# note: if decode a bytestring using 'latin-1' (or any other 0-255 encoding) +# in place of ascii you will get a byte to half-word or integer +# one to one mapping of values from 0 - 255 + +def mangle_fonts(encryption_key, data): + if isinstance(encryption_key, text_type): + encryption_key = encryption_key.encode('latin-1') + crypt = data[:1024] + key = cycle(iter(map(bord, encryption_key))) + # encrypt = ''.join([chr(ord(x)^key.next()) for x in crypt]) + encrypt = b''.join([bchr(bord(x)^next(key)) for x in crypt]) + return encrypt + data[1024:] diff --git a/src/epy_reader/tools/KindleUnpack/mobiml2xhtml.py b/src/epy_reader/tools/KindleUnpack/mobiml2xhtml.py new file mode 100755 index 0000000..94fc671 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobiml2xhtml.py @@ -0,0 +1,527 @@ +#! /usr/bin/python +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + + +# this program works in concert with the output from KindleUnpack + +''' +Convert from Mobi ML to XHTML +''' + +from __future__ import division, absolute_import, print_function + +import os +import sys +import re + +SPECIAL_HANDLING_TAGS = { + '?xml' : ('xmlheader', -1), + '!--' : ('comment', -3), + '!DOCTYPE' : ('doctype', -1), +} + +SPECIAL_HANDLING_TYPES = ['xmlheader', 'doctype', 'comment'] + +SELF_CLOSING_TAGS = ['br' , 'hr', 'input', 'img', 'image', 'meta', 'spacer', 'link', 'frame', 'base', 'col', 'reference'] + +class MobiMLConverter(object): + + PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE) + IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex') + + def __init__(self, filename): + self.base_css_rules = 'blockquote { margin: 0em 0em 0em 1.25em }\n' + self.base_css_rules += 'p { margin: 0em }\n' + self.base_css_rules += '.bold { font-weight: bold }\n' + self.base_css_rules += '.italic { font-style: italic }\n' + self.base_css_rules += '.mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n' + self.tag_css_rules = {} + self.tag_css_rule_cnt = 0 + self.path = [] + self.filename = filename + self.wipml = open(self.filename, 'r').read() + self.pos = 0 + self.opfname = self.filename.rsplit('.',1)[0] + '.opf' + self.opos = 0 + self.meta = '' + self.cssname = os.path.join(os.path.dirname(self.filename),'styles.css') + self.current_font_size = 3 + self.font_history = [] + + def cleanup_html(self): + self.wipml = re.sub(r'
', '', self.wipml) + self.wipml = self.wipml.replace('\r\n', '\n') + self.wipml = self.wipml.replace('> <', '>\n<') + self.wipml = self.wipml.replace(']*>', '', self.wipml) + self.wipml = self.wipml.replace('

','
') + + def replace_page_breaks(self): + self.wipml = self.PAGE_BREAK_PAT.sub( + '
', + self.wipml) + + # parse leading text of ml and tag + def parseml(self): + p = self.pos + if p >= len(self.wipml): + return None + if self.wipml[p] != '<': + res = self.wipml.find('<',p) + if res == -1 : + res = len(self.wipml) + self.pos = res + return self.wipml[p:res], None + # handle comment as a special case to deal with multi-line comments + if self.wipml[p:p+4] == '',p+1) + if te != -1: + te = te+2 + else : + te = self.wipml.find('>',p+1) + ntb = self.wipml.find('<',p+1) + if ntb != -1 and ntb < te: + self.pos = ntb + return self.wipml[p:ntb], None + self.pos = te + 1 + return None, self.wipml[p:te+1] + + # parses string version of tag to identify its name, + # its type 'begin', 'end' or 'single', + # plus build a hashtable of its attributes + # code is written to handle the possiblity of very poor formating + def parsetag(self, s): + p = 1 + # get the tag name + tname = None + ttype = None + tattr = {} + while s[p:p+1] == ' ' : + p += 1 + if s[p:p+1] == '/': + ttype = 'end' + p += 1 + while s[p:p+1] == ' ' : + p += 1 + b = p + while s[p:p+1] not in ('>', '/', ' ', '"', "'", "\r", "\n") : + p += 1 + tname=s[b:p].lower() + if tname == '!doctype': + tname = '!DOCTYPE' + # special cases + if tname in SPECIAL_HANDLING_TAGS: + ttype, backstep = SPECIAL_HANDLING_TAGS[tname] + tattr['special'] = s[p:backstep] + if ttype is None: + # parse any attributes + while s.find('=',p) != -1 : + while s[p:p+1] == ' ' : + p += 1 + b = p + while s[p:p+1] != '=' : + p += 1 + aname = s[b:p].lower() + aname = aname.rstrip(' ') + p += 1 + while s[p:p+1] == ' ' : + p += 1 + if s[p:p+1] in ('"', "'") : + p = p + 1 + b = p + while s[p:p+1] not in ('"', "'") : + p += 1 + val = s[b:p] + p += 1 + else : + b = p + while s[p:p+1] not in ('>', '/', ' ') : + p += 1 + val = s[b:p] + tattr[aname] = val + # label beginning and single tags + if ttype is None: + ttype = 'begin' + if s.find(' /',p) >= 0: + ttype = 'single_ext' + elif s.find('/',p) >= 0: + ttype = 'single' + return ttype, tname, tattr + + # main routine to convert from mobi markup language to html + def processml(self): + + # are these really needed + html_done = False + head_done = False + body_done = False + + skip = False + + htmlstr = '' + self.replace_page_breaks() + self.cleanup_html() + + # now parse the cleaned up ml into standard xhtml + while True: + + r = self.parseml() + if not r: + break + + text, tag = r + + if text: + if not skip: + htmlstr += text + + if tag: + ttype, tname, tattr = self.parsetag(tag) + + # If we run into a DTD or xml declarations inside the body ... bail. + if tname in SPECIAL_HANDLING_TAGS and tname != 'comment' and body_done: + htmlstr += '\n' + break + + # make sure self-closing tags actually self-close + if ttype == 'begin' and tname in SELF_CLOSING_TAGS: + ttype = 'single' + + # make sure any end tags of self-closing tags are discarded + if ttype == 'end' and tname in SELF_CLOSING_TAGS: + continue + + # remove embedded guide and refernces from old mobis + if tname in ('guide', 'ncx', 'reference') and ttype in ('begin', 'single', 'single_ext'): + tname = 'removeme:{0}'.format(tname) + tattr = None + if tname in ('guide', 'ncx', 'reference', 'font', 'span') and ttype == 'end': + if self.path[-1] == 'removeme:{0}'.format(tname): + tname = 'removeme:{0}'.format(tname) + tattr = None + + # Get rid of font tags that only have a color attribute. + if tname == 'font' and ttype in ('begin', 'single', 'single_ext'): + if 'color' in tattr and len(tattr) == 1: + tname = 'removeme:{0}'.format(tname) + tattr = None + + # Get rid of empty spans in the markup. + if tname == 'span' and ttype in ('begin', 'single', 'single_ext') and not len(tattr): + tname = 'removeme:{0}'.format(tname) + + # need to handle fonts outside of the normal methods + # so fonts tags won't be added to the self.path since we keep track + # of font tags separately with self.font_history + if tname == 'font' and ttype == 'begin': + # check for nested font start tags + if len(self.font_history) > 0 : + # inject a font end tag + taginfo = ('end', 'font', None) + htmlstr += self.processtag(taginfo) + self.font_history.append((ttype, tname, tattr)) + # handle the current font start tag + taginfo = (ttype, tname, tattr) + htmlstr += self.processtag(taginfo) + continue + + # check for nested font tags and unnest them + if tname == 'font' and ttype == 'end': + self.font_history.pop() + # handle this font end tag + taginfo = ('end', 'font', None) + htmlstr += self.processtag(taginfo) + # check if we were nested + if len(self.font_history) > 0: + # inject a copy of the most recent font start tag from history + taginfo = self.font_history[-1] + htmlstr += self.processtag(taginfo) + continue + + # keep track of nesting path + if ttype == 'begin': + self.path.append(tname) + elif ttype == 'end': + if tname != self.path[-1]: + print('improper nesting: ', self.path, tname, ttype) + if tname not in self.path: + # handle case of end tag with no beginning by injecting empty begin tag + taginfo = ('begin', tname, None) + htmlstr += self.processtag(taginfo) + print(" - fixed by injecting empty start tag ", tname) + self.path.append(tname) + elif len(self.path) > 1 and tname == self.path[-2]: + # handle case of dangling missing end + taginfo = ('end', self.path[-1], None) + htmlstr += self.processtag(taginfo) + print(" - fixed by injecting end tag ", self.path[-1]) + self.path.pop() + self.path.pop() + + if tname == 'removeme:{0}'.format(tname): + if ttype in ('begin', 'single', 'single_ext'): + skip = True + else: + skip = False + else: + taginfo = (ttype, tname, tattr) + htmlstr += self.processtag(taginfo) + + # handle potential issue of multiple html, head, and body sections + if tname == 'html' and ttype == 'begin' and not html_done: + htmlstr += '\n' + html_done = True + + if tname == 'head' and ttype == 'begin' and not head_done: + htmlstr += '\n' + # also add in metadata and style link tags + htmlstr += self.meta + htmlstr += '\n' + head_done = True + + if tname == 'body' and ttype == 'begin' and not body_done: + htmlstr += '\n' + body_done = True + + # handle issue of possibly missing html, head, and body tags + # I have not seen this but the original did something like this so ... + if not body_done: + htmlstr = '\n' + htmlstr + '\n' + if not head_done: + headstr = '\n' + headstr += self.meta + headstr += '\n' + headstr += '\n' + htmlstr = headstr + htmlstr + if not html_done: + htmlstr = '\n' + htmlstr + '\n' + + # finally add DOCTYPE info + htmlstr = '\n\n' + htmlstr + + css = self.base_css_rules + for cls, rule in self.tag_css_rules.items(): + css += '.%s { %s }\n' % (cls, rule) + + return (htmlstr, css, self.cssname) + + def ensure_unit(self, raw, unit='px'): + if re.search(r'\d+$', raw) is not None: + raw += unit + return raw + + # flatten possibly modified tag back to string + def taginfo_tostring(self, taginfo): + (ttype, tname, tattr) = taginfo + if ttype is None or tname is None: + return '' + if ttype == 'end': + return '' % tname + if ttype in SPECIAL_HANDLING_TYPES and tattr is not None and 'special' in tattr: + info = tattr['special'] + if ttype == 'comment': + return '<%s %s-->' % (tname, info) + else: + return '<%s %s>' % (tname, info) + res = [] + res.append('<%s' % tname) + if tattr is not None: + for key in tattr: + res.append(' %s="%s"' % (key, tattr[key])) + if ttype == 'single': + res.append('/>') + elif ttype == 'single_ext': + res.append(' />') + else : + res.append('>') + return "".join(res) + + # routines to convert from mobi ml tags atributes to xhtml attributes and styles + def processtag(self, taginfo): + # Converting mobi font sizes to numerics + size_map = { + 'xx-small': '1', + 'x-small': '2', + 'small': '3', + 'medium': '4', + 'large': '5', + 'x-large': '6', + 'xx-large': '7', + } + + size_to_em_map = { + '1': '.65em', + '2': '.75em', + '3': '1em', + '4': '1.125em', + '5': '1.25em', + '6': '1.5em', + '7': '2em', + } + + # current tag to work on + (ttype, tname, tattr) = taginfo + if not tattr: + tattr = {} + + styles = [] + + if tname is None or tname.startswith('removeme'): + return '' + + # have not seen an example of this yet so keep it here to be safe + # until this is better understood + if tname in ('country-region', 'place', 'placetype', 'placename', + 'state', 'city', 'street', 'address', 'content'): + tname = 'div' if tname == 'content' else 'span' + for key in tattr: + tattr.pop(key) + + # handle general case of style, height, width, bgcolor in any tag + if 'style' in tattr: + style = tattr.pop('style').strip() + if style: + styles.append(style) + + if 'align' in tattr: + align = tattr.pop('align').strip() + if align: + if tname in ('table', 'td', 'tr'): + pass + else: + styles.append('text-align: %s' % align) + + if 'height' in tattr: + height = tattr.pop('height').strip() + if height and '<' not in height and '>' not in height and re.search(r'\d+', height): + if tname in ('table', 'td', 'tr'): + pass + elif tname == 'img': + tattr['height'] = height + else: + styles.append('margin-top: %s' % self.ensure_unit(height)) + + if 'width' in tattr: + width = tattr.pop('width').strip() + if width and re.search(r'\d+', width): + if tname in ('table', 'td', 'tr'): + pass + elif tname == 'img': + tattr['width'] = width + else: + styles.append('text-indent: %s' % self.ensure_unit(width)) + if width.startswith('-'): + styles.append('margin-left: %s' % self.ensure_unit(width[1:])) + + if 'bgcolor' in tattr: + # no proprietary html allowed + if tname == 'div': + del tattr['bgcolor'] + + elif tname == 'font': + # Change font tags to span tags + tname = 'span' + if ttype in ('begin', 'single', 'single_ext'): + # move the face attribute to css font-family + if 'face' in tattr: + face = tattr.pop('face').strip() + styles.append('font-family: "%s"' % face) + + # Monitor the constantly changing font sizes, change them to ems and move + # them to css. The following will work for 'flat' font tags, but nested font tags + # will cause things to go wonky. Need to revert to the parent font tag's size + # when a closing tag is encountered. + if 'size' in tattr: + sz = tattr.pop('size').strip().lower() + try: + float(sz) + except ValueError: + if sz in size_map: + sz = size_map[sz] + else: + if sz.startswith('-') or sz.startswith('+'): + sz = self.current_font_size + float(sz) + if sz > 7: + sz = 7 + elif sz < 1: + sz = 1 + sz = str(int(sz)) + styles.append('font-size: %s' % size_to_em_map[sz]) + self.current_font_size = int(sz) + + elif tname == 'img': + for attr in ('width', 'height'): + if attr in tattr: + val = tattr[attr] + if val.lower().endswith('em'): + try: + nval = float(val[:-2]) + nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile + tattr[attr] = "%dpx"%int(nval) + except: + del tattr[attr] + elif val.lower().endswith('%'): + del tattr[attr] + + # convert the anchor tags + if 'filepos-id' in tattr: + tattr['id'] = tattr.pop('filepos-id') + if 'name' in tattr and tattr['name'] != tattr['id']: + tattr['name'] = tattr['id'] + + if 'filepos' in tattr: + filepos = tattr.pop('filepos') + try: + tattr['href'] = "#filepos%d" % int(filepos) + except ValueError: + pass + + if styles: + ncls = None + rule = '; '.join(styles) + for sel, srule in self.tag_css_rules.items(): + if srule == rule: + ncls = sel + break + if ncls is None: + self.tag_css_rule_cnt += 1 + ncls = 'rule_%d' % self.tag_css_rule_cnt + self.tag_css_rules[ncls] = rule + cls = tattr.get('class', '') + cls = cls + (' ' if cls else '') + ncls + tattr['class'] = cls + + # convert updated tag back to string representation + if len(tattr) == 0: + tattr = None + taginfo = (ttype, tname, tattr) + return self.taginfo_tostring(taginfo) + +''' main only left in for testing outside of plugin ''' + +def main(argv=sys.argv): + if len(argv) != 2: + return 1 + else: + infile = argv[1] + + try: + print('Converting Mobi Markup Language to XHTML') + mlc = MobiMLConverter(infile) + print('Processing ...') + htmlstr, css, cssname = mlc.processml() + outname = infile.rsplit('.',1)[0] + '_converted.html' + open(outname, 'w').write(htmlstr) + open(cssname, 'w').write(css) + print('Completed') + print('XHTML version of book can be found at: ' + outname) + + except ValueError as e: + print("Error: %s" % e) + return 1 + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/epy_reader/tools/KindleUnpack/unipath.py b/src/epy_reader/tools/KindleUnpack/unipath.py new file mode 100755 index 0000000..2416279 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/unipath.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this list of +# conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, this list +# of conditions and the following disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY +# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import unicode_literals, division, absolute_import, print_function +from .compatibility_utils import PY2, text_type, binary_type + +import sys +import os + +# utility routines to convert all paths to be full unicode + +# Under Python 2, if a bytestring, try to convert it to unicode using sys.getfilesystemencoding +# Under Python 3, if bytes, try to convert it to unicode using os.fsencode() to decode it + +# Mac OS X and Windows will happily support full unicode paths +# Linux can support full unicode paths but allows arbitrary byte paths which may be inconsistent with unicode + +fsencoding = sys.getfilesystemencoding() + +def pathof(s, enc=fsencoding): + if s is None: + return None + if isinstance(s, text_type): + return s + if isinstance(s, binary_type): + try: + return s.decode(enc) + except: + pass + return s + +def exists(s): + return os.path.exists(pathof(s)) + +def isfile(s): + return os.path.isfile(pathof(s)) + +def isdir(s): + return os.path.isdir(pathof(s)) + +def mkdir(s): + return os.mkdir(pathof(s)) + +def listdir(s): + rv = [] + for file in os.listdir(pathof(s)): + rv.append(pathof(file)) + return rv + +def getcwd(): + if PY2: + return os.getcwdu() + return os.getcwd() + +def walk(top): + top = pathof(top) + rv = [] + for base, dnames, names in os.walk(top): + base = pathof(base) + for name in names: + name = pathof(name) + rv.append(relpath(os.path.join(base, name), top)) + return rv + +def relpath(path, start=None): + return os.path.relpath(pathof(path) , pathof(start)) + +def abspath(path): + return os.path.abspath(pathof(path)) diff --git a/src/epy_reader/tools/KindleUnpack/unpack_structure.py b/src/epy_reader/tools/KindleUnpack/unpack_structure.py new file mode 100644 index 0000000..2e66eb8 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/unpack_structure.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import text_type + +from . import unipath +from .unipath import pathof + +DUMP = False +""" Set to True to dump all possible information. """ + +import os + +import re +# note: re requites the pattern to be the exact same type as the data to be searched in python3 +# but u"" is not allowed for the pattern itself only b"" + +import zipfile +import binascii +from .mobi_utils import mangle_fonts + +class unpackException(Exception): + pass + +class ZipInfo(zipfile.ZipInfo): + + def __init__(self, *args, **kwargs): + if 'compress_type' in kwargs: + compress_type = kwargs.pop('compress_type') + super(ZipInfo, self).__init__(*args, **kwargs) + self.compress_type = compress_type + +class fileNames: + + def __init__(self, infile, outdir): + self.infile = infile + self.outdir = outdir + if not unipath.exists(self.outdir): + unipath.mkdir(self.outdir) + self.mobi7dir = os.path.join(self.outdir,'mobi7') + if not unipath.exists(self.mobi7dir): + unipath.mkdir(self.mobi7dir) + self.imgdir = os.path.join(self.mobi7dir, 'Images') + if not unipath.exists(self.imgdir): + unipath.mkdir(self.imgdir) + self.hdimgdir = os.path.join(self.outdir,'HDImages') + if not unipath.exists(self.hdimgdir): + unipath.mkdir(self.hdimgdir) + self.outbase = os.path.join(self.outdir, os.path.splitext(os.path.split(infile)[1])[0]) + + def getInputFileBasename(self): + return os.path.splitext(os.path.basename(self.infile))[0] + + def makeK8Struct(self): + self.k8dir = os.path.join(self.outdir,'mobi8') + if not unipath.exists(self.k8dir): + unipath.mkdir(self.k8dir) + self.k8metainf = os.path.join(self.k8dir,'META-INF') + if not unipath.exists(self.k8metainf): + unipath.mkdir(self.k8metainf) + self.k8oebps = os.path.join(self.k8dir,'OEBPS') + if not unipath.exists(self.k8oebps): + unipath.mkdir(self.k8oebps) + self.k8images = os.path.join(self.k8oebps,'Images') + if not unipath.exists(self.k8images): + unipath.mkdir(self.k8images) + self.k8fonts = os.path.join(self.k8oebps,'Fonts') + if not unipath.exists(self.k8fonts): + unipath.mkdir(self.k8fonts) + self.k8styles = os.path.join(self.k8oebps,'Styles') + if not unipath.exists(self.k8styles): + unipath.mkdir(self.k8styles) + self.k8text = os.path.join(self.k8oebps,'Text') + if not unipath.exists(self.k8text): + unipath.mkdir(self.k8text) + + # recursive zip creation support routine + def zipUpDir(self, myzip, tdir, localname): + currentdir = tdir + if localname != "": + currentdir = os.path.join(currentdir,localname) + list = unipath.listdir(currentdir) + for file in list: + afilename = file + localfilePath = os.path.join(localname, afilename) + realfilePath = os.path.join(currentdir,file) + if unipath.isfile(realfilePath): + myzip.write(pathof(realfilePath), pathof(localfilePath), zipfile.ZIP_DEFLATED) + elif unipath.isdir(realfilePath): + self.zipUpDir(myzip, tdir, localfilePath) + + def makeEPUB(self, usedmap, obfuscate_data, uid): + bname = os.path.join(self.k8dir, self.getInputFileBasename() + '.epub') + # Create an encryption key for Adobe font obfuscation + # based on the epub's uid + if isinstance(uid,text_type): + uid = uid.encode('ascii') + if obfuscate_data: + key = re.sub(br'[^a-fA-F0-9]', b'', uid) + key = binascii.unhexlify((key + key)[:32]) + + # copy over all images and fonts that are actually used in the ebook + # and remove all font files from mobi7 since not supported + imgnames = unipath.listdir(self.imgdir) + for name in imgnames: + if usedmap.get(name,'not used') == 'used': + filein = os.path.join(self.imgdir,name) + if name.endswith(".ttf"): + fileout = os.path.join(self.k8fonts,name) + elif name.endswith(".otf"): + fileout = os.path.join(self.k8fonts,name) + elif name.endswith(".failed"): + fileout = os.path.join(self.k8fonts,name) + else: + fileout = os.path.join(self.k8images,name) + data = b'' + with open(pathof(filein),'rb') as f: + data = f.read() + if obfuscate_data: + if name in obfuscate_data: + data = mangle_fonts(key, data) + open(pathof(fileout),'wb').write(data) + if name.endswith(".ttf") or name.endswith(".otf"): + os.remove(pathof(filein)) + + # opf file name hard coded to "content.opf" + container = '\n' + container += '\n' + container += ' \n' + container += '' + container += ' \n\n' + fileout = os.path.join(self.k8metainf,'container.xml') + with open(pathof(fileout),'wb') as f: + f.write(container.encode('utf-8')) + + if obfuscate_data: + encryption = '\n' + for font in obfuscate_data: + encryption += ' \n' + encryption += ' \n' + encryption += ' \n' + encryption += ' \n' + encryption += ' \n' + encryption += ' \n' + encryption += '\n' + fileout = os.path.join(self.k8metainf,'encryption.xml') + with open(pathof(fileout),'wb') as f: + f.write(encryption.encode('utf-8')) + + # ready to build epub + self.outzip = zipfile.ZipFile(pathof(bname), 'w') + + # add the mimetype file uncompressed + mimetype = b'application/epub+zip' + fileout = os.path.join(self.k8dir,'mimetype') + with open(pathof(fileout),'wb') as f: + f.write(mimetype) + nzinfo = ZipInfo('mimetype', compress_type=zipfile.ZIP_STORED) + nzinfo.external_attr = 0o600 << 16 # make this a normal file + self.outzip.writestr(nzinfo, mimetype) + self.zipUpDir(self.outzip,self.k8dir,'META-INF') + self.zipUpDir(self.outzip,self.k8dir,'OEBPS') + self.outzip.close() diff --git a/src/epy_reader/tools/__init__.py b/src/epy_reader/tools/__init__.py new file mode 100644 index 0000000..d97cba1 --- /dev/null +++ b/src/epy_reader/tools/__init__.py @@ -0,0 +1,3 @@ +__all__ = ["unpack_kindle_book"] + +from epy_reader.tools.KindleUnpack.kindleunpack import unpackBook as unpack_kindle_book diff --git a/src/epy_reader/utils.py b/src/epy_reader/utils.py new file mode 100644 index 0000000..5bba7f6 --- /dev/null +++ b/src/epy_reader/utils.py @@ -0,0 +1,377 @@ +import curses +import os +import re +import sys +import textwrap +from functools import wraps +from typing import List, Mapping, Optional, Sequence, Tuple, Union + +from epy_reader.ebooks import URL, Azw, Ebook, Epub, FictionBook, Mobi +from epy_reader.lib import is_url, tuple_subtract +from epy_reader.models import Key, LettersCount, NoUpdate, ReadingState, TextStructure, TocEntry +from epy_reader.parser import parse_html +from epy_reader.speakers import SpeakerBaseModel, SpeakerMimic, SpeakerPico + + +def get_ebook_obj(filepath: str) -> Ebook: + file_ext = os.path.splitext(filepath)[1].lower() + if is_url(filepath): + return URL(filepath) + elif file_ext in {".epub", ".epub3"}: + return Epub(filepath) + elif file_ext == ".fb2": + return FictionBook(filepath) + elif file_ext == ".mobi": + return Mobi(filepath) + elif file_ext in {".azw", ".azw3"}: + return Azw(filepath) + else: + sys.exit("ERROR: Format not supported. (Supported: epub, fb2)") + + +def safe_curs_set(state: int) -> None: + try: + curses.curs_set(state) + except: + return + + +def find_current_content_index( + toc_entries: Tuple[TocEntry, ...], toc_secid: Mapping[str, int], index: int, y: int +) -> int: + ntoc = 0 + for n, toc_entry in enumerate(toc_entries): + if toc_entry.content_index <= index: + if y >= toc_secid.get(toc_entry.section, 0): # type: ignore + ntoc = n + return ntoc + + +def pgup(current_row: int, window_height: int, counter: int = 1) -> int: + if current_row >= (window_height) * counter: + return current_row - (window_height) * counter + else: + return 0 + + +def pgdn(current_row: int, total_lines: int, window_height: int, counter: int = 1) -> int: + if current_row + (window_height * counter) <= total_lines - window_height: + return current_row + (window_height * counter) + else: + current_row = total_lines - window_height + if current_row < 0: + return 0 + return current_row + + +def pgend(total_lines: int, window_height: int) -> int: + if total_lines - window_height >= 0: + return total_lines - window_height + else: + return 0 + + +def choice_win(allowdel=False): + """ + Conjure options window by wrapping a window function + which has a return type of tuple in the form of + (title, list_to_chose, initial_active_index, windows_key_to_toggle) + and return tuple of (returned_key, chosen_index, chosen_index_to_delete) + """ + + def inner_f(listgen): + @wraps(listgen) + def wrapper(self, *args, **kwargs): + rows, cols = self.screen.getmaxyx() + hi, wi = rows - 4, cols - 4 + Y, X = 2, 2 + chwin = curses.newwin(hi, wi, Y, X) + if self.is_color_supported: + chwin.bkgd(self.screen.getbkgd()) + + title, ch_list, index, key = listgen(self, *args, **kwargs) + + if len(title) > cols - 8: + title = title[: cols - 8] + + chwin.box() + chwin.keypad(True) + chwin.addstr(1, 2, title) + chwin.addstr(2, 2, "-" * len(title)) + if allowdel: + chwin.addstr(3, 2, "HINT: Press 'd' to delete.") + key_chwin = 0 + + totlines = len(ch_list) + chwin.refresh() + pad = curses.newpad(totlines, wi - 2) + if self.is_color_supported: + pad.bkgd(self.screen.getbkgd()) + + pad.keypad(True) + + padhi = rows - 5 - Y - 4 + 1 - (1 if allowdel else 0) + # padhi = rows - 5 - Y - 4 + 1 - 1 + y = 0 + if index in range(padhi // 2, totlines - padhi // 2): + y = index - padhi // 2 + 1 + span = [] + + for n, i in enumerate(ch_list): + # strs = " " + str(n+1).rjust(d) + " " + i[0] + # remove newline from choice entries + # mostly happens in FictionBook (.fb2) format + strs = " " + i.replace("\n", " ") + strs = strs[0 : wi - 3] + pad.addstr(n, 0, strs) + span.append(len(strs)) + + countstring = "" + while key_chwin not in self.keymap.Quit + key: + if countstring == "": + count = 1 + else: + count = int(countstring) + if key_chwin in tuple(Key(i) for i in range(48, 58)): # i.e., k is a numeral + countstring = countstring + key_chwin.char + else: + if key_chwin in self.keymap.ScrollUp + self.keymap.PageUp: + index -= count + if index < 0: + index = 0 + elif key_chwin in self.keymap.ScrollDown or key_chwin in self.keymap.PageDown: + index += count + if index + 1 >= totlines: + index = totlines - 1 + elif key_chwin in self.keymap.Follow: + chwin.clear() + chwin.refresh() + return None, index, None + elif key_chwin in self.keymap.BeginningOfCh: + index = 0 + elif key_chwin in self.keymap.EndOfCh: + index = totlines - 1 + elif key_chwin == Key("D") and allowdel: + return None, (0 if index == 0 else index - 1), index + # chwin.redrawwin() + # chwin.refresh() + elif key_chwin == Key("d") and allowdel: + resk, resp, _ = self.show_win_options( + "Delete '{}'?".format(ch_list[index]), + ["(Y)es", "(N)o"], + 0, + (Key("n"),), + ) + if resk is not None: + key_chwin = resk + continue + elif resp == 0: + return None, (0 if index == 0 else index - 1), index + chwin.redrawwin() + chwin.refresh() + elif key_chwin in {Key(i) for i in ["Y", "y", "N", "n"]} and ch_list == [ + "(Y)es", + "(N)o", + ]: + if key_chwin in {Key("Y"), Key("y")}: + return None, 0, None + else: + return None, 1, None + elif key_chwin in tuple_subtract(self._win_keys, key): + chwin.clear() + chwin.refresh() + return key_chwin, index, None + countstring = "" + + while index not in range(y, y + padhi): + if index < y: + y -= 1 + else: + y += 1 + + for n in range(totlines): + att = curses.A_REVERSE if index == n else curses.A_NORMAL + pre = ">>" if index == n else " " + pad.addstr(n, 0, pre) + pad.chgat(n, 0, span[n], pad.getbkgd() | att) + + pad.refresh(y, 0, Y + 4 + (1 if allowdel else 0), X + 4, rows - 5, cols - 6) + # pad.refresh(y, 0, Y+5, X+4, rows - 5, cols - 6) + key_chwin = Key(chwin.getch()) + if key_chwin == Key(curses.KEY_MOUSE): + mouse_event = curses.getmouse() + if mouse_event[4] == curses.BUTTON4_PRESSED: + key_chwin = self.keymap.ScrollUp[0] + elif mouse_event[4] == 2097152: + key_chwin = self.keymap.ScrollDown[0] + elif mouse_event[4] == curses.BUTTON1_DOUBLE_CLICKED: + if ( + mouse_event[2] >= 6 + and mouse_event[2] < rows - 4 + and mouse_event[2] < 6 + totlines + ): + index = mouse_event[2] - 6 + y + key_chwin = self.keymap.Follow[0] + elif ( + mouse_event[4] == curses.BUTTON1_CLICKED + and mouse_event[2] >= 6 + and mouse_event[2] < rows - 4 + and mouse_event[2] < 6 + totlines + ): + if index == mouse_event[2] - 6 + y: + key_chwin = self.keymap.Follow[0] + continue + index = mouse_event[2] - 6 + y + elif mouse_event[4] == curses.BUTTON3_CLICKED: + key_chwin = self.keymap.Quit[0] + + chwin.clear() + chwin.refresh() + return None, None, None + + return wrapper + + return inner_f + + +def text_win(textfunc): + @wraps(textfunc) + def wrapper(self, *args, **kwargs) -> Union[NoUpdate, Key]: + rows, cols = self.screen.getmaxyx() + hi, wi = rows - 4, cols - 4 + Y, X = 2, 2 + textw = curses.newwin(hi, wi, Y, X) + if self.is_color_supported: + textw.bkgd(self.screen.getbkgd()) + + title, raw_texts, key = textfunc(self, *args, **kwargs) + + if len(title) > cols - 8: + title = title[: cols - 8] + + texts = [] + for i in raw_texts.splitlines(): + texts += textwrap.wrap(i, wi - 6, drop_whitespace=False) + + textw.box() + textw.keypad(True) + textw.addstr(1, 2, title) + textw.addstr(2, 2, "-" * len(title)) + key_textw: Union[NoUpdate, Key] = NoUpdate() + + totlines = len(texts) + + pad = curses.newpad(totlines, wi - 2) + if self.is_color_supported: + pad.bkgd(self.screen.getbkgd()) + + pad.keypad(True) + for n, i in enumerate(texts): + pad.addstr(n, 0, i) + y = 0 + textw.refresh() + pad.refresh(y, 0, Y + 4, X + 4, rows - 5, cols - 6) + padhi = rows - 8 - Y + + while key_textw not in self.keymap.Quit + key: + if key_textw in self.keymap.ScrollUp and y > 0: + y -= 1 + elif key_textw in self.keymap.ScrollDown and y < totlines - hi + 6: + y += 1 + elif key_textw in self.keymap.PageUp: + y = pgup(y, padhi) + elif key_textw in self.keymap.PageDown: + y = pgdn(y, totlines, padhi) + elif key_textw in self.keymap.BeginningOfCh: + y = 0 + elif key_textw in self.keymap.EndOfCh: + y = pgend(totlines, padhi) + elif key_textw in tuple_subtract(self._win_keys, key): + textw.clear() + textw.refresh() + return key_textw + pad.refresh(y, 0, 6, 5, rows - 5, cols - 5) + key_textw = Key(textw.getch()) + + textw.clear() + textw.refresh() + return NoUpdate() + + return wrapper + + +def merge_text_structures( + text_structure_first: TextStructure, text_structure_second: TextStructure +) -> TextStructure: + return TextStructure( + text_lines=text_structure_first.text_lines + text_structure_second.text_lines, + image_maps={**text_structure_first.image_maps, **text_structure_second.image_maps}, + section_rows={**text_structure_first.section_rows, **text_structure_second.section_rows}, + formatting=text_structure_first.formatting + text_structure_second.formatting, + ) + + +def construct_relative_reading_state( + abs_reading_state: ReadingState, totlines_per_content: Sequence[int] +) -> ReadingState: + """ + :param abs_reading_state: ReadingState absolute to whole book when Setting.Seamless==True + :param totlines_per_content: sequence of total lines per book content + :return: new ReadingState relative to per content of the book + """ + index = 0 + cumulative_contents_lines = 0 + all_contents_lines = sum(totlines_per_content) + # for n, content_lines in enumerate(totlines_per_content): + # cumulative_contents_lines += content_lines + # if cumulative_contents_lines > abs_reading_state.row: + # return + while True: + content_lines = totlines_per_content[index] + cumulative_contents_lines += content_lines + if cumulative_contents_lines > abs_reading_state.row: + break + index += 1 + + return ReadingState( + content_index=index, + textwidth=abs_reading_state.textwidth, + row=abs_reading_state.row - cumulative_contents_lines + content_lines, + rel_pctg=abs_reading_state.rel_pctg + - ((cumulative_contents_lines - content_lines) / all_contents_lines) + if abs_reading_state.rel_pctg + else None, + section=abs_reading_state.section, + ) + + +def count_letters(ebook: Ebook) -> LettersCount: + per_content_counts: List[int] = [] + cumulative_counts: List[int] = [] + # assert isinstance(ebook.contents, tuple) + for i in ebook.contents: + content = ebook.get_raw_text(i) + src_lines = parse_html(content) + assert isinstance(src_lines, tuple) + cumulative_counts.append(sum(per_content_counts)) + per_content_counts.append(sum([len(re.sub(r"\s", "", j)) for j in src_lines])) + + return LettersCount(all=sum(per_content_counts), cumulative=tuple(cumulative_counts)) + + +def count_letters_parallel(ebook: Ebook, child_conn) -> None: + child_conn.send(count_letters(ebook)) + child_conn.close() + + +def construct_speaker( + preferred: Optional[str] = None, args: List[str] = [] +) -> Optional[SpeakerBaseModel]: + available_speakers = [SpeakerMimic, SpeakerPico] + sorted_speakers = ( + sorted(available_speakers, key=lambda x: int(x.cmd == preferred), reverse=True) + if preferred + else available_speakers + ) + speaker = next((speaker for speaker in sorted_speakers if speaker.available), None) + return speaker(args) if speaker else None -- cgit