diff options
author | Benawi Adha <benawiadha@gmail.com> | 2022-10-02 21:22:38 +0700 |
---|---|---|
committer | Benawi Adha <benawiadha@gmail.com> | 2022-10-02 21:22:38 +0700 |
commit | 258c30d2e088cd4ab091a53794da3f93af79915d (patch) | |
tree | f49340bf565deb20c730358af74a01bcc231de53 /src | |
parent | d43533f01d9d5baf5f78b71f832641382bd5962a (diff) | |
download | epy-258c30d2e088cd4ab091a53794da3f93af79915d.tar.gz |
Major refactor: breakdown epy.py script
into package project structure for easier
development
Squashed commit of the following:
commit 01309b961a4ab32394bff0d90949b57435dfda47
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 21:15:04 2022 +0700
Fix missing objects
commit aab2e773c30b255c81b1250b3b20967d5da40338
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 21:09:31 2022 +0700
Update README.md
commit d4e98926bcd9b00ce0410ad71249d24e6315abc5
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 21:07:28 2022 +0700
Add keywords in pyproject.toml
commit 432055af8245560a3ff2e046aef0b4e87da44930
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 21:04:34 2022 +0700
Bump version and deprecete setup.py
commit 51dd15aab8f8ff5996f822f8378e813f0b9fb80d
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 20:56:38 2022 +0700
Formatting
commit 81fb35e3b6fa0e27d79ef1da77202ed81eb99500
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 20:55:08 2022 +0700
Fix speakers module
commit 3b852e7c59b38d5a28520038e35f50a95270d2f1
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:52:46 2022 +0700
Fix circular import
commit 061e8a2649dabacd28a9e2f972559475316c654c
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:39:27 2022 +0700
Run formatting
commit abc2d0ab156992c63dc04745d14a69679a60accb
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:39:00 2022 +0700
Update isort and black config in pyproject
commit 5dc2e41bab5b997bd719bdc1561eb51ba0c17a83
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:31:00 2022 +0700
Add app Config
commit ed485a2ea8281585bf86dc5772f0c6dd9c803cc4
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:23:02 2022 +0700
Update debugpy script
commit 68b0553dd4d63eb4b847132c68ea4018587fa8ec
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:14:11 2022 +0700
Connect reader to main script
commit 63c3dd176f18a784a4ed2e88aa72b13d1c2b0990
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:11:17 2022 +0700
Implement reader
commit ce5eec8fb4e1db3870a16a07541365cd777d6c4c
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 19:29:49 2022 +0700
Fix script in pyproject.toml
commit 941e8e49f1593731fb582d92084206772b3f0442
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 19:28:39 2022 +0700
Rename modules
commit 5a3e7f766aee774c09b3b5336f3a2968e9cb1d0c
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 19:28:20 2022 +0700
Rename tool method
commit 3c0503ff475cb7eff8b12d3be0bda7a38efe1072
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 19:27:03 2022 +0700
Add ebooks lib
commit b5f71c3296a7d6f36454f6e1cbe84e15a45092ee
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 17:25:11 2022 +0700
Initial reorganization
Diffstat (limited to 'src')
45 files changed, 11365 insertions, 0 deletions
diff --git a/src/epy_reader/__init__.py b/src/epy_reader/__init__.py new file mode 100644 index 0000000..97e99a2 --- /dev/null +++ b/src/epy_reader/__init__.py @@ -0,0 +1,5 @@ +__version__ = "2022.10.2" +__license__ = "GPL-3.0" +__author__ = "Benawi Adha" +__email__ = "benawiadha@gmail.com" +__url__ = "https://github.com/wustho/epy" diff --git a/src/epy_reader/__main__.py b/src/epy_reader/__main__.py new file mode 100644 index 0000000..ce7d1b2 --- /dev/null +++ b/src/epy_reader/__main__.py @@ -0,0 +1,23 @@ +import curses +import multiprocessing +import sys + +import epy_reader.cli as cli +import epy_reader.reader as reader + + +def main(): + # On Windows, calling this method is necessary + # On Linux/OSX, this method does nothing + multiprocessing.freeze_support() + filepath, dump_only = cli.find_file() + if dump_only: + sys.exit(cli.dump_ebook_content(filepath)) + + while True: + filepath = curses.wrapper(reader.start_reading, filepath) + + +# https://setuptools.pypa.io/en/latest/userguide/entry_point.html +if __name__ == "__main__": + main() diff --git a/src/epy_reader/board.py b/src/epy_reader/board.py new file mode 100644 index 0000000..0562d3f --- /dev/null +++ b/src/epy_reader/board.py @@ -0,0 +1,148 @@ +import curses +import re +from typing import Optional, Tuple, Union + +from epy_reader.models import Direction, InlineStyle, Key, NoUpdate +from epy_reader.settings import DoubleSpreadPadding + + +class InfiniBoard: + """ + Wrapper for curses screen to render infinite texts. + The idea is instead of pre render all the text before reading, + this will only renders part of text on demand by which available + page on screen. + + And what this does is only drawing text/string on curses screen + without .clear() or .refresh() to optimize performance. + """ + + def __init__( + self, + screen, + text: Tuple[str, ...], + textwidth: int = 80, + default_style: Tuple[InlineStyle, ...] = tuple(), + spread: int = 1, + ): + self.screen = screen + self.screen_rows, self.screen_cols = self.screen.getmaxyx() + self.textwidth = textwidth + self.x = ((self.screen_cols - self.textwidth) // 2) + 1 + self.text = text + self.total_lines = len(text) + self.default_style: Tuple[InlineStyle, ...] = default_style + self.temporary_style: Tuple[InlineStyle, ...] = () + self.spread = spread + + if self.spread == 2: + self.x = DoubleSpreadPadding.LEFT.value + self.x_alt = ( + DoubleSpreadPadding.LEFT.value + self.textwidth + DoubleSpreadPadding.MIDDLE.value + ) + + def feed_temporary_style(self, styles: Optional[Tuple[InlineStyle, ...]] = None) -> None: + """Reset styling if `styles` is None""" + self.temporary_style = styles if styles else () + + def render_styles( + self, row: int, styles: Tuple[InlineStyle, ...] = (), bottom_padding: int = 0 + ) -> None: + for i in styles: + if i.row in range(row, row + self.screen_rows - bottom_padding): + self.chgat(row, i.row, i.col, i.n_letters, self.screen.getbkgd() | i.attr) + + if self.spread == 2 and i.row in range( + row + self.screen_rows - bottom_padding, + row + 2 * (self.screen_rows - bottom_padding), + ): + self.chgat( + row, + i.row - (self.screen_rows - bottom_padding), + -self.x + self.x_alt + i.col, + i.n_letters, + self.screen.getbkgd() | i.attr, + ) + + def getch(self) -> Union[NoUpdate, Key]: + input = self.screen.getch() + if input == -1: + return NoUpdate() + return Key(input) + + def getbkgd(self): + return self.screen.getbkgd() + + def chgat(self, row: int, y: int, x: int, n: int, attr: int) -> None: + self.screen.chgat(y - row, self.x + x, n, attr) + + def write(self, row: int, bottom_padding: int = 0) -> None: + for n_row in range(min(self.screen_rows - bottom_padding, self.total_lines - row)): + text_line = self.text[row + n_row] + self.screen.addstr(n_row, self.x, text_line) + + if ( + self.spread == 2 + and row + self.screen_rows - bottom_padding + n_row < self.total_lines + ): + text_line = self.text[row + self.screen_rows - bottom_padding + n_row] + # TODO: clean this up + if re.search("\\[IMG:[0-9]+\\]", text_line): + self.screen.addstr( + n_row, self.x_alt, text_line.center(self.textwidth), curses.A_BOLD + ) + else: + self.screen.addstr(n_row, self.x_alt, text_line) + + self.render_styles(row, self.default_style, bottom_padding) + self.render_styles(row, self.temporary_style, bottom_padding) + # self.screen.refresh() + + def write_n( + self, + row: int, + n: int = 1, + direction: Direction = Direction.FORWARD, + bottom_padding: int = 0, + ) -> None: + assert n > 0 + for n_row in range(min(self.screen_rows - bottom_padding, self.total_lines - row)): + text_line = self.text[row + n_row] + if direction == Direction.FORWARD: + # self.screen.addnstr(n_row, self.x + self.textwidth - n, self.text[row+n_row], n) + # `+ " " * (self.textwidth - len(self.text[row + n_row]))` is workaround to + # to prevent curses trace because not calling screen.clear() + self.screen.addnstr( + n_row, + self.x + self.textwidth - n, + text_line + " " * (self.textwidth - len(text_line)), + n, + ) + + if ( + self.spread == 2 + and row + self.screen_rows - bottom_padding + n_row < self.total_lines + ): + text_line_alt = self.text[row + n_row + self.screen_rows - bottom_padding] + self.screen.addnstr( + n_row, + self.x_alt + self.textwidth - n, + text_line_alt + " " * (self.textwidth - len(text_line_alt)), + n, + ) + + else: + if text_line[self.textwidth - n :]: + self.screen.addnstr(n_row, self.x, text_line[self.textwidth - n :], n) + + if ( + self.spread == 2 + and row + self.screen_rows - bottom_padding + n_row < self.total_lines + ): + text_line_alt = self.text[row + n_row + self.screen_rows - bottom_padding] + self.screen.addnstr( + n_row, + self.x_alt, + text_line_alt[self.textwidth - n :], + n, + ) diff --git a/src/epy_reader/cli.py b/src/epy_reader/cli.py new file mode 100644 index 0000000..e43b51c --- /dev/null +++ b/src/epy_reader/cli.py @@ -0,0 +1,171 @@ +import argparse +import os +import shutil +import sys +import textwrap +from difflib import SequenceMatcher as SM +from typing import List, Optional, Tuple + +from epy_reader import __version__ +from epy_reader.lib import coerce_to_int, is_url, truncate +from epy_reader.models import LibraryItem +from epy_reader.parser import parse_html +from epy_reader.state import State +from epy_reader.utils import get_ebook_obj + + +def cleanup_library(state: State) -> None: + """Cleanup non-existent file from library""" + library_items = state.get_from_history() + for item in library_items: + if not os.path.isfile(item.filepath) and not is_url(item.filepath): + state.delete_from_library(item.filepath) + + +def get_nth_file_from_library(state: State, n) -> Optional[LibraryItem]: + library_items = state.get_from_history() + try: + return library_items[n - 1] + except IndexError: + return None + + +def get_matching_library_item( + state: State, pattern: str, threshold: float = 0.5 +) -> Optional[LibraryItem]: + matches: List[Tuple[LibraryItem, float]] = [] # [(library_item, match_value), ...] + library_items = state.get_from_history() + if not library_items: + return None + + for item in library_items: + tomatch = f"{item.title} - {item.author}" # item.filepath + match_value = sum( + [i.size for i in SM(None, tomatch.lower(), pattern.lower()).get_matching_blocks()] + ) / float(len(pattern)) + matches.append( + ( + item, + match_value, + ) + ) + + sorted_matches = sorted(matches, key=lambda x: -x[1]) + first_match_item, first_match_value = sorted_matches[0] + if first_match_item and first_match_value >= threshold: + return first_match_item + else: + return None + + +def print_reading_history(state: State) -> None: + termc, _ = shutil.get_terminal_size() + library_items = state.get_from_history() + if not library_items: + print("No Reading History.") + return + + print("Reading History:") + dig = len(str(len(library_items) + 1)) + tcols = termc - dig - 2 + for n, item in enumerate(library_items): + print( + "{} {}".format( + str(n + 1).rjust(dig), + truncate(str(item), "...", tcols, tcols - 3), + ) + ) + + +def parse_cli_args() -> argparse.Namespace: + prog = "epy" + positional_arg_help_str = "[PATH | # | PATTERN | URL]" + args_parser = argparse.ArgumentParser( + prog=prog, + usage=f"%(prog)s [-h] [-r] [-d] [-v] {positional_arg_help_str}", + formatter_class=argparse.RawDescriptionHelpFormatter, + description="Read ebook in terminal", + epilog=textwrap.dedent( + f"""\ + examples: + {prog} /path/to/ebook read /path/to/ebook file + {prog} 3 read #3 file from reading history + {prog} count monte read file matching 'count monte' + from reading history + """ + ), + ) + args_parser.add_argument("-r", "--history", action="store_true", help="print reading history") + args_parser.add_argument("-d", "--dump", action="store_true", help="dump the content of ebook") + args_parser.add_argument( + "-v", + "--version", + action="version", + version=f"v{__version__}", + help="print version and exit", + ) + args_parser.add_argument( + "ebook", + action="store", + nargs="*", + metavar=positional_arg_help_str, + help="ebook path, history number, pattern or URL", + ) + return args_parser.parse_args() + + +def find_file() -> Tuple[str, bool]: + args = parse_cli_args() + state = State() + cleanup_library(state) + + if args.history: + print_reading_history(state) + sys.exit() + + if len(args.ebook) == 0: + last_read = state.get_last_read() + if last_read: + return last_read, args.dump + else: + sys.exit("ERROR: Found no last read ebook file.") + + elif len(args.ebook) == 1: + nth = coerce_to_int(args.ebook[0]) + if nth is not None: + file = get_nth_file_from_library(state, nth) + if file: + return file.filepath, args.dump + else: + print(f"ERROR: #{nth} file not found.") + print_reading_history(state) + sys.exit(1) + elif is_url(args.ebook[0]): + return args.ebook[0], args.dump + elif os.path.isfile(args.ebook[0]): + return args.ebook[0], args.dump + + pattern = " ".join(args.ebook) + match = get_matching_library_item(state, pattern) + if match: + return match.filepath, args.dump + else: + sys.exit("ERROR: Found no matching ebook from history.") + + +def dump_ebook_content(filepath: str) -> None: + ebook = get_ebook_obj(filepath) + try: + try: + ebook.initialize() + except Exception as e: + sys.exit("ERROR: Badly-structured ebook.\n" + str(e)) + for i in ebook.contents: + content = ebook.get_raw_text(i) + src_lines = parse_html(content) + assert isinstance(src_lines, tuple) + # sys.stdout.reconfigure(encoding="utf-8") # Python>=3.7 + for j in src_lines: + sys.stdout.buffer.write((j + "\n\n").encode("utf-8")) + finally: + ebook.cleanup() diff --git a/src/epy_reader/config.py b/src/epy_reader/config.py new file mode 100644 index 0000000..db70a98 --- /dev/null +++ b/src/epy_reader/config.py @@ -0,0 +1,80 @@ +import dataclasses +import json +import os +import sys +from typing import Mapping, Tuple, Union + +import epy_reader.settings as settings +from epy_reader.models import AppData, Key + + +class Config(AppData): + def __init__(self): + setting_dict = dataclasses.asdict(settings.Settings()) + keymap_dict = dataclasses.asdict(settings.CfgDefaultKeymaps()) + keymap_builtin_dict = dataclasses.asdict(settings.CfgBuiltinKeymaps()) + + if os.path.isfile(self.filepath): + with open(self.filepath) as f: + cfg_user = json.load(f) + setting_dict = Config.update_dict(setting_dict, cfg_user["Setting"]) + keymap_dict = Config.update_dict(keymap_dict, cfg_user["Keymap"]) + else: + self.save({"Setting": setting_dict, "Keymap": keymap_dict}) + + keymap_dict_tuple = {k: tuple(v) for k, v in keymap_dict.items()} + keymap_updated = { + k: tuple([Key(i) for i in v]) + for k, v in Config.update_keys_tuple(keymap_dict_tuple, keymap_builtin_dict).items() + } + + if sys.platform == "win32": + setting_dict["PageScrollAnimation"] = False + + self.setting = settings.Settings(**setting_dict) + self.keymap = settings.Keymap(**keymap_updated) + # to build help menu text + self.keymap_user_dict = keymap_dict + + @property + def filepath(self) -> str: + return os.path.join(self.prefix, "configuration.json") if self.prefix else os.devnull + + def save(self, cfg_dict): + with open(self.filepath, "w") as file: + json.dump(cfg_dict, file, indent=2) + + @staticmethod + def update_dict( + old_dict: Mapping[str, Union[str, int, bool]], + new_dict: Mapping[str, Union[str, int, bool]], + place_new=False, + ) -> Mapping[str, Union[str, int, bool]]: + """Returns a copy of `old_dict` after updating it with `new_dict`""" + + result = {**old_dict} + for k, _ in new_dict.items(): + if k in result: + result[k] = new_dict[k] + elif place_new: + result[k] = new_dict[k] + + return result + + @staticmethod + def update_keys_tuple( + old_keys: Mapping[str, Tuple[str, ...]], + new_keys: Mapping[str, Tuple[str, ...]], + place_new: bool = False, + ) -> Mapping[str, Tuple[str, ...]]: + """Returns a copy of `old_keys` after updating it with `new_keys` + by appending the tuple value and removes duplicate""" + + result = {**old_keys} + for k, _ in new_keys.items(): + if k in result: + result[k] = tuple(set(result[k] + new_keys[k])) + elif place_new: + result[k] = tuple(set(new_keys[k])) + + return result diff --git a/src/epy_reader/ebooks/__init__.py b/src/epy_reader/ebooks/__init__.py new file mode 100644 index 0000000..da5cfc0 --- /dev/null +++ b/src/epy_reader/ebooks/__init__.py @@ -0,0 +1,15 @@ +__all__ = [ + "Ebook", + "Epub", + "FictionBook", + "Mobi", + "Azw", + "URL", +] + +from epy_reader.ebooks.azw import Azw +from epy_reader.ebooks.base import Ebook +from epy_reader.ebooks.epub import Epub +from epy_reader.ebooks.fictionbook import FictionBook +from epy_reader.ebooks.mobi import Mobi +from epy_reader.ebooks.url import URL diff --git a/src/epy_reader/ebooks/azw.py b/src/epy_reader/ebooks/azw.py new file mode 100644 index 0000000..139fcc5 --- /dev/null +++ b/src/epy_reader/ebooks/azw.py @@ -0,0 +1,26 @@ +import contextlib +import os +import shutil +import tempfile +import zipfile + +from epy_reader.ebooks.epub import Epub +from epy_reader.tools import unpack_kindle_book + + +class Azw(Epub): + def __init__(self, fileepub): + self.path = os.path.abspath(fileepub) + self.tmpdir = tempfile.mkdtemp(prefix="epy-") + basename, _ = os.path.splitext(os.path.basename(self.path)) + self.tmpepub = os.path.join(self.tmpdir, "mobi8", basename + ".epub") + + def initialize(self): + with contextlib.redirect_stdout(None): + unpack_kindle_book(self.path, self.tmpdir, epubver="A", use_hd=True) + self.file = zipfile.ZipFile(self.tmpepub, "r") + Epub.initialize(self) + + def cleanup(self) -> None: + shutil.rmtree(self.tmpdir) + return diff --git a/src/epy_reader/ebooks/base.py b/src/epy_reader/ebooks/base.py new file mode 100644 index 0000000..0869db9 --- /dev/null +++ b/src/epy_reader/ebooks/base.py @@ -0,0 +1,48 @@ +import xml.etree.ElementTree as ET +from typing import Tuple, Union + +from epy_reader.models import BookMetadata, TocEntry + + +class Ebook: + def __init__(self, fileepub: str): + raise NotImplementedError("Ebook.__init__() not implemented") + + @property + def path(self) -> str: + return self._path + + @path.setter + def path(self, value: str) -> None: + self._path = value + + @property + def contents(self) -> Union[Tuple[str, ...], Tuple[ET.Element, ...]]: + return self._contents + + @contents.setter + def contents(self, value: Union[Tuple[str, ...], Tuple[ET.Element, ...]]) -> None: + self._contents = value + + @property + def toc_entries(self) -> Tuple[TocEntry, ...]: + return self._toc_entries + + @toc_entries.setter + def toc_entries(self, value: Tuple[TocEntry, ...]) -> None: + self._toc_entries = value + + def get_meta(self) -> BookMetadata: + raise NotImplementedError("Ebook.get_meta() not implemented") + + def initialize(self) -> None: + raise NotImplementedError("Ebook.initialize() not implemented") + + def get_raw_text(self, content: Union[str, ET.Element]) -> str: + raise NotImplementedError("Ebook.get_raw_text() not implemented") + + def get_img_bytestr(self, impath: str) -> Tuple[str, bytes]: + raise NotImplementedError("Ebook.get_img_bytestr() not implemented") + + def cleanup(self) -> None: + raise NotImplementedError("Ebook.cleanup() not implemented") diff --git a/src/epy_reader/ebooks/epub.py b/src/epy_reader/ebooks/epub.py new file mode 100644 index 0000000..a8cf0fa --- /dev/null +++ b/src/epy_reader/ebooks/epub.py @@ -0,0 +1,202 @@ +import dataclasses +import os +import xml.etree.ElementTree as ET +import zipfile +import zlib +from typing import Dict, List, Optional, Sequence, Tuple, Union +from urllib.parse import unquote, urljoin + +from epy_reader.ebooks.base import Ebook +from epy_reader.models import BookMetadata, TocEntry + + +# TODO: to be deprecated +DEBUG = False + + +class Epub(Ebook): + NAMESPACE = { + "DAISY": "http://www.daisy.org/z3986/2005/ncx/", + "OPF": "http://www.idpf.org/2007/opf", + "CONT": "urn:oasis:names:tc:opendocument:xmlns:container", + "XHTML": "http://www.w3.org/1999/xhtml", + "EPUB": "http://www.idpf.org/2007/ops", + # Dublin Core + "DC": "http://purl.org/dc/elements/1.1/", + } + + def __init__(self, fileepub: str): + self.path: str = os.path.abspath(fileepub) + self.file: Union[zipfile.ZipFile, str] = zipfile.ZipFile(fileepub, "r") + + # populate these attributes + # by calling self.initialize() + self.root_filepath: str + self.root_dirpath: str + + def get_meta(self) -> BookMetadata: + assert isinstance(self.file, zipfile.ZipFile) + # why self.file.read(self.root_filepath) problematic + # content_opf = ET.fromstring(self.file.open(self.root_filepath).read()) + content_opf = ET.parse(self.file.open(self.root_filepath)) + return Epub._get_metadata(content_opf) + + @staticmethod + def _get_metadata(content_opf: ET.ElementTree) -> BookMetadata: + metadata: Dict[str, Optional[str]] = {} + for field in dataclasses.fields(BookMetadata): + element = content_opf.find(f".//DC:{field.name}", Epub.NAMESPACE) + if element is not None: + metadata[field.name] = element.text + + return BookMetadata(**metadata) + + @staticmethod + def _get_contents(content_opf: ET.ElementTree) -> Tuple[str, ...]: + # cont = ET.parse(self.file.open(self.root_filepath)).getroot() + manifests: List[Tuple[str, str]] = [] + for manifest_elem in content_opf.findall("OPF:manifest/*", Epub.NAMESPACE): + # EPUB3 + # if manifest_elem.get("id") != "ncx" and manifest_elem.get("properties") != "nav": + if ( + manifest_elem.get("media-type") != "application/x-dtbncx+xml" + and manifest_elem.get("properties") != "nav" + ): + manifest_id = manifest_elem.get("id") + assert manifest_id is not None + manifest_href = manifest_elem.get("href") + assert manifest_href is not None + manifests.append((manifest_id, manifest_href)) + + spines: List[str] = [] + contents: List[str] = [] + for spine_elem in content_opf.findall("OPF:spine/*", Epub.NAMESPACE): + idref = spine_elem.get("idref") + assert idref is not None + spines.append(idref) + for spine in spines: + for manifest in manifests: + if spine == manifest[0]: + # book_contents.append(root_dirpath + unquote(manifest[1])) + contents.append(unquote(manifest[1])) + manifests.remove(manifest) + # TODO: test is break necessary + break + + return tuple(contents) + + @staticmethod + def _get_tocs(toc: ET.Element, version: str, contents: Sequence[str]) -> Tuple[TocEntry, ...]: + try: + # EPUB3 + if version in {"1.0", "2.0"}: + navPoints = toc.findall("DAISY:navMap//DAISY:navPoint", Epub.NAMESPACE) + elif version == "3.0": + navPoints = toc.findall( + "XHTML:body//XHTML:nav[@EPUB:type='toc']//XHTML:a", Epub.NAMESPACE + ) + + toc_entries: List[TocEntry] = [] + for navPoint in navPoints: + if version in {"1.0", "2.0"}: + src_elem = navPoint.find("DAISY:content", Epub.NAMESPACE) + assert src_elem is not None + src = src_elem.get("src") + + name_elem = navPoint.find("DAISY:navLabel/DAISY:text", Epub.NAMESPACE) + assert name_elem is not None + name = name_elem.text + elif version == "3.0": + src_elem = navPoint + assert src_elem is not None + src = src_elem.get("href") + + name = "".join(list(navPoint.itertext())) + + assert src is not None + src_id = src.split("#") + + try: + idx = contents.index(unquote(src_id[0])) + except ValueError: + continue + + # assert name is not None + # NOTE: skip empty label + if name is not None: + toc_entries.append( + TocEntry( + label=name, + content_index=idx, + section=src_id[1] if len(src_id) == 2 else None, + ) + ) + except AttributeError as e: + # TODO: + if DEBUG: + raise e + + return tuple(toc_entries) + + def initialize(self) -> None: + assert isinstance(self.file, zipfile.ZipFile) + + container = ET.parse(self.file.open("META-INF/container.xml")) + rootfile_elem = container.find("CONT:rootfiles/CONT:rootfile", Epub.NAMESPACE) + assert rootfile_elem is not None + self.root_filepath = rootfile_elem.attrib["full-path"] + self.root_dirpath = ( + os.path.dirname(self.root_filepath) + "/" + if os.path.dirname(self.root_filepath) != "" + else "" + ) + + content_opf = ET.parse(self.file.open(self.root_filepath)) + version = content_opf.getroot().get("version") + + contents = Epub._get_contents(content_opf) + self.contents = tuple(urljoin(self.root_dirpath, content) for content in contents) + + if version in {"1.0", "2.0"}: + # "OPF:manifest/*[@id='ncx']" + relative_toc = content_opf.find( + "OPF:manifest/*[@media-type='application/x-dtbncx+xml']", Epub.NAMESPACE + ) + elif version == "3.0": + relative_toc = content_opf.find("OPF:manifest/*[@properties='nav']", Epub.NAMESPACE) + else: + raise RuntimeError(f"Unsupported Epub version: {version}") + assert relative_toc is not None + relative_toc_path = relative_toc.get("href") + assert relative_toc_path is not None + toc_path = self.root_dirpath + relative_toc_path + toc = ET.parse(self.file.open(toc_path)).getroot() + self.toc_entries = Epub._get_tocs(toc, version, contents) # *self.contents (absolute path) + + def get_raw_text(self, content_path: Union[str, ET.Element]) -> str: + assert isinstance(self.file, zipfile.ZipFile) + assert isinstance(content_path, str) + + max_tries: Optional[int] = None # 1 if DEBUG else None + + # use try-except block to catch + # zlib.error: Error -3 while decompressing data: invalid distance too far back + # seems like caused by multiprocessing + tries = 0 + while True: + try: + content = self.file.open(content_path).read() + break + except zlib.error as e: + tries += 1 + if max_tries is not None and tries >= max_tries: + raise e + + return content.decode("utf-8") + + def get_img_bytestr(self, impath: str) -> Tuple[str, bytes]: + assert isinstance(self.file, zipfile.ZipFile) + return impath, self.file.read(impath) + + def cleanup(self) -> None: + pass diff --git a/src/epy_reader/ebooks/fictionbook.py b/src/epy_reader/ebooks/fictionbook.py new file mode 100644 index 0000000..35611b2 --- /dev/null +++ b/src/epy_reader/ebooks/fictionbook.py @@ -0,0 +1,76 @@ +import base64 +import os +import xml.etree.ElementTree as ET +from typing import List, Tuple, Union + +from epy_reader.ebooks import Ebook +from epy_reader.models import BookMetadata, TocEntry + + +class FictionBook(Ebook): + NAMESPACE = {"FB2": "http://www.gribuser.ru/xml/fictionbook/2.0"} + + def __init__(self, filefb: str): + self.path = os.path.abspath(filefb) + self.file = filefb + + # populate these attribute + # by calling self.initialize() + self.root: ET.Element + + def get_meta(self) -> BookMetadata: + title_elem = self.root.find(".//FB2:book-title", FictionBook.NAMESPACE) + first_name_elem = self.root.find(".//FB2:first-name", FictionBook.NAMESPACE) + last_name_elem = self.root.find(".//FB2:last-name", FictionBook.NAMESPACE) + date_elem = self.root.find(".//FB2:date", FictionBook.NAMESPACE) + identifier_elem = self.root.find(".//FB2:id", FictionBook.NAMESPACE) + + author = first_name_elem.text if first_name_elem is not None else None + if last_name_elem is not None: + if author is not None and author != "": + author += f" {last_name_elem.text}" + else: + author = last_name_elem.text + + return BookMetadata( + title=title_elem.text if title_elem is not None else None, + creator=author, + date=date_elem.text if date_elem is not None else None, + identifier=identifier_elem.text if identifier_elem is not None else None, + ) + + def initialize(self) -> None: + cont = ET.parse(self.file) + self.root = cont.getroot() + + self.contents = tuple(self.root.findall("FB2:body/*", FictionBook.NAMESPACE)) + + # TODO + toc_entries: List[TocEntry] = [] + for n, i in enumerate(self.contents): + title = i.find("FB2:title", FictionBook.NAMESPACE) + if title is not None: + toc_entries.append( + TocEntry(label="".join(title.itertext()), content_index=n, section=None) + ) + self.toc_entries = tuple(toc_entries) + + def get_raw_text(self, node: Union[str, ET.Element]) -> str: + assert isinstance(node, ET.Element) + ET.register_namespace("", "http://www.gribuser.ru/xml/fictionbook/2.0") + # sys.exit(ET.tostring(node, encoding="utf8", method="html").decode("utf-8").replace("ns1:","")) + return ET.tostring(node, encoding="utf8", method="html").decode("utf-8").replace("ns1:", "") + + def get_img_bytestr(self, imgid: str) -> Tuple[str, bytes]: + # TODO: test if image works + imgid = imgid.replace("#", "") + img_elem = self.root.find("*[@id='{}']".format(imgid)) + assert img_elem is not None + imgtype = img_elem.get("content-type") + img_elem_text = img_elem.text + assert imgtype is not None + assert img_elem_text is not None + return imgid + "." + imgtype.split("/")[1], base64.b64decode(img_elem_text) + + def cleanup(self) -> None: + return diff --git a/src/epy_reader/ebooks/mobi.py b/src/epy_reader/ebooks/mobi.py new file mode 100644 index 0000000..39f3be4 --- /dev/null +++ b/src/epy_reader/ebooks/mobi.py @@ -0,0 +1,69 @@ +import contextlib +import os +import shutil +import tempfile +import xml.etree.ElementTree as ET +from typing import Tuple, Union + +from epy_reader.ebooks.epub import Epub +from epy_reader.models import BookMetadata +from epy_reader.tools import unpack_kindle_book + + +class Mobi(Epub): + def __init__(self, filemobi: str): + self.path = os.path.abspath(filemobi) + self.file = tempfile.mkdtemp(prefix="epy-") + + # populate these attribute + # by calling self.initialize() + self.root_filepath: str + self.root_dirpath: str + + def get_meta(self) -> BookMetadata: + # why self.file.read(self.root_filepath) problematic + with open(os.path.join(self.root_dirpath, "content.opf")) as f: + content_opf = ET.parse(f) # .getroot() + return Epub._get_metadata(content_opf) + + def initialize(self) -> None: + assert isinstance(self.file, str) + + with contextlib.redirect_stdout(None): + unpack_kindle_book(self.path, self.file, epubver="A", use_hd=True) + # TODO: add cleanup here + + self.root_dirpath = os.path.join(self.file, "mobi7") + self.toc_path = os.path.join(self.root_dirpath, "toc.ncx") + version = "2.0" + + with open(os.path.join(self.root_dirpath, "content.opf")) as f: + content_opf = ET.parse(f) # .getroot() + + contents = Epub._get_contents(content_opf) + self.contents = tuple(os.path.join(self.root_dirpath, content) for content in contents) + + with open(self.toc_path) as f: + toc = ET.parse(f).getroot() + self.toc_entries = Epub._get_tocs(toc, version, contents) # *self.contents (absolute path) + + def get_raw_text(self, content_path: Union[str, ET.Element]) -> str: + assert isinstance(content_path, str) + with open(content_path, encoding="utf8") as f: + content = f.read() + # return content.decode("utf-8") + return content + + def get_img_bytestr(self, impath: str) -> Tuple[str, bytes]: + # TODO: test on windows + # if impath "Images/asdf.png" is problematic + image_abspath = os.path.join(self.root_dirpath, impath) + image_abspath = os.path.normpath(image_abspath) # handle crossplatform path + with open(image_abspath, "rb") as f: + src = f.read() + return impath, src + + def cleanup(self) -> None: + assert isinstance(self.file, str) + shutil.rmtree(self.file) + return diff --git a/src/epy_reader/ebooks/url.py b/src/epy_reader/ebooks/url.py new file mode 100644 index 0000000..4356fa7 --- /dev/null +++ b/src/epy_reader/ebooks/url.py @@ -0,0 +1,49 @@ +from pathlib import PurePosixPath +from typing import Tuple +from urllib.error import HTTPError, URLError +from urllib.parse import urljoin, urlparse +from urllib.request import Request, urlopen + +from epy_reader import __version__ +from epy_reader.ebooks import Ebook +from epy_reader.lib import is_url +from epy_reader.models import BookMetadata + + +class URL(Ebook): + _header = { + "User-Agent": f"epy/v{__version__}", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.8", + } + + def __init__(self, url: str): + self.path = url + self.file = url + self.contents = ("_",) + self.toc_entries = tuple() + + def get_meta(self) -> BookMetadata: + return BookMetadata() + + def initialize(self) -> None: + try: + with urlopen(Request(self.path, headers=URL._header)) as response: + self.html = response.read().decode() + except HTTPError as e: + raise e + except URLError as e: + raise e + + def get_raw_text(self, _) -> str: + return self.html + + def get_img_bytestr(self, src: str) -> Tuple[str, bytes]: + image_url = src if is_url(src) else urljoin(self.path, src) + # TODO: catch error on request + with urlopen(Request(image_url, headers=URL._header)) as response: + byte_str = response.read() + return PurePosixPath(urlparse(src).path).name, byte_str + + def cleanup(self) -> None: + return diff --git a/src/epy_reader/lib.py b/src/epy_reader/lib.py new file mode 100644 index 0000000..b010323 --- /dev/null +++ b/src/epy_reader/lib.py @@ -0,0 +1,63 @@ +from typing import Any, Optional, Tuple +from urllib.parse import urljoin, urlparse + + +def is_url(string: str) -> bool: + try: + tmp = urlparse(string) + return all([tmp.scheme, tmp.netloc]) + except ValueError: + return False + + +def coerce_to_int(string: str) -> Optional[int]: + try: + return int(string) + except ValueError: + return None + + +def truncate(teks: str, subtitution_text: str, maxlen: int, startsub: int = 0) -> str: + """ + Truncate text + + eg. + :param teks: 'This is long silly dummy text' + :param subtitution_text: '...' + :param maxlen: 12 + :param startsub: 3 + :return: 'This...ly dummy text' + """ + if startsub > maxlen: + raise ValueError("Var startsub cannot be bigger than maxlen.") + elif len(teks) <= maxlen: + return teks + else: + lensu = len(subtitution_text) + beg = teks[:startsub] + mid = ( + subtitution_text + if lensu <= maxlen - startsub + else subtitution_text[: maxlen - startsub] + ) + end = teks[startsub + lensu - maxlen :] if lensu < maxlen - startsub else "" + return beg + mid + end + + +def tuple_subtract(tuple_one: Tuple[Any, ...], tuple_two: Tuple[Any, ...]) -> Tuple[Any, ...]: + """ + Returns tuple with members in tuple_one + but not in tuple_two + """ + return tuple(i for i in tuple_one if i not in tuple_two) + + +def resolve_path(current_dir: str, relative_path: str) -> str: + """ + Resolve path containing dots + eg. '/foo/bar/book.html' + '../img.png' = '/foo/img.png' + NOTE: '/' suffix is important to tell that current dir in 'bar' + """ + # can also using os.path.normpath() + # but if the image in zipfile then posix path is mandatory + return urljoin(current_dir, relative_path) diff --git a/src/epy_reader/models.py b/src/epy_reader/models.py new file mode 100644 index 0000000..db4701b --- /dev/null +++ b/src/epy_reader/models.py @@ -0,0 +1,232 @@ +import os +from dataclasses import dataclass +from datetime import datetime +from enum import Enum +from typing import Any, Mapping, Optional, Tuple, Union + + +class Direction(Enum): + FORWARD = "forward" + BACKWARD = "backward" + + +@dataclass(frozen=True) +class BookMetadata: + title: Optional[str] = None + creator: Optional[str] = None + description: Optional[str] = None + publisher: Optional[str] = None + date: Optional[str] = None + language: Optional[str] = None + format: Optional[str] = None + identifier: Optional[str] = None + source: Optional[str] = None + + +@dataclass(frozen=True) +class LibraryItem: + last_read: datetime + filepath: str + title: Optional[str] = None + author: Optional[str] = None + reading_progress: Optional[float] = None + + def __str__(self) -> str: + if self.reading_progress is None: + reading_progress_str = "N/A" + else: + reading_progress_str = f"{int(self.reading_progress * 100)}%" + reading_progress_str = reading_progress_str.rjust(4) + + book_name: str + filename = self.filepath.replace(os.path.expanduser("~"), "~", 1) + if self.title is not None and self.author is not None: + book_name = f"{self.title} - {self.author} ({filename})" + elif self.title is None and self.author: + book_name = f"{filename} - {self.author}" + else: + book_name = filename + + last_read_str = self.last_read.strftime("%I:%M%p %b %d") + + return f"{reading_progress_str} {last_read_str}: {book_name}" + + +@dataclass(frozen=True) +class ReadingState: + """ + Data model for reading state. + + `row` has to be explicitly assigned with value + because Seamless feature needs it to adjust from + relative (to book's content index) row to absolute + (to book's entire content) row. + + `rel_pctg` and `section` default to None and if + either of them is assigned with value, then it + will be overriding the `row` value. + """ + + content_index: int + textwidth: int + row: int + rel_pctg: Optional[float] = None + section: Optional[str] = None + + +@dataclass(frozen=True) +class SearchData: + direction: Direction = Direction.FORWARD + value: str = "" + + +@dataclass(frozen=True) +class LettersCount: + """ + all: total letters in book + cumulative: list of total letters for previous contents + eg. let's say cumulative = (0, 50, 89, ...) it means + 0 is total cumulative letters of book contents[-1] to contents[0] + 50 is total cumulative letters of book contents[0] to contents[1] + 89 is total cumulative letters of book contents[0] to contents[2] + """ + + all: int + cumulative: Tuple[int, ...] + + +@dataclass(frozen=True) +class CharPos: + """ + Describes character position in text. + eg. ["Lorem ipsum dolor sit amet,", # row=0 + "consectetur adipiscing elit."] # row=1 + ^CharPos(row=1, col=3) + """ + + row: int + col: int + + +@dataclass(frozen=True) +class TextMark: + """ + Describes marking in text. + eg. Interval [CharPos(row=0, col=3), CharPos(row=1, col=4)] + notice the marking inclusive [] for both side instead of right exclusive [) + """ + + start: CharPos + end: Optional[CharPos] = None + + def is_valid(self) -> bool: + """ + Assert validity and check if the mark is unterminated + eg. <div><i>This is italic text</div> + Missing </i> tag + """ + if self.end is not None: + if self.start.row == self.end.row: + return self.start.col <= self.end.col + else: + return self.start.row < self.end.row + + return False + + +@dataclass(frozen=True) +class TextSpan: + """ + Like TextMark but using span of letters (n_letters) + """ + + start: CharPos + n_letters: int + + +@dataclass(frozen=True) +class InlineStyle: + """ + eg. InlineStyle(attr=curses.A_BOLD, row=3, cols=4, n_letters=3) + """ + + row: int + col: int + n_letters: int + attr: int + + +@dataclass(frozen=True) +class TocEntry: + label: str + content_index: int + section: Optional[str] + + +@dataclass(frozen=True) +class TextStructure: + """ + Object that describes how the text + should be displayed in screen. + + text_lines: ("list of lines", "of text", ...) + image_maps: {line_num: path/to/image/in/ebook/zip} + section_rows: {section_id: line_num} + formatting: (InlineStyle, ...) + """ + + text_lines: Tuple[str, ...] + image_maps: Mapping[int, str] + section_rows: Mapping[str, int] + formatting: Tuple[InlineStyle, ...] + + +@dataclass(frozen=True) +class NoUpdate: + pass + + +class Key: + """ + Because ord("k") chr(34) are confusing + """ + + def __init__(self, char_or_int: Union[str, int]): + self.value: int = char_or_int if isinstance(char_or_int, int) else ord(char_or_int) + self.char: str = char_or_int if isinstance(char_or_int, str) else chr(char_or_int) + + def __eq__(self, other: Any) -> bool: + if isinstance(other, Key): + return self.value == other.value + return False + + def __ne__(self, other: Any) -> bool: + return self.__eq__(other) + + def __hash__(self) -> int: + return hash(self.value) + + +class AppData: + @property + def prefix(self) -> Optional[str]: + """Return None if there exists no homedir | userdir""" + prefix: Optional[str] = None + + # UNIX filesystem + homedir = os.getenv("HOME") + # WIN filesystem + userdir = os.getenv("USERPROFILE") + + if homedir: + if os.path.isdir(os.path.join(homedir, ".config")): + prefix = os.path.join(homedir, ".config", "epy") + else: + prefix = os.path.join(homedir, ".epy") + elif userdir: + prefix = os.path.join(userdir, ".epy") + + if prefix: + os.makedirs(prefix, exist_ok=True) + + return prefix diff --git a/src/epy_reader/parser.py b/src/epy_reader/parser.py new file mode 100644 index 0000000..6eced00 --- /dev/null +++ b/src/epy_reader/parser.py @@ -0,0 +1,421 @@ +import curses +import dataclasses +import re +import textwrap +from html import unescape +from html.parser import HTMLParser +from typing import Dict, List, Mapping, Optional, Sequence, Set, Tuple, Union +from urllib.parse import unquote + +from epy_reader.models import CharPos, InlineStyle, TextMark, TextSpan, TextStructure + + +class HTMLtoLines(HTMLParser): + para = {"p", "div"} + inde = {"q", "dt", "dd", "blockquote"} + pref = {"pre"} + bull = {"li"} + hide = {"script", "style", "head"} + ital = {"i", "em"} + bold = {"b", "strong"} + # hide = {"script", "style", "head", ", "sub} + # sup_lookup = "⁰¹²³⁴⁵⁶⁷⁸⁹" + # sub_lookup = "₀₁₂₃₄₅₆₇₈₉" + + attr_bold = curses.A_BOLD + try: + attr_italic = curses.A_ITALIC + except AttributeError: + try: + attr_italic = curses.A_UNDERLINE + except AttributeError: + attr_italic = curses.A_NORMAL + + @staticmethod + def _mark_to_spans(text: Sequence[str], marks: Sequence[TextMark]) -> List[TextSpan]: + """ + Convert text marks in line of text to per line text span. + Keeping duplicate spans. + """ + spans: List[TextSpan] = [] + for mark in marks: + if mark.is_valid(): + # mypy issue, should be handled by mark.is_valid() + assert mark.end is not None + if mark.start.row == mark.end.row: + spans.append( + TextSpan(start=mark.start, n_letters=mark.end.col - mark.start.col) + ) + else: + spans.append( + TextSpan( + start=mark.start, n_letters=len(text[mark.start.row]) - mark.start.col + ) + ) + for nth_line in range(mark.start.row + 1, mark.end.row): + spans.append( + TextSpan( + start=CharPos(row=nth_line, col=0), n_letters=len(text[nth_line]) + ) + ) + spans.append( + TextSpan(start=CharPos(row=mark.end.row, col=0), n_letters=mark.end.col) + ) + + return spans # list(set(spans)) + + @staticmethod + def _adjust_wrapped_spans( + wrapped_lines: Sequence[str], + span: TextSpan, + *, + line_adjustment: int = 0, + left_adjustment: int = 0, + ) -> List[TextSpan]: + """ + Adjust text span to wrapped lines. + Not perfect, but should be good enough considering + the limitation on commandline interface. + """ + + # current_row = span.start.row + line_adjustment + current_row = line_adjustment + start_col = span.start.col + end_col = start_col + span.n_letters + + prev = 0 # chars length before current line + spans: List[TextSpan] = [] + for n, line in enumerate(wrapped_lines): + # + 1 compensates textwrap.wrap(*args, replace_whitespace=True, drop_whitespace=True) + line_len = len(line) + 1 + current = prev + line_len # chars length before next line + + # -:unmarked *:marked + # |------*****--------| + if start_col in range(prev, current) and end_col in range(prev, current): + spans.append( + TextSpan( + start=CharPos(row=current_row + n, col=start_col - prev + left_adjustment), + n_letters=span.n_letters, + ) + ) + + # |----------*********| + elif start_col in range(prev, current): + spans.append( + TextSpan( + start=CharPos(row=current_row + n, col=start_col - prev + left_adjustment), + n_letters=current - start_col - 1, # -1: dropped whitespace + ) + ) + + # |********-----------| + elif end_col in range(prev, current): + spans.append( + TextSpan( + start=CharPos(row=current_row + n, col=0 + left_adjustment), + n_letters=end_col - prev + 1, # +1: dropped whitespace + ) + ) + + # |*******************| + elif prev in range(start_col, end_col) and current in range(start_col, end_col): + spans.append( + TextSpan( + start=CharPos(row=current_row + n, col=0 + left_adjustment), + n_letters=line_len - 1, # -1: dropped whitespace + ) + ) + + elif prev > end_col: + break + + prev = current + + return spans + + @staticmethod + def _group_spans_by_row(blocks: Sequence[TextSpan]) -> Mapping[int, List[TextSpan]]: + groups: Dict[int, List[TextSpan]] = {} + for block in blocks: + row = block.start.row + if row in groups: + groups[row].append(block) + else: + groups[row] = [block] + return groups + + def __init__(self, sects={""}): + HTMLParser.__init__(self) + self.text = [""] + self.ishead = False + self.isinde = False + self.isbull = False + self.ispref = False + self.ishidden = False + self.idhead = set() + self.idinde = set() + self.idbull = set() + self.idpref = set() + self.idimgs = set() + self.sects = sects + self.sectsindex = {} + self.italic_marks: List[TextMark] = [] + self.bold_marks: List[TextMark] = [] + self.imgs: Dict[int, str] = dict() + + def handle_starttag(self, tag, attrs): + if re.match("h[1-6]", tag) is not None: + self.ishead = True + elif tag in self.inde: + self.isinde = True + elif tag in self.pref: + self.ispref = True + elif tag in self.bull: + self.isbull = True + elif tag in self.hide: + self.ishidden = True + elif tag == "sup": + self.text[-1] += "^{" + elif tag == "sub": + self.text[-1] += "_{" + # NOTE: "img" and "image" + # In HTML, both are startendtag (no need endtag) + # but in XHTML both need endtag + elif tag in {"img", "image"}: + for i in attrs: + if (tag == "img" and i[0] == "src") or (tag == "image" and i[0].endswith("href")): + this_line = len(self.text) + self.idimgs.add(this_line) + self.imgs[this_line] = unquote(i[1]) + self.text.append("[IMAGE]") + # formatting + elif tag in self.ital: + if len(self.italic_marks) == 0 or self.italic_marks[-1].is_valid(): + char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1])) + self.italic_marks.append(TextMark(start=char_pos)) + elif tag in self.bold: + if len(self.bold_marks) == 0 or self.bold_marks[-1].is_valid(): + char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1])) + self.bold_marks.append(TextMark(start=char_pos)) + if self.sects != {""}: + for i in attrs: + if i[0] == "id" and i[1] in self.sects: + # self.text[-1] += " (#" + i[1] + ") " + # self.sectsindex.append([len(self.text), i[1]]) + self.sectsindex[len(self.text) - 1] = i[1] + + def handle_startendtag(self, tag, attrs): + if tag == "br": + self.text += [""] + elif tag in {"img", "image"}: + for i in attrs: + # if (tag == "img" and i[0] == "src")\ + # or (tag == "image" and i[0] == "xlink:href"): + if (tag == "img" and i[0] == "src") or (tag == "image" and i[0].endswith("href")): + this_line = len(self.text) + self.idimgs.add(this_line) + self.imgs[this_line] = unquote(i[1]) + self.text.append("[IMAGE]") + self.text.append("") + # sometimes attribute "id" is inside "startendtag" + # especially html from mobi module (kindleunpack fork) + if self.sects != {""}: + for i in attrs: + if i[0] == "id" and i[1] in self.sects: + # self.text[-1] += " (#" + i[1] + ") " + self.sectsindex[len(self.text) - 1] = i[1] + + def handle_endtag(self, tag): + if re.match("h[1-6]", tag) is not None: + self.text.append("") + self.text.append("") + self.ishead = False + elif tag in self.para: + self.text.append("") + elif tag in self.hide: + self.ishidden = False + elif tag in self.inde: + if self.text[-1] != "": + self.text.append("") + self.isinde = False + elif tag in self.pref: + if self.text[-1] != "": + self.text.append("") + self.ispref = False + elif tag in self.bull: + if self.text[-1] != "": + self.text.append("") + self.isbull = False + elif tag in {"sub", "sup"}: + self.text[-1] += "}" + elif tag in {"img", "image"}: + self.text.append("") + # formatting + elif tag in self.ital: + char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1])) + last_mark = self.italic_marks[-1] + self.italic_marks[-1] = dataclasses.replace(last_mark, end=char_pos) + elif tag in self.bold: + char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1])) + last_mark = self.bold_marks[-1] + self.bold_marks[-1] = dataclasses.replace(last_mark, end=char_pos) + + def handle_data(self, raw): + if raw and not self.ishidden: + if self.text[-1] == "": + tmp = raw.lstrip() + else: + tmp = raw + if self.ispref: + line = unescape(tmp) + else: + line = unescape(re.sub(r"\s+", " ", tmp)) + self.text[-1] += line + if self.ishead: + self.idhead.add(len(self.text) - 1) + elif self.isbull: + self.idbull.add(len(self.text) - 1) + elif self.isinde: + self.idinde.add(len(self.text) - 1) + elif self.ispref: + self.idpref.add(len(self.text) - 1) + + def get_structured_text( + self, textwidth: Optional[int] = 0, starting_line: int = 0 + ) -> Union[Tuple[str, ...], TextStructure]: + + if not textwidth: + return tuple(self.text) + + text: List[str] = [] + images: Dict[int, str] = dict() # {line_num: path/in/zip} + sect: Dict[str, int] = dict() # {section_id: line_num} + formatting: List[InlineStyle] = [] + + italic_spans: List[TextSpan] = HTMLtoLines._mark_to_spans(self.text, self.italic_marks) + bold_spans: List[TextSpan] = HTMLtoLines._mark_to_spans(self.text, self.bold_marks) + italic_groups = HTMLtoLines._group_spans_by_row(italic_spans) + bold_groups = HTMLtoLines._group_spans_by_row(bold_spans) + + for n, line in enumerate(self.text): + + startline = len(text) + # findsect = re.search(r"(?<= \(#).*?(?=\) )", line) + # if findsect is not None and findsect.group() in self.sects: + # line = line.replace(" (#" + findsect.group() + ") ", "") + # # line = line.replace(" (#" + findsect.group() + ") ", " "*(5+len(findsect.group()))) + # sect[findsect.group()] = len(text) + if n in self.sectsindex.keys(): + sect[self.sectsindex[n]] = starting_line + len(text) + if n in self.idhead: + # text += [line.rjust(textwidth // 2 + len(line) // 2)] + [""] + text += [line.center(textwidth)] + [""] + formatting += [ + InlineStyle( + row=starting_line + i, col=0, n_letters=len(text[i]), attr=self.attr_bold + ) + for i in range(startline, len(text)) + ] + elif n in self.idinde: + text += [" " + i for i in textwrap.wrap(line, textwidth - 3)] + [""] + elif n in self.idbull: + tmp = textwrap.wrap(line, textwidth - 3) + text += [" - " + i if i == tmp[0] else " " + i for i in tmp] + [""] + elif n in self.idpref: + tmp = line.splitlines() + wraptmp = [] + for tmp_line in tmp: + wraptmp += [i for i in textwrap.wrap(tmp_line, textwidth - 6)] + text += [" " + i for i in wraptmp] + [""] + elif n in self.idimgs: + images[starting_line + len(text)] = self.imgs[n] + text += [line.center(textwidth)] + formatting += [ + InlineStyle( + row=starting_line + len(text) - 1, + col=0, + n_letters=len(text[-1]), + attr=self.attr_bold, + ) + ] + text += [""] + else: + text += textwrap.wrap(line, textwidth) + [""] + + endline = len(text) # -1 + + left_adjustment = 3 if n in self.idbull | self.idinde else 0 + + for spans in italic_groups.get(n, []): + italics = HTMLtoLines._adjust_wrapped_spans( + text[startline:endline], + spans, + line_adjustment=startline, + left_adjustment=left_adjustment, + ) + for span in italics: + formatting.append( + InlineStyle( + row=starting_line + span.start.row, + col=span.start.col, + n_letters=span.n_letters, + attr=self.attr_italic, + ) + ) + + for spans in bold_groups.get(n, []): + bolds = HTMLtoLines._adjust_wrapped_spans( + text[startline:endline], + spans, + line_adjustment=startline, + left_adjustment=left_adjustment, + ) + for span in bolds: + formatting.append( + InlineStyle( + row=starting_line + span.start.row, + col=span.start.col, + n_letters=span.n_letters, + attr=self.attr_bold, + ) + ) + + # chapter suffix + text += ["***".center(textwidth)] + + return TextStructure( + text_lines=tuple(text), + image_maps=images, + section_rows=sect, + formatting=tuple(formatting), + ) + + +def parse_html( + html_src: str, + *, + textwidth: Optional[int] = None, + section_ids: Optional[Set[str]] = None, + starting_line: int = 0, +) -> Union[Tuple[str, ...], TextStructure]: + """ + Parse html string into TextStructure + + :param html_src: html str to parse + :param textwidth: textwidth to count max length of returned TextStructure + if None given, sequence of text as paragraph is returned + :param section_ids: set of section ids to look for inside html tag attr + :return: Tuple[str, ...] if textwidth not given else TextStructure + """ + if not section_ids: + section_ids = set() + + parser = HTMLtoLines(section_ids) + # try: + parser.feed(html_src) + parser.close() + # except: + # pass + + return parser.get_structured_text(textwidth, starting_line) diff --git a/src/epy_reader/reader.py b/src/epy_reader/reader.py new file mode 100644 index 0000000..a903b62 --- /dev/null +++ b/src/epy_reader/reader.py @@ -0,0 +1,1610 @@ +import curses +import dataclasses +import multiprocessing +import os +import re +import shutil +import signal +import sqlite3 +import subprocess +import sys +import tempfile +import uuid +import xml.etree.ElementTree as ET +from html import unescape +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union + +import epy_reader.settings as settings +from epy_reader.board import InfiniBoard +from epy_reader.config import Config +from epy_reader.ebooks import Azw, Ebook, Epub, Mobi +from epy_reader.lib import resolve_path +from epy_reader.models import ( + Direction, + InlineStyle, + Key, + LettersCount, + NoUpdate, + ReadingState, + SearchData, + TextStructure, + TocEntry, +) +from epy_reader.parser import parse_html +from epy_reader.settings import DoubleSpreadPadding +from epy_reader.speakers import SpeakerBaseModel +from epy_reader.state import State +from epy_reader.utils import ( + choice_win, + construct_relative_reading_state, + construct_speaker, + count_letters, + count_letters_parallel, + find_current_content_index, + get_ebook_obj, + merge_text_structures, + pgend, + safe_curs_set, + text_win, +) + + +# TODO: to be deprecated +DEBUG = False + + +class Reader: + def __init__(self, screen, ebook: Ebook, config: Config, state: State): + + self.setting = config.setting + self.keymap = config.keymap + # to build help menu text + self.keymap_user_dict = config.keymap_user_dict + + self.seamless = self.setting.SeamlessBetweenChapters + + # keys that will make + # windows exit and return the said key + self._win_keys = ( + # curses.KEY_RESIZE is a must + (Key(curses.KEY_RESIZE),) + + self.keymap.TableOfContents + + self.keymap.Metadata + + self.keymap.Help + ) + + # screen initialization + self.screen = screen + self.screen.keypad(True) + safe_curs_set(0) + if self.setting.MouseSupport: + curses.mousemask(-1) + # curses.mouseinterval(0) + self.screen.clear() + + # screen color + self.is_color_supported: bool = False + try: + curses.use_default_colors() + curses.init_pair(1, self.setting.DefaultColorFG, self.setting.DefaultColorBG) + curses.init_pair(2, self.setting.DarkColorFG, self.setting.DarkColorBG) + curses.init_pair(3, self.setting.LightColorFG, self.setting.LightColorBG) + self.screen.bkgd(curses.color_pair(1)) + self.is_color_supported = True + except: + self.is_color_supported = False + + # show loader and start heavy resources processes + self.show_loader(subtext="initalizing ebook") + + # main ebook object + self.ebook = ebook + try: + self.ebook.initialize() + except (KeyboardInterrupt, Exception) as e: + self.ebook.cleanup() + if DEBUG: + raise e + else: + sys.exit("ERROR: Badly-structured ebook.\n" + str(e)) + + # state + self.state = state + + # page scroll animation + self.page_animation: Optional[Direction] = None + + # show reading progress + self.show_reading_progress: bool = self.setting.ShowProgressIndicator + self.reading_progress: Optional[float] = None # calculate after count_letters() + + # search storage + self.search_data: Optional[SearchData] = None + + # double spread + self.spread = 2 if self.setting.StartWithDoubleSpread else 1 + + # jumps marker container + self.jump_list: Dict[str, ReadingState] = dict() + + # TTS speaker utils + self._tts_speaker: Optional[SpeakerBaseModel] = construct_speaker( + self.setting.PreferredTTSEngine, self.setting.TTSEngineArgs + ) + self.tts_support: bool = bool(self._tts_speaker) + self.is_speaking: bool = False + + # multi process & progress percentage + self._multiprocess_support: bool = False if multiprocessing.cpu_count() == 1 else True + self._process_counting_letter: Optional[multiprocessing.Process] = None + self.letters_count: Optional[LettersCount] = None + + def run_counting_letters(self): + if self._multiprocess_support: + try: + self._proc_parent, self._proc_child = multiprocessing.Pipe() + self._process_counting_letter = multiprocessing.Process( + name="epy-subprocess-counting-letters", + target=count_letters_parallel, + args=(self.ebook, self._proc_child), + ) + # forking will raise + # zlib.error: Error -3 while decompressing data: invalid distance too far back + self._process_counting_letter.start() + except Exception as e: + if DEBUG: + raise e + self._multiprocess_support = False + if not self._multiprocess_support: + self.letters_count = count_letters(self.ebook) + + def try_assign_letters_count(self, *, force_wait=False) -> None: + if isinstance(self._process_counting_letter, multiprocessing.Process): + if force_wait and self._process_counting_letter.is_alive(): + self._process_counting_letter.join() + + if self._process_counting_letter.exitcode == 0: + self.letters_count = self._proc_parent.recv() + self._proc_parent.close() + self._process_counting_letter.terminate() + self._process_counting_letter.close() + self._process_counting_letter = None + + def calculate_reading_progress( + self, letters_per_content: List[int], reading_state: ReadingState + ) -> None: + if self.letters_count: + self.reading_progress = ( + self.letters_count.cumulative[reading_state.content_index] + + sum( + letters_per_content[: reading_state.row + (self.screen_rows * self.spread) - 1] + ) + ) / self.letters_count.all + + @property + def screen_rows(self) -> int: + return self.screen.getmaxyx()[0] + + @property + def screen_cols(self) -> int: + return self.screen.getmaxyx()[1] + + @property + def ext_dict_app(self) -> Optional[str]: + self._ext_dict_app: Optional[str] = None + + if shutil.which(self.setting.DictionaryClient.split()[0]): + self._ext_dict_app = self.setting.DictionaryClient + else: + for i in settings.DICT_PRESET_LIST: + if shutil.which(i) is not None: + self._ext_dict_app = i + break + if self._ext_dict_app in {"sdcv"}: + self._ext_dict_app += " -n" + + return self._ext_dict_app + + @property + def image_viewer(self) -> Optional[str]: + self._image_viewer: Optional[str] = None + + if shutil.which(self.setting.DefaultViewer.split()[0]) is not None: + self._image_viewer = self.setting.DefaultViewer + elif sys.platform == "win32": + self._image_viewer = "start" + elif sys.platform == "darwin": + self._image_viewer = "open" + else: + for i in settings.VIEWER_PRESET_LIST: + if shutil.which(i) is not None: + self._image_viewer = i + break + + if self._image_viewer in {"gio"}: + self._image_viewer += " open" + + return self._image_viewer + + def open_image(self, pad, name, bstr): + sfx = os.path.splitext(name)[1] + fd, path = tempfile.mkstemp(suffix=sfx) + try: + with os.fdopen(fd, "wb") as tmp: + # tmp.write(epub.file.read(src)) + tmp.write(bstr) + # run(VWR + " " + path, shell=True) + subprocess.call( + self.image_viewer + " " + path, + shell=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + k = pad.getch() + finally: + os.remove(path) + return k + + def show_loader(self, *, loader_str: str = "\u231B", subtext: Optional[str] = None): + self.screen.clear() + rows, cols = self.screen.getmaxyx() + middle_row = (rows - 1) // 2 + self.screen.addstr(middle_row, 0, loader_str.center(cols)) + if subtext: + self.screen.addstr(middle_row + 1, 0, subtext.center(cols)) + # self.screen.addstr(((rows-2)//2)+1, (cols-len(msg))//2, msg) + self.screen.refresh() + + @choice_win(True) + def show_win_options(self, title, options, active_index, key_set): + return title, options, active_index, key_set + + @text_win + def show_win_error(self, title, msg, key): + return title, msg, key + + @choice_win() + def toc(self, toc_entries: Tuple[TocEntry, ...], index: int): + return ( + "Table of Contents", + [i.label for i in toc_entries], + index, + self.keymap.TableOfContents, + ) + + @text_win + def show_win_metadata(self): + if os.path.isfile(self.ebook.path): + mdata = "[File Info]\nPATH: {}\nSIZE: {} MB\n \n[Book Info]\n".format( + self.ebook.path, round(os.path.getsize(self.ebook.path) / 1024**2, 2) + ) + else: + mdata = "[File Info]\nPATH: {}\n \n[Book Info]\n".format(self.ebook.path) + + book_metadata = self.ebook.get_meta() + for field in dataclasses.fields(book_metadata): + value = getattr(book_metadata, field.name) + if value: + value = unescape(re.sub("<[^>]*>", "", value)) + mdata += f"{field.name.title()}: {value}\n" + + return "Metadata", mdata, self.keymap.Metadata + + @text_win + def show_win_help(self): + src = "Key Bindings:\n" + dig = max([len(i) for i in self.keymap_user_dict.values()]) + 2 + for i in self.keymap_user_dict.keys(): + src += "{} {}\n".format( + self.keymap_user_dict[i].rjust(dig), " ".join(re.findall("[A-Z][^A-Z]*", i)) + ) + return "Help", src, self.keymap.Help + + @text_win + def define_word(self, word): + rows, cols = self.screen.getmaxyx() + hi, wi = 5, 16 + Y, X = (rows - hi) // 2, (cols - wi) // 2 + + p = subprocess.Popen( + "{} {}".format(self.ext_dict_app, word), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True, + ) + + dictwin = curses.newwin(hi, wi, Y, X) + dictwin.box() + dictwin.addstr((hi - 1) // 2, (wi - 10) // 2, "Loading...") + dictwin.refresh() + + out, err = p.communicate() + + dictwin.clear() + dictwin.refresh() + + if err == b"": + return "Definition: " + word.upper(), out.decode(), self.keymap.DefineWord + else: + return "Error: " + self.ext_dict_app, err.decode(), self.keymap.DefineWord + + def show_win_choices_bookmarks(self): + idx = 0 + while True: + bookmarks = [i[0] for i in self.state.get_bookmarks(self.ebook)] + if not bookmarks: + return self.keymap.ShowBookmarks[0], None + + retk, idx, todel = self.show_win_options( + "Bookmarks", bookmarks, idx, self.keymap.ShowBookmarks + ) + if todel is not None: + self.state.delete_bookmark(self.ebook, bookmarks[todel]) + else: + return retk, idx + + def show_win_library(self): + while True: + library_items = self.state.get_from_history() + if not library_items: + return self.keymap.Library[0], None + + retk, choice_index, todel_index = self.show_win_options( + "Library", [str(item) for item in library_items], 0, self.keymap.Library + ) + if todel_index is not None: + self.state.delete_from_library(library_items[todel_index].filepath) + else: + return retk, choice_index + + def input_prompt(self, prompt: str) -> Union[NoUpdate, Key, str]: + """ + :param prompt: prompt text + :return: NoUpdate if cancelled or interrupted + Key if curses.KEY_RESIZE triggered + str for successful input + """ + # prevent pad hole when prompting for input while + # other window is active + # pad.refresh(y, 0, 0, x, rows-2, x+width) + rows, cols = self.screen.getmaxyx() + stat = curses.newwin(1, cols, rows - 1, 0) + if self.is_color_supported: + stat.bkgd(self.screen.getbkgd()) + stat.keypad(True) + curses.echo(True) + safe_curs_set(2) + + init_text = "" + + stat.addstr(0, 0, prompt, curses.A_REVERSE) + stat.addstr(0, len(prompt), init_text) + stat.refresh() + + try: + while True: + # NOTE: getch() only handles ascii + # to handle wide char like: é, use get_wch() + ipt = Key(stat.get_wch()) + # get_wch() return ambiguous type + # str for string input but int for function or special keys + # if type(ipt) == str: + # ipt = ord(ipt) + + if ipt == Key(27): + stat.clear() + stat.refresh() + curses.echo(False) + safe_curs_set(0) + return NoUpdate() + elif ipt == Key(10): + stat.clear() + stat.refresh() + curses.echo(False) + safe_curs_set(0) + return init_text + elif ipt in (Key(8), Key(127), Key(curses.KEY_BACKSPACE)): + init_text = init_text[:-1] + elif ipt == Key(curses.KEY_RESIZE): + stat.clear() + stat.refresh() + curses.echo(False) + safe_curs_set(0) + return Key(curses.KEY_RESIZE) + # elif len(init_text) <= maxlen: + else: + init_text += ipt.char + + stat.clear() + stat.addstr(0, 0, prompt, curses.A_REVERSE) + stat.addstr( + 0, + len(prompt), + init_text + if len(prompt + init_text) < cols + else "..." + init_text[len(prompt) - cols + 4 :], + ) + stat.refresh() + except KeyboardInterrupt: + stat.clear() + stat.refresh() + curses.echo(False) + safe_curs_set(0) + return NoUpdate() + + def searching( + self, board: InfiniBoard, src: Sequence[str], reading_state: ReadingState, tot + ) -> Union[NoUpdate, ReadingState, Key]: + # reusable loop indices + i: Any + j: Any + + rows, cols = self.screen.getmaxyx() + # unnecessary + # if self.spread == 2: + # reading_state = dataclasses.replace(reading_state, textwidth=(cols - 7) // 2) + + x = (cols - reading_state.textwidth) // 2 + if self.spread == 1: + x = (cols - reading_state.textwidth) // 2 + else: + x = 2 + + if not self.search_data: + candidate_text = self.input_prompt(" Regex:") + # if isinstance(candidate_text, str) and candidate_text != "": + if isinstance(candidate_text, str) and candidate_text: + self.search_data = SearchData(value=candidate_text) + else: + assert isinstance(candidate_text, NoUpdate) or isinstance(candidate_text, Key) + return candidate_text + + found = [] + try: + pattern = re.compile(self.search_data.value, re.IGNORECASE) + except re.error as reerrmsg: + self.search_data = None + tmpk = self.show_win_error("!Regex Error", str(reerrmsg), tuple()) + return tmpk + + for n, i in enumerate(src): + for j in pattern.finditer(i): + found.append([n, j.span()[0], j.span()[1] - j.span()[0]]) + + if not found: + if ( + self.search_data.direction == Direction.FORWARD + and reading_state.content_index + 1 < tot + ): + return ReadingState( + content_index=reading_state.content_index + 1, + textwidth=reading_state.textwidth, + row=0, + ) + elif ( + self.search_data.direction == Direction.BACKWARD and reading_state.content_index > 0 + ): + return ReadingState( + content_index=reading_state.content_index - 1, + textwidth=reading_state.textwidth, + row=0, + ) + else: + s: Union[NoUpdate, Key] = NoUpdate() + while True: + if s in self.keymap.Quit: + self.search_data = None + self.screen.clear() + self.screen.refresh() + return reading_state + # TODO: maybe >= 0? + elif s == Key("n") and reading_state.content_index == 0: + self.search_data = dataclasses.replace( + self.search_data, direction=Direction.FORWARD + ) + return ReadingState( + content_index=reading_state.content_index + 1, + textwidth=reading_state.textwidth, + row=0, + ) + elif s == Key("N") and reading_state.content_index + 1 == tot: + self.search_data = dataclasses.replace( + self.search_data, direction=Direction.BACKWARD + ) + return ReadingState( + content_index=reading_state.content_index - 1, + textwidth=reading_state.textwidth, + row=0, + ) + + self.screen.clear() + self.screen.addstr( + rows - 1, + 0, + " Finished searching: " + self.search_data.value[: cols - 22] + " ", + curses.A_REVERSE, + ) + board.write(reading_state.row, 1) + self.screen.refresh() + s = board.getch() + + sidx = len(found) - 1 + if self.search_data.direction == Direction.FORWARD: + if reading_state.row > found[-1][0]: + return ReadingState( + content_index=reading_state.content_index + 1, + textwidth=reading_state.textwidth, + row=0, + ) + for n, i in enumerate(found): + if i[0] >= reading_state.row: + sidx = n + break + + s = NoUpdate() + msg = ( + " Searching: " + + self.search_data.value + + " --- Res {}/{} Ch {}/{} ".format( + sidx + 1, len(found), reading_state.content_index + 1, tot + ) + ) + while True: + if s in self.keymap.Quit: + self.search_data = None + # for i in found: + # pad.chgat(i[0], i[1], i[2], pad.getbkgd()) + board.feed_temporary_style() + # pad.format() + # self.screen.clear() + # self.screen.refresh() + return reading_state + elif s == Key("n"): + self.search_data = dataclasses.replace( + self.search_data, direction=Direction.FORWARD + ) + if sidx == len(found) - 1: + if reading_state.content_index + 1 < tot: + return ReadingState( + content_index=reading_state.content_index + 1, + textwidth=reading_state.textwidth, + row=0, + ) + else: + s = NoUpdate() + msg = " Finished searching: " + self.search_data.value + " " + continue + else: + sidx += 1 + msg = ( + " Searching: " + + self.search_data.value + + " --- Res {}/{} Ch {}/{} ".format( + sidx + 1, len(found), reading_state.content_index + 1, tot + ) + ) + elif s == Key("N"): + self.search_data = dataclasses.replace( + self.search_data, direction=Direction.BACKWARD + ) + if sidx == 0: + if reading_state.content_index > 0: + return ReadingState( + content_index=reading_state.content_index - 1, + textwidth=reading_state.textwidth, + row=0, + ) + else: + s = NoUpdate() + msg = " Finished searching: " + self.search_data.value + " " + continue + else: + sidx -= 1 + msg = ( + " Searching: " + + self.search_data.value + + " --- Res {}/{} Ch {}/{} ".format( + sidx + 1, len(found), reading_state.content_index + 1, tot + ) + ) + elif s == Key(curses.KEY_RESIZE): + return Key(curses.KEY_RESIZE) + + # if reading_state.row + rows - 1 > pad.chunks[pad.find_chunkidx(reading_state.row)]: + # reading_state = dataclasses.replace( + # reading_state, row=pad.chunks[pad.find_chunkidx(reading_state.row)] + 1 + # ) + + while found[sidx][0] not in list( + range(reading_state.row, reading_state.row + (rows - 1) * self.spread) + ): + if found[sidx][0] > reading_state.row: + reading_state = dataclasses.replace( + reading_state, row=reading_state.row + ((rows - 1) * self.spread) + ) + else: + reading_state = dataclasses.replace( + reading_state, row=reading_state.row - ((rows - 1) * self.spread) + ) + if reading_state.row < 0: + reading_state = dataclasses.replace(reading_state, row=0) + + # formats = [InlineStyle(row=i[0], col=i[1], n_letters=i[2], attr=curses.A_REVERSE) for i in found] + # pad.feed_style(formats) + styles: List[InlineStyle] = [] + for n, i in enumerate(found): + attr = curses.A_REVERSE if n == sidx else curses.A_NORMAL + # pad.chgat(i[0], i[1], i[2], pad.getbkgd() | attr) + styles.append( + InlineStyle(row=i[0], col=i[1], n_letters=i[2], attr=board.getbkgd() | attr) + ) + board.feed_temporary_style(tuple(styles)) + + self.screen.clear() + self.screen.addstr(rows - 1, 0, msg, curses.A_REVERSE) + self.screen.refresh() + # pad.refresh(reading_state.row, 0, 0, x, rows - 2, x + reading_state.textwidth) + board.write(reading_state.row, 1) + s = board.getch() + + def speaking(self, text): + self.is_speaking = True + self.screen.addstr(self.screen_rows - 1, 0, " Speaking! ", curses.A_REVERSE) + self.screen.refresh() + self.screen.timeout(1) + try: + self._tts_speaker.speak(text) + + while True: + if self._tts_speaker.is_done(): + k = self.keymap.PageDown[0] + break + tmp = self.screen.getch() + k = NoUpdate() if tmp == -1 else Key(tmp) + if k == Key(curses.KEY_MOUSE): + mouse_event = curses.getmouse() + if mouse_event[4] == curses.BUTTON2_CLICKED: + k = self.keymap.Quit[0] + elif mouse_event[4] == curses.BUTTON1_CLICKED: + if mouse_event[1] < self.screen_cols // 2: + k = self.keymap.PageUp[0] + else: + k = self.keymap.PageDown[0] + elif mouse_event[4] == curses.BUTTON4_PRESSED: + k = self.keymap.ScrollUp[0] + elif mouse_event[4] == 2097152: + k = self.keymap.ScrollDown[0] + if ( + k + in self.keymap.Quit + + self.keymap.PageUp + + self.keymap.PageDown + + self.keymap.ScrollUp + + self.keymap.ScrollDown + + (curses.KEY_RESIZE,) + ): + self._tts_speaker.stop() + break + finally: + self.screen.timeout(-1) + self._tts_speaker.cleanup() + + if k in self.keymap.Quit: + self.is_speaking = False + k = NoUpdate() + return k + + def savestate(self, reading_state: ReadingState) -> None: + if self.seamless: + reading_state = self.convert_absolute_reading_state_to_relative(reading_state) + self.state.set_last_reading_state(self.ebook, reading_state) + self.state.update_library(self.ebook, self.reading_progress) + + def cleanup(self) -> None: + self.ebook.cleanup() + + if isinstance(self._process_counting_letter, multiprocessing.Process): + if self._process_counting_letter.is_alive(): + self._process_counting_letter.terminate() + # weird python multiprocessing issue, need to call .join() before .close() + # ValueError: Cannot close a process while it is still running. + # You should first call join() or terminate(). + self._process_counting_letter.join() + self._process_counting_letter.close() + + def convert_absolute_reading_state_to_relative(self, reading_state) -> ReadingState: + if not self.seamless: + raise RuntimeError( + "Reader.convert_absolute_reading_state_to_relative() only implemented when Seamless=True" + ) + return construct_relative_reading_state(reading_state, self.totlines_per_content) + + def convert_relative_reading_state_to_absolute( + self, reading_state: ReadingState + ) -> ReadingState: + if not self.seamless: + raise RuntimeError( + "Reader.convert_relative_reading_state_to_absolute() only implemented when Seamless=True" + ) + + absolute_row = reading_state.row + sum( + self.totlines_per_content[: reading_state.content_index] + ) + absolute_pctg = ( + absolute_row / sum(self.totlines_per_content) if reading_state.rel_pctg else None + ) + + return dataclasses.replace( + reading_state, content_index=0, row=absolute_row, rel_pctg=absolute_pctg + ) + + def get_all_book_contents( + self, reading_state: ReadingState + ) -> Tuple[TextStructure, Tuple[TocEntry, ...], Union[Tuple[str, ...], Tuple[ET.Element, ...]]]: + if not self.seamless: + raise RuntimeError("Reader.get_all_book_contents() only implemented when Seamless=True") + + contents = self.ebook.contents + toc_entries = self.ebook.toc_entries + + text_structure: TextStructure = TextStructure( + text_lines=tuple(), image_maps=dict(), section_rows=dict(), formatting=tuple() + ) + toc_entries_tmp: List[TocEntry] = [] + section_rows_tmp: Dict[str, int] = dict() + + # self.totlines_per_content only defined when Seamless=True + self.totlines_per_content: Tuple[int, ...] = tuple() + + for n, content in enumerate(contents): + self.show_loader(subtext=f"loading contents ({n+1}/{len(contents)})") + starting_line = sum(self.totlines_per_content) + assert isinstance(content, str) or isinstance(content, ET.Element) + text_structure_tmp = parse_html( + self.ebook.get_raw_text(content), + textwidth=reading_state.textwidth, + section_ids=set(toc_entry.section for toc_entry in toc_entries), # type: ignore + starting_line=starting_line, + ) + assert isinstance(text_structure_tmp, TextStructure) + # self.totlines_per_content.append(len(text_structure_tmp.text_lines)) + self.totlines_per_content += (len(text_structure_tmp.text_lines),) + + for toc_entry in toc_entries: + if toc_entry.content_index == n: + if toc_entry.section: + toc_entries_tmp.append(dataclasses.replace(toc_entry, content_index=0)) + else: + section_id_tmp = str(uuid.uuid4()) + toc_entries_tmp.append( + TocEntry(label=toc_entry.label, content_index=0, section=section_id_tmp) + ) + section_rows_tmp[section_id_tmp] = starting_line + + text_structure = merge_text_structures(text_structure, text_structure_tmp) + + text_structure = dataclasses.replace( + text_structure, section_rows={**text_structure.section_rows, **section_rows_tmp} + ) + + return text_structure, tuple(toc_entries_tmp), (self.ebook.contents[0],) + + def get_current_book_content( + self, reading_state: ReadingState + ) -> Tuple[TextStructure, Tuple[TocEntry, ...], Union[Tuple[str, ...], Tuple[ET.Element, ...]]]: + contents = self.ebook.contents + toc_entries = self.ebook.toc_entries + content_path = contents[reading_state.content_index] + content = self.ebook.get_raw_text(content_path) + text_structure = parse_html( # type: ignore + content, + textwidth=reading_state.textwidth, + section_ids=set(toc_entry.section for toc_entry in toc_entries), # type: ignore + ) + return text_structure, toc_entries, contents + + def read(self, reading_state: ReadingState) -> Union[ReadingState, Ebook]: + # reusable loop indices + i: Any + + k = self.keymap.RegexSearch[0] if self.search_data else NoUpdate() + rows, cols = self.screen.getmaxyx() + + mincols_doublespr = ( + DoubleSpreadPadding.LEFT.value + + 22 + + DoubleSpreadPadding.MIDDLE.value + + 22 + + DoubleSpreadPadding.RIGHT.value + ) + if cols < mincols_doublespr: + self.spread = 1 + if self.spread == 2: + reading_state = dataclasses.replace( + reading_state, + textwidth=( + cols + - sum( + [ + DoubleSpreadPadding.LEFT.value, + DoubleSpreadPadding.MIDDLE.value, + DoubleSpreadPadding.RIGHT.value, + ] + ) + ) + // 2, + ) + x = (cols - reading_state.textwidth) // 2 + if self.spread == 2: + x = DoubleSpreadPadding.LEFT.value + + self.show_loader(subtext="loading contents") + # get text structure, toc entries and contents of the book + if self.seamless: + text_structure, toc_entries, contents = self.get_all_book_contents(reading_state) + # adjustment + reading_state = self.convert_relative_reading_state_to_absolute(reading_state) + else: + text_structure, toc_entries, contents = self.get_current_book_content(reading_state) + + totlines = len(text_structure.text_lines) + + if reading_state.row < 0 and totlines <= rows * self.spread: + reading_state = dataclasses.replace(reading_state, row=0) + elif reading_state.rel_pctg is not None: + reading_state = dataclasses.replace( + reading_state, row=round(reading_state.rel_pctg * totlines) + ) + else: + reading_state = dataclasses.replace(reading_state, row=reading_state.row % totlines) + + board = InfiniBoard( + screen=self.screen, + text=text_structure.text_lines, + textwidth=reading_state.textwidth, + default_style=text_structure.formatting, + spread=self.spread, + ) + + letters_per_content: List[int] = [] + for i in text_structure.text_lines: + letters_per_content.append(len(re.sub(r"\s", "", i))) + + self.screen.clear() + self.screen.refresh() + # try-except clause if there is issue + # with curses resize event + board.write(reading_state.row) + + # if reading_state.section is not None + # then override reading_state.row to follow the section + if reading_state.section: + reading_state = dataclasses.replace( + reading_state, row=text_structure.section_rows.get(reading_state.section, 0) + ) + + checkpoint_row: Optional[int] = None + countstring = "" + + try: + while True: + if countstring == "": + count = 1 + else: + count = int(countstring) + if k in tuple(Key(i) for i in range(48, 58)): # i.e., k is a numeral + countstring = countstring + k.char + else: + if k in self.keymap.Quit: + if k == Key(27) and countstring != "": + countstring = "" + else: + self.try_assign_letters_count(force_wait=True) + self.calculate_reading_progress(letters_per_content, reading_state) + + self.savestate( + dataclasses.replace( + reading_state, rel_pctg=reading_state.row / totlines + ) + ) + sys.exit() + + elif k in self.keymap.TTSToggle and self.tts_support: + tospeak = "" + for i in text_structure.text_lines[ + reading_state.row : reading_state.row + (rows * self.spread) + ]: + if re.match(r"^\s*$", i) is not None: + tospeak += "\n. \n" + else: + tospeak += i + " " + k = self.speaking(tospeak) + if ( + totlines - reading_state.row <= rows + and reading_state.content_index == len(contents) - 1 + ): + self.is_speaking = False + continue + + elif k in self.keymap.DoubleSpreadToggle: + if cols < mincols_doublespr: + k = self.show_win_error( + "Screen is too small", + "Min: {} cols x {} rows".format(mincols_doublespr, 12), + (Key("D"),), + ) + self.spread = (self.spread % 2) + 1 + return ReadingState( + content_index=reading_state.content_index, + textwidth=reading_state.textwidth, + row=reading_state.row, + rel_pctg=reading_state.row / totlines, + ) + + elif k in self.keymap.ScrollUp: + if self.spread == 2: + k = self.keymap.PageUp[0] + continue + if count > 1: + checkpoint_row = reading_state.row - 1 + if reading_state.row >= count: + reading_state = dataclasses.replace( + reading_state, row=reading_state.row - count + ) + elif reading_state.row == 0 and reading_state.content_index != 0: + self.page_animation = Direction.BACKWARD + # return -1, width, -rows, None, "" + return ReadingState( + content_index=reading_state.content_index - 1, + textwidth=reading_state.textwidth, + row=-rows, + ) + else: + reading_state = dataclasses.replace(reading_state, row=0) + + elif k in self.keymap.PageUp: + if reading_state.row == 0 and reading_state.content_index != 0: + self.page_animation = Direction.BACKWARD + text_structure_content_before = parse_html( + self.ebook.get_raw_text(contents[reading_state.content_index - 1]), + textwidth=reading_state.textwidth, + ) + assert isinstance(text_structure_content_before, TextStructure) + return ReadingState( + content_index=reading_state.content_index - 1, + textwidth=reading_state.textwidth, + row=rows + * self.spread + * ( + len(text_structure_content_before.text_lines) + // (rows * self.spread) + ), + ) + else: + if reading_state.row >= rows * self.spread * count: + self.page_animation = Direction.BACKWARD + reading_state = dataclasses.replace( + reading_state, + row=reading_state.row - (rows * self.spread * count), + ) + else: + reading_state = dataclasses.replace(reading_state, row=0) + + elif k in self.keymap.ScrollDown: + if self.spread == 2: + k = self.keymap.PageDown[0] + continue + if count > 1: + checkpoint_row = reading_state.row + rows - 1 + if reading_state.row + count <= totlines - rows: + reading_state = dataclasses.replace( + reading_state, row=reading_state.row + count + ) + elif ( + reading_state.row >= totlines - rows + and reading_state.content_index != len(contents) - 1 + ): + self.page_animation = Direction.FORWARD + return ReadingState( + content_index=reading_state.content_index + 1, + textwidth=reading_state.textwidth, + row=0, + ) + + elif k in self.keymap.PageDown: + if totlines - reading_state.row > rows * self.spread: + self.page_animation = Direction.FORWARD + reading_state = dataclasses.replace( + reading_state, row=reading_state.row + (rows * self.spread) + ) + elif reading_state.content_index != len(contents) - 1: + self.page_animation = Direction.FORWARD + return ReadingState( + content_index=reading_state.content_index + 1, + textwidth=reading_state.textwidth, + row=0, + ) + + # elif k in K["HalfScreenUp"] | K["HalfScreenDown"]: + # countstring = str(rows // 2) + # k = list(K["ScrollUp" if k in K["HalfScreenUp"] else "ScrollDown"])[0] + # continue + + elif k in self.keymap.NextChapter: + ntoc = find_current_content_index( + toc_entries, + text_structure.section_rows, + reading_state.content_index, + reading_state.row, + ) + if ntoc < len(toc_entries) - 1: + if reading_state.content_index == toc_entries[ntoc + 1].content_index: + try: + reading_state = dataclasses.replace( + reading_state, + row=text_structure.section_rows[ + toc_entries[ntoc + 1].section # type: ignore + ], + ) + except KeyError: + pass + else: + return ReadingState( + content_index=toc_entries[ntoc + 1].content_index, + textwidth=reading_state.textwidth, + row=0, + section=toc_entries[ntoc + 1].section, + ) + + elif k in self.keymap.PrevChapter: + ntoc = find_current_content_index( + toc_entries, + text_structure.section_rows, + reading_state.content_index, + reading_state.row, + ) + if ntoc > 0: + if reading_state.content_index == toc_entries[ntoc - 1].content_index: + reading_state = dataclasses.replace( + reading_state, + row=text_structure.section_rows.get( + toc_entries[ntoc - 1].section, 0 # type: ignore + ), + ) + else: + return ReadingState( + content_index=toc_entries[ntoc - 1].content_index, + textwidth=reading_state.textwidth, + row=0, + section=toc_entries[ntoc - 1].section, + ) + + elif k in self.keymap.BeginningOfCh: + ntoc = find_current_content_index( + toc_entries, + text_structure.section_rows, + reading_state.content_index, + reading_state.row, + ) + try: + reading_state = dataclasses.replace( + reading_state, + row=text_structure.section_rows[toc_entries[ntoc].section], # type: ignore + ) + except (KeyError, IndexError): + reading_state = dataclasses.replace(reading_state, row=0) + + elif k in self.keymap.EndOfCh: + ntoc = find_current_content_index( + toc_entries, + text_structure.section_rows, + reading_state.content_index, + reading_state.row, + ) + try: + if ( + text_structure.section_rows[toc_entries[ntoc + 1].section] - rows # type: ignore + >= 0 + ): + reading_state = dataclasses.replace( + reading_state, + row=text_structure.section_rows[toc_entries[ntoc + 1].section] # type: ignore + - rows, + ) + else: + reading_state = dataclasses.replace( + reading_state, + row=text_structure.section_rows[toc_entries[ntoc].section], # type: ignore + ) + except (KeyError, IndexError): + reading_state = dataclasses.replace( + reading_state, row=pgend(totlines, rows) + ) + + elif k in self.keymap.TableOfContents: + if not toc_entries: + k = self.show_win_error( + "Table of Contents", + "N/A: TableOfContents is unavailable for this book.", + self.keymap.TableOfContents, + ) + continue + ntoc = find_current_content_index( + toc_entries, + text_structure.section_rows, + reading_state.content_index, + reading_state.row, + ) + rettock, fllwd, _ = self.toc(toc_entries, ntoc) + if rettock is not None: # and rettock in WINKEYS: + k = rettock + continue + elif fllwd is not None: + if reading_state.content_index == toc_entries[fllwd].content_index: + try: + reading_state = dataclasses.replace( + reading_state, + row=text_structure.section_rows[toc_entries[fllwd].section], + ) + except KeyError: + reading_state = dataclasses.replace(reading_state, row=0) + else: + return ReadingState( + content_index=toc_entries[fllwd].content_index, + textwidth=reading_state.textwidth, + row=0, + section=toc_entries[fllwd].section, + ) + + elif k in self.keymap.Metadata: + k = self.show_win_metadata() + if k in self._win_keys: + continue + + elif k in self.keymap.Help: + k = self.show_win_help() + if k in self._win_keys: + continue + + elif ( + k in self.keymap.Enlarge + and (reading_state.textwidth + count) < cols - 4 + and self.spread == 1 + ): + return dataclasses.replace( + reading_state, + textwidth=reading_state.textwidth + count, + rel_pctg=reading_state.row / totlines, + ) + + elif ( + k in self.keymap.Shrink + and reading_state.textwidth >= 22 + and self.spread == 1 + ): + return dataclasses.replace( + reading_state, + textwidth=reading_state.textwidth - count, + rel_pctg=reading_state.row / totlines, + ) + + elif k in self.keymap.SetWidth and self.spread == 1: + if countstring == "": + # if called without a count, toggle between 80 cols and full width + if reading_state.textwidth != 80 and cols - 4 >= 80: + return ReadingState( + content_index=reading_state.content_index, + textwidth=80, + row=reading_state.row, + rel_pctg=reading_state.row / totlines, + ) + else: + return ReadingState( + content_index=reading_state.content_index, + textwidth=cols - 4, + row=reading_state.row, + rel_pctg=reading_state.row / totlines, + ) + else: + reading_state = dataclasses.replace(reading_state, textwidth=count) + if reading_state.textwidth < 20: + reading_state = dataclasses.replace(reading_state, textwidth=20) + elif reading_state.textwidth >= cols - 4: + reading_state = dataclasses.replace(reading_state, textwidth=cols - 4) + + return ReadingState( + content_index=reading_state.content_index, + textwidth=reading_state.textwidth, + row=reading_state.row, + rel_pctg=reading_state.row / totlines, + ) + + elif k in self.keymap.RegexSearch: + ret_object = self.searching( + board, + text_structure.text_lines, + reading_state, + len(contents), + ) + if isinstance(ret_object, Key) or isinstance(ret_object, NoUpdate): + k = ret_object + # k = ret_object.value + continue + elif isinstance(ret_object, ReadingState) and self.search_data: + return ret_object + # else: + elif isinstance(ret_object, ReadingState): + # y = ret_object + reading_state = ret_object + + elif k in self.keymap.OpenImage and self.image_viewer: + imgs_in_screen = list( + set( + range(reading_state.row, reading_state.row + rows * self.spread + 1) + ) + & set(text_structure.image_maps.keys()) + ) + if not imgs_in_screen: + k = NoUpdate() + continue + + imgs_in_screen.sort() + image_path: Optional[str] = None + if len(imgs_in_screen) == 1: + image_path = text_structure.image_maps[imgs_in_screen[0]] + elif len(imgs_in_screen) > 1: + imgs_rel_to_row = [i - reading_state.row for i in imgs_in_screen] + p: Union[NoUpdate, Key] = NoUpdate() + i = 0 + while p not in self.keymap.Quit and p not in self.keymap.Follow: + self.screen.move( + imgs_rel_to_row[i] % rows, + ( + x + if imgs_rel_to_row[i] // rows == 0 + else cols + - DoubleSpreadPadding.RIGHT.value + - reading_state.textwidth + ) + + reading_state.textwidth // 2, + ) + self.screen.refresh() + safe_curs_set(2) + p = board.getch() + if p in self.keymap.ScrollDown: + i += 1 + elif p in self.keymap.ScrollUp: + i -= 1 + i = i % len(imgs_rel_to_row) + + safe_curs_set(0) + if p in self.keymap.Follow: + image_path = text_structure.image_maps[imgs_in_screen[i]] + + if image_path: + try: + # if self.ebook.__class__.__name__ in {"Epub", "Mobi", "Azw"}: + if isinstance(self.ebook, (Epub, Mobi, Azw)): + # self.seamless adjustment + if self.seamless: + current_content_index = ( + self.convert_absolute_reading_state_to_relative( + reading_state + ).content_index + ) + else: + current_content_index = reading_state.content_index + # for n, content in enumerate(self.ebook.contents): + # content_path = content + # if reading_state.row < sum(totlines_per_content[:n]): + # break + + content_path = self.ebook.contents[current_content_index] + assert isinstance(content_path, str) + image_path = resolve_path(content_path, image_path) + imgnm, imgbstr = self.ebook.get_img_bytestr(image_path) + k = self.open_image(board, imgnm, imgbstr) + continue + except Exception as e: + self.show_win_error("Error Opening Image", str(e), tuple()) + if DEBUG: + raise e + + elif ( + k in self.keymap.SwitchColor + and self.is_color_supported + and countstring in {"", "0", "1", "2"} + ): + if countstring == "": + count_color = curses.pair_number(self.screen.getbkgd()) + if count_color not in {2, 3}: + count_color = 1 + count_color = count_color % 3 + else: + count_color = count + self.screen.bkgd(curses.color_pair(count_color + 1)) + # pad.format() + return ReadingState( + content_index=reading_state.content_index, + textwidth=reading_state.textwidth, + row=reading_state.row, + ) + + elif k in self.keymap.AddBookmark: + bmname = self.input_prompt(" Add bookmark:") + if isinstance(bmname, str) and bmname: + try: + self.state.insert_bookmark( + self.ebook, + bmname, + dataclasses.replace( + reading_state, rel_pctg=reading_state.row / totlines + ), + ) + except sqlite3.IntegrityError: + k = self.show_win_error( + "Error: Add Bookmarks", + f"Bookmark with name '{bmname}' already exists.", + (Key("B"),), + ) + continue + else: + k = bmname + continue + + elif k in self.keymap.ShowBookmarks: + bookmarks = self.state.get_bookmarks(self.ebook) + if not bookmarks: + k = self.show_win_error( + "Bookmarks", + "N/A: Bookmarks are not found in this book.", + self.keymap.ShowBookmarks, + ) + continue + else: + retk, idxchoice = self.show_win_choices_bookmarks() + if retk is not None: + k = retk + continue + elif idxchoice is not None: + bookmark_to_jump = self.state.get_bookmarks(self.ebook)[idxchoice][ + 1 + ] + if ( + bookmark_to_jump.content_index == reading_state.content_index + and bookmark_to_jump.textwidth == reading_state.textwidth + ): + reading_state = bookmark_to_jump + else: + return ReadingState( + content_index=bookmark_to_jump.content_index, + textwidth=reading_state.textwidth, + row=bookmark_to_jump.row, + rel_pctg=bookmark_to_jump.rel_pctg, + ) + + elif k in self.keymap.DefineWord and self.ext_dict_app: + word = self.input_prompt(" Define:") + if isinstance(word, str) and word: + defin = self.define_word(word) + if defin in self._win_keys: + k = defin + continue + else: + k = word + continue + + elif k in self.keymap.MarkPosition: + jumnum = board.getch() + if isinstance(jumnum, Key) and jumnum in tuple( + Key(i) for i in range(48, 58) + ): + self.jump_list[jumnum.char] = reading_state + else: + k = NoUpdate() + continue + + elif k in self.keymap.JumpToPosition: + jumnum = board.getch() + if ( + isinstance(jumnum, Key) + and jumnum in tuple(Key(i) for i in range(48, 58)) + and jumnum.char in self.jump_list + ): + marked_reading_state = self.jump_list[jumnum.char] + return dataclasses.replace( + marked_reading_state, + textwidth=reading_state.textwidth, + rel_pctg=None + if marked_reading_state.textwidth == reading_state.textwidth + else marked_reading_state.rel_pctg, + section="", + ) + else: + k = NoUpdate() + continue + + elif k in self.keymap.ShowHideProgress: + self.show_reading_progress = not self.show_reading_progress + + elif k in self.keymap.Library: + self.try_assign_letters_count(force_wait=True) + self.calculate_reading_progress(letters_per_content, reading_state) + + self.savestate( + dataclasses.replace( + reading_state, rel_pctg=reading_state.row / totlines + ) + ) + library_items = self.state.get_from_history() + if not library_items: + k = self.show_win_error( + "Library", + "N/A: No reading history.", + self.keymap.Library, + ) + continue + else: + retk, choice_index = self.show_win_library() + if retk is not None: + k = retk + continue + elif choice_index is not None: + return get_ebook_obj(library_items[choice_index].filepath) + + elif k == Key(curses.KEY_RESIZE): + self.savestate( + dataclasses.replace( + reading_state, rel_pctg=reading_state.row / totlines + ) + ) + # stated in pypi windows-curses page: + # to call resize_term right after KEY_RESIZE + if sys.platform == "win32": + curses.resize_term(rows, cols) + rows, cols = self.screen.getmaxyx() + else: + rows, cols = self.screen.getmaxyx() + curses.resize_term(rows, cols) + if cols < 22 or rows < 12: + sys.exit("ERROR: Screen was too small (min 22cols x 12rows).") + if cols <= reading_state.textwidth + 4: + return ReadingState( + content_index=reading_state.content_index, + textwidth=cols - 4, + row=reading_state.row, + rel_pctg=reading_state.row / totlines, + ) + else: + return ReadingState( + content_index=reading_state.content_index, + textwidth=reading_state.textwidth, + row=reading_state.row, + ) + + countstring = "" + + if checkpoint_row: + board.feed_temporary_style( + ( + InlineStyle( + row=checkpoint_row, + col=0, + n_letters=reading_state.textwidth, + attr=curses.A_UNDERLINE, + ), + ) + ) + + try: + if self.setting.PageScrollAnimation and self.page_animation: + self.screen.clear() + for i in range(1, reading_state.textwidth + 1): + curses.napms(1) + # self.screen.clear() + board.write_n(reading_state.row, i, self.page_animation) + self.screen.refresh() + self.page_animation = None + + self.screen.clear() + self.screen.addstr(0, 0, countstring) + board.write(reading_state.row) + + # check if letters counting process is done + self.try_assign_letters_count() + + # reading progress + self.calculate_reading_progress(letters_per_content, reading_state) + + # display reading progress + if ( + self.reading_progress + and self.show_reading_progress + and (cols - reading_state.textwidth - 2) // 2 > 3 + ): + reading_progress_str = "{}%".format(int(self.reading_progress * 100)) + self.screen.addstr( + 0, cols - len(reading_progress_str), reading_progress_str + ) + + self.screen.refresh() + except curses.error: + pass + + if self.is_speaking: + k = self.keymap.TTSToggle[0] + continue + + k = board.getch() + if k == Key(curses.KEY_MOUSE): + mouse_event = curses.getmouse() + if mouse_event[4] == curses.BUTTON1_CLICKED: + if mouse_event[1] < cols // 2: + k = self.keymap.PageUp[0] + else: + k = self.keymap.PageDown[0] + elif mouse_event[4] == curses.BUTTON3_CLICKED: + k = self.keymap.TableOfContents[0] + elif mouse_event[4] == curses.BUTTON4_PRESSED: + k = self.keymap.ScrollUp[0] + elif mouse_event[4] == 2097152: + k = self.keymap.ScrollDown[0] + elif mouse_event[4] == curses.BUTTON4_PRESSED + curses.BUTTON_CTRL: + k = self.keymap.Enlarge[0] + elif mouse_event[4] == 2097152 + curses.BUTTON_CTRL: + k = self.keymap.Shrink[0] + elif mouse_event[4] == curses.BUTTON2_CLICKED: + k = self.keymap.TTSToggle[0] + + if checkpoint_row: + board.feed_temporary_style() + checkpoint_row = None + + except KeyboardInterrupt: + self.savestate( + dataclasses.replace(reading_state, rel_pctg=reading_state.row / totlines) + ) + sys.exit() + + +def start_reading(stdscr, filepath: str): + + ebook = get_ebook_obj(filepath) + state = State() + config = Config() + + reader = Reader(screen=stdscr, ebook=ebook, config=config, state=state) + + def handle_signal(signum, _): + """ + Method to raise SystemExit based on signal received + to trigger `try-finally` clause + """ + msg = f"[{os.getpid()}] killed" + if signal.Signals(signum) == signal.SIGTERM: + msg = f"[{os.getpid()}] terminated" + sys.exit(msg) + + signal.signal(signal.SIGTERM, handle_signal) + + try: + reader.run_counting_letters() + + reading_state = state.get_last_reading_state(reader.ebook) + if reader.screen_cols <= reading_state.textwidth + 4: + reading_state = dataclasses.replace(reading_state, textwidth=reader.screen_cols - 4) + else: + reading_state = dataclasses.replace(reading_state, rel_pctg=None) + + while True: + reading_state_or_ebook = reader.read(reading_state) + + if isinstance(reading_state_or_ebook, Ebook): + return reading_state_or_ebook.path + else: + reading_state = reading_state_or_ebook + if reader.seamless: + reading_state = reader.convert_absolute_reading_state_to_relative(reading_state) + + finally: + reader.cleanup() diff --git a/src/epy_reader/settings.py b/src/epy_reader/settings.py new file mode 100644 index 0000000..f09bc98 --- /dev/null +++ b/src/epy_reader/settings.py @@ -0,0 +1,133 @@ +import curses +from dataclasses import dataclass, field +from enum import Enum +from typing import List, Optional, Tuple + +from epy_reader.models import Key + + +class DoubleSpreadPadding(Enum): + LEFT = 10 + MIDDLE = 7 + RIGHT = 10 + + +# add image viewers here +# sorted by most widely used +VIEWER_PRESET_LIST = ( + "feh", + "imv", + "gio", + "gnome-open", + "gvfs-open", + "xdg-open", + "kde-open", + "firefox", +) + +DICT_PRESET_LIST = ( + "wkdict", + "sdcv", + "dict", +) + + +@dataclass(frozen=True) +class Settings: + DefaultViewer: str = "auto" + DictionaryClient: str = "auto" + ShowProgressIndicator: bool = True + PageScrollAnimation: bool = True + MouseSupport: bool = False + StartWithDoubleSpread: bool = False + # -1 is default terminal fg/bg colors + DefaultColorFG: int = -1 + DefaultColorBG: int = -1 + DarkColorFG: int = 252 + DarkColorBG: int = 235 + LightColorFG: int = 238 + LightColorBG: int = 253 + SeamlessBetweenChapters: bool = False + PreferredTTSEngine: Optional[str] = None + TTSEngineArgs: List[str] = field(default_factory=list) + + +@dataclass(frozen=True) +class CfgDefaultKeymaps: + ScrollUp: str = "k" + ScrollDown: str = "j" + PageUp: str = "h" + PageDown: str = "l" + # HalfScreenUp: str = "h" + # HalfScreenDown: str + NextChapter: str = "L" + PrevChapter: str = "H" + BeginningOfCh: str = "g" + EndOfCh: str = "G" + Shrink: str = "-" + Enlarge: str = "+" + SetWidth: str = "=" + Metadata: str = "M" + DefineWord: str = "d" + TableOfContents: str = "t" + Follow: str = "f" + OpenImage: str = "o" + RegexSearch: str = "/" + ShowHideProgress: str = "s" + MarkPosition: str = "m" + JumpToPosition: str = "`" + AddBookmark: str = "b" + ShowBookmarks: str = "B" + Quit: str = "q" + Help: str = "?" + SwitchColor: str = "c" + TTSToggle: str = "!" + DoubleSpreadToggle: str = "D" + Library: str = "R" + + +@dataclass(frozen=True) +class CfgBuiltinKeymaps: + ScrollUp: Tuple[int, ...] = (curses.KEY_UP,) + ScrollDown: Tuple[int, ...] = (curses.KEY_DOWN,) + PageUp: Tuple[int, ...] = (curses.KEY_PPAGE, curses.KEY_LEFT) + PageDown: Tuple[int, ...] = (curses.KEY_NPAGE, ord(" "), curses.KEY_RIGHT) + BeginningOfCh: Tuple[int, ...] = (curses.KEY_HOME,) + EndOfCh: Tuple[int, ...] = (curses.KEY_END,) + TableOfContents: Tuple[int, ...] = (9, ord("\t")) + Follow: Tuple[int, ...] = (10,) + Quit: Tuple[int, ...] = (3, 27, 304) + + +@dataclass(frozen=True) +class Keymap: + # HalfScreenDown: Tuple[Key, ...] + # HalfScreenUp: Tuple[Key, ...] + AddBookmark: Tuple[Key, ...] + BeginningOfCh: Tuple[Key, ...] + DefineWord: Tuple[Key, ...] + DoubleSpreadToggle: Tuple[Key, ...] + EndOfCh: Tuple[Key, ...] + Enlarge: Tuple[Key, ...] + Follow: Tuple[Key, ...] + Help: Tuple[Key, ...] + JumpToPosition: Tuple[Key, ...] + Library: Tuple[Key, ...] + MarkPosition: Tuple[Key, ...] + Metadata: Tuple[Key, ...] + NextChapter: Tuple[Key, ...] + OpenImage: Tuple[Key, ...] + PageDown: Tuple[Key, ...] + PageUp: Tuple[Key, ...] + PrevChapter: Tuple[Key, ...] + Quit: Tuple[Key, ...] + RegexSearch: Tuple[Key, ...] + ScrollDown: Tuple[Key, ...] + ScrollUp: Tuple[Key, ...] + SetWidth: Tuple[Key, ...] + ShowBookmarks: Tuple[Key, ...] + ShowHideProgress: Tuple[Key, ...] + Shrink: Tuple[Key, ...] + SwitchColor: Tuple[Key, ...] + TTSToggle: Tuple[Key, ...] + TableOfContents: Tuple[Key, ...] diff --git a/src/epy_reader/speakers/__init__.py b/src/epy_reader/speakers/__init__.py new file mode 100644 index 0000000..078be31 --- /dev/null +++ b/src/epy_reader/speakers/__init__.py @@ -0,0 +1,9 @@ +__all__ = [ + "SpeakerBaseModel", + "SpeakerMimic", + "SpeakerPico", +] + +from epy_reader.speakers.base import SpeakerBaseModel +from epy_reader.speakers.mimic import SpeakerMimic +from epy_reader.speakers.pico import SpeakerPico diff --git a/src/epy_reader/speakers/base.py b/src/epy_reader/speakers/base.py new file mode 100644 index 0000000..7c1a8d5 --- /dev/null +++ b/src/epy_reader/speakers/base.py @@ -0,0 +1,21 @@ +from typing import List + + +class SpeakerBaseModel: + cmd: str = "tts_engine_binary" + available: bool = False + + def __init__(self, args: List[str] = []): + self.args = args + + def speak(self, text: str) -> None: + raise NotImplementedError("Speaker.speak() not implemented") + + def is_done(self) -> bool: + raise NotImplementedError("Speaker.is_done() not implemented") + + def stop(self) -> None: + raise NotImplementedError("Speaker.stop() not implemented") + + def cleanup(self) -> None: + raise NotImplementedError("Speaker.cleanup() not implemented") diff --git a/src/epy_reader/speakers/mimic.py b/src/epy_reader/speakers/mimic.py new file mode 100644 index 0000000..0db4ed8 --- /dev/null +++ b/src/epy_reader/speakers/mimic.py @@ -0,0 +1,31 @@ +import shutil +import subprocess + +from epy_reader.speakers.base import SpeakerBaseModel + + +class SpeakerMimic(SpeakerBaseModel): + cmd = "mimic" + available = bool(shutil.which("mimic")) + + def speak(self, text: str) -> None: + self.process = subprocess.Popen( + [self.cmd, *self.args], + text=True, + stdin=subprocess.PIPE, + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT, + ) + assert self.process.stdin + self.process.stdin.write(text) + self.process.stdin.close() + + def is_done(self) -> bool: + return self.process.poll() is not None + + def stop(self) -> None: + self.process.terminate() + # self.process.kill() + + def cleanup(self) -> None: + pass diff --git a/src/epy_reader/speakers/pico.py b/src/epy_reader/speakers/pico.py new file mode 100644 index 0000000..95065f1 --- /dev/null +++ b/src/epy_reader/speakers/pico.py @@ -0,0 +1,43 @@ +import os +import shutil +import subprocess +import sys +import tempfile + +from epy_reader.speakers.base import SpeakerBaseModel + + +class SpeakerPico(SpeakerBaseModel): + cmd = "pico2wave" + available = all([shutil.which(dep) for dep in ["pico2wave", "play"]]) + + def speak(self, text: str) -> None: + _, self.tmp_path = tempfile.mkstemp(suffix=".wav") + + try: + subprocess.run( + [self.cmd, *self.args, "-w", self.tmp_path, text], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + check=True, + ) + except subprocess.CalledProcessError as e: + if "invalid pointer" not in e.output: + sys.exit(e.output) + + self.process = subprocess.Popen( + ["play", self.tmp_path], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + + def is_done(self) -> bool: + return self.process.poll() is not None + + def stop(self) -> None: + self.process.terminate() + # self.process.kill() + + def cleanup(self) -> None: + os.remove(self.tmp_path) diff --git a/src/epy_reader/state.py b/src/epy_reader/state.py new file mode 100644 index 0000000..5129394 --- /dev/null +++ b/src/epy_reader/state.py @@ -0,0 +1,195 @@ +import dataclasses +import hashlib +import os +import sqlite3 +from datetime import datetime +from typing import List, Tuple + +from epy_reader.ebooks import Ebook +from epy_reader.models import AppData, LibraryItem, Optional, ReadingState + + +class State(AppData): + """ + Use sqlite3 instead of JSON (in older version) + to shift the weight from memory to process + """ + + def __init__(self): + if not os.path.isfile(self.filepath): + self.init_db() + + @property + def filepath(self) -> str: + return os.path.join(self.prefix, "states.db") if self.prefix else os.devnull + + def get_from_history(self) -> List[LibraryItem]: + try: + conn = sqlite3.connect(self.filepath) + cur = conn.cursor() + cur.execute( + """ + SELECT last_read, filepath, title, author, reading_progress + FROM library ORDER BY last_read DESC + """ + ) + results = cur.fetchall() + library_items: List[LibraryItem] = [] + for result in results: + library_items.append( + LibraryItem( + last_read=datetime.fromisoformat(result[0]), + filepath=result[1], + title=result[2], + author=result[3], + reading_progress=result[4], + ) + ) + return library_items + finally: + conn.close() + + def delete_from_library(self, filepath: str) -> None: + try: + conn = sqlite3.connect(self.filepath) + conn.execute("PRAGMA foreign_keys = ON") + conn.execute("DELETE FROM reading_states WHERE filepath=?", (filepath,)) + conn.commit() + finally: + conn.close() + + def get_last_read(self) -> Optional[str]: + library = self.get_from_history() + return library[0].filepath if library else None + + def update_library(self, ebook: Ebook, reading_progress: Optional[float]) -> None: + try: + metadata = ebook.get_meta() + conn = sqlite3.connect(self.filepath) + conn.execute( + """ + INSERT OR REPLACE INTO library (filepath, title, author, reading_progress) + VALUES (?, ?, ?, ?) + """, + (ebook.path, metadata.title, metadata.creator, reading_progress), + ) + conn.commit() + finally: + conn.close() + + def get_last_reading_state(self, ebook: Ebook) -> ReadingState: + try: + conn = sqlite3.connect(self.filepath) + conn.row_factory = sqlite3.Row + cur = conn.cursor() + cur.execute("SELECT * FROM reading_states WHERE filepath=?", (ebook.path,)) + result = cur.fetchone() + if result: + result = dict(result) + del result["filepath"] + return ReadingState(**result, section=None) + return ReadingState(content_index=0, textwidth=80, row=0, rel_pctg=None, section=None) + finally: + conn.close() + + def set_last_reading_state(self, ebook: Ebook, reading_state: ReadingState) -> None: + try: + conn = sqlite3.connect(self.filepath) + conn.execute( + """ + INSERT OR REPLACE INTO reading_states + VALUES (:filepath, :content_index, :textwidth, :row, :rel_pctg) + """, + {"filepath": ebook.path, **dataclasses.asdict(reading_state)}, + ) + conn.commit() + finally: + conn.close() + + def insert_bookmark(self, ebook: Ebook, name: str, reading_state: ReadingState) -> None: + try: + conn = sqlite3.connect(self.filepath) + conn.execute( + """ + INSERT INTO bookmarks + VALUES (:id, :filepath, :name, :content_index, :textwidth, :row, :rel_pctg) + """, + { + "id": hashlib.sha1(f"{ebook.path}{name}".encode()).hexdigest()[:10], + "filepath": ebook.path, + "name": name, + **dataclasses.asdict(reading_state), + }, + ) + conn.commit() + finally: + conn.close() + + def delete_bookmark(self, ebook: Ebook, name: str) -> None: + try: + conn = sqlite3.connect(self.filepath) + conn.execute("DELETE FROM bookmarks WHERE filepath=? AND name=?", (ebook.path, name)) + conn.commit() + finally: + conn.close() + + def get_bookmarks(self, ebook: Ebook) -> List[Tuple[str, ReadingState]]: + try: + conn = sqlite3.connect(self.filepath) + conn.row_factory = sqlite3.Row + cur = conn.cursor() + cur.execute("SELECT * FROM bookmarks WHERE filepath=?", (ebook.path,)) + results = cur.fetchall() + bookmarks: List[Tuple[str, ReadingState]] = [] + for result in results: + tmp_dict = dict(result) + name = tmp_dict["name"] + tmp_dict = { + k: v + for k, v in tmp_dict.items() + if k in ("content_index", "textwidth", "row", "rel_pctg") + } + bookmarks.append((name, ReadingState(**tmp_dict))) + return bookmarks + finally: + conn.close() + + def init_db(self) -> None: + try: + conn = sqlite3.connect(self.filepath) + conn.executescript( + """ + CREATE TABLE reading_states ( + filepath TEXT PRIMARY KEY, + content_index INTEGER, + textwidth INTEGER, + row INTEGER, + rel_pctg REAL + ); + + CREATE TABLE library ( + last_read DATETIME DEFAULT (datetime('now','localtime')), + filepath TEXT PRIMARY KEY, + title TEXT, + author TEXT, + reading_progress REAL, + FOREIGN KEY (filepath) REFERENCES reading_states(filepath) + ON DELETE CASCADE + ); + + CREATE TABLE bookmarks ( + id TEXT PRIMARY KEY, + filepath TEXT, + name TEXT, + content_index INTEGER, + textwidth INTEGER, + row INTEGER, + rel_pctg REAL, + FOREIGN KEY (filepath) REFERENCES reading_states(filepath) + ON DELETE CASCADE + ); + """ + ) + conn.commit() + finally: + conn.close() diff --git a/src/epy_reader/tools/KindleUnpack/__init__.py b/src/epy_reader/tools/KindleUnpack/__init__.py new file mode 100644 index 0000000..0077258 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai diff --git a/src/epy_reader/tools/KindleUnpack/compatibility_utils.py b/src/epy_reader/tools/KindleUnpack/compatibility_utils.py new file mode 100755 index 0000000..c46c0bb --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/compatibility_utils.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this list of +# conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, this list +# of conditions and the following disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY +# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import unicode_literals, division, absolute_import, print_function + +import sys +import codecs + +PY2 = sys.version_info[0] == 2 +PY3 = sys.version_info[0] == 3 + +iswindows = sys.platform.startswith('win') + +try: + from urllib.parse import unquote +except ImportError: + from urllib import unquote + +if PY2: + from HTMLParser import HTMLParser + _h = HTMLParser() +elif sys.version_info[1] < 4: + import html.parser + _h = html.parser.HTMLParser() +else: + import html as _h + +if PY3: + text_type = str + binary_type = bytes + # if will be printing arbitraty binary data to stdout on python 3 + # sys.stdin = sys.stdin.detach() + # sys.stdout = sys.stdout.detach() + # sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach()) +else: + range = xrange + text_type = unicode + binary_type = str + # if will be printing unicode under python 2 need to protect + # against sys.stdout.encoding being None stupidly forcing forcing ascii encoding of unicode + # sys.stdout = codecs.getwriter("utf-8")(sys.stdout) + # alternatively set environment variable as follows **before** launching python: export PYTHONIOENCODING=UTF-8 + +# NOTE: Python 3 is completely broken when accessing single bytes in bytes strings +# (and they amazingly claim by design and no bug!) + +# To illustrate: this works for unicode in Python 3 and for all Python 2.X for both bytestrings and unicode +# >>> o = '123456789' +# >>> o[-3] +# '7' +# >>> type(o[-3]) +# <class 'str'> +# >>> type(o) +# <class 'str'> + +# Unfortunately, this is what Python 3 does for no sane reason and only for bytestrings +# >>> o = b'123456789' +# >>> o[-3] +# 55 +# >>> type(o[-3]) +# <class 'int'> +# >>> type(o) +# <class 'bytes'> + +# This mind boggling behaviour also happens when indexing a bytestring and/or +# iteratoring over a bytestring. In other words it will return an int but not +# the byte itself!!!!!!! + +# The only way to access a single byte as a byte in bytestring and get the byte in both +# Python 2 and Python 3 is to use a slice + +# This problem is so common there are horrible hacks floating around the net to **try** +# to work around it, so that code that works on both Python 2 and Python 3 is possible. + +# So in order to write code that works on both Python 2 and Python 3 +# if you index or access a single byte and want its ord() then use the bord() function. +# If instead you want it as a single character byte use the bchar() function +# both of which are defined below. + +if PY3: + # Also Note: if decode a bytestring using 'latin-1' (or any other full range 0-255 encoding) + # in place of ascii you will get a byte value to half-word or integer value + # one-to-one mapping (in the 0 - 255 range) + + def bchr(s): + return bytes([s]) + + def bstr(s): + if isinstance(s, str): + return bytes(s, 'latin-1') + else: + return bytes(s) + + def bord(s): + return s + + def bchar(s): + return bytes([s]) + +else: + def bchr(s): + return chr(s) + + def bstr(s): + return str(s) + + def bord(s): + return ord(s) + + def bchar(s): + return s + +if PY3: + # list-producing versions of the major Python iterating functions + def lrange(*args, **kwargs): + return list(range(*args, **kwargs)) + + def lzip(*args, **kwargs): + return list(zip(*args, **kwargs)) + + def lmap(*args, **kwargs): + return list(map(*args, **kwargs)) + + def lfilter(*args, **kwargs): + return list(filter(*args, **kwargs)) +else: + import __builtin__ + # Python 2-builtin ranges produce lists + lrange = __builtin__.range + lzip = __builtin__.zip + lmap = __builtin__.map + lfilter = __builtin__.filter + +# In Python 3 you can no longer use .encode('hex') on a bytestring +# instead use the following on both platforms +import binascii +def hexlify(bdata): + return (binascii.hexlify(bdata)).decode('ascii') + +# If you: import struct +# Note: struct pack, unpack, unpack_from all *require* bytestring format +# data all the way up to at least Python 2.7.5, Python 3 is okay with either + +# If you: import re +# note: Python 3 "re" requires the pattern to be the exact same type as the data to be +# searched ... but u"" is not allowed for the pattern itself only b"" +# Python 2.X allows the pattern to be any type and converts it to match the data +# and returns the same type as the data + +# convert string to be utf-8 encoded +def utf8_str(p, enc='utf-8'): + if p is None: + return None + if isinstance(p, text_type): + return p.encode('utf-8') + if enc != 'utf-8': + return p.decode(enc).encode('utf-8') + return p + +# convert string to be unicode encoded +def unicode_str(p, enc='utf-8'): + if p is None: + return None + if isinstance(p, text_type): + return p + return p.decode(enc) + +ASCII_CHARS = set(chr(x) for x in range(128)) +URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ' + 'abcdefghijklmnopqrstuvwxyz' + '0123456789' '#' '_.-/~') +IRI_UNSAFE = ASCII_CHARS - URL_SAFE + +# returns a quoted IRI (not a URI) +def quoteurl(href): + if isinstance(href,binary_type): + href = href.decode('utf-8') + result = [] + for char in href: + if char in IRI_UNSAFE: + char = "%%%02x" % ord(char) + result.append(char) + return ''.join(result) + +# unquotes url/iri +def unquoteurl(href): + if isinstance(href,binary_type): + href = href.decode('utf-8') + href = unquote(href) + return href + +# unescape html +def unescapeit(sval): + return _h.unescape(sval) + +# Python 2.X commandline parsing under Windows has been horribly broken for years! +# Use the following code to emulate full unicode commandline parsing on Python 2 +# ie. To get sys.argv arguments and properly encode them as unicode + +def unicode_argv(): + global iswindows + global PY3 + if PY3: + return sys.argv + if iswindows: + # Versions 2.x of Python don't support Unicode in sys.argv on + # Windows, with the underlying Windows API instead replacing multi-byte + # characters with '?'. So use shell32.GetCommandLineArgvW to get sys.argv + # as a list of Unicode strings + from ctypes import POINTER, byref, cdll, c_int, windll + from ctypes.wintypes import LPCWSTR, LPWSTR + + GetCommandLineW = cdll.kernel32.GetCommandLineW + GetCommandLineW.argtypes = [] + GetCommandLineW.restype = LPCWSTR + + CommandLineToArgvW = windll.shell32.CommandLineToArgvW + CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)] + CommandLineToArgvW.restype = POINTER(LPWSTR) + + cmd = GetCommandLineW() + argc = c_int(0) + argv = CommandLineToArgvW(cmd, byref(argc)) + if argc.value > 0: + # Remove Python executable and commands if present + start = argc.value - len(sys.argv) + return [argv[i] for i in + range(start, argc.value)] + # this should never happen + return None + else: + argv = [] + argvencoding = sys.stdin.encoding + if argvencoding is None: + argvencoding = sys.getfilesystemencoding() + if argvencoding is None: + argvencoding = 'utf-8' + for arg in sys.argv: + if isinstance(arg, text_type): + argv.append(arg) + else: + argv.append(arg.decode(argvencoding)) + return argv + + +# Python 2.X is broken in that it does not recognize CP65001 as UTF-8 +def add_cp65001_codec(): + if PY2: + try: + codecs.lookup('cp65001') + except LookupError: + codecs.register( + lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None) + return diff --git a/src/epy_reader/tools/KindleUnpack/kindleunpack.py b/src/epy_reader/tools/KindleUnpack/kindleunpack.py new file mode 100644 index 0000000..317941a --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/kindleunpack.py @@ -0,0 +1,1029 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +import os + +__path__ = ["lib", os.path.dirname(os.path.realpath(__file__)), "kindleunpack"] + +import sys +import codecs +import traceback + +from .compatibility_utils import PY2, binary_type, utf8_str, unicode_str +from .compatibility_utils import unicode_argv, add_cp65001_codec +from .compatibility_utils import hexlify + +add_cp65001_codec() + +from .unipath import pathof + +if PY2: + range = xrange + # since will be printing unicode under python 2 need to protect + # against sys.stdout.encoding being None stupidly forcing forcing ascii encoding + if sys.stdout.encoding is None: + sys.stdout = codecs.getwriter("utf-8")(sys.stdout) + else: + encoding = sys.stdout.encoding + sys.stdout = codecs.getwriter(encoding)(sys.stdout) + +# Changelog +# 0.11 - Version by adamselene +# 0.11pd - Tweaked version by pdurrant +# 0.12 - extracts pictures too, and all into a folder. +# 0.13 - added back in optional output dir for those who don't want it based on infile +# 0.14 - auto flush stdout and wrapped in main, added proper return codes +# 0.15 - added support for metadata +# 0.16 - metadata now starting to be output as an opf file (PD) +# 0.17 - Also created tweaked text as source for Mobipocket Creator +# 0.18 - removed raw mobi file completely but kept _meta.html file for ease of conversion +# 0.19 - added in metadata for ASIN, Updated Title and Rights to the opf +# 0.20 - remove _meta.html since no longer needed +# 0.21 - Fixed some typos in the opf output, and also updated handling +# of test for trailing data/multibyte characters +# 0.22 - Fixed problem with > 9 images +# 0.23 - Now output Start guide item +# 0.24 - Set firstaddl value for 'TEXtREAd' +# 0.25 - Now added character set metadata to html file for utf-8 files. +# 0.26 - Dictionary support added. Image handling speed improved. +# For huge files create temp files to speed up decoding. +# Language decoding fixed. Metadata is now converted to utf-8 when written to opf file. +# 0.27 - Add idx:entry attribute "scriptable" if dictionary contains entry length tags. +# Don't save non-image sections as images. Extract and save source zip file +# included by kindlegen as kindlegensrc.zip. +# 0.28 - Added back correct image file name extensions, created FastConcat class to simplify and clean up +# 0.29 - Metadata handling reworked, multiple entries of the same type are now supported. +# Several missing types added. +# FastConcat class has been removed as in-memory handling with lists is faster, even for huge files. +# 0.30 - Add support for outputting **all** metadata values - encode content with hex if of unknown type +# 0.31 - Now supports Print Replica ebooks, outputting PDF and mysterious data sections +# 0.32 - Now supports NCX file extraction/building. +# Overhauled the structure of mobiunpack to be more class oriented. +# 0.33 - Split Classes ito separate files and added prelim support for KF8 format eBooks +# 0.34 - Improved KF8 support, guide support, bug fixes +# 0.35 - Added splitting combo mobi7/mobi8 into standalone mobi7 and mobi8 files +# Also handle mobi8-only file properly +# 0.36 - very minor changes to support KF8 mobis with no flow items, no ncx, etc +# 0.37 - separate output, add command line switches to control, interface to Mobi_Unpack.pyw +# 0.38 - improve split function by resetting flags properly, fix bug in Thumbnail Images +# 0.39 - improve split function so that ToC info is not lost for standalone mobi8s +# 0.40 - make mobi7 split match official versions, add support for graphic novel metadata, +# improve debug for KF8 +# 0.41 - fix when StartOffset set to 0xffffffff, fix to work with older mobi versions, +# fix other minor metadata issues +# 0.42 - add new class interface to allow it to integrate more easily with internal calibre routines +# 0.43 - bug fixes for new class interface +# 0.44 - more bug fixes and fix for potnetial bug caused by not properly closing created zip archive +# 0.45 - sync to version in the new Mobi_Unpack plugin +# 0.46 - fixes for: obfuscated fonts, improper toc links and ncx, add support for opentype fonts +# 0.47 - minor opf improvements +# 0.48 - ncx link fixes +# 0.49 - use azw3 when splitting mobis +# 0.50 - unknown change +# 0.51 - fix for converting filepos links to hrefs, Added GPL3 notice, made KF8 extension just '.azw3' +# 0.52 - fix for cover metadata (no support for Mobipocket Creator) +# 0.53 - fix for proper identification of embedded fonts, added new metadata items +# 0.54 - Added error-handling so wonky embedded fonts don't bomb the whole unpack process, +# entity escape KF8 metadata to ensure valid OPF. +# 0.55 Strip extra StartOffset EXTH from the mobi8 header when splitting, keeping only the relevant one +# For mobi8 files, don't generate duplicate guide entries from the metadata if we could extract one +# from the OTH table. +# 0.56 - Added further entity escaping of OPF text. +# Allow unicode string file paths to be passed as arguments to the unpackBook method without blowing up later +# when the attempt to "re"-unicode a portion of that filename occurs in the process_all_mobi_headers method. +# 0.57 - Fixed eror when splitting Preview files downloaded from KDP website +# 0.58 - Output original kindlegen build log ('CMET' record) if included in the package. +# 0.58 - Include and extend functionality of DumpMobiHeader, replacing DEBUG with DUMP +# 0.59 - Much added DUMP functionality, including full dumping and descriptions of sections +# 0.60 - Bug fixes in opf, div tables, bad links, page breaks, section descriptions +# - plus a number of other bug fixed that were found by Sergey Dubinets +# - fixs for file/paths that require full unicode to work properly +# - replace subprocess with multiprocessing to remove need for unbuffered stdout +# 0.61 - renamed to be KindleUnpack and more unicode/utf-8 path bug fixes and other minor fixes +# 0.62 - fix for multiprocessing on Windows, split fixes, opf improvements +# 0.63 - Modified to process right to left page progression books properly. +# - Added some id_map_strings and RESC section processing; metadata and +# - spine in the RESC are integrated partly to content.opf. +# 0.63a- Separated K8 RESC processor to an individual file. Bug fixes. Added cover page creation. +# 0.64 - minor bug fixes to more properly handle unicode command lines, and support for more jpeg types +# 0.64a- Modifed to handle something irregular mobi and azw3 files. +# 0.64b- Modifed to create k8resc.spine for no RECS files. +# 0.65 - Bug fixes to shorten title and remove epub3 "properties" to make the output epub2 compliant +# 0.65a- Bug fixes to extract RESC section correctly, to prevent item id confliction +# - and to process multiline comments in RESC. +# 0.66 - Bug fix to deal with missing first resource information sometimes generated by calibre +# 0.66a- Fixed minor bugs, which probably do not affect the output anything +# 0.67 - Fixed Mobi Split functionality bug with azw3 images not being properly copied +# 0.68 - preliminary support for handling PAGE sections to create page-map.xml +# 0.69 - preliminary support for CONT and CRES for HD Images +# 0.70 - preliminary support for decoding apnx files when used with azw3 ebooks +# 0.71 - extensive refactoring of kindleunpack.py to make it more manageable +# 0.72 - many bug fixes from tkeo: fix pageProcessing, fix print replica, fix resc usage, fix font mangling, etc. +# 0.72a- fix for still broken PrintReplica support +# 0.72b- preview for primary epub3 support. A parameter epubver(default='2') is added to process_all_mobi_headers(), unpackBook(). +# 0.72c- preview for apnx page support +# 0.72d- more bugs fixed in preview features, much improved GUI with ability to dynaically grow the Log Window with preference support +# 0.72e- more bug fixes, Tk GUI adds support for epub version and HDImage use +# 0.72f- more bug fixes, implement use hd images if present +# 0.72g- minor bug fixes and cleanups from tkeo +# 0.72h- updated mobi_header and mobi_k8proc to use the correct fragment and guide terms in place of div and other +# to better match the terms that both Calibre and Amazon use internally to their own software +# 0.72x- very experimental conversion to use new mobi_k8resc.py and some of its associated changes +# 0.72y- more changes to simplify and integrate in epub3 support in a simpler manner +# 0.72z- remove redundancy in mobi_opf.py and bug fixes for mobi_k8resc.py +# 0.73 faster mobi split, numerous bug fixes in mobi_k8proc, mobi_header, mobi_opf, mobi_k8resc, etc +# 0.74 added refines metadata, fixed language code in ncx and title in nav, added support for opf: from refines +# 0.75 much improved dictioanry support including support for multiple inflection sections, minor mobi_opf fixes +# 0.76 pre-release version only fix name related issues in opf by not using original file name in mobi7 +# 0.77 bug fix for unpacking HDImages with included Fonts +# 0.80 converted to work with both python 2.7 and Python 3.3 and later +# 0.81 various fixes +# 0.82 Handle calibre-generated mobis that can have skeletons with no fragments +# 0.83 Fix header item 114 being mistakenly treated as a string instead of a value + +DUMP = False +""" Set to True to dump all possible information. """ + +WRITE_RAW_DATA = False +""" Set to True to create additional files with raw data for debugging/reverse engineering. """ + +SPLIT_COMBO_MOBIS = False +""" Set to True to split combination mobis into mobi7 and mobi8 pieces. """ + +CREATE_COVER_PAGE = True # XXX experimental +""" Create and insert a cover xhtml page. """ + +EOF_RECORD = b'\xe9\x8e' + b'\r\n' +""" The EOF record content. """ + +TERMINATION_INDICATOR1 = b'\x00' +TERMINATION_INDICATOR2 = b'\x00\x00' +TERMINATION_INDICATOR3 = b'\x00\x00\x00' + +KINDLEGENSRC_FILENAME = "kindlegensrc.zip" +""" The name for the kindlegen source archive. """ + +KINDLEGENLOG_FILENAME = "kindlegenbuild.log" +""" The name for the kindlegen build log. """ + +K8_BOUNDARY = b'BOUNDARY' +""" The section data that divides K8 mobi ebooks. """ + +import os +import struct +import re +import zlib +import getopt + +class unpackException(Exception): + pass + + +# import the kindleunpack support libraries +from .unpack_structure import fileNames +from .mobi_sectioner import Sectionizer, describe +from .mobi_header import MobiHeader, dump_contexth +from .mobi_utils import toBase32 +from .mobi_opf import OPFProcessor +from .mobi_html import HTMLProcessor, XHTMLK8Processor +from .mobi_ncx import ncxExtract +from .mobi_k8proc import K8Processor +from .mobi_split import mobi_split +from .mobi_k8resc import K8RESCProcessor +from .mobi_nav import NAVProcessor +from .mobi_cover import CoverProcessor, get_image_type +from .mobi_pagemap import PageMapProcessor +from .mobi_dict import dictSupport + + +def processSRCS(i, files, rscnames, sect, data): + # extract the source zip archive and save it. + print("File contains kindlegen source archive, extracting as %s" % KINDLEGENSRC_FILENAME) + srcname = os.path.join(files.outdir, KINDLEGENSRC_FILENAME) + with open(pathof(srcname), 'wb') as f: + f.write(data[16:]) + rscnames.append(None) + sect.setsectiondescription(i,"Zipped Source Files") + return rscnames + + +def processPAGE(i, files, rscnames, sect, data, mh, pagemapproc): + # process any page map information and create an apnx file + pagemapproc = PageMapProcessor(mh, data) + rscnames.append(None) + sect.setsectiondescription(i,"PageMap") + apnx_meta = {} + acr = sect.palmname.decode('latin-1').rstrip('\x00') + apnx_meta['acr'] = acr + apnx_meta['cdeType'] = mh.metadata['cdeType'][0] + apnx_meta['contentGuid'] = hex(int(mh.metadata['UniqueID'][0]))[2:] + apnx_meta['asin'] = mh.metadata['ASIN'][0] + apnx_meta['pageMap'] = pagemapproc.getPageMap() + if mh.version == 8: + apnx_meta['format'] = 'MOBI_8' + else: + apnx_meta['format'] = 'MOBI_7' + apnx_data = pagemapproc.generateAPNX(apnx_meta) + if mh.isK8(): + outname = os.path.join(files.outdir, 'mobi8-'+files.getInputFileBasename() + '.apnx') + else: + outname = os.path.join(files.outdir, 'mobi7-'+files.getInputFileBasename() + '.apnx') + with open(pathof(outname), 'wb') as f: + f.write(apnx_data) + return rscnames, pagemapproc + + +def processCMET(i, files, rscnames, sect, data): + # extract the build log + print("File contains kindlegen build log, extracting as %s" % KINDLEGENLOG_FILENAME) + srcname = os.path.join(files.outdir, KINDLEGENLOG_FILENAME) + with open(pathof(srcname), 'wb') as f: + f.write(data[10:]) + rscnames.append(None) + sect.setsectiondescription(i,"Kindlegen log") + return rscnames + + +# fonts only exist in KF8 ebooks +# Format: bytes 0 - 3: 'FONT' +# bytes 4 - 7: uncompressed size +# bytes 8 - 11: flags +# flag bit 0x0001 - zlib compression +# flag bit 0x0002 - obfuscated with xor string +# bytes 12 - 15: offset to start of compressed font data +# bytes 16 - 19: length of xor string stored before the start of the comnpress font data +# bytes 20 - 23: start of xor string +def processFONT(i, files, rscnames, sect, data, obfuscate_data, beg, rsc_ptr): + fontname = "font%05d" % i + ext = '.dat' + font_error = False + font_data = data + try: + usize, fflags, dstart, xor_len, xor_start = struct.unpack_from(b'>LLLLL',data,4) + except: + print("Failed to extract font: {0:s} from section {1:d}".format(fontname,i)) + font_error = True + ext = '.failed' + pass + if not font_error: + print("Extracting font:", fontname) + font_data = data[dstart:] + extent = len(font_data) + extent = min(extent, 1040) + if fflags & 0x0002: + # obfuscated so need to de-obfuscate the first 1040 bytes + key = bytearray(data[xor_start: xor_start+ xor_len]) + buf = bytearray(font_data) + for n in range(extent): + buf[n] ^= key[n%xor_len] + font_data = bytes(buf) + if fflags & 0x0001: + # ZLIB compressed data + font_data = zlib.decompress(font_data) + hdr = font_data[0:4] + if hdr == b'\0\1\0\0' or hdr == b'true' or hdr == b'ttcf': + ext = '.ttf' + elif hdr == b'OTTO': + ext = '.otf' + else: + print("Warning: unknown font header %s" % hexlify(hdr)) + if (ext == '.ttf' or ext == '.otf') and (fflags & 0x0002): + obfuscate_data.append(fontname + ext) + fontname += ext + outfnt = os.path.join(files.imgdir, fontname) + with open(pathof(outfnt), 'wb') as f: + f.write(font_data) + rscnames.append(fontname) + sect.setsectiondescription(i,"Font {0:s}".format(fontname)) + if rsc_ptr == -1: + rsc_ptr = i - beg + return rscnames, obfuscate_data, rsc_ptr + + +def processCRES(i, files, rscnames, sect, data, beg, rsc_ptr, use_hd): + # extract an HDImage + global DUMP + data = data[12:] + imgtype = get_image_type(None, data) + + if imgtype is None: + print("Warning: CRES Section %s does not contain a recognised resource" % i) + rscnames.append(None) + sect.setsectiondescription(i,"Mysterious CRES data, first four bytes %s" % describe(data[0:4])) + if DUMP: + fname = "unknown%05d.dat" % i + outname= os.path.join(files.outdir, fname) + with open(pathof(outname), 'wb') as f: + f.write(data) + sect.setsectiondescription(i,"Mysterious CRES data, first four bytes %s extracting as %s" % (describe(data[0:4]), fname)) + rsc_ptr += 1 + return rscnames, rsc_ptr + + if use_hd: + # overwrite corresponding lower res image with hd version + imgname = rscnames[rsc_ptr] + imgdest = files.imgdir + else: + imgname = "HDimage%05d.%s" % (i, imgtype) + imgdest = files.hdimgdir + print("Extracting HD image: {0:s} from section {1:d}".format(imgname,i)) + outimg = os.path.join(imgdest, imgname) + with open(pathof(outimg), 'wb') as f: + f.write(data) + rscnames.append(None) + sect.setsectiondescription(i,"Optional HD Image {0:s}".format(imgname)) + rsc_ptr += 1 + return rscnames, rsc_ptr + + +def processCONT(i, files, rscnames, sect, data): + global DUMP + # process a container header, most of this is unknown + # right now only extract its EXTH + dt = data[0:12] + if dt == b"CONTBOUNDARY": + rscnames.append(None) + sect.setsectiondescription(i,"CONTAINER BOUNDARY") + else: + sect.setsectiondescription(i,"CONT Header") + rscnames.append(None) + if DUMP: + cpage, = struct.unpack_from(b'>L', data, 12) + contexth = data[48:] + print("\n\nContainer EXTH Dump") + dump_contexth(cpage, contexth) + fname = "CONT_Header%05d.dat" % i + outname= os.path.join(files.outdir, fname) + with open(pathof(outname), 'wb') as f: + f.write(data) + return rscnames + + +def processkind(i, files, rscnames, sect, data): + global DUMP + dt = data[0:12] + if dt == b"kindle:embed": + if DUMP: + print("\n\nHD Image Container Description String") + print(data) + sect.setsectiondescription(i,"HD Image Container Description String") + rscnames.append(None) + return rscnames + + +# spine information from the original content.opf +def processRESC(i, files, rscnames, sect, data, k8resc): + global DUMP + if DUMP: + rescname = "RESC%05d.dat" % i + print("Extracting Resource: ", rescname) + outrsc = os.path.join(files.outdir, rescname) + with open(pathof(outrsc), 'wb') as f: + f.write(data) + if True: # try: + # parse the spine and metadata from RESC + k8resc = K8RESCProcessor(data[16:], DUMP) + else: # except: + print("Warning: cannot extract information from RESC.") + k8resc = None + rscnames.append(None) + sect.setsectiondescription(i,"K8 RESC section") + return rscnames, k8resc + + +def processImage(i, files, rscnames, sect, data, beg, rsc_ptr, cover_offset, thumb_offset): + global DUMP + # Extract an Image + imgtype = get_image_type(None, data) + if imgtype is None: + print("Warning: Section %s does not contain a recognised resource" % i) + rscnames.append(None) + sect.setsectiondescription(i,"Mysterious Section, first four bytes %s" % describe(data[0:4])) + if DUMP: + fname = "unknown%05d.dat" % i + outname= os.path.join(files.outdir, fname) + with open(pathof(outname), 'wb') as f: + f.write(data) + sect.setsectiondescription(i,"Mysterious Section, first four bytes %s extracting as %s" % (describe(data[0:4]), fname)) + return rscnames, rsc_ptr + + imgname = "image%05d.%s" % (i, imgtype) + if cover_offset is not None and i == beg + cover_offset: + imgname = "cover%05d.%s" % (i, imgtype) + if thumb_offset is not None and i == beg + thumb_offset: + imgname = "thumb%05d.%s" % (i, imgtype) + print("Extracting image: {0:s} from section {1:d}".format(imgname,i)) + outimg = os.path.join(files.imgdir, imgname) + with open(pathof(outimg), 'wb') as f: + f.write(data) + rscnames.append(imgname) + sect.setsectiondescription(i,"Image {0:s}".format(imgname)) + if rsc_ptr == -1: + rsc_ptr = i - beg + return rscnames, rsc_ptr + + +def processPrintReplica(metadata, files, rscnames, mh): + global DUMP + global WRITE_RAW_DATA + rawML = mh.getRawML() + if DUMP or WRITE_RAW_DATA: + outraw = os.path.join(files.outdir,files.getInputFileBasename() + '.rawpr') + with open(pathof(outraw),'wb') as f: + f.write(rawML) + + fileinfo = [] + print("Print Replica ebook detected") + try: + numTables, = struct.unpack_from(b'>L', rawML, 0x04) + tableIndexOffset = 8 + 4*numTables + # for each table, read in count of sections, assume first section is a PDF + # and output other sections as binary files + for i in range(numTables): + sectionCount, = struct.unpack_from(b'>L', rawML, 0x08 + 4*i) + for j in range(sectionCount): + sectionOffset, sectionLength, = struct.unpack_from(b'>LL', rawML, tableIndexOffset) + tableIndexOffset += 8 + if j == 0: + entryName = os.path.join(files.outdir, files.getInputFileBasename() + ('.%03d.pdf' % (i+1))) + else: + entryName = os.path.join(files.outdir, files.getInputFileBasename() + ('.%03d.%03d.data' % ((i+1),j))) + with open(pathof(entryName), 'wb') as f: + f.write(rawML[sectionOffset:(sectionOffset+sectionLength)]) + except Exception as e: + print('Error processing Print Replica: ' + str(e)) + + fileinfo.append([None,'', files.getInputFileBasename() + '.pdf']) + usedmap = {} + for name in rscnames: + if name is not None: + usedmap[name] = 'used' + opf = OPFProcessor(files, metadata, fileinfo, rscnames, False, mh, usedmap) + opf.writeOPF() + + +def processMobi8(mh, metadata, sect, files, rscnames, pagemapproc, k8resc, obfuscate_data, apnxfile=None, epubver='2'): + global DUMP + global WRITE_RAW_DATA + + # extract raw markup langauge + rawML = mh.getRawML() + if DUMP or WRITE_RAW_DATA: + outraw = os.path.join(files.k8dir,files.getInputFileBasename() + '.rawml') + with open(pathof(outraw),'wb') as f: + f.write(rawML) + + # KF8 require other indexes which contain parsing information and the FDST info + # to process the rawml back into the xhtml files, css files, svg image files, etc + k8proc = K8Processor(mh, sect, files, DUMP) + k8proc.buildParts(rawML) + + # collect information for the guide first + guidetext = unicode_str(k8proc.getGuideText()) + + # if the guide was empty, add in any guide info from metadata, such as StartOffset + if not guidetext and 'StartOffset' in metadata: + # Apparently, KG 2.5 carries over the StartOffset from the mobi7 part... + # Taking that into account, we only care about the *last* StartOffset, which + # should always be the correct one in these cases (the one actually pointing + # to the right place in the mobi8 part). + starts = metadata['StartOffset'] + last_start = starts[-1] + last_start = int(last_start) + if last_start == 0xffffffff: + last_start = 0 + seq, idtext = k8proc.getFragTblInfo(last_start) + filename, idtext = k8proc.getIDTagByPosFid(toBase32(seq), b'0000000000') + linktgt = filename + idtext = unicode_str(idtext, mh.codec) + if idtext != '': + linktgt += '#' + idtext + guidetext += '<reference type="text" href="Text/%s" />\n' % linktgt + + # if apnxfile is passed in use it for page map information + if apnxfile is not None and pagemapproc is None: + with open(apnxfile, 'rb') as f: + apnxdata = b"00000000" + f.read() + pagemapproc = PageMapProcessor(mh, apnxdata) + + # generate the page map + pagemapxml = '' + if pagemapproc is not None: + pagemapxml = pagemapproc.generateKF8PageMapXML(k8proc) + outpm = os.path.join(files.k8oebps,'page-map.xml') + with open(pathof(outpm),'wb') as f: + f.write(pagemapxml.encode('utf-8')) + if DUMP: + print(pagemapproc.getNames()) + print(pagemapproc.getOffsets()) + print("\n\nPage Map") + print(pagemapxml) + + # process the toc ncx + # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num + print("Processing ncx / toc") + ncx = ncxExtract(mh, files) + ncx_data = ncx.parseNCX() + # extend the ncx data with filenames and proper internal idtags + for i in range(len(ncx_data)): + ncxmap = ncx_data[i] + [junk1, junk2, junk3, fid, junk4, off] = ncxmap['pos_fid'].split(':') + filename, idtag = k8proc.getIDTagByPosFid(fid, off) + ncxmap['filename'] = filename + ncxmap['idtag'] = unicode_str(idtag) + ncx_data[i] = ncxmap + + # convert the rawML to a set of xhtml files + print("Building an epub-like structure") + htmlproc = XHTMLK8Processor(rscnames, k8proc) + usedmap = htmlproc.buildXHTML() + + # write out the xhtml svg, and css files + # fileinfo = [skelid|coverpage, dir, name] + fileinfo = [] + # first create a cover page if none exists + if CREATE_COVER_PAGE: + cover = CoverProcessor(files, metadata, rscnames) + cover_img = utf8_str(cover.getImageName()) + need_to_create_cover_page = False + if cover_img is not None: + if k8resc is None or not k8resc.hasSpine(): + part = k8proc.getPart(0) + if part.find(cover_img) == -1: + need_to_create_cover_page = True + else: + if "coverpage" not in k8resc.spine_idrefs: + part = k8proc.getPart(int(k8resc.spine_order[0])) + if part.find(cover_img) == -1: + k8resc.prepend_to_spine("coverpage", "inserted", "no", None) + if k8resc.spine_order[0] == "coverpage": + need_to_create_cover_page = True + if need_to_create_cover_page: + filename = cover.getXHTMLName() + fileinfo.append(["coverpage", 'Text', filename]) + guidetext += cover.guide_toxml() + cover.writeXHTML() + + n = k8proc.getNumberOfParts() + for i in range(n): + part = k8proc.getPart(i) + [skelnum, dir, filename, beg, end, aidtext] = k8proc.getPartInfo(i) + fileinfo.append([str(skelnum), dir, filename]) + fname = os.path.join(files.k8oebps,dir,filename) + with open(pathof(fname),'wb') as f: + f.write(part) + n = k8proc.getNumberOfFlows() + for i in range(1, n): + [ptype, pformat, pdir, filename] = k8proc.getFlowInfo(i) + flowpart = k8proc.getFlow(i) + if pformat == b'file': + fileinfo.append([None, pdir, filename]) + fname = os.path.join(files.k8oebps,pdir,filename) + with open(pathof(fname),'wb') as f: + f.write(flowpart) + + # create the opf + opf = OPFProcessor(files, metadata.copy(), fileinfo, rscnames, True, mh, usedmap, + pagemapxml=pagemapxml, guidetext=guidetext, k8resc=k8resc, epubver=epubver) + uuid = opf.writeOPF(bool(obfuscate_data)) + + if opf.hasNCX(): + # Create a toc.ncx. + ncx.writeK8NCX(ncx_data, metadata) + if opf.hasNAV(): + # Create a navigation document. + nav = NAVProcessor(files) + nav.writeNAV(ncx_data, guidetext, metadata) + + # make an epub-like structure of it all + print("Creating an epub-like file") + files.makeEPUB(usedmap, obfuscate_data, uuid) + + +def processMobi7(mh, metadata, sect, files, rscnames): + global DUMP + global WRITE_RAW_DATA + # An original Mobi + rawML = mh.getRawML() + if DUMP or WRITE_RAW_DATA: + outraw = os.path.join(files.mobi7dir,files.getInputFileBasename() + '.rawml') + with open(pathof(outraw),'wb') as f: + f.write(rawML) + + # process the toc ncx + # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num + ncx = ncxExtract(mh, files) + ncx_data = ncx.parseNCX() + ncx.writeNCX(metadata) + + positionMap = {} + + # if Dictionary build up the positionMap + if mh.isDictionary(): + if mh.DictInLanguage(): + metadata['DictInLanguage'] = [mh.DictInLanguage()] + if mh.DictOutLanguage(): + metadata['DictOutLanguage'] = [mh.DictOutLanguage()] + positionMap = dictSupport(mh, sect).getPositionMap() + + # convert the rawml back to Mobi ml + proc = HTMLProcessor(files, metadata, rscnames) + srctext = proc.findAnchors(rawML, ncx_data, positionMap) + srctext, usedmap = proc.insertHREFS() + + # write the proper mobi html + fileinfo=[] + # fname = files.getInputFileBasename() + '.html' + fname = 'book.html' + fileinfo.append([None,'', fname]) + outhtml = os.path.join(files.mobi7dir, fname) + with open(pathof(outhtml), 'wb') as f: + f.write(srctext) + + # extract guidetext from srctext + guidetext =b'' + # no pagemap support for older mobis + # pagemapxml = None + guidematch = re.search(br'''<guide>(.*)</guide>''',srctext,re.IGNORECASE+re.DOTALL) + if guidematch: + guidetext = guidematch.group(1) + # sometimes old mobi guide from srctext horribly written so need to clean up + guidetext = guidetext.replace(b"\r", b"") + guidetext = guidetext.replace(b'<REFERENCE', b'<reference') + guidetext = guidetext.replace(b' HREF=', b' href=') + guidetext = guidetext.replace(b' TITLE=', b' title=') + guidetext = guidetext.replace(b' TYPE=', b' type=') + # reference must be a self-closing tag + # and any href must be replaced with filepos information + ref_tag_pattern = re.compile(br'''(<reference [^>]*>)''', re.IGNORECASE) + guidepieces = ref_tag_pattern.split(guidetext) + for i in range(1,len(guidepieces), 2): + reftag = guidepieces[i] + # remove any href there now to replace with filepos + reftag = re.sub(br'''href\s*=[^'"]*['"][^'"]*['"]''',b'', reftag) + # make sure the reference tag ends properly + if not reftag.endswith(b"/>"): + reftag = reftag[0:-1] + b"/>" + guidepieces[i] = reftag + guidetext = b''.join(guidepieces) + replacetext = br'''href="'''+utf8_str(fileinfo[0][2])+ br'''#filepos\1"''' + guidetext = re.sub(br'''filepos=['"]{0,1}0*(\d+)['"]{0,1}''', replacetext, guidetext) + guidetext += b'\n' + + if 'StartOffset' in metadata: + for value in metadata['StartOffset']: + if int(value) == 0xffffffff: + value = '0' + starting_offset = value + # get guide items from metadata + metaguidetext = b'<reference type="text" href="'+utf8_str(fileinfo[0][2])+b'#filepos'+utf8_str(starting_offset)+b'" />\n' + guidetext += metaguidetext + + if isinstance(guidetext, binary_type): + guidetext = guidetext.decode(mh.codec) + + # create an OPF + opf = OPFProcessor(files, metadata, fileinfo, rscnames, ncx.isNCX, mh, usedmap, guidetext=guidetext) + opf.writeOPF() + + +def processUnknownSections(mh, sect, files, K8Boundary): + global DUMP + global TERMINATION_INDICATOR1 + global TERMINATION_INDICATOR2 + global TERMINATION_INDICATOR3 + if DUMP: + print("Unpacking any remaining unknown records") + beg = mh.start + end = sect.num_sections + if beg < K8Boundary: + # then we're processing the first part of a combination file + end = K8Boundary + for i in range(beg, end): + if sect.sectiondescriptions[i] == "": + data = sect.loadSection(i) + type = data[0:4] + if type == TERMINATION_INDICATOR3: + description = "Termination Marker 3 Nulls" + elif type == TERMINATION_INDICATOR2: + description = "Termination Marker 2 Nulls" + elif type == TERMINATION_INDICATOR1: + description = "Termination Marker 1 Null" + elif type == "INDX": + fname = "Unknown%05d_INDX.dat" % i + description = "Unknown INDX section" + if DUMP: + outname= os.path.join(files.outdir, fname) + with open(pathof(outname), 'wb') as f: + f.write(data) + print("Extracting %s: %s from section %d" % (description, fname, i)) + description = description + ", extracting as %s" % fname + else: + fname = "unknown%05d.dat" % i + description = "Mysterious Section, first four bytes %s" % describe(data[0:4]) + if DUMP: + outname= os.path.join(files.outdir, fname) + with open(pathof(outname), 'wb') as f: + f.write(data) + print("Extracting %s: %s from section %d" % (description, fname, i)) + description = description + ", extracting as %s" % fname + sect.setsectiondescription(i, description) + + +def process_all_mobi_headers(files, apnxfile, sect, mhlst, K8Boundary, k8only=False, epubver='2', use_hd=False): + global DUMP + global WRITE_RAW_DATA + rscnames = [] + rsc_ptr = -1 + k8resc = None + obfuscate_data = [] + for mh in mhlst: + pagemapproc = None + if mh.isK8(): + sect.setsectiondescription(mh.start,"KF8 Header") + mhname = os.path.join(files.outdir,"header_K8.dat") + print("Processing K8 section of book...") + elif mh.isPrintReplica(): + sect.setsectiondescription(mh.start,"Print Replica Header") + mhname = os.path.join(files.outdir,"header_PR.dat") + print("Processing PrintReplica section of book...") + else: + if mh.version == 0: + sect.setsectiondescription(mh.start, "PalmDoc Header".format(mh.version)) + else: + sect.setsectiondescription(mh.start,"Mobipocket {0:d} Header".format(mh.version)) + mhname = os.path.join(files.outdir,"header.dat") + print("Processing Mobipocket {0:d} section of book...".format(mh.version)) + + if DUMP: + # write out raw mobi header data + with open(pathof(mhname), 'wb') as f: + f.write(mh.header) + + # process each mobi header + metadata = mh.getMetaData() + mh.describeHeader(DUMP) + if mh.isEncrypted(): + raise unpackException('Book is encrypted') + + pagemapproc = None + + # first handle all of the different resource sections: images, resources, fonts, and etc + # build up a list of image names to use to postprocess the ebook + + print("Unpacking images, resources, fonts, etc") + beg = mh.firstresource + end = sect.num_sections + if beg < K8Boundary: + # processing first part of a combination file + end = K8Boundary + + # Not sure the try/except is necessary, but just in case + try: + thumb_offset = int(metadata.get('ThumbOffset', ['-1'])[0]) + except: + thumb_offset = None + + cover_offset = int(metadata.get('CoverOffset', ['-1'])[0]) + if not CREATE_COVER_PAGE: + cover_offset = None + + for i in range(beg, end): + data = sect.loadSection(i) + type = data[0:4] + + # handle the basics first + if type in [b"FLIS", b"FCIS", b"FDST", b"DATP"]: + if DUMP: + fname = unicode_str(type) + "%05d" % i + if mh.isK8(): + fname += "_K8" + fname += '.dat' + outname= os.path.join(files.outdir, fname) + with open(pathof(outname), 'wb') as f: + f.write(data) + print("Dumping section {0:d} type {1:s} to file {2:s} ".format(i,unicode_str(type),outname)) + sect.setsectiondescription(i,"Type {0:s}".format(unicode_str(type))) + rscnames.append(None) + elif type == b"SRCS": + rscnames = processSRCS(i, files, rscnames, sect, data) + elif type == b"PAGE": + rscnames, pagemapproc = processPAGE(i, files, rscnames, sect, data, mh, pagemapproc) + elif type == b"CMET": + rscnames = processCMET(i, files, rscnames, sect, data) + elif type == b"FONT": + rscnames, obfuscate_data, rsc_ptr = processFONT(i, files, rscnames, sect, data, obfuscate_data, beg, rsc_ptr) + elif type == b"CRES": + rscnames, rsc_ptr = processCRES(i, files, rscnames, sect, data, beg, rsc_ptr, use_hd) + elif type == b"CONT": + rscnames = processCONT(i, files, rscnames, sect, data) + elif type == b"kind": + rscnames = processkind(i, files, rscnames, sect, data) + elif type == b'\xa0\xa0\xa0\xa0': + sect.setsectiondescription(i,"Empty_HD_Image/Resource_Placeholder") + rscnames.append(None) + rsc_ptr += 1 + elif type == b"RESC": + rscnames, k8resc = processRESC(i, files, rscnames, sect, data, k8resc) + elif data == EOF_RECORD: + sect.setsectiondescription(i,"End Of File") + rscnames.append(None) + elif data[0:8] == b"BOUNDARY": + sect.setsectiondescription(i,"BOUNDARY Marker") + rscnames.append(None) + else: + # if reached here should be an image ow treat as unknown + rscnames, rsc_ptr = processImage(i, files, rscnames, sect, data, beg, rsc_ptr, cover_offset, thumb_offset) + # done unpacking resources + + # Print Replica + if mh.isPrintReplica() and not k8only: + processPrintReplica(metadata, files, rscnames, mh) + continue + + # KF8 (Mobi 8) + if mh.isK8(): + processMobi8(mh, metadata, sect, files, rscnames, pagemapproc, k8resc, obfuscate_data, apnxfile, epubver) + + # Old Mobi (Mobi 7) + elif not k8only: + processMobi7(mh, metadata, sect, files, rscnames) + + # process any remaining unknown sections of the palm file + processUnknownSections(mh, sect, files, K8Boundary) + + return + + +def unpackBook(infile, outdir, apnxfile=None, epubver='2', use_hd=False, dodump=False, dowriteraw=False, dosplitcombos=False): + global DUMP + global WRITE_RAW_DATA + global SPLIT_COMBO_MOBIS + if DUMP or dodump: + DUMP = True + if WRITE_RAW_DATA or dowriteraw: + WRITE_RAW_DATA = True + if SPLIT_COMBO_MOBIS or dosplitcombos: + SPLIT_COMBO_MOBIS = True + + infile = unicode_str(infile) + outdir = unicode_str(outdir) + if apnxfile is not None: + apnxfile = unicode_str(apnxfile) + + files = fileNames(infile, outdir) + + # process the PalmDoc database header and verify it is a mobi + sect = Sectionizer(infile) + if sect.ident != b'BOOKMOBI' and sect.ident != b'TEXtREAd': + raise unpackException('Invalid file format') + if DUMP: + sect.dumppalmheader() + else: + print("Palm DB type: %s, %d sections." % (sect.ident.decode('utf-8'),sect.num_sections)) + + # scan sections to see if this is a compound mobi file (K8 format) + # and build a list of all mobi headers to process. + mhlst = [] + mh = MobiHeader(sect,0) + # if this is a mobi8-only file hasK8 here will be true + mhlst.append(mh) + K8Boundary = -1 + + if mh.isK8(): + print("Unpacking a KF8 book...") + hasK8 = True + else: + # This is either a Mobipocket 7 or earlier, or a combi M7/KF8 + # Find out which + hasK8 = False + for i in range(len(sect.sectionoffsets)-1): + before, after = sect.sectionoffsets[i:i+2] + if (after - before) == 8: + data = sect.loadSection(i) + if data == K8_BOUNDARY: + sect.setsectiondescription(i,"Mobi/KF8 Boundary Section") + mh = MobiHeader(sect,i+1) + hasK8 = True + mhlst.append(mh) + K8Boundary = i + break + if hasK8: + print("Unpacking a Combination M{0:d}/KF8 book...".format(mh.version)) + if SPLIT_COMBO_MOBIS: + # if this is a combination mobi7-mobi8 file split them up + mobisplit = mobi_split(infile) + if mobisplit.combo: + outmobi7 = os.path.join(files.outdir, 'mobi7-'+files.getInputFileBasename() + '.mobi') + outmobi8 = os.path.join(files.outdir, 'mobi8-'+files.getInputFileBasename() + '.azw3') + with open(pathof(outmobi7), 'wb') as f: + f.write(mobisplit.getResult7()) + with open(pathof(outmobi8), 'wb') as f: + f.write(mobisplit.getResult8()) + else: + print("Unpacking a Mobipocket {0:d} book...".format(mh.version)) + + if hasK8: + files.makeK8Struct() + + process_all_mobi_headers(files, apnxfile, sect, mhlst, K8Boundary, False, epubver, use_hd) + + if DUMP: + sect.dumpsectionsinfo() + return + + +def usage(progname): + print("") + print("Description:") + print(" Unpacks an unencrypted Kindle/MobiPocket ebook to html and images") + print(" or an unencrypted Kindle/Print Replica ebook to PDF and images") + print(" into the specified output folder.") + print("Usage:") + print(" %s -r -s -p apnxfile -d -h --epub_version= infile [outdir]" % progname) + print("Options:") + print(" -h print this help message") + print(" -i use HD Images, if present, to overwrite reduced resolution images") + print(" -s split combination mobis into mobi7 and mobi8 ebooks") + print(" -p APNXFILE path to an .apnx file associated with the azw3 input (optional)") + print(" --epub_version= specify epub version to unpack to: 2, 3, A (for automatic) or ") + print(" F (force to fit to epub2 definitions), default is 2") + print(" -d dump headers and other info to output and extra files") + print(" -r write raw data to the output folder") + + +def main(argv=unicode_argv()): + global DUMP + global WRITE_RAW_DATA + global SPLIT_COMBO_MOBIS + + print("KindleUnpack v0.83") + print(" Based on initial mobipocket version Copyright © 2009 Charles M. Hannum <root@ihack.net>") + print(" Extensive Extensions and Improvements Copyright © 2009-2020 ") + print(" by: P. Durrant, K. Hendricks, S. Siebert, fandrieu, DiapDealer, nickredding, tkeo.") + print(" This program is free software: you can redistribute it and/or modify") + print(" it under the terms of the GNU General Public License as published by") + print(" the Free Software Foundation, version 3.") + + progname = os.path.basename(argv[0]) + try: + opts, args = getopt.getopt(argv[1:], "dhirsp:", ['epub_version=']) + except getopt.GetoptError as err: + print(str(err)) + usage(progname) + sys.exit(2) + + if len(args)<1: + usage(progname) + sys.exit(2) + + apnxfile = None + epubver = '2' + use_hd = False + + for o, a in opts: + if o == "-h": + usage(progname) + sys.exit(0) + if o == "-i": + use_hd = True + if o == "-d": + DUMP = True + if o == "-r": + WRITE_RAW_DATA = True + if o == "-s": + SPLIT_COMBO_MOBIS = True + if o == "-p": + apnxfile = a + if o == "--epub_version": + epubver = a + + if len(args) > 1: + infile, outdir = args + else: + infile = args[0] + outdir = os.path.splitext(infile)[0] + + infileext = os.path.splitext(infile)[1].upper() + if infileext not in ['.MOBI', '.PRC', '.AZW', '.AZW3', '.AZW4']: + print("Error: first parameter must be a Kindle/Mobipocket ebook or a Kindle/Print Replica ebook.") + return 1 + + try: + print('Unpacking Book...') + unpackBook(infile, outdir, apnxfile, epubver, use_hd) + print('Completed') + + except ValueError as e: + print("Error: %s" % e) + print(traceback.format_exc()) + return 1 + + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/epy_reader/tools/KindleUnpack/mobi_cover.py b/src/epy_reader/tools/KindleUnpack/mobi_cover.py new file mode 100644 index 0000000..3078ac4 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_cover.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import unicode_str + +from .unipath import pathof +import os +import imghdr + +import struct +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring + +USE_SVG_WRAPPER = True +""" Set to True to use svg wrapper for default. """ + +FORCE_DEFAULT_TITLE = False +""" Set to True to force to use the default title. """ + +COVER_PAGE_FINENAME = 'cover_page.xhtml' +""" The name for the cover page. """ + +DEFAULT_TITLE = 'Cover' +""" The default title for the cover page. """ + +MAX_WIDTH = 4096 +""" The max width for the svg cover page. """ + +MAX_HEIGHT = 4096 +""" The max height for the svg cover page. """ + + +def get_image_type(imgname, imgdata=None): + imgtype = unicode_str(imghdr.what(pathof(imgname), imgdata)) + + # imghdr only checks for JFIF or Exif JPEG files. Apparently, there are some + # with only the magic JPEG bytes out there... + # ImageMagick handles those, so, do it too. + if imgtype is None: + if imgdata is None: + with open(pathof(imgname), 'rb') as f: + imgdata = f.read() + if imgdata[0:2] == b'\xFF\xD8': + # Get last non-null bytes + last = len(imgdata) + while (imgdata[last-1:last] == b'\x00'): + last-=1 + # Be extra safe, check the trailing bytes, too. + if imgdata[last-2:last] == b'\xFF\xD9': + imgtype = "jpeg" + return imgtype + + +def get_image_size(imgname, imgdata=None): + '''Determine the image type of imgname (or imgdata) and return its size. + + Originally, + Determine the image type of fhandle and return its size. + from draco''' + if imgdata is None: + fhandle = open(pathof(imgname), 'rb') + head = fhandle.read(24) + else: + head = imgdata[0:24] + if len(head) != 24: + return + + imgtype = get_image_type(imgname, imgdata) + if imgtype == 'png': + check = struct.unpack(b'>i', head[4:8])[0] + if check != 0x0d0a1a0a: + return + width, height = struct.unpack(b'>ii', head[16:24]) + elif imgtype == 'gif': + width, height = struct.unpack(b'<HH', head[6:10]) + elif imgtype == 'jpeg' and imgdata is None: + try: + fhandle.seek(0) # Read 0xff next + size = 2 + ftype = 0 + while not 0xc0 <= ftype <= 0xcf: + fhandle.seek(size, 1) + byte = fhandle.read(1) + while ord(byte) == 0xff: + byte = fhandle.read(1) + ftype = ord(byte) + size = struct.unpack(b'>H', fhandle.read(2))[0] - 2 + # We are at a SOFn block + fhandle.seek(1, 1) # Skip `precision' byte. + height, width = struct.unpack(b'>HH', fhandle.read(4)) + except Exception: # IGNORE:W0703 + return + elif imgtype == 'jpeg' and imgdata is not None: + try: + pos = 0 + size = 2 + ftype = 0 + while not 0xc0 <= ftype <= 0xcf: + pos += size + byte = imgdata[pos:pos+1] + pos += 1 + while ord(byte) == 0xff: + byte = imgdata[pos:pos+1] + pos += 1 + ftype = ord(byte) + size = struct.unpack(b'>H', imgdata[pos:pos+2])[0] - 2 + pos += 2 + # We are at a SOFn block + pos += 1 # Skip `precision' byte. + height, width = struct.unpack(b'>HH', imgdata[pos:pos+4]) + pos += 4 + except Exception: # IGNORE:W0703 + return + else: + return + return width, height + +# XXX experimental +class CoverProcessor(object): + + """Create a cover page. + + """ + def __init__(self, files, metadata, rscnames, imgname=None, imgdata=None): + self.files = files + self.metadata = metadata + self.rscnames = rscnames + self.cover_page = COVER_PAGE_FINENAME + self.use_svg = USE_SVG_WRAPPER # Use svg wrapper. + self.lang = metadata.get('Language', ['en'])[0] + # This should ensure that if the methods to find the cover image's + # dimensions should fail for any reason, the SVG routine will not be used. + [self.width, self.height] = (-1,-1) + if FORCE_DEFAULT_TITLE: + self.title = DEFAULT_TITLE + else: + self.title = metadata.get('Title', [DEFAULT_TITLE])[0] + + self.cover_image = None + if imgname is not None: + self.cover_image = imgname + elif 'CoverOffset' in metadata: + imageNumber = int(metadata['CoverOffset'][0]) + cover_image = self.rscnames[imageNumber] + if cover_image is not None: + self.cover_image = cover_image + else: + print('Warning: Cannot identify the cover image.') + if self.use_svg: + try: + if imgdata is None: + fname = os.path.join(files.imgdir, self.cover_image) + [self.width, self.height] = get_image_size(fname) + else: + [self.width, self.height] = get_image_size(None, imgdata) + except: + self.use_svg = False + width = self.width + height = self.height + if width < 0 or height < 0 or width > MAX_WIDTH or height > MAX_HEIGHT: + self.use_svg = False + return + + def getImageName(self): + return self.cover_image + + def getXHTMLName(self): + return self.cover_page + + def buildXHTML(self): + print('Building a cover page.') + files = self.files + cover_image = self.cover_image + title = self.title + lang = self.lang + + image_dir = os.path.normpath(os.path.relpath(files.k8images, files.k8text)) + image_path = os.path.join(image_dir, cover_image).replace('\\', '/') + + if not self.use_svg: + data = '' + data += '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html>' + data += '<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"' + data += ' xml:lang="{:s}">\n'.format(lang) + data += '<head>\n<title>{:s}</title>\n'.format(title) + data += '<style type="text/css">\n' + data += 'body {\n margin: 0;\n padding: 0;\n text-align: center;\n}\n' + data += 'div {\n height: 100%;\n width: 100%;\n text-align: center;\n page-break-inside: avoid;\n}\n' + data += 'img {\n display: inline-block;\n height: 100%;\n margin: 0 auto;\n}\n' + data += '</style>\n</head>\n' + data += '<body><div>\n' + data += ' <img src="{:s}" alt=""/>\n'.format(image_path) + data += '</div></body>\n</html>' + else: + width = self.width + height = self.height + viewBox = "0 0 {0:d} {1:d}".format(width, height) + + data = '' + data += '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html>' + data += '<html xmlns="http://www.w3.org/1999/xhtml"' + data += ' xml:lang="{:s}">\n'.format(lang) + data += '<head>\n <title>{:s}</title>\n'.format(title) + data += '<style type="text/css">\n' + data += 'svg {padding: 0pt; margin:0pt}\n' + data += 'body { text-align: center; padding:0pt; margin: 0pt; }\n' + data += '</style>\n</head>\n' + data += '<body>\n <div>\n' + data += ' <svg xmlns="http://www.w3.org/2000/svg" height="100%" preserveAspectRatio="xMidYMid meet"' + data += ' version="1.1" viewBox="{0:s}" width="100%" xmlns:xlink="http://www.w3.org/1999/xlink">\n'.format(viewBox) + data += ' <image height="{0}" width="{1}" xlink:href="{2}"/>\n'.format(height, width, image_path) + data += ' </svg>\n' + data += ' </div>\n</body>\n</html>' + return data + + def writeXHTML(self): + files = self.files + cover_page = self.cover_page + + data = self.buildXHTML() + + outfile = os.path.join(files.k8text, cover_page) + if os.path.exists(pathof(outfile)): + print('Warning: {:s} already exists.'.format(cover_page)) + os.remove(pathof(outfile)) + with open(pathof(outfile), 'wb') as f: + f.write(data.encode('utf-8')) + return + + def guide_toxml(self): + files = self.files + text_dir = os.path.relpath(files.k8text, files.k8oebps) + data = '<reference type="cover" title="Cover" href="{:s}/{:s}" />\n'.format( + text_dir, self.cover_page) + return data diff --git a/src/epy_reader/tools/KindleUnpack/mobi_dict.py b/src/epy_reader/tools/KindleUnpack/mobi_dict.py new file mode 100644 index 0000000..bfc2ea8 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_dict.py @@ -0,0 +1,377 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, PY3, utf8_str, bstr, bchr + +if PY2: + range = xrange + array_format = b'B' +if PY3: + unichr = chr + array_format = "B" + +import array + +import struct +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring + +from .mobi_index import getVariableWidthValue, readTagSection, getTagMap +from .mobi_utils import toHex + +DEBUG_DICT = False + +class InflectionData(object): + + def __init__(self, infldatas): + self.infldatas = infldatas + self.starts = [] + self.counts = [] + for idata in self.infldatas: + start, = struct.unpack_from(b'>L', idata, 0x14) + count, = struct.unpack_from(b'>L', idata, 0x18) + self.starts.append(start) + self.counts.append(count) + + def lookup(self, lookupvalue): + i = 0 + rvalue = lookupvalue + while rvalue >= self.counts[i]: + rvalue = rvalue - self.counts[i] + i += 1 + if i == len(self.counts): + print("Error: Problem with multiple inflections data sections") + return lookupvalue, self.starts[0], self.counts[0], self.infldatas[0] + return rvalue, self.starts[i], self.counts[i], self.infldatas[i] + + def offsets(self, value): + rvalue, start, count, data = self.lookup(value) + offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue)) + if rvalue + 1 < count: + nextOffset, = struct.unpack_from(b'>H',data, start + 4 + (2 * (rvalue + 1))) + else: + nextOffset = None + return offset, nextOffset, data + + +class dictSupport(object): + + def __init__(self, mh, sect): + self.mh = mh + self.header = mh.header + self.sect = sect + self.metaOrthIndex = mh.metaOrthIndex + self.metaInflIndex = mh.metaInflIndex + + def parseHeader(self, data): + "read INDX header" + if not data[:4] == b'INDX': + print("Warning: index section is not INDX") + return False + words = ( + 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code', + 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc' + ) + num = len(words) + values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)]) + header = {} + for n in range(num): + header[words[n]] = values[n] + + ordt1 = None + ordt2 = None + + otype, oentries, op1, op2, otagx = struct.unpack_from(b'>LLLLL',data, 0xa4) + header['otype'] = otype + header['oentries'] = oentries + + if DEBUG_DICT: + print("otype %d, oentries %d, op1 %d, op2 %d, otagx %d" % (otype, oentries, op1, op2, otagx)) + + if header['code'] == 0xfdea or oentries > 0: + # some dictionaries seem to be codepage 65002 (0xFDEA) which seems + # to be some sort of strange EBCDIC utf-8 or 16 encoded strings + # So we need to look for them and store them away to process leading text + # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries + # we only ever seem to use the second but ... + # + # if otype = 0, ORDT table uses 16 bit values as offsets into the table + # if otype = 1, ORDT table uses 8 bit values as offsets inot the table + + assert(data[op1:op1+4] == b'ORDT') + assert(data[op2:op2+4] == b'ORDT') + ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4) + ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4) + + if DEBUG_DICT: + print("parsed INDX header:") + for key in header: + print(key, "%x" % header[key],) + print("\n") + return header, ordt1, ordt2 + + def getPositionMap(self): + sect = self.sect + + positionMap = {} + + metaOrthIndex = self.metaOrthIndex + metaInflIndex = self.metaInflIndex + + decodeInflection = True + if metaOrthIndex != 0xFFFFFFFF: + print("Info: Document contains orthographic index, handle as dictionary") + if metaInflIndex == 0xFFFFFFFF: + decodeInflection = False + else: + metaInflIndexData = sect.loadSection(metaInflIndex) + + print("\nParsing metaInflIndexData") + midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData) + + metaIndexCount = midxhdr['count'] + idatas = [] + for j in range(metaIndexCount): + idatas.append(sect.loadSection(metaInflIndex + 1 + j)) + dinfl = InflectionData(idatas) + + inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount) + tagSectionStart = midxhdr['len'] + inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData) + if DEBUG_DICT: + print("inflectionTagTable: %s" % inflectionTagTable) + if self.hasTag(inflectionTagTable, 0x07): + print("Error: Dictionary uses obsolete inflection rule scheme which is not yet supported") + decodeInflection = False + + data = sect.loadSection(metaOrthIndex) + + print("\nParsing metaOrthIndex") + idxhdr, hordt1, hordt2 = self.parseHeader(data) + + tagSectionStart = idxhdr['len'] + controlByteCount, tagTable = readTagSection(tagSectionStart, data) + orthIndexCount = idxhdr['count'] + print("orthIndexCount is", orthIndexCount) + if DEBUG_DICT: + print("orthTagTable: %s" % tagTable) + if hordt2 is not None: + print("orth entry uses ordt2 lookup table of type ", idxhdr['otype']) + hasEntryLength = self.hasTag(tagTable, 0x02) + if not hasEntryLength: + print("Info: Index doesn't contain entry length tags") + + print("Read dictionary index data") + for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount): + data = sect.loadSection(i) + hdrinfo, ordt1, ordt2 = self.parseHeader(data) + idxtPos = hdrinfo['start'] + entryCount = hdrinfo['count'] + idxPositions = [] + for j in range(entryCount): + pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j)) + idxPositions.append(pos) + # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) + idxPositions.append(idxtPos) + for j in range(entryCount): + startPos = idxPositions[j] + endPos = idxPositions[j+1] + textLength = ord(data[startPos:startPos+1]) + text = data[startPos+1:startPos+1+textLength] + if hordt2 is not None: + utext = u"" + if idxhdr['otype'] == 0: + pattern = b'>H' + inc = 2 + else: + pattern = b'>B' + inc = 1 + pos = 0 + while pos < textLength: + off, = struct.unpack_from(pattern, text, pos) + if off < len(hordt2): + utext += unichr(hordt2[off]) + else: + utext += unichr(off) + pos += inc + text = utext.encode('utf-8') + + tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos) + if 0x01 in tagMap: + if decodeInflection and 0x2a in tagMap: + inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable, + dinfl, inflNameData, tagMap[0x2a]) + else: + inflectionGroups = b'' + assert len(tagMap[0x01]) == 1 + entryStartPosition = tagMap[0x01][0] + if hasEntryLength: + # The idx:entry attribute "scriptable" must be present to create entry length tags. + ml = b'<idx:entry scriptable="yes"><idx:orth value="' + text + b'">' + inflectionGroups + b'</idx:orth>' + if entryStartPosition in positionMap: + positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml + else: + positionMap[entryStartPosition] = ml + assert len(tagMap[0x02]) == 1 + entryEndPosition = entryStartPosition + tagMap[0x02][0] + if entryEndPosition in positionMap: + positionMap[entryEndPosition] = b"</idx:entry>" + positionMap[entryEndPosition] + else: + positionMap[entryEndPosition] = b"</idx:entry>" + + else: + indexTags = b'<idx:entry>\n<idx:orth value="' + text + b'">\n' + inflectionGroups + b'</idx:entry>\n' + if entryStartPosition in positionMap: + positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags + else: + positionMap[entryStartPosition] = indexTags + return positionMap + + def hasTag(self, tagTable, tag): + ''' + Test if tag table contains given tag. + + @param tagTable: The tag table. + @param tag: The tag to search. + @return: True if tag table contains given tag; False otherwise. + ''' + for currentTag, _, _, _ in tagTable: + if currentTag == tag: + return True + return False + + def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList): + ''' + Create string which contains the inflection groups with inflection rules as mobipocket tags. + + @param mainEntry: The word to inflect. + @param controlByteCount: The number of control bytes. + @param tagTable: The tag table. + @param data: The Inflection data object to properly select the right inflection data section to use + @param inflectionNames: The inflection rule name data. + @param groupList: The list of inflection groups to process. + @return: String with inflection groups and rules or empty string if required tags are not available. + ''' + result = b"" + for value in groupList: + offset, nextOffset, data = dinfl.offsets(value) + + # First byte seems to be always 0x00 and must be skipped. + assert ord(data[offset:offset+1]) == 0x00 + tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset) + + # Make sure that the required tags are available. + if 0x05 not in tagMap: + print("Error: Required tag 0x05 not found in tagMap") + return "" + if 0x1a not in tagMap: + print("Error: Required tag 0x1a not found in tagMap") + return b'' + + result += b'<idx:infl>' + + for i in range(len(tagMap[0x05])): + + # Get name of inflection rule. + value = tagMap[0x05][i] + consumed, textLength = getVariableWidthValue(inflectionNames, value) + inflectionName = inflectionNames[value+consumed:value+consumed+textLength] + + # Get and apply inflection rule across possibly multiple inflection data sections + value = tagMap[0x1a][i] + rvalue, start, count, data = dinfl.lookup(value) + offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue)) + textLength = ord(data[offset:offset+1]) + inflection = self.applyInflectionRule(mainEntry, data, offset+1, offset+1+textLength) + if inflection is not None: + result += b' <idx:iform name="' + inflectionName + b'" value="' + inflection + b'"/>' + + result += b'</idx:infl>' + return result + + def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end): + ''' + Apply inflection rule. + + @param mainEntry: The word to inflect. + @param inflectionRuleData: The inflection rules. + @param start: The start position of the inflection rule to use. + @param end: The end position of the inflection rule to use. + @return: The string with the inflected word or None if an error occurs. + ''' + mode = -1 + byteArray = array.array(array_format, mainEntry) + position = len(byteArray) + for charOffset in range(start, end): + char = inflectionRuleData[charOffset:charOffset+1] + abyte = ord(char) + if abyte >= 0x0a and abyte <= 0x13: + # Move cursor backwards + offset = abyte - 0x0a + if mode not in [0x02, 0x03]: + mode = 0x02 + position = len(byteArray) + position -= offset + elif abyte > 0x13: + if mode == -1: + print("Error: Unexpected first byte %i of inflection rule" % abyte) + return None + elif position == -1: + print("Error: Unexpected first byte %i of inflection rule" % abyte) + return None + else: + if mode == 0x01: + # Insert at word start + byteArray.insert(position, abyte) + position += 1 + elif mode == 0x02: + # Insert at word end + byteArray.insert(position, abyte) + elif mode == 0x03: + # Delete at word end + position -= 1 + deleted = byteArray.pop(position) + if bchr(deleted) != char: + if DEBUG_DICT: + print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted))) + print("Error: Delete operation of inflection rule failed") + return None + elif mode == 0x04: + # Delete at word start + deleted = byteArray.pop(position) + if bchr(deleted) != char: + if DEBUG_DICT: + print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted))) + print("Error: Delete operation of inflection rule failed") + return None + else: + print("Error: Inflection rule mode %x is not implemented" % mode) + return None + elif abyte == 0x01: + # Insert at word start + if mode not in [0x01, 0x04]: + position = 0 + mode = abyte + elif abyte == 0x02: + # Insert at word end + if mode not in [0x02, 0x03]: + position = len(byteArray) + mode = abyte + elif abyte == 0x03: + # Delete at word end + if mode not in [0x02, 0x03]: + position = len(byteArray) + mode = abyte + elif abyte == 0x04: + # Delete at word start + if mode not in [0x01, 0x04]: + position = 0 + # Delete at word start + mode = abyte + else: + print("Error: Inflection rule mode %x is not implemented" % abyte) + return None + return utf8_str(byteArray.tostring()) diff --git a/src/epy_reader/tools/KindleUnpack/mobi_header.py b/src/epy_reader/tools/KindleUnpack/mobi_header.py new file mode 100644 index 0000000..a15f636 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_header.py @@ -0,0 +1,936 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +DEBUG_USE_ORDERED_DICTIONARY = False # OrderedDict is supoorted >= python 2.7. +""" set to True to use OrderedDict for MobiHeader.metadata.""" + +if DEBUG_USE_ORDERED_DICTIONARY: + from collections import OrderedDict as dict_ +else: + dict_ = dict + +from .compatibility_utils import PY2, unicode_str, hexlify, bord + +if PY2: + range = xrange + +import struct +import uuid + +# import the mobiunpack support libraries +from .mobi_utils import getLanguage +from .mobi_uncompress import HuffcdicReader, PalmdocReader, UncompressedReader + +class unpackException(Exception): + pass + + +def sortedHeaderKeys(mheader): + hdrkeys = sorted(list(mheader.keys()), key=lambda akey: mheader[akey][0]) + return hdrkeys + + +# HD Containers have their own headers and their own EXTH +# this is just guesswork so far, making big assumption that +# metavalue key numbers remain the same in the CONT EXTH + +# Note: The layout of the CONT Header is still unknown +# so just deal with their EXTH sections for now + +def dump_contexth(cpage, extheader): + # determine text encoding + codec = 'windows-1252' + codec_map = { + 1252 : 'windows-1252', + 65001: 'utf-8', + } + if cpage in codec_map: + codec = codec_map[cpage] + if extheader == b'': + return + id_map_strings = { + 1 : 'Drm Server Id', + 2 : 'Drm Commerce Id', + 3 : 'Drm Ebookbase Book Id', + 4 : 'Drm Ebookbase Dep Id', + 100 : 'Creator', + 101 : 'Publisher', + 102 : 'Imprint', + 103 : 'Description', + 104 : 'ISBN', + 105 : 'Subject', + 106 : 'Published', + 107 : 'Review', + 108 : 'Contributor', + 109 : 'Rights', + 110 : 'SubjectCode', + 111 : 'Type', + 112 : 'Source', + 113 : 'ASIN', + # 114 : 'versionNumber', + 117 : 'Adult', + 118 : 'Retail-Price', + 119 : 'Retail-Currency', + 120 : 'TSC', + 122 : 'fixed-layout', + 123 : 'book-type', + 124 : 'orientation-lock', + 126 : 'original-resolution', + 127 : 'zero-gutter', + 128 : 'zero-margin', + 129 : 'MetadataResourceURI', + 132 : 'RegionMagnification', + 150 : 'LendingEnabled', + 200 : 'DictShortName', + 501 : 'cdeType', + 502 : 'last_update_time', + 503 : 'Updated_Title', + 504 : 'CDEContentKey', + 505 : 'AmazonContentReference', + 506 : 'Title-Language', + 507 : 'Title-Display-Direction', + 508 : 'Title-Pronunciation', + 509 : 'Title-Collation', + 510 : 'Secondary-Title', + 511 : 'Secondary-Title-Language', + 512 : 'Secondary-Title-Direction', + 513 : 'Secondary-Title-Pronunciation', + 514 : 'Secondary-Title-Collation', + 515 : 'Author-Language', + 516 : 'Author-Display-Direction', + 517 : 'Author-Pronunciation', + 518 : 'Author-Collation', + 519 : 'Author-Type', + 520 : 'Publisher-Language', + 521 : 'Publisher-Display-Direction', + 522 : 'Publisher-Pronunciation', + 523 : 'Publisher-Collation', + 524 : 'Content-Language-Tag', + 525 : 'primary-writing-mode', + 526 : 'NCX-Ingested-By-Software', + 527 : 'page-progression-direction', + 528 : 'override-kindle-fonts', + 529 : 'Compression-Upgraded', + 530 : 'Soft-Hyphens-In-Content', + 531 : 'Dictionary_In_Langague', + 532 : 'Dictionary_Out_Language', + 533 : 'Font_Converted', + 534 : 'Amazon_Creator_Info', + 535 : 'Creator-Build-Tag', + 536 : 'HD-Media-Containers-Info', # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?) + 538 : 'Resource-Container-Fidelity', + 539 : 'HD-Container-Mimetype', + 540 : 'Sample-For_Special-Purpose', + 541 : 'Kindletool-Operation-Information', + 542 : 'Container_Id', + 543 : 'Asset-Type', # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER + 544 : 'Unknown_544', + } + id_map_values = { + 114 : 'versionNumber', + 115 : 'sample', + 116 : 'StartOffset', + 121 : 'Mobi8-Boundary-Section', + 125 : 'Embedded-Record-Count', + 130 : 'Offline-Sample', + 131 : 'Metadata-Record-Offset', + 201 : 'CoverOffset', + 202 : 'ThumbOffset', + 203 : 'HasFakeCover', + 204 : 'Creator-Software', + 205 : 'Creator-Major-Version', + 206 : 'Creator-Minor-Version', + 207 : 'Creator-Build-Number', + 401 : 'Clipping-Limit', + 402 : 'Publisher-Limit', + 404 : 'Text-to-Speech-Disabled', + 406 : 'Rental-Expiration-Time', + } + id_map_hexstrings = { + 208 : 'Watermark_(hex)', + 209 : 'Tamper-Proof-Keys_(hex)', + 300 : 'Font-Signature_(hex)', + 403 : 'Unknown_(403)_(hex)', + 405 : 'Ownership-Type_(hex)', + 407 : 'Unknown_(407)_(hex)', + 420 : 'Multimedia-Content-Reference_(hex)', + 450 : 'Locations_Match_(hex)', + 451 : 'Full-Story-Length_(hex)', + 452 : 'Sample-Start_Location_(hex)', + 453 : 'Sample-End-Location_(hex)', + } + _length, num_items = struct.unpack(b'>LL', extheader[4:12]) + extheader = extheader[12:] + pos = 0 + for _ in range(num_items): + id, size = struct.unpack(b'>LL', extheader[pos:pos+8]) + content = extheader[pos + 8: pos + size] + if id in id_map_strings: + name = id_map_strings[id] + print('\n Key: "%s"\n Value: "%s"' % (name, content.decode(codec, errors='replace'))) + elif id in id_map_values: + name = id_map_values[id] + if size == 9: + value, = struct.unpack(b'B',content) + print('\n Key: "%s"\n Value: 0x%01x' % (name, value)) + elif size == 10: + value, = struct.unpack(b'>H',content) + print('\n Key: "%s"\n Value: 0x%02x' % (name, value)) + elif size == 12: + value, = struct.unpack(b'>L',content) + print('\n Key: "%s"\n Value: 0x%04x' % (name, value)) + else: + print("\nError: Value for %s has unexpected size of %s" % (name, size)) + elif id in id_map_hexstrings: + name = id_map_hexstrings[id] + print('\n Key: "%s"\n Value: 0x%s' % (name, hexlify(content))) + else: + print("\nWarning: Unknown metadata with id %s found" % id) + name = str(id) + ' (hex)' + print(' Key: "%s"\n Value: 0x%s' % (name, hexlify(content))) + pos += size + return + + +class MobiHeader: + # all values are packed in big endian format + palmdoc_header = { + 'compression_type' : (0x00, b'>H', 2), + 'fill0' : (0x02, b'>H', 2), + 'text_length' : (0x04, b'>L', 4), + 'text_records' : (0x08, b'>H', 2), + 'max_section_size' : (0x0a, b'>H', 2), + 'read_pos ' : (0x0c, b'>L', 4), + } + + mobi6_header = { + 'compression_type' : (0x00, b'>H', 2), + 'fill0' : (0x02, b'>H', 2), + 'text_length' : (0x04, b'>L', 4), + 'text_records' : (0x08, b'>H', 2), + 'max_section_size' : (0x0a, b'>H', 2), + 'crypto_type' : (0x0c, b'>H', 2), + 'fill1' : (0x0e, b'>H', 2), + 'magic' : (0x10, b'4s', 4), + 'header_length (from MOBI)' : (0x14, b'>L', 4), + 'type' : (0x18, b'>L', 4), + 'codepage' : (0x1c, b'>L', 4), + 'unique_id' : (0x20, b'>L', 4), + 'version' : (0x24, b'>L', 4), + 'metaorthindex' : (0x28, b'>L', 4), + 'metainflindex' : (0x2c, b'>L', 4), + 'index_names' : (0x30, b'>L', 4), + 'index_keys' : (0x34, b'>L', 4), + 'extra_index0' : (0x38, b'>L', 4), + 'extra_index1' : (0x3c, b'>L', 4), + 'extra_index2' : (0x40, b'>L', 4), + 'extra_index3' : (0x44, b'>L', 4), + 'extra_index4' : (0x48, b'>L', 4), + 'extra_index5' : (0x4c, b'>L', 4), + 'first_nontext' : (0x50, b'>L', 4), + 'title_offset' : (0x54, b'>L', 4), + 'title_length' : (0x58, b'>L', 4), + 'language_code' : (0x5c, b'>L', 4), + 'dict_in_lang' : (0x60, b'>L', 4), + 'dict_out_lang' : (0x64, b'>L', 4), + 'min_version' : (0x68, b'>L', 4), + 'first_resc_offset' : (0x6c, b'>L', 4), + 'huff_offset' : (0x70, b'>L', 4), + 'huff_num' : (0x74, b'>L', 4), + 'huff_tbl_offset' : (0x78, b'>L', 4), + 'huff_tbl_len' : (0x7c, b'>L', 4), + 'exth_flags' : (0x80, b'>L', 4), + 'fill3_a' : (0x84, b'>L', 4), + 'fill3_b' : (0x88, b'>L', 4), + 'fill3_c' : (0x8c, b'>L', 4), + 'fill3_d' : (0x90, b'>L', 4), + 'fill3_e' : (0x94, b'>L', 4), + 'fill3_f' : (0x98, b'>L', 4), + 'fill3_g' : (0x9c, b'>L', 4), + 'fill3_h' : (0xa0, b'>L', 4), + 'unknown0' : (0xa4, b'>L', 4), + 'drm_offset' : (0xa8, b'>L', 4), + 'drm_count' : (0xac, b'>L', 4), + 'drm_size' : (0xb0, b'>L', 4), + 'drm_flags' : (0xb4, b'>L', 4), + 'fill4_a' : (0xb8, b'>L', 4), + 'fill4_b' : (0xbc, b'>L', 4), + 'first_content' : (0xc0, b'>H', 2), + 'last_content' : (0xc2, b'>H', 2), + 'unknown0' : (0xc4, b'>L', 4), + 'fcis_offset' : (0xc8, b'>L', 4), + 'fcis_count' : (0xcc, b'>L', 4), + 'flis_offset' : (0xd0, b'>L', 4), + 'flis_count' : (0xd4, b'>L', 4), + 'unknown1' : (0xd8, b'>L', 4), + 'unknown2' : (0xdc, b'>L', 4), + 'srcs_offset' : (0xe0, b'>L', 4), + 'srcs_count' : (0xe4, b'>L', 4), + 'unknown3' : (0xe8, b'>L', 4), + 'unknown4' : (0xec, b'>L', 4), + 'fill5' : (0xf0, b'>H', 2), + 'traildata_flags' : (0xf2, b'>H', 2), + 'ncx_index' : (0xf4, b'>L', 4), + 'unknown5' : (0xf8, b'>L', 4), + 'unknown6' : (0xfc, b'>L', 4), + 'datp_offset' : (0x100, b'>L', 4), + 'unknown7' : (0x104, b'>L', 4), + 'Unknown ' : (0x108, b'>L', 4), + 'Unknown ' : (0x10C, b'>L', 4), + 'Unknown ' : (0x110, b'>L', 4), + 'Unknown ' : (0x114, b'>L', 4), + 'Unknown ' : (0x118, b'>L', 4), + 'Unknown ' : (0x11C, b'>L', 4), + 'Unknown ' : (0x120, b'>L', 4), + 'Unknown ' : (0x124, b'>L', 4), + 'Unknown ' : (0x128, b'>L', 4), + 'Unknown ' : (0x12C, b'>L', 4), + 'Unknown ' : (0x130, b'>L', 4), + 'Unknown ' : (0x134, b'>L', 4), + 'Unknown ' : (0x138, b'>L', 4), + 'Unknown ' : (0x11C, b'>L', 4), + } + + mobi8_header = { + 'compression_type' : (0x00, b'>H', 2), + 'fill0' : (0x02, b'>H', 2), + 'text_length' : (0x04, b'>L', 4), + 'text_records' : (0x08, b'>H', 2), + 'max_section_size' : (0x0a, b'>H', 2), + 'crypto_type' : (0x0c, b'>H', 2), + 'fill1' : (0x0e, b'>H', 2), + 'magic' : (0x10, b'4s', 4), + 'header_length (from MOBI)' : (0x14, b'>L', 4), + 'type' : (0x18, b'>L', 4), + 'codepage' : (0x1c, b'>L', 4), + 'unique_id' : (0x20, b'>L', 4), + 'version' : (0x24, b'>L', 4), + 'metaorthindex' : (0x28, b'>L', 4), + 'metainflindex' : (0x2c, b'>L', 4), + 'index_names' : (0x30, b'>L', 4), + 'index_keys' : (0x34, b'>L', 4), + 'extra_index0' : (0x38, b'>L', 4), + 'extra_index1' : (0x3c, b'>L', 4), + 'extra_index2' : (0x40, b'>L', 4), + 'extra_index3' : (0x44, b'>L', 4), + 'extra_index4' : (0x48, b'>L', 4), + 'extra_index5' : (0x4c, b'>L', 4), + 'first_nontext' : (0x50, b'>L', 4), + 'title_offset' : (0x54, b'>L', 4), + 'title_length' : (0x58, b'>L', 4), + 'language_code' : (0x5c, b'>L', 4), + 'dict_in_lang' : (0x60, b'>L', 4), + 'dict_out_lang' : (0x64, b'>L', 4), + 'min_version' : (0x68, b'>L', 4), + 'first_resc_offset' : (0x6c, b'>L', 4), + 'huff_offset' : (0x70, b'>L', 4), + 'huff_num' : (0x74, b'>L', 4), + 'huff_tbl_offset' : (0x78, b'>L', 4), + 'huff_tbl_len' : (0x7c, b'>L', 4), + 'exth_flags' : (0x80, b'>L', 4), + 'fill3_a' : (0x84, b'>L', 4), + 'fill3_b' : (0x88, b'>L', 4), + 'fill3_c' : (0x8c, b'>L', 4), + 'fill3_d' : (0x90, b'>L', 4), + 'fill3_e' : (0x94, b'>L', 4), + 'fill3_f' : (0x98, b'>L', 4), + 'fill3_g' : (0x9c, b'>L', 4), + 'fill3_h' : (0xa0, b'>L', 4), + 'unknown0' : (0xa4, b'>L', 4), + 'drm_offset' : (0xa8, b'>L', 4), + 'drm_count' : (0xac, b'>L', 4), + 'drm_size' : (0xb0, b'>L', 4), + 'drm_flags' : (0xb4, b'>L', 4), + 'fill4_a' : (0xb8, b'>L', 4), + 'fill4_b' : (0xbc, b'>L', 4), + 'fdst_offset' : (0xc0, b'>L', 4), + 'fdst_flow_count' : (0xc4, b'>L', 4), + 'fcis_offset' : (0xc8, b'>L', 4), + 'fcis_count' : (0xcc, b'>L', 4), + 'flis_offset' : (0xd0, b'>L', 4), + 'flis_count' : (0xd4, b'>L', 4), + 'unknown1' : (0xd8, b'>L', 4), + 'unknown2' : (0xdc, b'>L', 4), + 'srcs_offset' : (0xe0, b'>L', 4), + 'srcs_count' : (0xe4, b'>L', 4), + 'unknown3' : (0xe8, b'>L', 4), + 'unknown4' : (0xec, b'>L', 4), + 'fill5' : (0xf0, b'>H', 2), + 'traildata_flags' : (0xf2, b'>H', 2), + 'ncx_index' : (0xf4, b'>L', 4), + 'fragment_index' : (0xf8, b'>L', 4), + 'skeleton_index' : (0xfc, b'>L', 4), + 'datp_offset' : (0x100, b'>L', 4), + 'guide_index' : (0x104, b'>L', 4), + 'Unknown ' : (0x108, b'>L', 4), + 'Unknown ' : (0x10C, b'>L', 4), + 'Unknown ' : (0x110, b'>L', 4), + 'Unknown ' : (0x114, b'>L', 4), + 'Unknown ' : (0x118, b'>L', 4), + 'Unknown ' : (0x11C, b'>L', 4), + 'Unknown ' : (0x120, b'>L', 4), + 'Unknown ' : (0x124, b'>L', 4), + 'Unknown ' : (0x128, b'>L', 4), + 'Unknown ' : (0x12C, b'>L', 4), + 'Unknown ' : (0x130, b'>L', 4), + 'Unknown ' : (0x134, b'>L', 4), + 'Unknown ' : (0x138, b'>L', 4), + 'Unknown ' : (0x11C, b'>L', 4), + } + + palmdoc_header_sorted_keys = sortedHeaderKeys(palmdoc_header) + mobi6_header_sorted_keys = sortedHeaderKeys(mobi6_header) + mobi8_header_sorted_keys = sortedHeaderKeys(mobi8_header) + + id_map_strings = { + 1 : 'Drm Server Id', + 2 : 'Drm Commerce Id', + 3 : 'Drm Ebookbase Book Id', + 4 : 'Drm Ebookbase Dep Id', + 100 : 'Creator', + 101 : 'Publisher', + 102 : 'Imprint', + 103 : 'Description', + 104 : 'ISBN', + 105 : 'Subject', + 106 : 'Published', + 107 : 'Review', + 108 : 'Contributor', + 109 : 'Rights', + 110 : 'SubjectCode', + 111 : 'Type', + 112 : 'Source', + 113 : 'ASIN', + # 114 : 'versionNumber', + 117 : 'Adult', + 118 : 'Retail-Price', + 119 : 'Retail-Currency', + 120 : 'TSC', + 122 : 'fixed-layout', + 123 : 'book-type', + 124 : 'orientation-lock', + 126 : 'original-resolution', + 127 : 'zero-gutter', + 128 : 'zero-margin', + 129 : 'MetadataResourceURI', + 132 : 'RegionMagnification', + 150 : 'LendingEnabled', + 200 : 'DictShortName', + 501 : 'cdeType', + 502 : 'last_update_time', + 503 : 'Updated_Title', + 504 : 'CDEContentKey', + 505 : 'AmazonContentReference', + 506 : 'Title-Language', + 507 : 'Title-Display-Direction', + 508 : 'Title-Pronunciation', + 509 : 'Title-Collation', + 510 : 'Secondary-Title', + 511 : 'Secondary-Title-Language', + 512 : 'Secondary-Title-Direction', + 513 : 'Secondary-Title-Pronunciation', + 514 : 'Secondary-Title-Collation', + 515 : 'Author-Language', + 516 : 'Author-Display-Direction', + 517 : 'Author-Pronunciation', + 518 : 'Author-Collation', + 519 : 'Author-Type', + 520 : 'Publisher-Language', + 521 : 'Publisher-Display-Direction', + 522 : 'Publisher-Pronunciation', + 523 : 'Publisher-Collation', + 524 : 'Content-Language-Tag', + 525 : 'primary-writing-mode', + 526 : 'NCX-Ingested-By-Software', + 527 : 'page-progression-direction', + 528 : 'override-kindle-fonts', + 529 : 'Compression-Upgraded', + 530 : 'Soft-Hyphens-In-Content', + 531 : 'Dictionary_In_Langague', + 532 : 'Dictionary_Out_Language', + 533 : 'Font_Converted', + 534 : 'Amazon_Creator_Info', + 535 : 'Creator-Build-Tag', + 536 : 'HD-Media-Containers-Info', # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?) + 538 : 'Resource-Container-Fidelity', + 539 : 'HD-Container-Mimetype', + 540 : 'Sample-For_Special-Purpose', + 541 : 'Kindletool-Operation-Information', + 542 : 'Container_Id', + 543 : 'Asset-Type', # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER + 544 : 'Unknown_544', + } + id_map_values = { + 114 : 'versionNumber', + 115 : 'sample', + 116 : 'StartOffset', + 121 : 'Mobi8-Boundary-Section', + 125 : 'Embedded-Record-Count', + 130 : 'Offline-Sample', + 131 : 'Metadata-Record-Offset', + 201 : 'CoverOffset', + 202 : 'ThumbOffset', + 203 : 'HasFakeCover', + 204 : 'Creator-Software', + 205 : 'Creator-Major-Version', + 206 : 'Creator-Minor-Version', + 207 : 'Creator-Build-Number', + 401 : 'Clipping-Limit', + 402 : 'Publisher-Limit', + 404 : 'Text-to-Speech-Disabled', + 406 : 'Rental-Expiration-Time', + } + id_map_hexstrings = { + 208 : 'Watermark_(hex)', + 209 : 'Tamper-Proof-Keys_(hex)', + 300 : 'Font-Signature_(hex)', + 403 : 'Unknown_(403)_(hex)', + 405 : 'Ownership-Type_(hex)', + 407 : 'Unknown_(407)_(hex)', + 420 : 'Multimedia-Content-Reference_(hex)', + 450 : 'Locations_Match_(hex)', + 451 : 'Full-Story-Length_(hex)', + 452 : 'Sample-Start_Location_(hex)', + 453 : 'Sample-End-Location_(hex)', + } + + def __init__(self, sect, sectNumber): + self.sect = sect + self.start = sectNumber + self.header = self.sect.loadSection(self.start) + if len(self.header)>20 and self.header[16:20] == b'MOBI': + self.sect.setsectiondescription(0,"Mobipocket Header") + self.palm = False + elif self.sect.ident == b'TEXtREAd': + self.sect.setsectiondescription(0, "PalmDOC Header") + self.palm = True + else: + raise unpackException('Unknown File Format') + + self.records, = struct.unpack_from(b'>H', self.header, 0x8) + + # set defaults in case this is a PalmDOC + self.title = self.sect.palmname.decode('latin-1', errors='replace') + self.length = len(self.header)-16 + self.type = 3 + self.codepage = 1252 + self.codec = 'windows-1252' + self.unique_id = 0 + self.version = 0 + self.hasExth = False + self.exth = b'' + self.exth_offset = self.length + 16 + self.exth_length = 0 + self.crypto_type = 0 + self.firstnontext = self.start+self.records + 1 + self.firstresource = self.start+self.records + 1 + self.ncxidx = 0xffffffff + self.metaOrthIndex = 0xffffffff + self.metaInflIndex = 0xffffffff + self.skelidx = 0xffffffff + self.fragidx = 0xffffffff + self.guideidx = 0xffffffff + self.fdst = 0xffffffff + self.mlstart = self.sect.loadSection(self.start+1)[:4] + self.rawSize = 0 + self.metadata = dict_() + + # set up for decompression/unpacking + self.compression, = struct.unpack_from(b'>H', self.header, 0x0) + if self.compression == 0x4448: + reader = HuffcdicReader() + huffoff, huffnum = struct.unpack_from(b'>LL', self.header, 0x70) + huffoff = huffoff + self.start + self.sect.setsectiondescription(huffoff,"Huffman Compression Seed") + reader.loadHuff(self.sect.loadSection(huffoff)) + for i in range(1, huffnum): + self.sect.setsectiondescription(huffoff+i,"Huffman CDIC Compression Seed %d" % i) + reader.loadCdic(self.sect.loadSection(huffoff+i)) + self.unpack = reader.unpack + elif self.compression == 2: + self.unpack = PalmdocReader().unpack + elif self.compression == 1: + self.unpack = UncompressedReader().unpack + else: + raise unpackException('invalid compression type: 0x%4x' % self.compression) + + if self.palm: + return + + self.length, self.type, self.codepage, self.unique_id, self.version = struct.unpack(b'>LLLLL', self.header[20:40]) + codec_map = { + 1252 : 'windows-1252', + 65001: 'utf-8', + } + if self.codepage in codec_map: + self.codec = codec_map[self.codepage] + + # title + toff, tlen = struct.unpack(b'>II', self.header[0x54:0x5c]) + tend = toff + tlen + self.title=self.header[toff:tend].decode(self.codec, errors='replace') + + exth_flag, = struct.unpack(b'>L', self.header[0x80:0x84]) + self.hasExth = exth_flag & 0x40 + self.exth_offset = self.length + 16 + self.exth_length = 0 + if self.hasExth: + self.exth_length, = struct.unpack_from(b'>L', self.header, self.exth_offset+4) + self.exth_length = ((self.exth_length + 3)>>2)<<2 # round to next 4 byte boundary + self.exth = self.header[self.exth_offset:self.exth_offset+self.exth_length] + + # parse the exth / metadata + self.parseMetaData() + + # self.mlstart = self.sect.loadSection(self.start+1) + # self.mlstart = self.mlstart[0:4] + self.crypto_type, = struct.unpack_from(b'>H', self.header, 0xC) + + # Start sector for additional files such as images, fonts, resources, etc + # Can be missing so fall back to default set previously + ofst, = struct.unpack_from(b'>L', self.header, 0x6C) + if ofst != 0xffffffff: + self.firstresource = ofst + self.start + ofst, = struct.unpack_from(b'>L', self.header, 0x50) + if ofst != 0xffffffff: + self.firstnontext = ofst + self.start + + if self.isPrintReplica(): + return + + if self.version < 8: + # Dictionary metaOrthIndex + self.metaOrthIndex, = struct.unpack_from(b'>L', self.header, 0x28) + if self.metaOrthIndex != 0xffffffff: + self.metaOrthIndex += self.start + + # Dictionary metaInflIndex + self.metaInflIndex, = struct.unpack_from(b'>L', self.header, 0x2C) + if self.metaInflIndex != 0xffffffff: + self.metaInflIndex += self.start + + # handle older headers without any ncxindex info and later + # specifically 0xe4 headers + if self.length + 16 < 0xf8: + return + + # NCX Index + self.ncxidx, = struct.unpack(b'>L', self.header[0xf4:0xf8]) + if self.ncxidx != 0xffffffff: + self.ncxidx += self.start + + # K8 specific Indexes + if self.start != 0 or self.version == 8: + # Index into <xml> file skeletons in RawML + self.skelidx, = struct.unpack_from(b'>L', self.header, 0xfc) + if self.skelidx != 0xffffffff: + self.skelidx += self.start + + # Index into <div> sections in RawML + self.fragidx, = struct.unpack_from(b'>L', self.header, 0xf8) + if self.fragidx != 0xffffffff: + self.fragidx += self.start + + # Index into Other files + self.guideidx, = struct.unpack_from(b'>L', self.header, 0x104) + if self.guideidx != 0xffffffff: + self.guideidx += self.start + + # dictionaries do not seem to use the same approach in K8's + # so disable them + self.metaOrthIndex = 0xffffffff + self.metaInflIndex = 0xffffffff + + # need to use the FDST record to find out how to properly unpack + # the rawML into pieces + # it is simply a table of start and end locations for each flow piece + self.fdst, = struct.unpack_from(b'>L', self.header, 0xc0) + self.fdstcnt, = struct.unpack_from(b'>L', self.header, 0xc4) + # if cnt is 1 or less, fdst section mumber can be garbage + if self.fdstcnt <= 1: + self.fdst = 0xffffffff + if self.fdst != 0xffffffff: + self.fdst += self.start + # setting of fdst section description properly handled in mobi_kf8proc + + def dump_exth(self): + # determine text encoding + codec=self.codec + if (not self.hasExth) or (self.exth_length) == 0 or (self.exth == b''): + return + num_items, = struct.unpack(b'>L', self.exth[8:12]) + pos = 12 + print("Key Size Description Value") + for _ in range(num_items): + id, size = struct.unpack(b'>LL', self.exth[pos:pos+8]) + contentsize = size-8 + content = self.exth[pos + 8: pos + size] + if id in MobiHeader.id_map_strings: + exth_name = MobiHeader.id_map_strings[id] + print('{0: >3d} {1: >4d} {2: <30s} {3:s}'.format(id, contentsize, exth_name, content.decode(codec, errors='replace'))) + elif id in MobiHeader.id_map_values: + exth_name = MobiHeader.id_map_values[id] + if size == 9: + value, = struct.unpack(b'B',content) + print('{0:3d} byte {1:<30s} {2:d}'.format(id, exth_name, value)) + elif size == 10: + value, = struct.unpack(b'>H',content) + print('{0:3d} word {1:<30s} 0x{2:0>4X} ({2:d})'.format(id, exth_name, value)) + elif size == 12: + value, = struct.unpack(b'>L',content) + print('{0:3d} long {1:<30s} 0x{2:0>8X} ({2:d})'.format(id, exth_name, value)) + else: + print('{0: >3d} {1: >4d} {2: <30s} (0x{3:s})'.format(id, contentsize, "Bad size for "+exth_name, hexlify(content))) + elif id in MobiHeader.id_map_hexstrings: + exth_name = MobiHeader.id_map_hexstrings[id] + print('{0:3d} {1:4d} {2:<30s} 0x{3:s}'.format(id, contentsize, exth_name, hexlify(content))) + else: + exth_name = "Unknown EXTH ID {0:d}".format(id) + print("{0: >3d} {1: >4d} {2: <30s} 0x{3:s}".format(id, contentsize, exth_name, hexlify(content))) + pos += size + return + + def dumpheader(self): + # first 16 bytes are not part of the official mobiheader + # but we will treat it as such + # so section 0 is 16 (decimal) + self.length in total == at least 0x108 bytes for Mobi 8 headers + print("Dumping section %d, Mobipocket Header version: %d, total length %d" % (self.start,self.version, self.length+16)) + self.hdr = {} + # set it up for the proper header version + if self.version == 0: + self.mobi_header = MobiHeader.palmdoc_header + self.mobi_header_sorted_keys = MobiHeader.palmdoc_header_sorted_keys + elif self.version < 8: + self.mobi_header = MobiHeader.mobi6_header + self.mobi_header_sorted_keys = MobiHeader.mobi6_header_sorted_keys + else: + self.mobi_header = MobiHeader.mobi8_header + self.mobi_header_sorted_keys = MobiHeader.mobi8_header_sorted_keys + + # parse the header information + for key in self.mobi_header_sorted_keys: + (pos, format, tot_len) = self.mobi_header[key] + if pos < (self.length + 16): + val, = struct.unpack_from(format, self.header, pos) + self.hdr[key] = val + + if 'title_offset' in self.hdr: + title_offset = self.hdr['title_offset'] + title_length = self.hdr['title_length'] + else: + title_offset = 0 + title_length = 0 + if title_offset == 0: + title_offset = len(self.header) + title_length = 0 + self.title = self.sect.palmname.decode('latin-1', errors='replace') + else: + self.title = self.header[title_offset:title_offset+title_length].decode(self.codec, errors='replace') + # title record always padded with two nul bytes and then padded with nuls to next 4 byte boundary + title_length = ((title_length+2+3)>>2)<<2 + + self.extra1 = self.header[self.exth_offset+self.exth_length:title_offset] + self.extra2 = self.header[title_offset+title_length:] + + print("Mobipocket header from section %d" % self.start) + print(" Offset Value Hex Dec Description") + for key in self.mobi_header_sorted_keys: + (pos, format, tot_len) = self.mobi_header[key] + if pos < (self.length + 16): + if key != 'magic': + fmt_string = "0x{0:0>3X} ({0:3d}){1: >" + str(9-2*tot_len) +"s}0x{2:0>" + str(2*tot_len) + "X} {2:10d} {3:s}" + else: + self.hdr[key] = unicode_str(self.hdr[key]) + fmt_string = "0x{0:0>3X} ({0:3d}){2:>11s} {3:s}" + print(fmt_string.format(pos, " ",self.hdr[key], key)) + print("") + + if self.exth_length > 0: + print("EXTH metadata, offset %d, padded length %d" % (self.exth_offset,self.exth_length)) + self.dump_exth() + print("") + + if len(self.extra1) > 0: + print("Extra data between EXTH and Title, length %d" % len(self.extra1)) + print(hexlify(self.extra1)) + print("") + + if title_length > 0: + print("Title in header at offset %d, padded length %d: '%s'" %(title_offset,title_length,self.title)) + print("") + + if len(self.extra2) > 0: + print("Extra data between Title and end of header, length %d" % len(self.extra2)) + print(hexlify(self.extra2)) + print("") + + def isPrintReplica(self): + return self.mlstart[0:4] == b"%MOP" + + def isK8(self): + return self.start != 0 or self.version == 8 + + def isEncrypted(self): + return self.crypto_type != 0 + + def hasNCX(self): + return self.ncxidx != 0xffffffff + + def isDictionary(self): + return self.metaOrthIndex != 0xffffffff + + def getncxIndex(self): + return self.ncxidx + + def decompress(self, data): + return self.unpack(data) + + def Language(self): + langcode = struct.unpack(b'!L', self.header[0x5c:0x60])[0] + langid = langcode & 0xFF + sublangid = (langcode >> 8) & 0xFF + return getLanguage(langid, sublangid) + + def DictInLanguage(self): + if self.isDictionary(): + langcode = struct.unpack(b'!L', self.header[0x60:0x64])[0] + langid = langcode & 0xFF + sublangid = (langcode >> 10) & 0xFF + if langid != 0: + return getLanguage(langid, sublangid) + return False + + def DictOutLanguage(self): + if self.isDictionary(): + langcode = struct.unpack(b'!L', self.header[0x64:0x68])[0] + langid = langcode & 0xFF + sublangid = (langcode >> 10) & 0xFF + if langid != 0: + return getLanguage(langid, sublangid) + return False + + def getRawML(self): + def getSizeOfTrailingDataEntry(data): + num = 0 + for v in data[-4:]: + if bord(v) & 0x80: + num = 0 + num = (num << 7) | (bord(v) & 0x7f) + return num + def trimTrailingDataEntries(data): + for _ in range(trailers): + num = getSizeOfTrailingDataEntry(data) + data = data[:-num] + if multibyte: + num = (ord(data[-1:]) & 3) + 1 + data = data[:-num] + return data + multibyte = 0 + trailers = 0 + if self.sect.ident == b'BOOKMOBI': + mobi_length, = struct.unpack_from(b'>L', self.header, 0x14) + mobi_version, = struct.unpack_from(b'>L', self.header, 0x68) + if (mobi_length >= 0xE4) and (mobi_version >= 5): + flags, = struct.unpack_from(b'>H', self.header, 0xF2) + multibyte = flags & 1 + while flags > 1: + if flags & 2: + trailers += 1 + flags = flags >> 1 + # get raw mobi markup languge + print("Unpacking raw markup language") + dataList = [] + # offset = 0 + for i in range(1, self.records+1): + data = trimTrailingDataEntries(self.sect.loadSection(self.start + i)) + dataList.append(self.unpack(data)) + if self.isK8(): + self.sect.setsectiondescription(self.start + i,"KF8 Text Section {0:d}".format(i)) + elif self.version == 0: + self.sect.setsectiondescription(self.start + i,"PalmDOC Text Section {0:d}".format(i)) + else: + self.sect.setsectiondescription(self.start + i,"Mobipocket Text Section {0:d}".format(i)) + rawML = b''.join(dataList) + self.rawSize = len(rawML) + return rawML + + # all metadata is stored in a dictionary with key and returns a *list* of values + # a list is used to allow for multiple creators, multiple contributors, etc + def parseMetaData(self): + def addValue(name, value): + if name not in self.metadata: + self.metadata[name] = [value] + else: + self.metadata[name].append(value) + + codec=self.codec + if self.hasExth: + extheader=self.exth + _length, num_items = struct.unpack(b'>LL', extheader[4:12]) + extheader = extheader[12:] + pos = 0 + for _ in range(num_items): + id, size = struct.unpack(b'>LL', extheader[pos:pos+8]) + content = extheader[pos + 8: pos + size] + if id in MobiHeader.id_map_strings: + name = MobiHeader.id_map_strings[id] + addValue(name, content.decode(codec, errors='replace')) + elif id in MobiHeader.id_map_values: + name = MobiHeader.id_map_values[id] + if size == 9: + value, = struct.unpack(b'B',content) + addValue(name, unicode_str(str(value))) + elif size == 10: + value, = struct.unpack(b'>H',content) + addValue(name, unicode_str(str(value))) + elif size == 12: + value, = struct.unpack(b'>L',content) + # handle special case of missing CoverOffset or missing ThumbOffset + if id == 201 or id == 202: + if value != 0xffffffff: + addValue(name, unicode_str(str(value))) + else: + addValue(name, unicode_str(str(value))) + else: + print("Warning: Bad key, size, value combination detected in EXTH ", id, size, hexlify(content)) + addValue(name, hexlify(content)) + elif id in MobiHeader.id_map_hexstrings: + name = MobiHeader.id_map_hexstrings[id] + addValue(name, hexlify(content)) + else: + name = unicode_str(str(id)) + ' (hex)' + addValue(name, hexlify(content)) + pos += size + + # add the basics to the metadata each as a list element + self.metadata['Language'] = [self.Language()] + self.metadata['Title'] = [unicode_str(self.title,self.codec)] + self.metadata['Codec'] = [self.codec] + self.metadata['UniqueID'] = [unicode_str(str(self.unique_id))] + # if no asin create one using a uuid + if 'ASIN' not in self.metadata: + self.metadata['ASIN'] = [unicode_str(str(uuid.uuid4()))] + # if no cdeType set it to "EBOK" + if 'cdeType' not in self.metadata: + self.metadata['cdeType'] = ['EBOK'] + + def getMetaData(self): + return self.metadata + + def describeHeader(self, DUMP): + print("Mobi Version:", self.version) + print("Codec:", self.codec) + print("Title:", self.title) + if 'Updated_Title' in self.metadata: + print("EXTH Title:", self.metadata['Updated_Title'][0]) + if self.compression == 0x4448: + print("Huffdic compression") + elif self.compression == 2: + print("Palmdoc compression") + elif self.compression == 1: + print("No compression") + if DUMP: + self.dumpheader() diff --git a/src/epy_reader/tools/KindleUnpack/mobi_html.py b/src/epy_reader/tools/KindleUnpack/mobi_html.py new file mode 100644 index 0000000..eda766c --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_html.py @@ -0,0 +1,439 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, utf8_str + +if PY2: + range = xrange + +import re +# note: re requites the pattern to be the exact same type as the data to be searched in python3 +# but u"" is not allowed for the pattern itself only b"" + +from .mobi_utils import fromBase32 + +class HTMLProcessor: + + def __init__(self, files, metadata, rscnames): + self.files = files + self.metadata = metadata + self.rscnames = rscnames + # for original style mobis, default to including all image files in the opf manifest + self.used = {} + for name in rscnames: + self.used[name] = 'used' + + def findAnchors(self, rawtext, indx_data, positionMap): + # process the raw text + # find anchors... + print("Find link anchors") + link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''', re.IGNORECASE) + # TEST NCX: merge in filepos from indx + pos_links = [int(m.group(1)) for m in link_pattern.finditer(rawtext)] + if indx_data: + pos_indx = [e['pos'] for e in indx_data if e['pos']>0] + pos_links = list(set(pos_links + pos_indx)) + + for position in pos_links: + if position in positionMap: + positionMap[position] = positionMap[position] + utf8_str('<a id="filepos%d" />' % position) + else: + positionMap[position] = utf8_str('<a id="filepos%d" />' % position) + + # apply dictionary metadata and anchors + print("Insert data into html") + pos = 0 + lastPos = len(rawtext) + dataList = [] + for end in sorted(positionMap.keys()): + if end == 0 or end > lastPos: + continue # something's up - can't put a tag in outside <html>...</html> + dataList.append(rawtext[pos:end]) + dataList.append(positionMap[end]) + pos = end + dataList.append(rawtext[pos:]) + srctext = b"".join(dataList) + rawtext = None + dataList = None + self.srctext = srctext + self.indx_data = indx_data + return srctext + + def insertHREFS(self): + srctext = self.srctext + rscnames = self.rscnames + metadata = self.metadata + + # put in the hrefs + print("Insert hrefs into html") + # There doesn't seem to be a standard, so search as best as we can + + link_pattern = re.compile(br'''<a([^>]*?)filepos=['"]{0,1}0*(\d+)['"]{0,1}([^>]*?)>''', re.IGNORECASE) + srctext = link_pattern.sub(br'''<a\1href="#filepos\2"\3>''', srctext) + + # remove empty anchors + print("Remove empty anchors from html") + srctext = re.sub(br"<a\s*/>",br"", srctext) + srctext = re.sub(br"<a\s*>\s*</a>",br"", srctext) + + # convert image references + print("Insert image references into html") + # split string into image tag pieces and other pieces + image_pattern = re.compile(br'''(<img.*?>)''', re.IGNORECASE) + image_index_pattern = re.compile(br'''recindex=['"]{0,1}([0-9]+)['"]{0,1}''', re.IGNORECASE) + srcpieces = image_pattern.split(srctext) + srctext = self.srctext = None + + # all odd pieces are image tags (nulls string on even pieces if no space between them in srctext) + for i in range(1, len(srcpieces), 2): + tag = srcpieces[i] + for m in image_index_pattern.finditer(tag): + imageNumber = int(m.group(1)) + imageName = rscnames[imageNumber-1] + if imageName is None: + print("Error: Referenced image %s was not recognized as a valid image" % imageNumber) + else: + replacement = b'src="Images/' + utf8_str(imageName) + b'"' + tag = image_index_pattern.sub(replacement, tag, 1) + srcpieces[i] = tag + srctext = b"".join(srcpieces) + + # add in character set meta into the html header if needed + if 'Codec' in metadata: + srctext = srctext[0:12]+b'<meta http-equiv="content-type" content="text/html; charset='+utf8_str(metadata.get('Codec')[0])+b'" />'+srctext[12:] + return srctext, self.used + + +class XHTMLK8Processor: + + def __init__(self, rscnames, k8proc): + self.rscnames = rscnames + self.k8proc = k8proc + self.used = {} + + def buildXHTML(self): + + # first need to update all links that are internal which + # are based on positions within the xhtml files **BEFORE** + # cutting and pasting any pieces into the xhtml text files + + # kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml) + # XXXX is the offset in records into divtbl + # YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position + + # pos:fid pattern + posfid_pattern = re.compile(br'''(<a.*?href=.*?>)''', re.IGNORECASE) + posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''') + + parts = [] + print("Building proper xhtml for each file") + for i in range(self.k8proc.getNumberOfParts()): + part = self.k8proc.getPart(i) + [partnum, dir, filename, beg, end, aidtext] = self.k8proc.getPartInfo(i) + + # internal links + srcpieces = posfid_pattern.split(part) + for j in range(1, len(srcpieces),2): + tag = srcpieces[j] + if tag.startswith(b'<'): + for m in posfid_index_pattern.finditer(tag): + posfid = m.group(1) + offset = m.group(2) + filename, idtag = self.k8proc.getIDTagByPosFid(posfid, offset) + if idtag == b'': + replacement= b'"' + utf8_str(filename) + b'"' + else: + replacement = b'"' + utf8_str(filename) + b'#' + idtag + b'"' + tag = posfid_index_pattern.sub(replacement, tag, 1) + srcpieces[j] = tag + part = b"".join(srcpieces) + parts.append(part) + + # we are free to cut and paste as we see fit + # we can safely remove all of the Kindlegen generated aid tags + # change aid ids that are in k8proc.linked_aids to xhtml ids + find_tag_with_aid_pattern = re.compile(br'''(<[^>]*\said\s*=[^>]*>)''', re.IGNORECASE) + within_tag_aid_position_pattern = re.compile(br'''\said\s*=['"]([^'"]*)['"]''') + for i in range(len(parts)): + part = parts[i] + srcpieces = find_tag_with_aid_pattern.split(part) + for j in range(len(srcpieces)): + tag = srcpieces[j] + if tag.startswith(b'<'): + for m in within_tag_aid_position_pattern.finditer(tag): + try: + aid = m.group(1) + except IndexError: + aid = None + replacement = b'' + if aid in self.k8proc.linked_aids: + replacement = b' id="aid-' + aid + b'"' + tag = within_tag_aid_position_pattern.sub(replacement, tag, 1) + srcpieces[j] = tag + part = b"".join(srcpieces) + parts[i] = part + + # we can safely replace all of the Kindlegen generated data-AmznPageBreak tags + # with page-break-after style patterns + find_tag_with_AmznPageBreak_pattern = re.compile(br'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE) + within_tag_AmznPageBreak_position_pattern = re.compile(br'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''') + for i in range(len(parts)): + part = parts[i] + srcpieces = find_tag_with_AmznPageBreak_pattern.split(part) + for j in range(len(srcpieces)): + tag = srcpieces[j] + if tag.startswith(b'<'): + srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub( + lambda m:b' style="page-break-after:' + m.group(1) + b'"', tag) + part = b"".join(srcpieces) + parts[i] = part + + # we have to handle substitutions for the flows pieces first as they may + # be inlined into the xhtml text + # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images) + # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc) + # kindle:embed:XXXX (used for fonts) + + flows = [] + flows.append(None) + flowinfo = [] + flowinfo.append([None, None, None, None]) + + # regular expression search patterns + img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE) + img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE) + + tag_pattern = re.compile(br'''(<[^>]*>)''') + flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE) + + url_pattern = re.compile(br'''(url\(.*?\))''', re.IGNORECASE) + url_img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*["')]''', re.IGNORECASE) + font_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)["')]''', re.IGNORECASE) + url_css_index_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE) + url_svg_image_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=image/svg\+xml[^\)]*''', re.IGNORECASE) + + for i in range(1, self.k8proc.getNumberOfFlows()): + [ftype, format, dir, filename] = self.k8proc.getFlowInfo(i) + flowpart = self.k8proc.getFlow(i) + + # links to raster image files from image tags + # image_pattern + srcpieces = img_pattern.split(flowpart) + for j in range(1, len(srcpieces),2): + tag = srcpieces[j] + if tag.startswith(b'<im'): + for m in img_index_pattern.finditer(tag): + imageNumber = fromBase32(m.group(1)) + imageName = self.rscnames[imageNumber-1] + if imageName is not None: + replacement = b'"../Images/' + utf8_str(imageName) + b'"' + self.used[imageName] = 'used' + tag = img_index_pattern.sub(replacement, tag, 1) + else: + print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag)) + srcpieces[j] = tag + flowpart = b"".join(srcpieces) + + # replacements inside css url(): + srcpieces = url_pattern.split(flowpart) + for j in range(1, len(srcpieces),2): + tag = srcpieces[j] + + # process links to raster image files + for m in url_img_index_pattern.finditer(tag): + imageNumber = fromBase32(m.group(1)) + imageName = self.rscnames[imageNumber-1] + osep = m.group()[0:1] + csep = m.group()[-1:] + if imageName is not None: + replacement = osep + b'../Images/' + utf8_str(imageName) + csep + self.used[imageName] = 'used' + tag = url_img_index_pattern.sub(replacement, tag, 1) + else: + print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag)) + + # process links to fonts + for m in font_index_pattern.finditer(tag): + fontNumber = fromBase32(m.group(1)) + fontName = self.rscnames[fontNumber-1] + osep = m.group()[0:1] + csep = m.group()[-1:] + if fontName is None: + print("Error: Referenced font %s was not recognized as a valid font in %s" % (fontNumber, tag)) + else: + replacement = osep + b'../Fonts/' + utf8_str(fontName) + csep + tag = font_index_pattern.sub(replacement, tag, 1) + self.used[fontName] = 'used' + + # process links to other css pieces + for m in url_css_index_pattern.finditer(tag): + num = fromBase32(m.group(1)) + [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) + replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"' + tag = url_css_index_pattern.sub(replacement, tag, 1) + self.used[fnm] = 'used' + + # process links to svg images + for m in url_svg_image_pattern.finditer(tag): + num = fromBase32(m.group(1)) + [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) + replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"' + tag = url_svg_image_pattern.sub(replacement, tag, 1) + self.used[fnm] = 'used' + + srcpieces[j] = tag + flowpart = b"".join(srcpieces) + + # store away in our own copy + flows.append(flowpart) + + # I do not think this case exists and even if it does exist, it needs to be done in a separate + # pass to prevent inlining a flow piece into another flow piece before the inserted one or the + # target one has been fully processed + + # but keep it around if it ends up we do need it + + # flow pattern not inside url() + # srcpieces = tag_pattern.split(flowpart) + # for j in range(1, len(srcpieces),2): + # tag = srcpieces[j] + # if tag.startswith(b'<'): + # for m in flow_pattern.finditer(tag): + # num = fromBase32(m.group(1)) + # [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) + # flowtext = self.k8proc.getFlow(num) + # if fmt == b'inline': + # tag = flowtext + # else: + # replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"' + # tag = flow_pattern.sub(replacement, tag, 1) + # self.used[fnm] = 'used' + # srcpieces[j] = tag + # flowpart = b"".join(srcpieces) + + # now handle the main text xhtml parts + + # Handle the flow items in the XHTML text pieces + # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc) + tag_pattern = re.compile(br'''(<[^>]*>)''') + flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE) + for i in range(len(parts)): + part = parts[i] + [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] + # flow pattern + srcpieces = tag_pattern.split(part) + for j in range(1, len(srcpieces),2): + tag = srcpieces[j] + if tag.startswith(b'<'): + for m in flow_pattern.finditer(tag): + num = fromBase32(m.group(1)) + if num > 0 and num < len(self.k8proc.flowinfo): + [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) + flowpart = flows[num] + if fmt == b'inline': + tag = flowpart + else: + replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"' + tag = flow_pattern.sub(replacement, tag, 1) + self.used[fnm] = 'used' + else: + print("warning: ignoring non-existent flow link", tag, " value 0x%x" % num) + srcpieces[j] = tag + part = b''.join(srcpieces) + + # store away modified version + parts[i] = part + + # Handle any embedded raster images links in style= attributes urls + style_pattern = re.compile(br'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''', re.IGNORECASE) + img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE) + + for i in range(len(parts)): + part = parts[i] + [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] + + # replace urls in style attributes + srcpieces = style_pattern.split(part) + for j in range(1, len(srcpieces),2): + tag = srcpieces[j] + if b'kindle:embed' in tag: + for m in img_index_pattern.finditer(tag): + imageNumber = fromBase32(m.group(1)) + imageName = self.rscnames[imageNumber-1] + osep = m.group()[0:1] + csep = m.group()[-1:] + if imageName is not None: + replacement = osep + b'../Images/'+ utf8_str(imageName) + csep + self.used[imageName] = 'used' + tag = img_index_pattern.sub(replacement, tag, 1) + else: + print("Error: Referenced image %s in style url was not recognized in %s" % (imageNumber, tag)) + srcpieces[j] = tag + part = b"".join(srcpieces) + + # store away modified version + parts[i] = part + + # Handle any embedded raster images links in the xhtml text + # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images) + img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE) + img_index_pattern = re.compile(br'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''') + + for i in range(len(parts)): + part = parts[i] + [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] + + # links to raster image files + # image_pattern + srcpieces = img_pattern.split(part) + for j in range(1, len(srcpieces),2): + tag = srcpieces[j] + if tag.startswith(b'<im'): + for m in img_index_pattern.finditer(tag): + imageNumber = fromBase32(m.group(1)) + imageName = self.rscnames[imageNumber-1] + if imageName is not None: + replacement = b'"../Images/' + utf8_str(imageName) + b'"' + self.used[imageName] = 'used' + tag = img_index_pattern.sub(replacement, tag, 1) + else: + print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag)) + srcpieces[j] = tag + part = b"".join(srcpieces) + # store away modified version + parts[i] = part + + # finally perform any general cleanups needed to make valid XHTML + # these include: + # in svg tags replace "perserveaspectratio" attributes with "perserveAspectRatio" + # in svg tags replace "viewbox" attributes with "viewBox" + # in <li> remove value="XX" attributes since these are illegal + tag_pattern = re.compile(br'''(<[^>]*>)''') + li_value_pattern = re.compile(br'''\svalue\s*=\s*['"][^'"]*['"]''', re.IGNORECASE) + + for i in range(len(parts)): + part = parts[i] + [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] + + # tag pattern + srcpieces = tag_pattern.split(part) + for j in range(1, len(srcpieces),2): + tag = srcpieces[j] + if tag.startswith(b'<svg') or tag.startswith(b'<SVG'): + tag = tag.replace(b'preserveaspectratio',b'preserveAspectRatio') + tag = tag.replace(b'viewbox',b'viewBox') + elif tag.startswith(b'<li ') or tag.startswith(b'<LI '): + tagpieces = li_value_pattern.split(tag) + tag = b"".join(tagpieces) + srcpieces[j] = tag + part = b"".join(srcpieces) + # store away modified version + parts[i] = part + + self.k8proc.setFlows(flows) + self.k8proc.setParts(parts) + + return self.used diff --git a/src/epy_reader/tools/KindleUnpack/mobi_index.py b/src/epy_reader/tools/KindleUnpack/mobi_index.py new file mode 100644 index 0000000..397aaf8 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_index.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, bchr, bstr, bord +if PY2: + range = xrange + +import struct +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring + +from .mobi_utils import toHex + +class MobiIndex: + + def __init__(self, sect, DEBUG=False): + self.sect = sect + self.DEBUG = DEBUG + + def getIndexData(self, idx, label="Unknown"): + sect = self.sect + outtbl = [] + ctoc_text = {} + if idx != 0xffffffff: + sect.setsectiondescription(idx,"{0} Main INDX section".format(label)) + data = sect.loadSection(idx) + idxhdr, hordt1, hordt2 = self.parseINDXHeader(data) + IndexCount = idxhdr['count'] + # handle the case of multiple sections used for CTOC + rec_off = 0 + off = idx + IndexCount + 1 + for j in range(idxhdr['nctoc']): + cdata = sect.loadSection(off + j) + sect.setsectiondescription(off+j, label + ' CTOC Data ' + str(j)) + ctocdict = self.readCTOC(cdata) + for k in ctocdict: + ctoc_text[k + rec_off] = ctocdict[k] + rec_off += 0x10000 + tagSectionStart = idxhdr['len'] + controlByteCount, tagTable = readTagSection(tagSectionStart, data) + if self.DEBUG: + print("ControlByteCount is", controlByteCount) + print("IndexCount is", IndexCount) + print("TagTable: %s" % tagTable) + for i in range(idx + 1, idx + 1 + IndexCount): + sect.setsectiondescription(i,"{0} Extra {1:d} INDX section".format(label,i-idx)) + data = sect.loadSection(i) + hdrinfo, ordt1, ordt2 = self.parseINDXHeader(data) + idxtPos = hdrinfo['start'] + entryCount = hdrinfo['count'] + if self.DEBUG: + print(idxtPos, entryCount) + # loop through to build up the IDXT position starts + idxPositions = [] + for j in range(entryCount): + pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j)) + idxPositions.append(pos) + # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) + idxPositions.append(idxtPos) + # for each entry in the IDXT build up the tagMap and any associated text + for j in range(entryCount): + startPos = idxPositions[j] + endPos = idxPositions[j+1] + textLength = ord(data[startPos:startPos+1]) + text = data[startPos+1:startPos+1+textLength] + if hordt2 is not None: + text = b''.join(bchr(hordt2[bord(x)]) for x in text) + tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos) + outtbl.append([text, tagMap]) + if self.DEBUG: + print(tagMap) + print(text) + return outtbl, ctoc_text + + def parseINDXHeader(self, data): + "read INDX header" + if not data[:4] == b'INDX': + print("Warning: index section is not INDX") + return False + words = ( + 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code', + 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc' + ) + num = len(words) + values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)]) + header = {} + for n in range(num): + header[words[n]] = values[n] + + ordt1 = None + ordt2 = None + + ocnt, oentries, op1, op2, otagx = struct.unpack_from(b'>LLLLL',data, 0xa4) + if header['code'] == 0xfdea or ocnt != 0 or oentries > 0: + # horribly hacked up ESP (sample) mobi books use two ORDT sections but never specify + # them in the proper place in the header. They seem to be codepage 65002 which seems + # to be some sort of strange EBCDIC utf-8 or 16 encoded strings + + # so we need to look for them and store them away to process leading text + # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries + # we only ever seem to use the seocnd but ... + assert(ocnt == 1) + assert(data[op1:op1+4] == b'ORDT') + assert(data[op2:op2+4] == b'ORDT') + ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4) + ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4) + + if self.DEBUG: + print("parsed INDX header:") + for n in words: + print(n, "%X" % header[n],) + print("") + return header, ordt1, ordt2 + + def readCTOC(self, txtdata): + # read all blocks from CTOC + ctoc_data = {} + offset = 0 + while offset<len(txtdata): + if PY2: + if txtdata[offset] == b'\0': + break + else: + if txtdata[offset] == 0: + break + idx_offs = offset + # first n bytes: name len as vwi + pos, ilen = getVariableWidthValue(txtdata, offset) + offset += pos + # <len> next bytes: name + name = txtdata[offset:offset+ilen] + offset += ilen + if self.DEBUG: + print("name length is ", ilen) + print(idx_offs, name) + ctoc_data[idx_offs] = name + return ctoc_data + + +def getVariableWidthValue(data, offset): + ''' + Decode variable width value from given bytes. + + @param data: The bytes to decode. + @param offset: The start offset into data. + @return: Tuple of consumed bytes count and decoded value. + ''' + value = 0 + consumed = 0 + finished = False + while not finished: + v = data[offset + consumed: offset + consumed + 1] + consumed += 1 + if ord(v) & 0x80: + finished = True + value = (value << 7) | (ord(v) & 0x7f) + return consumed, value + + +def readTagSection(start, data): + ''' + Read tag section from given data. + + @param start: The start position in the data. + @param data: The data to process. + @return: Tuple of control byte count and list of tag tuples. + ''' + controlByteCount = 0 + tags = [] + if data[start:start+4] == b"TAGX": + firstEntryOffset, = struct.unpack_from(b'>L', data, start + 0x04) + controlByteCount, = struct.unpack_from(b'>L', data, start + 0x08) + + # Skip the first 12 bytes already read above. + for i in range(12, firstEntryOffset, 4): + pos = start + i + tags.append((ord(data[pos:pos+1]), ord(data[pos+1:pos+2]), ord(data[pos+2:pos+3]), ord(data[pos+3:pos+4]))) + return controlByteCount, tags + + +def countSetBits(value, bits=8): + ''' + Count the set bits in the given value. + + @param value: Integer value. + @param bits: The number of bits of the input value (defaults to 8). + @return: Number of set bits. + ''' + count = 0 + for _ in range(bits): + if value & 0x01 == 0x01: + count += 1 + value = value >> 1 + return count + + +def getTagMap(controlByteCount, tagTable, entryData, startPos, endPos): + ''' + Create a map of tags and values from the given byte section. + + @param controlByteCount: The number of control bytes. + @param tagTable: The tag table. + @param entryData: The data to process. + @param startPos: The starting position in entryData. + @param endPos: The end position in entryData or None if it is unknown. + @return: Hashmap of tag and list of values. + ''' + tags = [] + tagHashMap = {} + controlByteIndex = 0 + dataStart = startPos + controlByteCount + + for tag, valuesPerEntry, mask, endFlag in tagTable: + if endFlag == 0x01: + controlByteIndex += 1 + continue + cbyte = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1]) + if 0: + print("Control Byte Index %0x , Control Byte Value %0x" % (controlByteIndex, cbyte)) + + value = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1]) & mask + if value != 0: + if value == mask: + if countSetBits(mask) > 1: + # If all bits of masked value are set and the mask has more than one bit, a variable width value + # will follow after the control bytes which defines the length of bytes (NOT the value count!) + # which will contain the corresponding variable width values. + consumed, value = getVariableWidthValue(entryData, dataStart) + dataStart += consumed + tags.append((tag, None, value, valuesPerEntry)) + else: + tags.append((tag, 1, None, valuesPerEntry)) + else: + # Shift bits to get the masked value. + while mask & 0x01 == 0: + mask = mask >> 1 + value = value >> 1 + tags.append((tag, value, None, valuesPerEntry)) + for tag, valueCount, valueBytes, valuesPerEntry in tags: + values = [] + if valueCount is not None: + # Read valueCount * valuesPerEntry variable width values. + for _ in range(valueCount): + for _ in range(valuesPerEntry): + consumed, data = getVariableWidthValue(entryData, dataStart) + dataStart += consumed + values.append(data) + else: + # Convert valueBytes to variable width values. + totalConsumed = 0 + while totalConsumed < valueBytes: + # Does this work for valuesPerEntry != 1? + consumed, data = getVariableWidthValue(entryData, dataStart) + dataStart += consumed + totalConsumed += consumed + values.append(data) + if totalConsumed != valueBytes: + print("Error: Should consume %s bytes, but consumed %s" % (valueBytes, totalConsumed)) + tagHashMap[tag] = values + # Test that all bytes have been processed if endPos is given. + if endPos is not None and dataStart != endPos: + # The last entry might have some zero padding bytes, so complain only if non zero bytes are left. + for char in entryData[dataStart:endPos]: + if bord(char) != 0: + print("Warning: There are unprocessed index bytes left: %s" % toHex(entryData[dataStart:endPos])) + if 0: + print("controlByteCount: %s" % controlByteCount) + print("tagTable: %s" % tagTable) + print("data: %s" % toHex(entryData[startPos:endPos])) + print("tagHashMap: %s" % tagHashMap) + break + + return tagHashMap diff --git a/src/epy_reader/tools/KindleUnpack/mobi_k8proc.py b/src/epy_reader/tools/KindleUnpack/mobi_k8proc.py new file mode 100644 index 0000000..5b8274e --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_k8proc.py @@ -0,0 +1,496 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, bstr, utf8_str + +if PY2: + range = xrange + +import os + +import struct +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring + +import re +# note: re requites the pattern to be the exact same type as the data to be searched in python3 +# but u"" is not allowed for the pattern itself only b"" + +from .mobi_index import MobiIndex +from .mobi_utils import fromBase32 +from .unipath import pathof + +_guide_types = [b'cover',b'title-page',b'toc',b'index',b'glossary',b'acknowledgements', + b'bibliography',b'colophon',b'copyright-page',b'dedication', + b'epigraph',b'foreward',b'loi',b'lot',b'notes',b'preface',b'text'] + +# locate beginning and ending positions of tag with specific aid attribute +def locate_beg_end_of_tag(ml, aid): + pattern = utf8_str(r'''<[^>]*\said\s*=\s*['"]%s['"][^>]*>''' % aid) + aid_pattern = re.compile(pattern,re.IGNORECASE) + for m in re.finditer(aid_pattern, ml): + plt = m.start() + pgt = ml.find(b'>',plt+1) + return plt, pgt + return 0, 0 + + +# iterate over all tags in block in reverse order, i.e. last ta to first tag +def reverse_tag_iter(block): + end = len(block) + while True: + pgt = block.rfind(b'>', 0, end) + if pgt == -1: + break + plt = block.rfind(b'<', 0, pgt) + if plt == -1: + break + yield block[plt:pgt+1] + end = plt + + +class K8Processor: + + def __init__(self, mh, sect, files, debug=False): + self.sect = sect + self.files = files + self.mi = MobiIndex(sect) + self.mh = mh + self.skelidx = mh.skelidx + self.fragidx = mh.fragidx + self.guideidx = mh.guideidx + self.fdst = mh.fdst + self.flowmap = {} + self.flows = None + self.flowinfo = [] + self.parts = None + self.partinfo = [] + self.linked_aids = set() + self.fdsttbl= [0,0xffffffff] + self.DEBUG = debug + + # read in and parse the FDST info which is very similar in format to the Palm DB section + # parsing except it provides offsets into rawML file and not the Palm DB file + # this is needed to split up the final css, svg, etc flow section + # that can exist at the end of the rawML file + if self.fdst != 0xffffffff: + header = self.sect.loadSection(self.fdst) + if header[0:4] == b"FDST": + num_sections, = struct.unpack_from(b'>L', header, 0x08) + self.fdsttbl = struct.unpack_from(bstr('>%dL' % (num_sections*2)), header, 12)[::2] + (mh.rawSize, ) + sect.setsectiondescription(self.fdst,"KF8 FDST INDX") + if self.DEBUG: + print("\nFDST Section Map: %d sections" % num_sections) + for j in range(num_sections): + print("Section %d: 0x%08X - 0x%08X" % (j, self.fdsttbl[j],self.fdsttbl[j+1])) + else: + print("\nError: K8 Mobi with Missing FDST info") + + # read/process skeleton index info to create the skeleton table + skeltbl = [] + if self.skelidx != 0xffffffff: + # for i in range(2): + # fname = 'skel%04d.dat' % i + # data = self.sect.loadSection(self.skelidx + i) + # with open(pathof(fname), 'wb') as f: + # f.write(data) + outtbl, ctoc_text = self.mi.getIndexData(self.skelidx, "KF8 Skeleton") + fileptr = 0 + for [text, tagMap] in outtbl: + # file number, skeleton name, fragtbl record count, start position, length + skeltbl.append([fileptr, text, tagMap[1][0], tagMap[6][0], tagMap[6][1]]) + fileptr += 1 + self.skeltbl = skeltbl + if self.DEBUG: + print("\nSkel Table: %d entries" % len(self.skeltbl)) + print("table: filenum, skeleton name, frag tbl record count, start position, length") + for j in range(len(self.skeltbl)): + print(self.skeltbl[j]) + + # read/process the fragment index to create the fragment table + fragtbl = [] + if self.fragidx != 0xffffffff: + # for i in range(3): + # fname = 'frag%04d.dat' % i + # data = self.sect.loadSection(self.fragidx + i) + # with open(pathof(fname), 'wb') as f: + # f.write(data) + outtbl, ctoc_text = self.mi.getIndexData(self.fragidx, "KF8 Fragment") + for [text, tagMap] in outtbl: + # insert position, ctoc offset (aidtext), file number, sequence number, start position, length + ctocoffset = tagMap[2][0] + ctocdata = ctoc_text[ctocoffset] + fragtbl.append([int(text), ctocdata, tagMap[3][0], tagMap[4][0], tagMap[6][0], tagMap[6][1]]) + self.fragtbl = fragtbl + if self.DEBUG: + print("\nFragment Table: %d entries" % len(self.fragtbl)) + print("table: file position, link id text, file num, sequence number, start position, length") + for j in range(len(self.fragtbl)): + print(self.fragtbl[j]) + + # read / process guide index for guide elements of opf + guidetbl = [] + if self.guideidx != 0xffffffff: + # for i in range(3): + # fname = 'guide%04d.dat' % i + # data = self.sect.loadSection(self.guideidx + i) + # with open(pathof(fname), 'wb') as f: + # f.write(data) + outtbl, ctoc_text = self.mi.getIndexData(self.guideidx, "KF8 Guide elements)") + for [text, tagMap] in outtbl: + # ref_type, ref_title, frag number + ctocoffset = tagMap[1][0] + ref_title = ctoc_text[ctocoffset] + ref_type = text + fileno = None + if 3 in tagMap: + fileno = tagMap[3][0] + if 6 in tagMap: + fileno = tagMap[6][0] + guidetbl.append([ref_type, ref_title, fileno]) + self.guidetbl = guidetbl + if self.DEBUG: + print("\nGuide Table: %d entries" % len(self.guidetbl)) + print("table: ref_type, ref_title, fragtbl entry number") + for j in range(len(self.guidetbl)): + print(self.guidetbl[j]) + + def buildParts(self, rawML): + # now split the rawML into its flow pieces + self.flows = [] + for j in range(0, len(self.fdsttbl)-1): + start = self.fdsttbl[j] + end = self.fdsttbl[j+1] + self.flows.append(rawML[start:end]) + + # the first piece represents the xhtml text + text = self.flows[0] + self.flows[0] = b'' + + # walk the <skeleton> and fragment tables to build original source xhtml files + # *without* destroying any file position information needed for later href processing + # and create final list of file separation start: stop points and etc in partinfo + if self.DEBUG: + print("\nRebuilding flow piece 0: the main body of the ebook") + self.parts = [] + self.partinfo = [] + fragptr = 0 + baseptr = 0 + cnt = 0 + filename = 'part%04d.xhtml' % cnt + for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl: + baseptr = skelpos + skellen + skeleton = text[skelpos: baseptr] + aidtext = "0" + for i in range(fragcnt): + [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[fragptr] + aidtext = idtext[12:-2] + if i == 0: + filename = 'part%04d.xhtml' % filenum + slice = text[baseptr: baseptr + length] + insertpos = insertpos - skelpos + head = skeleton[:insertpos] + tail = skeleton[insertpos:] + actual_inspos = insertpos + if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') < head.rfind(b'<')): + # There is an incomplete tag in either the head or tail. + # This can happen for some badly formed KF8 files + print('The fragment table for %s has incorrect insert position. Calculating manually.' % skelname) + bp, ep = locate_beg_end_of_tag(skeleton, aidtext) + if bp != ep: + actual_inspos = ep + 1 + startpos + if insertpos != actual_inspos: + print("fixed corrupt fragment table insert position", insertpos+skelpos, actual_inspos+skelpos) + insertpos = actual_inspos + self.fragtbl[fragptr][0] = actual_inspos + skelpos + skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:] + baseptr = baseptr + length + fragptr += 1 + cnt += 1 + self.parts.append(skeleton) + self.partinfo.append([skelnum, 'Text', filename, skelpos, baseptr, aidtext]) + + assembled_text = b''.join(self.parts) + if self.DEBUG: + outassembled = os.path.join(self.files.k8dir, 'assembled_text.dat') + with open(pathof(outassembled),'wb') as f: + f.write(assembled_text) + + # The primary css style sheet is typically stored next followed by any + # snippets of code that were previously inlined in the + # original xhtml but have been stripped out and placed here. + # This can include local CDATA snippets and and svg sections. + + # The problem is that for most browsers and ereaders, you can not + # use <img src="imageXXXX.svg" /> to import any svg image that itself + # properly uses an <image/> tag to import some raster image - it + # should work according to the spec but does not for almost all browsers + # and ereaders and causes epub validation issues because those raster + # images are in manifest but not in xhtml text - since they only + # referenced from an svg image + + # So we need to check the remaining flow pieces to see if they are css + # or svg images. if svg images, we must check if they have an <image /> + # and if so inline them into the xhtml text pieces. + + # there may be other sorts of pieces stored here but until we see one + # in the wild to reverse engineer we won't be able to tell + self.flowinfo.append([None, None, None, None]) + svg_tag_pattern = re.compile(br'''(<svg[^>]*>)''', re.IGNORECASE) + image_tag_pattern = re.compile(br'''(<image[^>]*>)''', re.IGNORECASE) + for j in range(1,len(self.flows)): + flowpart = self.flows[j] + nstr = '%04d' % j + m = re.search(svg_tag_pattern, flowpart) + if m is not None: + # svg + ptype = b'svg' + start = m.start() + m2 = re.search(image_tag_pattern, flowpart) + if m2 is not None: + pformat = b'inline' + pdir = None + fname = None + # strip off anything before <svg if inlining + flowpart = flowpart[start:] + else: + pformat = b'file' + pdir = "Images" + fname = 'svgimg' + nstr + '.svg' + else: + # search for CDATA and if exists inline it + if flowpart.find(b'[CDATA[') >= 0: + ptype = b'css' + flowpart = b'<style type="text/css">\n' + flowpart + b'\n</style>\n' + pformat = b'inline' + pdir = None + fname = None + else: + # css - assume as standalone css file + ptype = b'css' + pformat = b'file' + pdir = "Styles" + fname = 'style' + nstr + '.css' + + self.flows[j] = flowpart + self.flowinfo.append([ptype, pformat, pdir, fname]) + + if self.DEBUG: + print("\nFlow Map: %d entries" % len(self.flowinfo)) + for fi in self.flowinfo: + print(fi) + print("\n") + + print("\nXHTML File Part Position Information: %d entries" % len(self.partinfo)) + for pi in self.partinfo: + print(pi) + + if False: # self.Debug: + # dump all of the locations of the aid tags used in TEXT + # find id links only inside of tags + # inside any < > pair find all "aid=' and return whatever is inside the quotes + # [^>]* means match any amount of chars except for '>' char + # [^'"] match any amount of chars except for the quote character + # \s* means match any amount of whitespace + print("\npositions of all aid= pieces") + id_pattern = re.compile(br'''<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>''',re.IGNORECASE) + for m in re.finditer(id_pattern, rawML): + [filename, partnum, start, end] = self.getFileInfo(m.start()) + [seqnum, idtext] = self.getFragTblInfo(m.start()) + value = fromBase32(m.group(1)) + print(" aid: %s value: %d at: %d -> part: %d, start: %d, end: %d" % (m.group(1), value, m.start(), partnum, start, end)) + print(" %s fragtbl entry %d" % (idtext, seqnum)) + + return + + # get information fragment table entry by pos + def getFragTblInfo(self, pos): + for j in range(len(self.fragtbl)): + [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[j] + if pos >= insertpos and pos < (insertpos + length): + # why are these "in: and before: added here + return seqnum, b'in: ' + idtext + if pos < insertpos: + return seqnum, b'before: ' + idtext + return None, None + + # get information about the part (file) that exists at pos in original rawML + def getFileInfo(self, pos): + for [partnum, pdir, filename, start, end, aidtext] in self.partinfo: + if pos >= start and pos < end: + return filename, partnum, start, end + return None, None, None, None + + # accessor functions to properly protect the internal structure + def getNumberOfParts(self): + return len(self.parts) + + def getPart(self,i): + if i >= 0 and i < len(self.parts): + return self.parts[i] + return None + + def getPartInfo(self, i): + if i >= 0 and i < len(self.partinfo): + return self.partinfo[i] + return None + + def getNumberOfFlows(self): + return len(self.flows) + + def getFlow(self,i): + # note flows[0] is empty - it was all of the original text + if i > 0 and i < len(self.flows): + return self.flows[i] + return None + + def getFlowInfo(self,i): + # note flowinfo[0] is empty - it was all of the original text + if i > 0 and i < len(self.flowinfo): + return self.flowinfo[i] + return None + + def getIDTagByPosFid(self, posfid, offset): + # first convert kindle:pos:fid and offset info to position in file + # (fromBase32 can handle both string types on input) + row = fromBase32(posfid) + off = fromBase32(offset) + [insertpos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[row] + pos = insertpos + off + fname, pn, skelpos, skelend = self.getFileInfo(pos) + if fname is None: + # pos does not exist + # default to skeleton pos instead + print("Link To Position", pos, "does not exist, retargeting to top of target") + pos = self.skeltbl[filenum][3] + fname, pn, skelpos, skelend = self.getFileInfo(pos) + # an existing "id=" or "name=" attribute must exist in original xhtml otherwise it would not have worked for linking. + # Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent + # some position information encoded into Base32 name. + # so find the closest "id=" before position the file by actually searching in that file + idtext = self.getIDTag(pos) + return fname, idtext + + def getIDTag(self, pos): + # find the first tag with a named anchor (name or id attribute) before pos + fname, pn, skelpos, skelend = self.getFileInfo(pos) + if pn is None and skelpos is None: + print("Error: getIDTag - no file contains ", pos) + textblock = self.parts[pn] + npos = pos - skelpos + # if npos inside a tag then search all text before the its end of tag marker + pgt = textblock.find(b'>',npos) + plt = textblock.find(b'<',npos) + if plt == npos or pgt < plt: + npos = pgt + 1 + # find id and name attributes only inside of tags + # use a reverse tag search since that is faster + # inside any < > pair find "id=" and "name=" attributes return it + # [^>]* means match any amount of chars except for '>' char + # [^'"] match any amount of chars except for the quote character + # \s* means match any amount of whitespace + textblock = textblock[0:npos] + id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE) + name_pattern = re.compile(br'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE) + aid_pattern = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''') + for tag in reverse_tag_iter(textblock): + # any ids in the body should default to top of file + if tag[0:6] == b'<body ': + return b'' + if tag[0:6] != b'<meta ': + m = id_pattern.match(tag) or name_pattern.match(tag) + if m is not None: + return m.group(1) + m = aid_pattern.match(tag) + if m is not None: + self.linked_aids.add(m.group(1)) + return b'aid-' + m.group(1) + return b'' + + # do we need to do deep copying + def setParts(self, parts): + assert(len(parts) == len(self.parts)) + for i in range(len(parts)): + self.parts[i] = parts[i] + + # do we need to do deep copying + def setFlows(self, flows): + assert(len(flows) == len(self.flows)) + for i in range(len(flows)): + self.flows[i] = flows[i] + + # get information about the part (file) that exists at pos in original rawML + def getSkelInfo(self, pos): + for [partnum, pdir, filename, start, end, aidtext] in self.partinfo: + if pos >= start and pos < end: + return [partnum, pdir, filename, start, end, aidtext] + return [None, None, None, None, None, None] + + # fileno is actually a reference into fragtbl (a fragment) + def getGuideText(self): + guidetext = b'' + for [ref_type, ref_title, fileno] in self.guidetbl: + if ref_type == b'thumbimagestandard': + continue + if ref_type not in _guide_types and not ref_type.startswith(b'other.'): + if ref_type == b'start': + ref_type = b'text' + else: + ref_type = b'other.' + ref_type + [pos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[fileno] + [pn, pdir, filename, skelpos, skelend, aidtext] = self.getSkelInfo(pos) + idtext = self.getIDTag(pos) + linktgt = filename.encode('utf-8') + if idtext != b'': + linktgt += b'#' + idtext + guidetext += b'<reference type="'+ref_type+b'" title="'+ref_title+b'" href="'+utf8_str(pdir)+b'/'+linktgt+b'" />\n' + # opf is encoded utf-8 so must convert any titles properly + guidetext = (guidetext.decode(self.mh.codec)).encode("utf-8") + return guidetext + + def getPageIDTag(self, pos): + # find the first tag with a named anchor (name or id attribute) before pos + # but page map offsets need to little more leeway so if the offset points + # into a tag look for the next ending tag "/>" or "</" and start your search from there. + fname, pn, skelpos, skelend = self.getFileInfo(pos) + if pn is None and skelpos is None: + print("Error: getIDTag - no file contains ", pos) + textblock = self.parts[pn] + npos = pos - skelpos + # if npos inside a tag then search all text before next ending tag + pgt = textblock.find(b'>',npos) + plt = textblock.find(b'<',npos) + if plt == npos or pgt < plt: + # we are in a tag + # so find first ending tag + pend1 = textblock.find(b'/>', npos) + pend2 = textblock.find(b'</', npos) + if pend1 != -1 and pend2 != -1: + pend = min(pend1, pend2) + else: + pend = max(pend1, pend2) + if pend != -1: + npos = pend + else: + npos = pgt + 1 + # find id and name attributes only inside of tags + # use a reverse tag search since that is faster + # inside any < > pair find "id=" and "name=" attributes return it + # [^>]* means match any amount of chars except for '>' char + # [^'"] match any amount of chars except for the quote character + # \s* means match any amount of whitespace + textblock = textblock[0:npos] + id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE) + name_pattern = re.compile(br'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE) + for tag in reverse_tag_iter(textblock): + # any ids in the body should default to top of file + if tag[0:6] == b'<body ': + return b'' + if tag[0:6] != b'<meta ': + m = id_pattern.match(tag) or name_pattern.match(tag) + if m is not None: + return m.group(1) + return b'' diff --git a/src/epy_reader/tools/KindleUnpack/mobi_k8resc.py b/src/epy_reader/tools/KindleUnpack/mobi_k8resc.py new file mode 100644 index 0000000..1e58e84 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_k8resc.py @@ -0,0 +1,271 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +DEBUG_USE_ORDERED_DICTIONARY = False # OrderedDict is supoorted >= python 2.7. +""" set to True to use OrderedDict for K8RESCProcessor.parsetag.tattr.""" + +if DEBUG_USE_ORDERED_DICTIONARY: + from collections import OrderedDict as dict_ +else: + dict_ = dict + +from .compatibility_utils import unicode_str + +from .mobi_utils import fromBase32 + +_OPF_PARENT_TAGS = ['xml', 'package', 'metadata', 'dc-metadata', + 'x-metadata', 'manifest', 'spine', 'tours', 'guide'] + +class K8RESCProcessor(object): + + def __init__(self, data, debug=False): + self._debug = debug + self.resc = None + self.opos = 0 + self.extrameta = [] + self.cover_name = None + self.spine_idrefs = {} + self.spine_order = [] + self.spine_pageattributes = {} + self.spine_ppd = None + # need3 indicate the book has fields which require epub3. + # but the estimation of the source epub version from the fields is difficult. + self.need3 = False + self.package_ver = None + self.extra_metadata = [] + self.refines_metadata = [] + self.extra_attributes = [] + # get header + start_pos = data.find(b'<') + self.resc_header = data[:start_pos] + # get resc data length + start = self.resc_header.find(b'=') + 1 + end = self.resc_header.find(b'&', start) + resc_size = 0 + if end > 0: + resc_size = fromBase32(self.resc_header[start:end]) + resc_rawbytes = len(data) - start_pos + if resc_rawbytes == resc_size: + self.resc_length = resc_size + else: + # Most RESC has a nul string at its tail but some do not. + end_pos = data.find(b'\x00', start_pos) + if end_pos < 0: + self.resc_length = resc_rawbytes + else: + self.resc_length = end_pos - start_pos + if self.resc_length != resc_size: + print("Warning: RESC section length({:d}bytes) does not match its size({:d}bytes).".format(self.resc_length, resc_size)) + # now parse RESC after converting it to unicode from utf-8 + try: + self.resc = unicode_str(data[start_pos:start_pos+self.resc_length]) + except UnicodeDecodeError: + self.resc = unicode_str(data[start_pos:start_pos+self.resc_length], enc='latin-1') + self.parseData() + + def prepend_to_spine(self, key, idref, linear, properties): + self.spine_order = [key] + self.spine_order + self.spine_idrefs[key] = idref + attributes = {} + if linear is not None: + attributes['linear'] = linear + if properties is not None: + attributes['properties'] = properties + self.spine_pageattributes[key] = attributes + + # RESC tag iterator + def resc_tag_iter(self): + tcontent = last_tattr = None + prefix = [''] + while True: + text, tag = self.parseresc() + if text is None and tag is None: + break + if text is not None: + tcontent = text.rstrip(' \r\n') + else: # we have a tag + ttype, tname, tattr = self.parsetag(tag) + if ttype == 'begin': + tcontent = None + prefix.append(tname + '.') + if tname in _OPF_PARENT_TAGS: + yield ''.join(prefix), tname, tattr, tcontent + else: + last_tattr = tattr + else: # single or end + if ttype == 'end': + prefix.pop() + tattr = last_tattr + last_tattr = None + if tname in _OPF_PARENT_TAGS: + tname += '-end' + yield ''.join(prefix), tname, tattr, tcontent + tcontent = None + + # now parse the RESC to extract spine and extra metadata info + def parseData(self): + for prefix, tname, tattr, tcontent in self.resc_tag_iter(): + if self._debug: + print(" Parsing RESC: ", prefix, tname, tattr, tcontent) + if tname == 'package': + self.package_ver = tattr.get('version', '2.0') + package_prefix = tattr.get('prefix','') + if self.package_ver.startswith('3') or package_prefix.startswith('rendition'): + self.need3 = True + if tname == 'spine': + self.spine_ppd = tattr.get('page-progession-direction', None) + if self.spine_ppd is not None and self.spine_ppd == 'rtl': + self.need3 = True + if tname == 'itemref': + skelid = tattr.pop('skelid', None) + if skelid is None and len(self.spine_order) == 0: + # assume it was removed initial coverpage + skelid = 'coverpage' + tattr['linear'] = 'no' + self.spine_order.append(skelid) + idref = tattr.pop('idref', None) + if idref is not None: + idref = 'x_' + idref + self.spine_idrefs[skelid] = idref + if 'id' in tattr: + del tattr['id'] + # tattr["id"] = 'x_' + tattr["id"] + if 'properties' in tattr: + self.need3 = True + self.spine_pageattributes[skelid] = tattr + if tname == 'meta' or tname.startswith('dc:'): + if 'refines' in tattr or 'property' in tattr: + self.need3 = True + if tattr.get('name','') == 'cover': + cover_name = tattr.get('content',None) + if cover_name is not None: + cover_name = 'x_' + cover_name + self.cover_name = cover_name + else: + self.extrameta.append([tname, tattr, tcontent]) + + # parse and return either leading text or the next tag + def parseresc(self): + p = self.opos + if p >= len(self.resc): + return None, None + if self.resc[p] != '<': + res = self.resc.find('<',p) + if res == -1 : + res = len(self.resc) + self.opos = res + return self.resc[p:res], None + # handle comment as a special case + if self.resc[p:p+4] == '<!--': + te = self.resc.find('-->',p+1) + if te != -1: + te = te+2 + else: + te = self.resc.find('>',p+1) + ntb = self.resc.find('<',p+1) + if ntb != -1 and ntb < te: + self.opos = ntb + return self.resc[p:ntb], None + self.opos = te + 1 + return None, self.resc[p:te+1] + + # parses tag to identify: [tname, ttype, tattr] + # tname: tag name + # ttype: tag type ('begin', 'end' or 'single'); + # tattr: dictionary of tag atributes + def parsetag(self, s): + p = 1 + tname = None + ttype = None + tattr = dict_() + while s[p:p+1] == ' ' : + p += 1 + if s[p:p+1] == '/': + ttype = 'end' + p += 1 + while s[p:p+1] == ' ' : + p += 1 + b = p + while s[p:p+1] not in ('>', '/', ' ', '"', "'",'\r','\n') : + p += 1 + tname=s[b:p].lower() + # some special cases + if tname == '?xml': + tname = 'xml' + if tname == '!--': + ttype = 'single' + comment = s[p:-3].strip() + tattr['comment'] = comment + if ttype is None: + # parse any attributes of begin or single tags + while s.find('=',p) != -1 : + while s[p:p+1] == ' ' : + p += 1 + b = p + while s[p:p+1] != '=' : + p += 1 + aname = s[b:p].lower() + aname = aname.rstrip(' ') + p += 1 + while s[p:p+1] == ' ' : + p += 1 + if s[p:p+1] in ('"', "'") : + p = p + 1 + b = p + while s[p:p+1] not in ('"', "'"): + p += 1 + val = s[b:p] + p += 1 + else : + b = p + while s[p:p+1] not in ('>', '/', ' ') : + p += 1 + val = s[b:p] + tattr[aname] = val + if ttype is None: + ttype = 'begin' + if s.find('/',p) >= 0: + ttype = 'single' + return ttype, tname, tattr + + def taginfo_toxml(self, taginfo): + res = [] + tname, tattr, tcontent = taginfo + res.append('<' + tname) + if tattr is not None: + for key in tattr: + res.append(' ' + key + '="'+tattr[key]+'"') + if tcontent is not None: + res.append('>' + tcontent + '</' + tname + '>\n') + else: + res.append('/>\n') + return "".join(res) + + def hasSpine(self): + return len(self.spine_order) > 0 + + def needEPUB3(self): + return self.need3 + + def hasRefines(self): + for [tname, tattr, tcontent] in self.extrameta: + if 'refines' in tattr: + return True + return False + + def createMetadata(self, epubver): + for taginfo in self.extrameta: + tname, tattr, tcontent = taginfo + if 'refines' in tattr: + if epubver == 'F' and 'property' in tattr: + attr = ' id="%s" opf:%s="%s"\n' % (tattr['refines'], tattr['property'], tcontent) + self.extra_attributes.append(attr) + else: + tag = self.taginfo_toxml(taginfo) + self.refines_metadata.append(tag) + else: + tag = self.taginfo_toxml(taginfo) + self.extra_metadata.append(tag) diff --git a/src/epy_reader/tools/KindleUnpack/mobi_nav.py b/src/epy_reader/tools/KindleUnpack/mobi_nav.py new file mode 100644 index 0000000..16fb0be --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_nav.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import unicode_str +import os +from .unipath import pathof + +import re +# note: re requites the pattern to be the exact same type as the data to be searched in python3 +# but u"" is not allowed for the pattern itself only b"" + +DEBUG_NAV = False + +FORCE_DEFAULT_TITLE = False +""" Set to True to force to use the default title. """ + +NAVIGATION_FINENAME = 'nav.xhtml' +""" The name for the navigation document. """ + +DEFAULT_TITLE = 'Navigation' +""" The default title for the navigation document. """ + +class NAVProcessor(object): + + def __init__(self, files): + self.files = files + self.navname = NAVIGATION_FINENAME + + def buildLandmarks(self, guidetext): + header = '' + header += ' <nav epub:type="landmarks" id="landmarks" hidden="">\n' + header += ' <h2>Guide</h2>\n' + header += ' <ol>\n' + element = ' <li><a epub:type="{:s}" href="{:s}">{:s}</a></li>\n' + footer = '' + footer += ' </ol>\n' + footer += ' </nav>\n' + + type_map = { + 'cover' : 'cover', + 'title-page' : 'title-page', + # ?: 'frontmatter', + 'text' : 'bodymatter', + # ?: 'backmatter', + 'toc' : 'toc', + 'loi' : 'loi', + 'lot' : 'lot', + 'preface' : 'preface', + 'bibliography' : 'bibliography', + 'index' : 'index', + 'glossary' : 'glossary', + 'acknowledgements' : 'acknowledgements', + 'colophon' : None, + 'copyright-page' : None, + 'dedication' : None, + 'epigraph' : None, + 'foreword' : None, + 'notes' : None + } + + re_type = re.compile(r'\s+type\s*=\s*"(.*?)"', re.I) + re_title = re.compile(r'\s+title\s*=\s*"(.*?)"', re.I) + re_link = re.compile(r'\s+href\s*=\s*"(.*?)"', re.I) + dir_ = os.path.relpath(self.files.k8text, self.files.k8oebps).replace('\\', '/') + + data = '' + references = re.findall(r'<reference\s+.*?>', unicode_str(guidetext), re.I) + for reference in references: + mo_type = re_type.search(reference) + mo_title = re_title.search(reference) + mo_link = re_link.search(reference) + if mo_type is not None: + type_ = type_map.get(mo_type.group(1), None) + else: + type_ = None + if mo_title is not None: + title = mo_title.group(1) + else: + title = None + if mo_link is not None: + link = mo_link.group(1) + else: + link = None + + if type_ is not None and title is not None and link is not None: + link = os.path.relpath(link, dir_).replace('\\', '/') + data += element.format(type_, link, title) + if len(data) > 0: + return header + data + footer + else: + return '' + + def buildTOC(self, indx_data): + header = '' + header += ' <nav epub:type="toc" id="toc">\n' + header += ' <h1>Table of contents</h1>\n' + footer = ' </nav>\n' + + # recursive part + def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1): + if start>len(indx_data) or end>len(indx_data): + print("Warning (in buildTOC): missing INDX child entries", start, end, len(indx_data)) + return '' + if DEBUG_NAV: + print("recursINDX (in buildTOC) lvl %d from %d to %d" % (lvl, start, end)) + xhtml = '' + if start <= 0: + start = 0 + if end <= 0: + end = len(indx_data) + if lvl > max_lvl: + max_lvl = lvl + + indent1 = ' ' * (2 + lvl * 2) + indent2 = ' ' * (3 + lvl * 2) + xhtml += indent1 + '<ol>\n' + for i in range(start, end): + e = indx_data[i] + htmlfile = e['filename'] + desttag = e['idtag'] + text = e['text'] + if not e['hlvl'] == lvl: + continue + num += 1 + if desttag == '': + link = htmlfile + else: + link = '{:s}#{:s}'.format(htmlfile, desttag) + xhtml += indent2 + '<li>' + entry = '<a href="{:}">{:s}</a>'.format(link, text) + xhtml += entry + # recurs + if e['child1'] >= 0: + xhtml += '\n' + xhtmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1, + e['child1'], e['childn'] + 1) + xhtml += xhtmlrec + xhtml += indent2 + # close entry + xhtml += '</li>\n' + xhtml += indent1 + '</ol>\n' + return xhtml, max_lvl, num + + data, max_lvl, num = recursINDX() + if not len(indx_data) == num: + print("Warning (in buildTOC): different number of entries in NCX", len(indx_data), num) + return header + data + footer + + def buildNAV(self, ncx_data, guidetext, title, lang): + print("Building Navigation Document.") + if FORCE_DEFAULT_TITLE: + title = DEFAULT_TITLE + nav_header = '' + nav_header += '<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html>' + nav_header += '<html xmlns="http://www.w3.org/1999/xhtml"' + nav_header += ' xmlns:epub="http://www.idpf.org/2007/ops"' + nav_header += ' lang="{0:s}" xml:lang="{0:s}">\n'.format(lang) + nav_header += '<head>\n<title>{:s}</title>\n'.format(title) + nav_header += '<meta charset="UTF-8" />\n' + nav_header += '<style type="text/css">\n' + nav_header += 'nav#landmarks { display:none; }\n' + nav_header += 'ol { list-style-type: none; }' + nav_header += '</style>\n</head>\n<body>\n' + nav_footer = '</body>\n</html>\n' + + landmarks = self.buildLandmarks(guidetext) + toc = self.buildTOC(ncx_data) + + data = nav_header + data += landmarks + data += toc + data += nav_footer + return data + + def getNAVName(self): + return self.navname + + def writeNAV(self, ncx_data, guidetext, metadata): + # build the xhtml + # print("Write Navigation Document.") + xhtml = self.buildNAV(ncx_data, guidetext, metadata.get('Title')[0], metadata.get('Language')[0]) + fname = os.path.join(self.files.k8text, self.navname) + with open(pathof(fname), 'wb') as f: + f.write(xhtml.encode('utf-8')) diff --git a/src/epy_reader/tools/KindleUnpack/mobi_ncx.py b/src/epy_reader/tools/KindleUnpack/mobi_ncx.py new file mode 100644 index 0000000..60ef9a0 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_ncx.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +import os +from .unipath import pathof +from .compatibility_utils import unescapeit + + +import re +# note: re requites the pattern to be the exact same type as the data to be searched in python3 +# but u"" is not allowed for the pattern itself only b"" + +from xml.sax.saxutils import escape as xmlescape + +from .mobi_utils import toBase32 +from .mobi_index import MobiIndex + +DEBUG_NCX = False + +class ncxExtract: + + def __init__(self, mh, files): + self.mh = mh + self.sect = self.mh.sect + self.files = files + self.isNCX = False + self.mi = MobiIndex(self.sect) + self.ncxidx = self.mh.ncxidx + self.indx_data = None + + def parseNCX(self): + indx_data = [] + tag_fieldname_map = { + 1: ['pos',0], + 2: ['len',0], + 3: ['noffs',0], + 4: ['hlvl',0], + 5: ['koffs',0], + 6: ['pos_fid',0], + 21: ['parent',0], + 22: ['child1',0], + 23: ['childn',0] + } + if self.ncxidx != 0xffffffff: + outtbl, ctoc_text = self.mi.getIndexData(self.ncxidx, "NCX") + if DEBUG_NCX: + print(ctoc_text) + print(outtbl) + num = 0 + for [text, tagMap] in outtbl: + tmp = { + 'name': text.decode('utf-8'), + 'pos': -1, + 'len': 0, + 'noffs': -1, + 'text' : "Unknown Text", + 'hlvl' : -1, + 'kind' : "Unknown Kind", + 'pos_fid' : None, + 'parent' : -1, + 'child1' : -1, + 'childn' : -1, + 'num' : num + } + for tag in tag_fieldname_map: + [fieldname, i] = tag_fieldname_map[tag] + if tag in tagMap: + fieldvalue = tagMap[tag][i] + if tag == 6: + pos_fid = toBase32(fieldvalue,4).decode('utf-8') + fieldvalue2 = tagMap[tag][i+1] + pos_off = toBase32(fieldvalue2,10).decode('utf-8') + fieldvalue = 'kindle:pos:fid:%s:off:%s' % (pos_fid, pos_off) + tmp[fieldname] = fieldvalue + if tag == 3: + toctext = ctoc_text.get(fieldvalue, 'Unknown Text') + toctext = toctext.decode(self.mh.codec) + tmp['text'] = toctext + if tag == 5: + kindtext = ctoc_text.get(fieldvalue, 'Unknown Kind') + kindtext = kindtext.decode(self.mh.codec) + tmp['kind'] = kindtext + indx_data.append(tmp) + if DEBUG_NCX: + print("record number: ", num) + print("name: ", tmp['name'],) + print("position", tmp['pos']," length: ", tmp['len']) + print("text: ", tmp['text']) + print("kind: ", tmp['kind']) + print("heading level: ", tmp['hlvl']) + print("parent:", tmp['parent']) + print("first child: ",tmp['child1']," last child: ", tmp['childn']) + print("pos_fid is ", tmp['pos_fid']) + print("\n\n") + num += 1 + self.indx_data = indx_data + return indx_data + + def buildNCX(self, htmlfile, title, ident, lang): + indx_data = self.indx_data + + ncx_header = \ +'''<?xml version='1.0' encoding='utf-8'?> +<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="%s"> +<head> +<meta content="%s" name="dtb:uid"/> +<meta content="%d" name="dtb:depth"/> +<meta content="mobiunpack.py" name="dtb:generator"/> +<meta content="0" name="dtb:totalPageCount"/> +<meta content="0" name="dtb:maxPageNumber"/> +</head> +<docTitle> +<text>%s</text> +</docTitle> +<navMap> +''' + + ncx_footer = \ +''' </navMap> +</ncx> +''' + + ncx_entry = \ +'''<navPoint id="%s" playOrder="%d"> +<navLabel> +<text>%s</text> +</navLabel> +<content src="%s"/>''' + + # recursive part + def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1): + if start>len(indx_data) or end>len(indx_data): + print("Warning: missing INDX child entries", start, end, len(indx_data)) + return '' + if DEBUG_NCX: + print("recursINDX lvl %d from %d to %d" % (lvl, start, end)) + xml = '' + if start <= 0: + start = 0 + if end <= 0: + end = len(indx_data) + if lvl > max_lvl: + max_lvl = lvl + indent = ' ' * (2 + lvl) + + for i in range(start, end): + e = indx_data[i] + if not e['hlvl'] == lvl: + continue + # open entry + num += 1 + link = '%s#filepos%d' % (htmlfile, e['pos']) + tagid = 'np_%d' % num + entry = ncx_entry % (tagid, num, xmlescape(unescapeit(e['text'])), link) + entry = re.sub(re.compile('^', re.M), indent, entry, 0) + xml += entry + '\n' + # recurs + if e['child1']>=0: + xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1, + e['child1'], e['childn'] + 1) + xml += xmlrec + # close entry + xml += indent + '</navPoint>\n' + return xml, max_lvl, num + + body, max_lvl, num = recursINDX() + header = ncx_header % (lang, ident, max_lvl + 1, title) + ncx = header + body + ncx_footer + if not len(indx_data) == num: + print("Warning: different number of entries in NCX", len(indx_data), num) + return ncx + + def writeNCX(self, metadata): + # build the xml + self.isNCX = True + print("Write ncx") + # htmlname = os.path.basename(self.files.outbase) + # htmlname += '.html' + htmlname = 'book.html' + xml = self.buildNCX(htmlname, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0]) + # write the ncx file + # ncxname = os.path.join(self.files.mobi7dir, self.files.getInputFileBasename() + '.ncx') + ncxname = os.path.join(self.files.mobi7dir, 'toc.ncx') + with open(pathof(ncxname), 'wb') as f: + f.write(xml.encode('utf-8')) + + def buildK8NCX(self, indx_data, title, ident, lang): + ncx_header = \ +'''<?xml version='1.0' encoding='utf-8'?> +<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="%s"> +<head> +<meta content="%s" name="dtb:uid"/> +<meta content="%d" name="dtb:depth"/> +<meta content="mobiunpack.py" name="dtb:generator"/> +<meta content="0" name="dtb:totalPageCount"/> +<meta content="0" name="dtb:maxPageNumber"/> +</head> +<docTitle> +<text>%s</text> +</docTitle> +<navMap> +''' + + ncx_footer = \ +''' </navMap> +</ncx> +''' + + ncx_entry = \ +'''<navPoint id="%s" playOrder="%d"> +<navLabel> +<text>%s</text> +</navLabel> +<content src="%s"/>''' + + # recursive part + def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1): + if start>len(indx_data) or end>len(indx_data): + print("Warning: missing INDX child entries", start, end, len(indx_data)) + return '' + if DEBUG_NCX: + print("recursINDX lvl %d from %d to %d" % (lvl, start, end)) + xml = '' + if start <= 0: + start = 0 + if end <= 0: + end = len(indx_data) + if lvl > max_lvl: + max_lvl = lvl + indent = ' ' * (2 + lvl) + + for i in range(start, end): + e = indx_data[i] + htmlfile = e['filename'] + desttag = e['idtag'] + if not e['hlvl'] == lvl: + continue + # open entry + num += 1 + if desttag == '': + link = 'Text/%s' % htmlfile + else: + link = 'Text/%s#%s' % (htmlfile, desttag) + tagid = 'np_%d' % num + entry = ncx_entry % (tagid, num, xmlescape(unescapeit(e['text'])), link) + entry = re.sub(re.compile('^', re.M), indent, entry, 0) + xml += entry + '\n' + # recurs + if e['child1']>=0: + xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1, + e['child1'], e['childn'] + 1) + xml += xmlrec + # close entry + xml += indent + '</navPoint>\n' + return xml, max_lvl, num + + body, max_lvl, num = recursINDX() + header = ncx_header % (lang, ident, max_lvl + 1, title) + ncx = header + body + ncx_footer + if not len(indx_data) == num: + print("Warning: different number of entries in NCX", len(indx_data), num) + return ncx + + def writeK8NCX(self, ncx_data, metadata): + # build the xml + self.isNCX = True + print("Write K8 ncx") + xml = self.buildK8NCX(ncx_data, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0]) + bname = 'toc.ncx' + ncxname = os.path.join(self.files.k8oebps,bname) + with open(pathof(ncxname), 'wb') as f: + f.write(xml.encode('utf-8')) diff --git a/src/epy_reader/tools/KindleUnpack/mobi_opf.py b/src/epy_reader/tools/KindleUnpack/mobi_opf.py new file mode 100644 index 0000000..742d776 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_opf.py @@ -0,0 +1,686 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import unicode_str, unescapeit +from .compatibility_utils import lzip + +from .unipath import pathof + +from xml.sax.saxutils import escape as xmlescape + +import os +import uuid +from datetime import datetime + +# In EPUB3, NCX and <guide> MAY exist in OPF, although the NCX is superseded +# by the Navigation Document and the <guide> is deprecated. Currently, EPUB3_WITH_NCX +# and EPUB3_WITH_GUIDE are set to True due to compatibility with epub2 reading systems. +# They might be change to set to False in the future. + +EPUB3_WITH_NCX = True # Do not set to False except for debug. +""" Set to True to create a toc.ncx when converting to epub3. """ + +EPUB3_WITH_GUIDE = True # Do not set to False except for debug. +""" Set to True to create a guide element in an opf when converting to epub3. """ + +EPUB_OPF = 'content.opf' +""" The name for the OPF of EPUB. """ + +TOC_NCX = 'toc.ncx' +""" The name for the TOC of EPUB2. """ + +NAVIGATION_DOCUMENT = 'nav.xhtml' +""" The name for the navigation document of EPUB3. """ + +BEGIN_INFO_ONLY = '<!-- BEGIN INFORMATION ONLY ' +""" The comment to indicate the beginning of metadata which will be ignored by kindlegen. """ + +END_INFO_ONLY = 'END INFORMATION ONLY -->' +""" The comment to indicate the end of metadata which will be ignored by kindlegen. """ + +EXTH_TITLE_FURIGANA = 'Title-Pronunciation' +""" The name for Title Furigana(similar to file-as) set by KDP. """ + +EXTH_CREATOR_FURIGANA = 'Author-Pronunciation' +""" The name for Creator Furigana(similar to file-as) set by KDP. """ + +EXTH_PUBLISHER_FURIGANA = 'Publisher-Pronunciation' +""" The name for Publisher Furigana(similar to file-as) set by KDP. """ + +EXTRA_ENTITIES = {'"': '"', "'": "'"} + +class OPFProcessor(object): + + def __init__(self, files, metadata, fileinfo, rscnames, hasNCX, mh, usedmap, pagemapxml='', guidetext='', k8resc=None, epubver='2'): + self.files = files + self.metadata = metadata + self.fileinfo = fileinfo + self.rscnames = rscnames + self.has_ncx = hasNCX + self.codec = mh.codec + self.isK8 = mh.isK8() + self.printReplica = mh.isPrintReplica() + self.guidetext = unicode_str(guidetext) + self.used = usedmap + self.k8resc = k8resc + self.covername = None + self.cover_id = 'cover_img' + if self.k8resc is not None and self.k8resc.cover_name is not None: + # update cover id info from RESC if available + self.cover_id = self.k8resc.cover_name + # Create a unique urn uuid + self.BookId = unicode_str(str(uuid.uuid4())) + self.pagemap = pagemapxml + + self.ncxname = None + self.navname = None + + # page-progression-direction is only set in spine + self.page_progression_direction = metadata.pop('page-progression-direction', [None])[0] + if 'rl' in metadata.get('primary-writing-mode', [''])[0]: + self.page_progression_direction = 'rtl' + self.epubver = epubver # the epub version set by user + self.target_epubver = epubver # the epub vertion set by user or detected automatically + if self.epubver == 'A': + self.target_epubver = self.autodetectEPUBVersion() + elif self.epubver == 'F': + self.target_epubver = '2' + elif self.epubver != '2' and self.epubver != '3': + self.target_epubver = '2' + + # id for rifine attributes + self.title_id = {} + self.creator_id = {} + self.publisher_id = {} + # extra attributes + self.title_attrib = {} + self.creator_attrib = {} + self.publisher_attrib = {} + self.extra_attributes = [] # for force epub2 option + # Create epub3 metadata from EXTH. + self.exth_solved_refines_metadata = [] + self.exth_refines_metadata = [] + self.exth_fixedlayout_metadata = [] + + self.defineRefinesID() + self.processRefinesMetadata() + if self.k8resc is not None: + # Create metadata in RESC section. + self.k8resc.createMetadata(epubver) + if self.target_epubver == "3": + self.createMetadataForFixedlayout() + + def escapeit(self, sval, EXTRAS=None): + # note, xmlescape and unescape do not work with utf-8 bytestrings + sval = unicode_str(sval) + if EXTRAS: + res = xmlescape(unescapeit(sval), EXTRAS) + else: + res = xmlescape(unescapeit(sval)) + return res + + def createMetaTag(self, data, property, content, refid=''): + refines = '' + if refid: + refines = ' refines="#%s"' % refid + data.append('<meta property="%s"%s>%s</meta>\n' % (property, refines, content)) + + def buildOPFMetadata(self, start_tag, has_obfuscated_fonts=False): + # convert from EXTH metadata format to target epub version metadata + # epub 3 will ignore <meta name="xxxx" content="yyyy" /> style metatags + # but allows them to be present for backwards compatibility + # instead the new format is + # <meta property="xxxx" id="iiii" ... > property_value</meta> + # and DCMES elements such as: + # <dc:blah id="iiii">value</dc:blah> + + metadata = self.metadata + k8resc = self.k8resc + + META_TAGS = ['Drm Server Id', 'Drm Commerce Id', 'Drm Ebookbase Book Id', 'ASIN', 'ThumbOffset', 'Fake Cover', + 'Creator Software', 'Creator Major Version', 'Creator Minor Version', 'Creator Build Number', + 'Watermark', 'Clipping Limit', 'Publisher Limit', 'Text to Speech Disabled', 'CDE Type', + 'Updated Title', 'Font Signature (hex)', 'Tamper Proof Keys (hex)',] + + # def handleTag(data, metadata, key, tag, ids={}): + def handleTag(data, metadata, key, tag, attrib={}): + '''Format metadata values. + + @param data: List of formatted metadata entries. + @param metadata: The metadata dictionary. + @param key: The key of the metadata value to handle. + @param tag: The opf tag corresponds to the metadata value. + ###@param ids: The ids in tags for refines property of epub3. + @param attrib: The extra attibute for refines or opf prefixs. + ''' + if key in metadata: + for i, value in enumerate(metadata[key]): + closingTag = tag.split(" ")[0] + res = '<%s%s>%s</%s>\n' % (tag, attrib.get(i, ''), self.escapeit(value), closingTag) + data.append(res) + del metadata[key] + + # these are allowed but ignored by epub3 + def handleMetaPairs(data, metadata, key, name): + if key in metadata: + for value in metadata[key]: + res = '<meta name="%s" content="%s" />\n' % (name, self.escapeit(value, EXTRA_ENTITIES)) + data.append(res) + del metadata[key] + + data = [] + data.append(start_tag + '\n') + # Handle standard metadata + if 'Title' in metadata: + handleTag(data, metadata, 'Title', 'dc:title', self.title_attrib) + else: + data.append('<dc:title>Untitled</dc:title>\n') + handleTag(data, metadata, 'Language', 'dc:language') + if 'UniqueID' in metadata: + handleTag(data, metadata, 'UniqueID', 'dc:identifier id="uid"') + else: + # No unique ID in original, give it a generic one. + data.append('<dc:identifier id="uid">0</dc:identifier>\n') + + if self.target_epubver == '3': + # epub version 3 minimal metadata requires a dcterms:modifed date tag + self.createMetaTag(data, 'dcterms:modified', datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")) + + if self.isK8 and has_obfuscated_fonts: + # Use the random generated urn:uuid so obuscated fonts work. + # It doesn't need to be _THE_ unique identifier to work as a key + # for obfuscated fonts in Sigil, ADE and calibre. Its just has + # to use the opf:scheme="UUID" and have the urn:uuid: prefix. + if self.target_epubver == '3': + data.append('<dc:identifier>urn:uuid:'+self.BookId+'</dc:identifier>\n') + else: + data.append('<dc:identifier opf:scheme="UUID">urn:uuid:'+self.BookId+'</dc:identifier>\n') + + handleTag(data, metadata, 'Creator', 'dc:creator', self.creator_attrib) + handleTag(data, metadata, 'Contributor', 'dc:contributor') + handleTag(data, metadata, 'Publisher', 'dc:publisher', self.publisher_attrib) + handleTag(data, metadata, 'Source', 'dc:source') + handleTag(data, metadata, 'Type', 'dc:type') + if self.target_epubver == '3': + if 'ISBN' in metadata: + for i, value in enumerate(metadata['ISBN']): + res = '<dc:identifier>urn:isbn:%s</dc:identifier>\n' % self.escapeit(value) + data.append(res) + else: + handleTag(data, metadata, 'ISBN', 'dc:identifier opf:scheme="ISBN"') + if 'Subject' in metadata: + if 'SubjectCode' in metadata: + codeList = metadata['SubjectCode'] + del metadata['SubjectCode'] + else: + codeList = None + for i in range(len(metadata['Subject'])): + if codeList and i < len(codeList): + data.append('<dc:subject BASICCode="'+codeList[i]+'">') + else: + data.append('<dc:subject>') + data.append(self.escapeit(metadata['Subject'][i])+'</dc:subject>\n') + del metadata['Subject'] + handleTag(data, metadata, 'Description', 'dc:description') + if self.target_epubver == '3': + if 'Published' in metadata: + for i, value in enumerate(metadata['Published']): + res = '<dc:date>%s</dc:date>\n' % self.escapeit(value) + data.append(res) + else: + handleTag(data, metadata, 'Published', 'dc:date opf:event="publication"') + handleTag(data, metadata, 'Rights', 'dc:rights') + + if self.epubver == 'F': + if self.extra_attributes or k8resc is not None and k8resc.extra_attributes: + data.append('<!-- THE FOLLOWINGS ARE REQUIRED TO INSERT INTO <dc:xxx> MANUALLY\n') + if self.extra_attributes: + data += self.extra_attributes + if k8resc is not None and k8resc.extra_attributes: + data += k8resc.extra_attributes + data.append('-->\n') + else: + # Append refines metadata. + if self.exth_solved_refines_metadata: + data.append('<!-- Refines MetaData from EXTH -->\n') + data += self.exth_solved_refines_metadata + if self.exth_refines_metadata or k8resc is not None and k8resc.refines_metadata: + data.append('<!-- THE FOLLOWINGS ARE REQUIRED TO EDIT IDS MANUALLY\n') + if self.exth_refines_metadata: + data += self.exth_refines_metadata + if k8resc is not None and k8resc.refines_metadata: + data += k8resc.refines_metadata + data.append('-->\n') + + # Append metadata in RESC section. + if k8resc is not None and k8resc.extra_metadata: + data.append('<!-- Extra MetaData from RESC\n') + data += k8resc.extra_metadata + data.append('-->\n') + + if 'CoverOffset' in metadata: + imageNumber = int(metadata['CoverOffset'][0]) + self.covername = self.rscnames[imageNumber] + if self.covername is None: + print("Error: Cover image %s was not recognized as a valid image" % imageNumber) + else: + # <meta name="cover"> is obsoleted in EPUB3, but kindlegen v2.9 requires it. + data.append('<meta name="cover" content="' + self.cover_id + '" />\n') + self.used[self.covername] = 'used' + del metadata['CoverOffset'] + + handleMetaPairs(data, metadata, 'Codec', 'output encoding') + # handle kindlegen specifc tags + handleTag(data, metadata, 'DictInLanguage', 'DictionaryInLanguage') + handleTag(data, metadata, 'DictOutLanguage', 'DictionaryOutLanguage') + handleMetaPairs(data, metadata, 'RegionMagnification', 'RegionMagnification') + handleMetaPairs(data, metadata, 'book-type', 'book-type') + handleMetaPairs(data, metadata, 'zero-gutter', 'zero-gutter') + handleMetaPairs(data, metadata, 'zero-margin', 'zero-margin') + handleMetaPairs(data, metadata, 'primary-writing-mode', 'primary-writing-mode') + handleMetaPairs(data, metadata, 'fixed-layout', 'fixed-layout') + handleMetaPairs(data, metadata, 'orientation-lock', 'orientation-lock') + handleMetaPairs(data, metadata, 'original-resolution', 'original-resolution') + + # these are not allowed in epub2 or 3 so convert them to meta name content pairs + # perhaps these could better be mapped into the dcterms namespace instead + handleMetaPairs(data, metadata, 'Review', 'review') + handleMetaPairs(data, metadata, 'Imprint', 'imprint') + handleMetaPairs(data, metadata, 'Adult', 'adult') + handleMetaPairs(data, metadata, 'DictShortName', 'DictionaryVeryShortName') + + # these are needed by kobo books upon submission but not sure if legal metadata in epub2 or epub3 + if 'Price' in metadata and 'Currency' in metadata: + priceList = metadata['Price'] + currencyList = metadata['Currency'] + if len(priceList) != len(currencyList): + print("Error: found %s price entries, but %s currency entries.") + else: + for i in range(len(priceList)): + data.append('<SRP Currency="'+currencyList[i]+'">'+priceList[i]+'</SRP>\n') + del metadata['Price'] + del metadata['Currency'] + + if self.target_epubver == '3': + # Append metadata for EPUB3. + if self.exth_fixedlayout_metadata: + data.append('<!-- EPUB3 MedaData converted from EXTH -->\n') + data += self.exth_fixedlayout_metadata + + # all that remains is extra EXTH info we will store inside a comment inside meta name/content pairs + # so it can not impact anything and will be automatically stripped out if found again in a RESC section + data.append(BEGIN_INFO_ONLY + '\n') + if 'ThumbOffset' in metadata: + imageNumber = int(metadata['ThumbOffset'][0]) + # Some bad books give image indexes that are 'out of range' + try: + imageName = self.rscnames[imageNumber] + except: + print('Number given for Cover Thumbnail is out of range: %s' % imageNumber) + imageName = None + if imageName is None: + print("Error: Cover Thumbnail image %s was not recognized as a valid image" % imageNumber) + else: + data.append('<meta name="Cover ThumbNail Image" content="'+ 'Images/'+imageName+'" />\n') + # self.used[imageName] = 'used' # thumbnail image is always generated by Kindlegen, so don't include in manifest + self.used[imageName] = 'not used' + del metadata['ThumbOffset'] + for metaName in META_TAGS: + if metaName in metadata: + for value in metadata[metaName]: + data.append('<meta name="'+metaName+'" content="'+self.escapeit(value, EXTRA_ENTITIES)+'" />\n') + del metadata[metaName] + for key in list(metadata.keys()): + for value in metadata[key]: + data.append('<meta name="'+key+'" content="'+self.escapeit(value, EXTRA_ENTITIES)+'" />\n') + del metadata[key] + data.append(END_INFO_ONLY + '\n') + data.append('</metadata>\n') + return data + + def buildOPFManifest(self, ncxname, navname=None): + # buildManifest for mobi7, azw4, epub2 and epub3. + k8resc = self.k8resc + cover_id = self.cover_id + hasK8RescSpine = k8resc is not None and k8resc.hasSpine() + self.ncxname = ncxname + self.navname = navname + + data = [] + data.append('<manifest>\n') + media_map = { + '.jpg' : 'image/jpeg', + '.jpeg' : 'image/jpeg', + '.png' : 'image/png', + '.gif' : 'image/gif', + '.svg' : 'image/svg+xml', + '.xhtml': 'application/xhtml+xml', + '.html' : 'text/html', # for mobi7 + '.pdf' : 'application/pdf', # for azw4(print replica textbook) + '.ttf' : 'application/x-font-ttf', + '.otf' : 'application/x-font-opentype', # replaced? + '.css' : 'text/css', + # '.html' : 'text/x-oeb1-document', # for mobi7 + # '.otf' : 'application/vnd.ms-opentype', # [OpenType] OpenType fonts + # '.woff' : 'application/font-woff', # [WOFF] WOFF fonts + # '.smil' : 'application/smil+xml', # [MediaOverlays301] EPUB Media Overlay documents + # '.pls' : 'application/pls+xml', # [PLS] Text-to-Speech (TTS) Pronunciation lexicons + # '.mp3' : 'audio/mpeg', + # '.mp4' : 'video/mp4', + # '.js' : 'text/javascript', # not supported in K8 + } + spinerefs = [] + + idcnt = 0 + for [key,dir,fname] in self.fileinfo: + name, ext = os.path.splitext(fname) + ext = ext.lower() + media = media_map.get(ext) + ref = "item%d" % idcnt + if hasK8RescSpine: + if key is not None and key in k8resc.spine_idrefs: + ref = k8resc.spine_idrefs[key] + properties = '' + if dir != '': + fpath = dir + '/' + fname + else: + fpath = fname + data.append('<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(ref, media, fpath, properties)) + + if ext in ['.xhtml', '.html']: + spinerefs.append(ref) + idcnt += 1 + + for fname in self.rscnames: + if fname is not None: + if self.used.get(fname,'not used') == 'not used': + continue + name, ext = os.path.splitext(fname) + ext = ext.lower() + media = media_map.get(ext,ext[1:]) + properties = '' + if fname == self.covername: + ref = cover_id + if self.target_epubver == '3': + properties = 'properties="cover-image"' + else: + ref = "item%d" % idcnt + if ext == '.ttf' or ext == '.otf': + if self.isK8: # fonts are only used in Mobi 8 + fpath = 'Fonts/' + fname + data.append('<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(ref, media, fpath, properties)) + else: + fpath = 'Images/' + fname + data.append('<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(ref, media, fpath, properties)) + idcnt += 1 + + if self.target_epubver == '3' and navname is not None: + data.append('<item id="nav" media-type="application/xhtml+xml" href="Text/' + navname + '" properties="nav"/>\n') + if self.has_ncx and ncxname is not None: + data.append('<item id="ncx" media-type="application/x-dtbncx+xml" href="' + ncxname +'" />\n') + if self.pagemap != '': + data.append('<item id="map" media-type="application/oebs-page-map+xml" href="page-map.xml" />\n') + data.append('</manifest>\n') + return [data, spinerefs] + + def buildOPFSpine(self, spinerefs, isNCX): + # build spine + k8resc = self.k8resc + hasK8RescSpine = k8resc is not None and k8resc.hasSpine() + data = [] + ppd = '' + if self.isK8 and self.page_progression_direction is not None: + ppd = ' page-progression-direction="{:s}"'.format(self.page_progression_direction) + ncx = '' + if isNCX: + ncx = ' toc="ncx"' + map='' + if self.pagemap != '': + map = ' page-map="map"' + if self.epubver == 'F': + if ppd: + ppd = '<!--' + ppd + ' -->' + spine_start_tag = '<spine{1:s}{2:s}>{0:s}\n'.format(ppd, map, ncx) + else: + spine_start_tag = '<spine{0:s}{1:s}{2:s}>\n'.format(ppd, map, ncx) + data.append(spine_start_tag) + + if hasK8RescSpine: + for key in k8resc.spine_order: + idref = k8resc.spine_idrefs[key] + attribs = k8resc.spine_pageattributes[key] + tag = '<itemref idref="%s"' % idref + for aname, val in list(attribs.items()): + if self.epubver == 'F' and aname == 'properties': + continue + if val is not None: + tag += ' %s="%s"' % (aname, val) + tag += '/>' + if self.epubver == 'F' and 'properties' in attribs: + val = attribs['properties'] + if val is not None: + tag += '<!-- properties="%s" -->' % val + tag += '\n' + data.append(tag) + else: + start = 0 + # special case the created coverpage if need be + [key, dir, fname] = self.fileinfo[0] + if key is not None and key == "coverpage": + entry = spinerefs[start] + data.append('<itemref idref="%s" linear="no"/>\n' % entry) + start += 1 + for entry in spinerefs[start:]: + data.append('<itemref idref="' + entry + '"/>\n') + data.append('</spine>\n') + return data + + def buildMobi7OPF(self): + # Build an OPF for mobi7 and azw4. + print("Building an opf for mobi7/azw4.") + data = [] + data.append('<?xml version="1.0" encoding="utf-8"?>\n') + data.append('<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="uid">\n') + metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">' + opf_metadata = self.buildOPFMetadata(metadata_tag) + data += opf_metadata + if self.has_ncx: + # ncxname = self.files.getInputFileBasename() + '.ncx' + ncxname = 'toc.ncx' + else: + ncxname = None + [opf_manifest, spinerefs] = self.buildOPFManifest(ncxname) + data += opf_manifest + opf_spine = self.buildOPFSpine(spinerefs, self.has_ncx) + data += opf_spine + data.append('<tours>\n</tours>\n') + if not self.printReplica: + guide ='<guide>\n' + self.guidetext + '</guide>\n' + data.append(guide) + data.append('</package>\n') + return ''.join(data) + + def buildEPUBOPF(self, has_obfuscated_fonts=False): + print("Building an opf for mobi8 using epub version: ", self.target_epubver) + if self.target_epubver == '2': + has_ncx = self.has_ncx + has_guide = True + ncxname = None + ncxname = TOC_NCX + navname = None + package = '<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="uid">\n' + tours = '<tours>\n</tours>\n' + metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">' + else: + has_ncx = EPUB3_WITH_NCX + has_guide = EPUB3_WITH_GUIDE + ncxname = None + if has_ncx: + ncxname = TOC_NCX + navname = NAVIGATION_DOCUMENT + package = '<package version="3.0" xmlns="http://www.idpf.org/2007/opf" prefix="rendition: http://www.idpf.org/vocab/rendition/#" unique-identifier="uid">\n' + tours = '' + metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">' + + data = [] + data.append('<?xml version="1.0" encoding="utf-8"?>\n') + data.append(package) + opf_metadata = self.buildOPFMetadata(metadata_tag, has_obfuscated_fonts) + data += opf_metadata + [opf_manifest, spinerefs] = self.buildOPFManifest(ncxname, navname) + data += opf_manifest + opf_spine = self.buildOPFSpine(spinerefs, has_ncx) + data += opf_spine + data.append(tours) + if has_guide: + guide ='<guide>\n' + self.guidetext + '</guide>\n' + data.append(guide) + data.append('</package>\n') + return ''.join(data) + + def writeOPF(self, has_obfuscated_fonts=False): + if self.isK8: + data = self.buildEPUBOPF(has_obfuscated_fonts) + outopf = os.path.join(self.files.k8oebps, EPUB_OPF) + with open(pathof(outopf), 'wb') as f: + f.write(data.encode('utf-8')) + return self.BookId + else: + data = self.buildMobi7OPF() + outopf = os.path.join(self.files.mobi7dir, 'content.opf') + with open(pathof(outopf), 'wb') as f: + f.write(data.encode('utf-8')) + return 0 + + def getBookId(self): + return self.BookId + + def getNCXName(self): + return self.ncxname + + def getNAVName(self): + return self.navname + + def getEPUBVersion(self): + return self.target_epubver + + def hasNCX(self): + return self.ncxname is not None and self.has_ncx + + def hasNAV(self): + return self.navname is not None + + def autodetectEPUBVersion(self): + # Determine EPUB version from metadata and RESC. + metadata = self.metadata + k8resc = self.k8resc + epubver = '2' + if 'true' == metadata.get('fixed-layout', [''])[0].lower(): + epubver = '3' + elif metadata.get('orientation-lock', [''])[0].lower() in ['portrait', 'landscape']: + epubver = '3' + elif self.page_progression_direction == 'rtl': + epubver = '3' + elif EXTH_TITLE_FURIGANA in metadata: + epubver = '3' + elif EXTH_CREATOR_FURIGANA in metadata: + epubver = '3' + elif EXTH_PUBLISHER_FURIGANA in metadata: + epubver = '3' + elif k8resc is not None and k8resc.needEPUB3(): + epubver = '3' + return epubver + + def defineRefinesID(self): + # the following EXTH are set by KDP. + # 'Title_Furigana_(508)' + # 'Creator_Furigana_(517)', + # 'Publisher_Furigana_(522)' + # It is difficult to find correspondence between Title, Creator, Publisher + # and EXTH 508,512, 522 if they have more than two values since KDP seems not preserve the oders of EXTH 508,512 and 522. + # It is also difficult to find correspondence between them and tags which have refine attributes in RESC. + # So editing manually is required. + metadata = self.metadata + + needRefinesId = False + if self.k8resc is not None: + needRefinesId = self.k8resc.hasRefines() + # Create id for rifine attributes + if (needRefinesId or EXTH_TITLE_FURIGANA in metadata) and 'Title' in metadata: + for i in range(len(metadata.get('Title'))): + self.title_id[i] = 'title%02d' % (i+1) + + if (needRefinesId or EXTH_CREATOR_FURIGANA in metadata) and 'Creator' in metadata: + for i in range(len(metadata.get('Creator'))): + self.creator_id[i] = 'creator%02d' % (i+1) + + if (needRefinesId or EXTH_PUBLISHER_FURIGANA in metadata) and 'Publisher' in metadata: + for i in range(len(metadata.get('Publisher'))): + self.publisher_id[i] = 'publisher%02d' % (i+1) + + def processRefinesMetadata(self): + # create refines metadata defined in epub3 or convert refines property to opf: attribues for epub2. + metadata = self.metadata + + refines_list = [ + [EXTH_TITLE_FURIGANA, self.title_id, self.title_attrib, 'title00'], + [EXTH_CREATOR_FURIGANA, self.creator_id, self.creator_attrib, 'creator00'], + [EXTH_PUBLISHER_FURIGANA, self.publisher_id, self.publisher_attrib, 'publisher00'] + ] + + create_refines_metadata = False + for EXTH in lzip(*refines_list)[0]: + if EXTH in metadata: + create_refines_metadata = True + break + if create_refines_metadata: + for [EXTH, id, attrib, defaultid] in refines_list: + if self.target_epubver == '3': + for i, value in list(id.items()): + attrib[i] = ' id="%s"' % value + + if EXTH in metadata: + if len(metadata[EXTH]) == 1 and len(id) == 1: + self.createMetaTag(self.exth_solved_refines_metadata, 'file-as', metadata[EXTH][0], id[0]) + else: + for i, value in enumerate(metadata[EXTH]): + self.createMetaTag(self.exth_refines_metadata, 'file-as', value, id.get(i, defaultid)) + else: + if EXTH in metadata: + if len(metadata[EXTH]) == 1 and len(id) == 1: + attr = ' opf:file-as="%s"' % metadata[EXTH][0] + attrib[0] = attr + else: + for i, value in enumerate(metadata[EXTH]): + attr = ' id="#%s" opf:file-as="%s"\n' % (id.get(i, defaultid), value) + self.extra_attributes.append(attr) + + def createMetadataForFixedlayout(self): + # convert fixed layout to epub3 format if needed. + metadata = self.metadata + + if 'fixed-layout' in metadata: + fixedlayout = metadata['fixed-layout'][0] + content = {'true' : 'pre-paginated'}.get(fixedlayout.lower(), 'reflowable') + self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:layout', content) + + if 'orientation-lock' in metadata: + content = metadata['orientation-lock'][0].lower() + if content == 'portrait' or content == 'landscape': + self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:orientation', content) + + # according to epub3 spec about correspondence with Amazon + # if 'original-resolution' is provided it needs to be converted to + # meta viewport property tag stored in the <head></head> of **each** + # xhtml page - so this tag would need to be handled by editing each part + # before reaching this routine + # we need to add support for this to the k8html routine + # if 'original-resolution' in metadata.keys(): + # resolution = metadata['original-resolution'][0].lower() + # width, height = resolution.split('x') + # if width.isdigit() and int(width) > 0 and height.isdigit() and int(height) > 0: + # viewport = 'width=%s, height=%s' % (width, height) + # self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:viewport', viewport) diff --git a/src/epy_reader/tools/KindleUnpack/mobi_pagemap.py b/src/epy_reader/tools/KindleUnpack/mobi_pagemap.py new file mode 100644 index 0000000..5228d4e --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_pagemap.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, unicode_str + +if PY2: + range = xrange + +import struct +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring + +import re +# note: re requites the pattern to be the exact same type as the data to be searched in python3 +# but u"" is not allowed for the pattern itself only b"" + + +_TABLE = [('m', 1000), ('cm', 900), ('d', 500), ('cd', 400), ('c', 100), ('xc', 90), ('l', 50), ('xl', 40), ('x', 10), ('ix', 9), ('v', 5), ('iv', 4), ('i', 1)] + +def int_to_roman(i): + parts = [] + num = i + for letter, value in _TABLE: + while value <= num: + num -= value + parts.append(letter) + return ''.join(parts) + +def roman_to_int(s): + result = 0 + rnstr = s + for letter, value in _TABLE: + while rnstr.startswith(letter): + result += value + rnstr = rnstr[len(letter):] + return result + +_pattern = r'''\(([^\)]*)\)''' +_tup_pattern = re.compile(_pattern,re.IGNORECASE) + + +def _parseNames(numpages, data): + data = unicode_str(data) + pagenames = [] + pageMap = '' + for i in range(numpages): + pagenames.append(None) + for m in re.finditer(_tup_pattern, data): + tup = m.group(1) + if pageMap != '': + pageMap += ',' + pageMap += '(' + tup + ')' + spos, nametype, svalue = tup.split(",") + # print(spos, nametype, svalue) + if nametype == 'a' or nametype == 'r': + svalue = int(svalue) + spos = int(spos) + for i in range(spos - 1, numpages): + if nametype == 'r': + pname = int_to_roman(svalue) + svalue += 1 + elif nametype == 'a': + pname = "%s" % svalue + svalue += 1 + elif nametype == 'c': + sp = svalue.find('|') + if sp == -1: + pname = svalue + else: + pname = svalue[0:sp] + svalue = svalue[sp+1:] + else: + print("Error: unknown page numbering type", nametype) + pagenames[i] = pname + return pagenames, pageMap + + +class PageMapProcessor: + + def __init__(self, mh, data): + self.mh = mh + self.data = data + self.pagenames = [] + self.pageoffsets = [] + self.pageMap = '' + self.pm_len = 0 + self.pm_nn = 0 + self.pn_bits = 0 + self.pmoff = None + self.pmstr = '' + print("Extracting Page Map Information") + rev_len, = struct.unpack_from(b'>L', self.data, 0x10) + # skip over header, revision string length data, and revision string + ptr = 0x14 + rev_len + pm_1, self.pm_len, self.pm_nn, self.pm_bits = struct.unpack_from(b'>4H', self.data, ptr) + # print(pm_1, self.pm_len, self.pm_nn, self.pm_bits) + self.pmstr = self.data[ptr+8:ptr+8+self.pm_len] + self.pmoff = self.data[ptr+8+self.pm_len:] + offsize = b">L" + offwidth = 4 + if self.pm_bits == 16: + offsize = b">H" + offwidth = 2 + ptr = 0 + for i in range(self.pm_nn): + od, = struct.unpack_from(offsize, self.pmoff, ptr) + ptr += offwidth + self.pageoffsets.append(od) + self.pagenames, self.pageMap = _parseNames(self.pm_nn, self.pmstr) + + def getPageMap(self): + return self.pageMap + + def getNames(self): + return self.pagenames + + def getOffsets(self): + return self.pageoffsets + + # page-map.xml will be unicode but encoded to utf-8 immediately before being written to a file + def generateKF8PageMapXML(self, k8proc): + pagemapxml = '<page-map xmlns="http://www.idpf.org/2007/opf">\n' + for i in range(len(self.pagenames)): + pos = self.pageoffsets[i] + name = self.pagenames[i] + if name is not None and name != "": + [pn, dir, filename, skelpos, skelend, aidtext] = k8proc.getSkelInfo(pos) + idtext = unicode_str(k8proc.getPageIDTag(pos)) + linktgt = unicode_str(filename) + if idtext != '': + linktgt += '#' + idtext + pagemapxml += '<page name="%s" href="%s/%s" />\n' % (name, dir, linktgt) + pagemapxml += "</page-map>\n" + return pagemapxml + + def generateAPNX(self, apnx_meta): + if apnx_meta['format'] == 'MOBI_8': + content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","format":"%(format)s","fileRevisionId":"1","acr":"%(acr)s"}' %apnx_meta + else: + content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","fileRevisionId":"1"}' % apnx_meta + content_header = content_header.encode('utf-8') + page_header = '{"asin":"%(asin)s","pageMap":"%(pageMap)s"}' % apnx_meta + page_header = page_header.encode('utf-8') + apnx = struct.pack(b'>H',1) + struct.pack(b'>H',1) + apnx += struct.pack(b'>I', 12 + len(content_header)) + apnx += struct.pack(b'>I', len(content_header)) + apnx += content_header + apnx += struct.pack(b'>H', 1) + apnx += struct.pack(b'>H', len(page_header)) + apnx += struct.pack(b'>H', self.pm_nn) + apnx += struct.pack(b'>H', 32) + apnx += page_header + for page in self.pageoffsets: + apnx += struct.pack(b'>L', page) + return apnx diff --git a/src/epy_reader/tools/KindleUnpack/mobi_sectioner.py b/src/epy_reader/tools/KindleUnpack/mobi_sectioner.py new file mode 100644 index 0000000..81f62bb --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_sectioner.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, hexlify, bstr, bord, bchar + +import datetime + +if PY2: + range = xrange + +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring +import struct + +from .unipath import pathof + +DUMP = False +""" Set to True to dump all possible information. """ + +class unpackException(Exception): + pass + + +def describe(data): + txtans = '' + hexans = hexlify(data) + for i in data: + if bord(i) < 32 or bord(i) > 127: + txtans += '?' + else: + txtans += bchar(i).decode('latin-1') + return '"' + txtans + '"' + ' 0x'+ hexans + +def datetimefrompalmtime(palmtime): + if palmtime > 0x7FFFFFFF: + pythondatetime = datetime.datetime(year=1904,month=1,day=1)+datetime.timedelta(seconds=palmtime) + else: + pythondatetime = datetime.datetime(year=1970,month=1,day=1)+datetime.timedelta(seconds=palmtime) + return pythondatetime + + +class Sectionizer: + + def __init__(self, filename): + self.data = b'' + with open(pathof(filename), 'rb') as f: + self.data = f.read() + self.palmheader = self.data[:78] + self.palmname = self.data[:32] + self.ident = self.palmheader[0x3C:0x3C+8] + self.num_sections, = struct.unpack_from(b'>H', self.palmheader, 76) + self.filelength = len(self.data) + sectionsdata = struct.unpack_from(bstr('>%dL' % (self.num_sections*2)), self.data, 78) + (self.filelength, 0) + self.sectionoffsets = sectionsdata[::2] + self.sectionattributes = sectionsdata[1::2] + self.sectiondescriptions = ["" for x in range(self.num_sections+1)] + self.sectiondescriptions[-1] = "File Length Only" + return + + def dumpsectionsinfo(self): + print("Section Offset Length UID Attribs Description") + for i in range(self.num_sections): + print("%3d %3X 0x%07X 0x%05X % 8d % 7d %s" % (i,i, self.sectionoffsets[i], self.sectionoffsets[ + i+1] - self.sectionoffsets[i], self.sectionattributes[i]&0xFFFFFF, (self.sectionattributes[i]>>24)&0xFF, self.sectiondescriptions[i])) + print("%3d %3X 0x%07X %s" % + (self.num_sections,self.num_sections, self.sectionoffsets[self.num_sections], self.sectiondescriptions[self.num_sections])) + + def setsectiondescription(self, section, description): + if section < len(self.sectiondescriptions): + self.sectiondescriptions[section] = description + else: + print("Section out of range: %d, description %s" % (section,description)) + + def dumppalmheader(self): + print("Palm Database Header") + print("Database name: " + repr(self.palmheader[:32])) + dbattributes, = struct.unpack_from(b'>H', self.palmheader, 32) + print("Bitfield attributes: 0x%0X" % dbattributes,) + if dbattributes != 0: + print(" (",) + if (dbattributes & 2): + print("Read-only; ",) + if (dbattributes & 4): + print("Dirty AppInfoArea; ",) + if (dbattributes & 8): + print("Needs to be backed up; ",) + if (dbattributes & 16): + print("OK to install over newer; ",) + if (dbattributes & 32): + print("Reset after installation; ",) + if (dbattributes & 64): + print("No copying by PalmPilot beaming; ",) + print(")") + else: + print("") + print("File version: %d" % struct.unpack_from(b'>H', self.palmheader, 34)[0]) + dbcreation, = struct.unpack_from(b'>L', self.palmheader, 36) + print("Creation Date: " + str(datetimefrompalmtime(dbcreation))+ (" (0x%0X)" % dbcreation)) + dbmodification, = struct.unpack_from(b'>L', self.palmheader, 40) + print("Modification Date: " + str(datetimefrompalmtime(dbmodification))+ (" (0x%0X)" % dbmodification)) + dbbackup, = struct.unpack_from(b'>L', self.palmheader, 44) + if dbbackup != 0: + print("Backup Date: " + str(datetimefrompalmtime(dbbackup))+ (" (0x%0X)" % dbbackup)) + print("Modification No.: %d" % struct.unpack_from(b'>L', self.palmheader, 48)[0]) + print("App Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 52)[0]) + print("Sort Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 56)[0]) + print("Type/Creator: %s/%s" % (repr(self.palmheader[60:64]), repr(self.palmheader[64:68]))) + print("Unique seed: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 68)[0]) + expectedzero, = struct.unpack_from(b'>L', self.palmheader, 72) + if expectedzero != 0: + print("Should be zero but isn't: %d" % struct.unpack_from(b'>L', self.palmheader, 72)[0]) + print("Number of sections: %d" % struct.unpack_from(b'>H', self.palmheader, 76)[0]) + return + + def loadSection(self, section): + before, after = self.sectionoffsets[section:section+2] + return self.data[before:after] diff --git a/src/epy_reader/tools/KindleUnpack/mobi_split.py b/src/epy_reader/tools/KindleUnpack/mobi_split.py new file mode 100755 index 0000000..3535029 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_split.py @@ -0,0 +1,438 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +import struct +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring + +from .unipath import pathof + + +# important pdb header offsets +unique_id_seed = 68 +number_of_pdb_records = 76 + +# important palmdoc header offsets +book_length = 4 +book_record_count = 8 +first_pdb_record = 78 + +# important rec0 offsets +length_of_book = 4 +mobi_header_base = 16 +mobi_header_length = 20 +mobi_type = 24 +mobi_version = 36 +first_non_text = 80 +title_offset = 84 +first_resc_record = 108 +first_content_index = 192 +last_content_index = 194 +kf8_fdst_index = 192 # for KF8 mobi headers +fcis_index = 200 +flis_index = 208 +srcs_index = 224 +srcs_count = 228 +primary_index = 244 +datp_index = 256 +huffoff = 112 +hufftbloff = 120 + +def getint(datain,ofs,sz=b'L'): + i, = struct.unpack_from(b'>'+sz,datain,ofs) + return i + +def writeint(datain,ofs,n,len=b'L'): + if len==b'L': + return datain[:ofs]+struct.pack(b'>L',n)+datain[ofs+4:] + else: + return datain[:ofs]+struct.pack(b'>H',n)+datain[ofs+2:] + +def getsecaddr(datain,secno): + nsec = getint(datain,number_of_pdb_records,b'H') + assert secno>=0 & secno<nsec,'secno %d out of range (nsec=%d)'%(secno,nsec) + secstart = getint(datain,first_pdb_record+secno*8) + if secno == nsec-1: + secend = len(datain) + else: + secend = getint(datain,first_pdb_record+(secno+1)*8) + return secstart,secend + +def readsection(datain,secno): + secstart, secend = getsecaddr(datain,secno) + return datain[secstart:secend] + +def writesection(datain,secno,secdata): # overwrite, accounting for different length + # dataout = deletesectionrange(datain,secno, secno) + # return insertsection(dataout, secno, secdata) + datalst = [] + nsec = getint(datain,number_of_pdb_records,b'H') + zerosecstart,zerosecend = getsecaddr(datain,0) + secstart,secend = getsecaddr(datain,secno) + dif = len(secdata) - (secend - secstart) + datalst.append(datain[:unique_id_seed]) + datalst.append(struct.pack(b'>L',2*nsec+1)) + datalst.append(datain[unique_id_seed+4:number_of_pdb_records]) + datalst.append(struct.pack(b'>H',nsec)) + newstart = zerosecstart + for i in range(0,secno): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) + datalst.append(struct.pack(b'>L', secstart) + struct.pack(b'>L', (2*secno))) + for i in range(secno+1,nsec): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + ofs = ofs + dif + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) + lpad = newstart - (first_pdb_record + 8*nsec) + if lpad > 0: + datalst.append(b'\0' * lpad) + datalst.append(datain[zerosecstart:secstart]) + datalst.append(secdata) + datalst.append(datain[secend:]) + dataout = b''.join(datalst) + return dataout + +def nullsection(datain,secno): # make it zero-length without deleting it + datalst = [] + nsec = getint(datain,number_of_pdb_records,b'H') + secstart, secend = getsecaddr(datain,secno) + zerosecstart, zerosecend = getsecaddr(datain, 0) + dif = secend-secstart + datalst.append(datain[:first_pdb_record]) + for i in range(0,secno+1): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) + for i in range(secno+1, nsec): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + ofs = ofs - dif + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) + lpad = zerosecstart - (first_pdb_record + 8*nsec) + if lpad > 0: + datalst.append(b'\0' * lpad) + datalst.append(datain[zerosecstart: secstart]) + datalst.append(datain[secend:]) + dataout = b''.join(datalst) + return dataout + +def deletesectionrange(datain,firstsec,lastsec): # delete a range of sections + datalst = [] + firstsecstart,firstsecend = getsecaddr(datain,firstsec) + lastsecstart,lastsecend = getsecaddr(datain,lastsec) + zerosecstart, zerosecend = getsecaddr(datain, 0) + dif = lastsecend - firstsecstart + 8*(lastsec-firstsec+1) + nsec = getint(datain,number_of_pdb_records,b'H') + datalst.append(datain[:unique_id_seed]) + datalst.append(struct.pack(b'>L',2*(nsec-(lastsec-firstsec+1))+1)) + datalst.append(datain[unique_id_seed+4:number_of_pdb_records]) + datalst.append(struct.pack(b'>H',nsec-(lastsec-firstsec+1))) + newstart = zerosecstart - 8*(lastsec-firstsec+1) + for i in range(0,firstsec): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + ofs = ofs-8*(lastsec-firstsec+1) + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) + for i in range(lastsec+1,nsec): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + ofs = ofs - dif + flgval = 2*(i-(lastsec-firstsec+1)) + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) + lpad = newstart - (first_pdb_record + 8*(nsec - (lastsec - firstsec + 1))) + if lpad > 0: + datalst.append(b'\0' * lpad) + datalst.append(datain[zerosecstart:firstsecstart]) + datalst.append(datain[lastsecend:]) + dataout = b''.join(datalst) + return dataout + +def insertsection(datain,secno,secdata): # insert a new section + datalst = [] + nsec = getint(datain,number_of_pdb_records,b'H') + # print("inserting secno" , secno, "into" ,nsec, "sections") + secstart,secend = getsecaddr(datain,secno) + zerosecstart,zerosecend = getsecaddr(datain,0) + dif = len(secdata) + datalst.append(datain[:unique_id_seed]) + datalst.append(struct.pack(b'>L',2*(nsec+1)+1)) + datalst.append(datain[unique_id_seed+4:number_of_pdb_records]) + datalst.append(struct.pack(b'>H',nsec+1)) + newstart = zerosecstart + 8 + for i in range(0,secno): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + ofs += 8 + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) + datalst.append(struct.pack(b'>L', secstart + 8) + struct.pack(b'>L', (2*secno))) + for i in range(secno,nsec): + ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) + ofs = ofs + dif + 8 + flgval = 2*(i+1) + datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) + lpad = newstart - (first_pdb_record + 8*(nsec + 1)) + if lpad > 0: + datalst.append(b'\0' * lpad) + datalst.append(datain[zerosecstart:secstart]) + datalst.append(secdata) + datalst.append(datain[secstart:]) + dataout = b''.join(datalst) + return dataout + + +def insertsectionrange(sectionsource,firstsec,lastsec,sectiontarget,targetsec): # insert a range of sections + # print("inserting secno" , firstsec, "to", lastsec, "into" ,targetsec, "sections") + # dataout = sectiontarget + # for idx in range(lastsec,firstsec-1,-1): + # dataout = insertsection(dataout,targetsec,readsection(sectionsource,idx)) + # return dataout + datalst = [] + nsec = getint(sectiontarget,number_of_pdb_records,b'H') + zerosecstart, zerosecend = getsecaddr(sectiontarget,0) + insstart, nul = getsecaddr(sectiontarget,targetsec) + nins = lastsec - firstsec + 1 + srcstart, nul = getsecaddr(sectionsource,firstsec) + nul, srcend = getsecaddr(sectionsource,lastsec) + newstart = zerosecstart + 8*nins + + datalst.append(sectiontarget[:unique_id_seed]) + datalst.append(struct.pack(b'>L',2*(nsec+nins)+1)) + datalst.append(sectiontarget[unique_id_seed+4:number_of_pdb_records]) + datalst.append(struct.pack(b'>H',nsec+nins)) + for i in range(0,targetsec): + ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8) + ofsnew = ofs + 8*nins + flgvalnew = flgval + datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew)) + # print(ofsnew, flgvalnew, ofs, flgval) + srcstart0, nul = getsecaddr(sectionsource,firstsec) + for i in range(nins): + isrcstart, nul = getsecaddr(sectionsource,firstsec+i) + ofsnew = insstart + (isrcstart-srcstart0) + 8*nins + flgvalnew = 2*(targetsec+i) + datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew)) + # print(ofsnew, flgvalnew) + dif = srcend - srcstart + for i in range(targetsec,nsec): + ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8) + ofsnew = ofs + dif + 8*nins + flgvalnew = 2*(i+nins) + datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L',flgvalnew)) + # print(ofsnew, flgvalnew, ofs, flgval) + lpad = newstart - (first_pdb_record + 8*(nsec + nins)) + if lpad > 0: + datalst.append(b'\0' * lpad) + datalst.append(sectiontarget[zerosecstart:insstart]) + datalst.append(sectionsource[srcstart:srcend]) + datalst.append(sectiontarget[insstart:]) + dataout = b''.join(datalst) + return dataout + +def get_exth_params(rec0): + ebase = mobi_header_base + getint(rec0,mobi_header_length) + elen = getint(rec0,ebase+4) + enum = getint(rec0,ebase+8) + return ebase,elen,enum + +def add_exth(rec0,exth_num,exth_bytes): + ebase,elen,enum = get_exth_params(rec0) + newrecsize = 8+len(exth_bytes) + newrec0 = rec0[0:ebase+4]+struct.pack(b'>L',elen+newrecsize)+struct.pack(b'>L',enum+1)+\ + struct.pack(b'>L',exth_num)+struct.pack(b'>L',newrecsize)+exth_bytes+rec0[ebase+12:] + newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+newrecsize) + return newrec0 + +def read_exth(rec0,exth_num): + exth_values = [] + ebase,elen,enum = get_exth_params(rec0) + ebase = ebase+12 + while enum>0: + exth_id = getint(rec0,ebase) + if exth_id == exth_num: + # We might have multiple exths, so build a list. + exth_values.append(rec0[ebase+8:ebase+getint(rec0,ebase+4)]) + enum = enum-1 + ebase = ebase+getint(rec0,ebase+4) + return exth_values + +def write_exth(rec0,exth_num,exth_bytes): + ebase,elen,enum = get_exth_params(rec0) + ebase_idx = ebase+12 + enum_idx = enum + while enum_idx>0: + exth_id = getint(rec0,ebase_idx) + if exth_id == exth_num: + dif = len(exth_bytes)+8-getint(rec0,ebase_idx+4) + newrec0 = rec0 + if dif != 0: + newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+dif) + return newrec0[:ebase+4]+struct.pack(b'>L',elen+len(exth_bytes)+8-getint(rec0,ebase_idx+4))+\ + struct.pack(b'>L',enum)+rec0[ebase+12:ebase_idx+4]+\ + struct.pack(b'>L',len(exth_bytes)+8)+exth_bytes+\ + rec0[ebase_idx+getint(rec0,ebase_idx+4):] + enum_idx = enum_idx-1 + ebase_idx = ebase_idx+getint(rec0,ebase_idx+4) + return rec0 + +def del_exth(rec0,exth_num): + ebase,elen,enum = get_exth_params(rec0) + ebase_idx = ebase+12 + enum_idx = 0 + while enum_idx < enum: + exth_id = getint(rec0,ebase_idx) + exth_size = getint(rec0,ebase_idx+4) + if exth_id == exth_num: + newrec0 = rec0 + newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)-exth_size) + newrec0 = newrec0[:ebase_idx]+newrec0[ebase_idx+exth_size:] + newrec0 = newrec0[0:ebase+4]+struct.pack(b'>L',elen-exth_size)+struct.pack(b'>L',enum-1)+newrec0[ebase+12:] + return newrec0 + enum_idx += 1 + ebase_idx = ebase_idx+exth_size + return rec0 + + +class mobi_split: + + def __init__(self, infile): + datain = b'' + with open(pathof(infile), 'rb') as f: + datain = f.read() + datain_rec0 = readsection(datain,0) + ver = getint(datain_rec0,mobi_version) + self.combo = (ver!=8) + if not self.combo: + return + exth121 = read_exth(datain_rec0,121) + if len(exth121) == 0: + self.combo = False + return + else: + # only pay attention to first exth121 + # (there should only be one) + datain_kf8, = struct.unpack_from(b'>L',exth121[0],0) + if datain_kf8 == 0xffffffff: + self.combo = False + return + datain_kfrec0 =readsection(datain,datain_kf8) + + # create the standalone mobi7 + num_sec = getint(datain,number_of_pdb_records,b'H') + # remove BOUNDARY up to but not including ELF record + self.result_file7 = deletesectionrange(datain,datain_kf8-1,num_sec-2) + # check if there are SRCS records and delete them + srcs = getint(datain_rec0,srcs_index) + num_srcs = getint(datain_rec0,srcs_count) + if srcs != 0xffffffff and num_srcs > 0: + self.result_file7 = deletesectionrange(self.result_file7,srcs,srcs+num_srcs-1) + datain_rec0 = writeint(datain_rec0,srcs_index,0xffffffff) + datain_rec0 = writeint(datain_rec0,srcs_count,0) + # reset the EXTH 121 KF8 Boundary meta data to 0xffffffff + datain_rec0 = write_exth(datain_rec0,121, struct.pack(b'>L', 0xffffffff)) + # datain_rec0 = del_exth(datain_rec0,121) + # datain_rec0 = del_exth(datain_rec0,534) + # don't remove the EXTH 125 KF8 Count of Resources, seems to be present in mobi6 files as well + # set the EXTH 129 KF8 Masthead / Cover Image string to the null string + datain_rec0 = write_exth(datain_rec0,129, b'') + # don't remove the EXTH 131 KF8 Unidentified Count, seems to be present in mobi6 files as well + + # need to reset flags stored in 0x80-0x83 + # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050 + # Bit Flags + # 0x1000 = Bit 12 indicates if embedded fonts are used or not + # 0x0800 = means this Header points to *shared* images/resource/fonts ?? + # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8? + # 0x0040 = exth exists + # 0x0010 = Not sure but this is always set so far + fval, = struct.unpack_from(b'>L',datain_rec0, 0x80) + # need to remove flag 0x0800 for KindlePreviewer 2.8 and unset Bit 12 for embedded fonts + fval = fval & 0x07FF + datain_rec0 = datain_rec0[:0x80] + struct.pack(b'>L',fval) + datain_rec0[0x84:] + + self.result_file7 = writesection(self.result_file7,0,datain_rec0) + + # no need to replace kf8 style fcis with mobi 7 one + # fcis_secnum, = struct.unpack_from(b'>L',datain_rec0, 0xc8) + # if fcis_secnum != 0xffffffff: + # fcis_info = readsection(datain, fcis_secnum) + # text_len, = struct.unpack_from(b'>L', fcis_info, 0x14) + # new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00' + # new_fcis += struct.pack(b'>L',text_len) + # new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00' + # self.result_file7 = writesection(self.result_file7, fcis_secnum, new_fcis) + + firstimage = getint(datain_rec0,first_resc_record) + lastimage = getint(datain_rec0,last_content_index,b'H') + # print("Old First Image, last Image", firstimage,lastimage) + if lastimage == 0xffff: + # find the lowest of the next sections and copy up to that. + ofs_list = [(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')] + for ofs,sz in ofs_list: + n = getint(datain_rec0,ofs,sz) + # print("n",n) + if n > 0 and n < lastimage: + lastimage = n-1 + print("First Image, last Image", firstimage,lastimage) + + # Try to null out FONT and RES, but leave the (empty) PDB record so image refs remain valid + for i in range(firstimage,lastimage): + imgsec = readsection(self.result_file7,i) + if imgsec[0:4] in [b'RESC',b'FONT']: + self.result_file7 = nullsection(self.result_file7,i) + + # mobi7 finished + + # create standalone mobi8 + self.result_file8 = deletesectionrange(datain,0,datain_kf8-1) + target = getint(datain_kfrec0,first_resc_record) + self.result_file8 = insertsectionrange(datain,firstimage,lastimage,self.result_file8,target) + datain_kfrec0 =readsection(self.result_file8,0) + + # Only keep the correct EXTH 116 StartOffset, KG 2.5 carries over the one from the mobi7 part, which then points at garbage in the mobi8 part, and confuses FW 3.4 + kf8starts = read_exth(datain_kfrec0,116) + # If we have multiple StartOffset, keep only the last one + kf8start_count = len(kf8starts) + while kf8start_count > 1: + kf8start_count -= 1 + datain_kfrec0 = del_exth(datain_kfrec0,116) + + # update the EXTH 125 KF8 Count of Images/Fonts/Resources + datain_kfrec0 = write_exth(datain_kfrec0,125,struct.pack(b'>L',lastimage-firstimage+1)) + + # need to reset flags stored in 0x80-0x83 + # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050 + # standalone mobi8 with exth: 0x0050 + # Bit Flags + # 0x1000 = Bit 12 indicates if embedded fonts are used or not + # 0x0800 = means this Header points to *shared* images/resource/fonts ?? + # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8? + # 0x0040 = exth exists + # 0x0010 = Not sure but this is always set so far + fval, = struct.unpack_from('>L',datain_kfrec0, 0x80) + fval = fval & 0x1FFF + fval |= 0x0800 + datain_kfrec0 = datain_kfrec0[:0x80] + struct.pack(b'>L',fval) + datain_kfrec0[0x84:] + + # properly update other index pointers that have been shifted by the insertion of images + ofs_list = [(kf8_fdst_index,b'L'),(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')] + for ofs,sz in ofs_list: + n = getint(datain_kfrec0,ofs,sz) + if n != 0xffffffff: + datain_kfrec0 = writeint(datain_kfrec0,ofs,n+lastimage-firstimage+1,sz) + self.result_file8 = writesection(self.result_file8,0,datain_kfrec0) + + # no need to replace kf8 style fcis with mobi 7 one + # fcis_secnum, = struct.unpack_from(b'>L',datain_kfrec0, 0xc8) + # if fcis_secnum != 0xffffffff: + # fcis_info = readsection(self.result_file8, fcis_secnum) + # text_len, = struct.unpack_from(b'>L', fcis_info, 0x14) + # new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00' + # new_fcis += struct.pack(b'>L',text_len) + # new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00' + # self.result_file8 = writesection(self.result_file8, fcis_secnum, new_fcis) + + # mobi8 finished + + def getResult8(self): + return self.result_file8 + + def getResult7(self): + return self.result_file7 diff --git a/src/epy_reader/tools/KindleUnpack/mobi_uncompress.py b/src/epy_reader/tools/KindleUnpack/mobi_uncompress.py new file mode 100644 index 0000000..c5fad85 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_uncompress.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, bchr, lmap, bstr + +if PY2: + range = xrange + +import struct +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring + + +class unpackException(Exception): + pass + +class UncompressedReader: + + def unpack(self, data): + return data + +class PalmdocReader: + + def unpack(self, i): + o, p = b'', 0 + while p < len(i): + # for python 3 must use slice since i[p] returns int while slice returns character + c = ord(i[p:p+1]) + p += 1 + if (c >= 1 and c <= 8): + o += i[p:p+c] + p += c + elif (c < 128): + o += bchr(c) + elif (c >= 192): + o += b' ' + bchr(c ^ 128) + else: + if p < len(i): + c = (c << 8) | ord(i[p:p+1]) + p += 1 + m = (c >> 3) & 0x07ff + n = (c & 7) + 3 + if (m > n): + o += o[-m:n-m] + else: + for _ in range(n): + # because of completely ass-backwards decision by python mainters for python 3 + # we must use slice for bytes as i[p] returns int while slice returns character + if m == 1: + o += o[-m:] + else: + o += o[-m:-m+1] + return o + +class HuffcdicReader: + q = struct.Struct(b'>Q').unpack_from + + def loadHuff(self, huff): + if huff[0:8] != b'HUFF\x00\x00\x00\x18': + raise unpackException('invalid huff header') + off1, off2 = struct.unpack_from(b'>LL', huff, 8) + + def dict1_unpack(v): + codelen, term, maxcode = v&0x1f, v&0x80, v>>8 + assert codelen != 0 + if codelen <= 8: + assert term + maxcode = ((maxcode + 1) << (32 - codelen)) - 1 + return (codelen, term, maxcode) + self.dict1 = lmap(dict1_unpack, struct.unpack_from(b'>256L', huff, off1)) + + dict2 = struct.unpack_from(b'>64L', huff, off2) + self.mincode, self.maxcode = (), () + for codelen, mincode in enumerate((0,) + dict2[0::2]): + self.mincode += (mincode << (32 - codelen), ) + for codelen, maxcode in enumerate((0,) + dict2[1::2]): + self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, ) + + self.dictionary = [] + + def loadCdic(self, cdic): + if cdic[0:8] != b'CDIC\x00\x00\x00\x10': + raise unpackException('invalid cdic header') + phrases, bits = struct.unpack_from(b'>LL', cdic, 8) + n = min(1<<bits, phrases-len(self.dictionary)) + h = struct.Struct(b'>H').unpack_from + def getslice(off): + blen, = h(cdic, 16+off) + slice = cdic[18+off:18+off+(blen&0x7fff)] + return (slice, blen&0x8000) + self.dictionary += lmap(getslice, struct.unpack_from(bstr('>%dH' % n), cdic, 16)) + + def unpack(self, data): + q = HuffcdicReader.q + + bitsleft = len(data) * 8 + data += b"\x00\x00\x00\x00\x00\x00\x00\x00" + pos = 0 + x, = q(data, pos) + n = 32 + + s = b'' + while True: + if n <= 0: + pos += 4 + x, = q(data, pos) + n += 32 + code = (x >> n) & ((1 << 32) - 1) + + codelen, term, maxcode = self.dict1[code >> 24] + if not term: + while code < self.mincode[codelen]: + codelen += 1 + maxcode = self.maxcode[codelen] + + n -= codelen + bitsleft -= codelen + if bitsleft < 0: + break + + r = (maxcode - code) >> (32 - codelen) + slice, flag = self.dictionary[r] + if not flag: + self.dictionary[r] = None + slice = self.unpack(slice) + self.dictionary[r] = (slice, 1) + s += slice + return s diff --git a/src/epy_reader/tools/KindleUnpack/mobi_utils.py b/src/epy_reader/tools/KindleUnpack/mobi_utils.py new file mode 100644 index 0000000..6791e0d --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_utils.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab +# flake8: noqa + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, text_type, bchr, bord + +import binascii + +if PY2: + range = xrange + +from itertools import cycle + +def getLanguage(langID, sublangID): + mobilangdict = { + 54 : {0 : 'af'}, # Afrikaans + 28 : {0 : 'sq'}, # Albanian + 1 : {0 : 'ar' , 5 : 'ar-dz' , 15 : 'ar-bh' , 3 : 'ar-eg' , 2 : 'ar-iq', 11 : 'ar-jo' , 13 : 'ar-kw' , 12 : 'ar-lb' , 4: 'ar-ly', + 6 : 'ar-ma' , 8 : 'ar-om' , 16 : 'ar-qa' , 1 : 'ar-sa' , 10 : 'ar-sy' , 7 : 'ar-tn' , 14 : 'ar-ae' , 9 : 'ar-ye'}, + # Arabic, Arabic (Algeria), Arabic (Bahrain), Arabic (Egypt), Arabic + # (Iraq), Arabic (Jordan), Arabic (Kuwait), Arabic (Lebanon), Arabic + # (Libya), Arabic (Morocco), Arabic (Oman), Arabic (Qatar), Arabic + # (Saudi Arabia), Arabic (Syria), Arabic (Tunisia), Arabic (United Arab + # Emirates), Arabic (Yemen) + 43 : {0 : 'hy'}, # Armenian + 77 : {0 : 'as'}, # Assamese + 44 : {0 : 'az'}, # "Azeri (IANA: Azerbaijani) + 45 : {0 : 'eu'}, # Basque + 35 : {0 : 'be'}, # Belarusian + 69 : {0 : 'bn'}, # Bengali + 2 : {0 : 'bg'}, # Bulgarian + 3 : {0 : 'ca'}, # Catalan + 4 : {0 : 'zh' , 3 : 'zh-hk' , 2 : 'zh-cn' , 4 : 'zh-sg' , 1 : 'zh-tw'}, + # Chinese, Chinese (Hong Kong), Chinese (PRC), Chinese (Singapore), Chinese (Taiwan) + 26 : {0 : 'hr', 3 : 'sr'}, # Croatian, Serbian + 5 : {0 : 'cs'}, # Czech + 6 : {0 : 'da'}, # Danish + 19 : {0: 'nl', 1 : 'nl' , 2 : 'nl-be'}, # Dutch / Flemish, Dutch (Belgium) + 9 : {0: 'en', 1 : 'en' , 3 : 'en-au' , 40 : 'en-bz' , 4 : 'en-ca' , 6 : 'en-ie' , 8 : 'en-jm' , 5 : 'en-nz' , 13 : 'en-ph' , + 7 : 'en-za' , 11 : 'en-tt' , 2 : 'en-gb', 1 : 'en-us' , 12 : 'en-zw'}, + # English, English (Australia), English (Belize), English (Canada), + # English (Ireland), English (Jamaica), English (New Zealand), English + # (Philippines), English (South Africa), English (Trinidad), English + # (United Kingdom), English (United States), English (Zimbabwe) + 37 : {0 : 'et'}, # Estonian + 56 : {0 : 'fo'}, # Faroese + 41 : {0 : 'fa'}, # Farsi / Persian + 11 : {0 : 'fi'}, # Finnish + 12 : {0 : 'fr', 1 : 'fr' , 2 : 'fr-be' , 3 : 'fr-ca' , 5 : 'fr-lu' , 6 : 'fr-mc' , 4 : 'fr-ch'}, + # French, French (Belgium), French (Canada), French (Luxembourg), French (Monaco), French (Switzerland) + 55 : {0 : 'ka'}, # Georgian + 7 : {0 : 'de', 1 : 'de' , 3 : 'de-at' , 5 : 'de-li' , 4 : 'de-lu' , 2 : 'de-ch'}, + # German, German (Austria), German (Liechtenstein), German (Luxembourg), German (Switzerland) + 8 : {0 : 'el'}, # Greek, Modern (1453-) + 71 : {0 : 'gu'}, # Gujarati + 13 : {0 : 'he'}, # Hebrew (also code 'iw'?) + 57 : {0 : 'hi'}, # Hindi + 14 : {0 : 'hu'}, # Hungarian + 15 : {0 : 'is'}, # Icelandic + 33 : {0 : 'id'}, # Indonesian + 16 : {0 : 'it', 1 : 'it' , 2 : 'it-ch'}, # Italian, Italian (Switzerland) + 17 : {0 : 'ja'}, # Japanese + 75 : {0 : 'kn'}, # Kannada + 63 : {0 : 'kk'}, # Kazakh + 87 : {0 : 'x-kok'}, # Konkani (real language code is 'kok'?) + 18 : {0 : 'ko'}, # Korean + 38 : {0 : 'lv'}, # Latvian + 39 : {0 : 'lt'}, # Lithuanian + 47 : {0 : 'mk'}, # Macedonian + 62 : {0 : 'ms'}, # Malay + 76 : {0 : 'ml'}, # Malayalam + 58 : {0 : 'mt'}, # Maltese + 78 : {0 : 'mr'}, # Marathi + 97 : {0 : 'ne'}, # Nepali + 20 : {0 : 'no'}, # Norwegian + 72 : {0 : 'or'}, # Oriya + 21 : {0 : 'pl'}, # Polish + 22 : {0 : 'pt', 2 : 'pt' , 1 : 'pt-br'}, # Portuguese, Portuguese (Brazil) + 70 : {0 : 'pa'}, # Punjabi + 23 : {0 : 'rm'}, # "Rhaeto-Romanic" (IANA: Romansh) + 24 : {0 : 'ro'}, # Romanian + 25 : {0 : 'ru'}, # Russian + 59 : {0 : 'sz'}, # "Sami (Lappish)" (not an IANA language code) + # IANA code for "Northern Sami" is 'se' + # 'SZ' is the IANA region code for Swaziland + 79 : {0 : 'sa'}, # Sanskrit + 27 : {0 : 'sk'}, # Slovak + 36 : {0 : 'sl'}, # Slovenian + 46 : {0 : 'sb'}, # "Sorbian" (not an IANA language code) + # 'SB' is IANA region code for 'Solomon Islands' + # Lower Sorbian = 'dsb' + # Upper Sorbian = 'hsb' + # Sorbian Languages = 'wen' + 10 : {0 : 'es' , 4 : 'es' , 44 : 'es-ar' , 64 : 'es-bo' , 52 : 'es-cl' , 36 : 'es-co' , 20 : 'es-cr' , 28 : 'es-do' , + 48 : 'es-ec' , 68 : 'es-sv' , 16 : 'es-gt' , 72 : 'es-hn' , 8 : 'es-mx' , 76 : 'es-ni' , 24 : 'es-pa' , + 60 : 'es-py' , 40 : 'es-pe' , 80 : 'es-pr' , 56 : 'es-uy' , 32 : 'es-ve'}, + # Spanish, Spanish (Mobipocket bug?), Spanish (Argentina), Spanish + # (Bolivia), Spanish (Chile), Spanish (Colombia), Spanish (Costa Rica), + # Spanish (Dominican Republic), Spanish (Ecuador), Spanish (El + # Salvador), Spanish (Guatemala), Spanish (Honduras), Spanish (Mexico), + # Spanish (Nicaragua), Spanish (Panama), Spanish (Paraguay), Spanish + # (Peru), Spanish (Puerto Rico), Spanish (Uruguay), Spanish (Venezuela) + 48 : {0 : 'sx'}, # "Sutu" (not an IANA language code) + # "Sutu" is another name for "Southern Sotho"? + # IANA code for "Southern Sotho" is 'st' + 65 : {0 : 'sw'}, # Swahili + 29 : {0 : 'sv' , 1 : 'sv' , 8 : 'sv-fi'}, # Swedish, Swedish (Finland) + 73 : {0 : 'ta'}, # Tamil + 68 : {0 : 'tt'}, # Tatar + 74 : {0 : 'te'}, # Telugu + 30 : {0 : 'th'}, # Thai + 49 : {0 : 'ts'}, # Tsonga + 50 : {0 : 'tn'}, # Tswana + 31 : {0 : 'tr'}, # Turkish + 34 : {0 : 'uk'}, # Ukrainian + 32 : {0 : 'ur'}, # Urdu + 67 : {0 : 'uz', 2 : 'uz'}, # Uzbek + 42 : {0 : 'vi'}, # Vietnamese + 52 : {0 : 'xh'}, # Xhosa + 53 : {0 : 'zu'}, # Zulu + } + lang = "en" + if langID in mobilangdict: + subdict = mobilangdict[langID] + lang = subdict[0] + if sublangID in subdict: + lang = subdict[sublangID] + return lang + + +def toHex(byteList): + return binascii.hexlify(byteList) + +# returns base32 bytestring +def toBase32(value, npad=4): + digits = b'0123456789ABCDEFGHIJKLMNOPQRSTUV' + num_string=b'' + current = value + while current != 0: + next, remainder = divmod(current, 32) + rem_string = digits[remainder:remainder+1] + num_string = rem_string + num_string + current=next + if num_string == b'': + num_string = b'0' + pad = npad - len(num_string) + if pad > 0: + num_string = b'0' * pad + num_string + return num_string + + +# converts base32 string to value +def fromBase32(str_num): + if isinstance(str_num, text_type): + str_num = str_num.encode('latin-1') + scalelst = [1,32,1024,32768,1048576,33554432,1073741824,34359738368] + value = 0 + j = 0 + n = len(str_num) + scale = 0 + for i in range(n): + c = str_num[n-i-1:n-i] + if c in b'0123456789': + v = ord(c) - ord(b'0') + else: + v = ord(c) - ord(b'A') + 10 + if j < len(scalelst): + scale = scalelst[j] + else: + scale = scale * 32 + j += 1 + if v != 0: + value = value + (v * scale) + return value + + +# note: if decode a bytestring using 'latin-1' (or any other 0-255 encoding) +# in place of ascii you will get a byte to half-word or integer +# one to one mapping of values from 0 - 255 + +def mangle_fonts(encryption_key, data): + if isinstance(encryption_key, text_type): + encryption_key = encryption_key.encode('latin-1') + crypt = data[:1024] + key = cycle(iter(map(bord, encryption_key))) + # encrypt = ''.join([chr(ord(x)^key.next()) for x in crypt]) + encrypt = b''.join([bchr(bord(x)^next(key)) for x in crypt]) + return encrypt + data[1024:] diff --git a/src/epy_reader/tools/KindleUnpack/mobiml2xhtml.py b/src/epy_reader/tools/KindleUnpack/mobiml2xhtml.py new file mode 100755 index 0000000..94fc671 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobiml2xhtml.py @@ -0,0 +1,527 @@ +#! /usr/bin/python +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + + +# this program works in concert with the output from KindleUnpack + +''' +Convert from Mobi ML to XHTML +''' + +from __future__ import division, absolute_import, print_function + +import os +import sys +import re + +SPECIAL_HANDLING_TAGS = { + '?xml' : ('xmlheader', -1), + '!--' : ('comment', -3), + '!DOCTYPE' : ('doctype', -1), +} + +SPECIAL_HANDLING_TYPES = ['xmlheader', 'doctype', 'comment'] + +SELF_CLOSING_TAGS = ['br' , 'hr', 'input', 'img', 'image', 'meta', 'spacer', 'link', 'frame', 'base', 'col', 'reference'] + +class MobiMLConverter(object): + + PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE) + IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex') + + def __init__(self, filename): + self.base_css_rules = 'blockquote { margin: 0em 0em 0em 1.25em }\n' + self.base_css_rules += 'p { margin: 0em }\n' + self.base_css_rules += '.bold { font-weight: bold }\n' + self.base_css_rules += '.italic { font-style: italic }\n' + self.base_css_rules += '.mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n' + self.tag_css_rules = {} + self.tag_css_rule_cnt = 0 + self.path = [] + self.filename = filename + self.wipml = open(self.filename, 'r').read() + self.pos = 0 + self.opfname = self.filename.rsplit('.',1)[0] + '.opf' + self.opos = 0 + self.meta = '' + self.cssname = os.path.join(os.path.dirname(self.filename),'styles.css') + self.current_font_size = 3 + self.font_history = [] + + def cleanup_html(self): + self.wipml = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.wipml) + self.wipml = self.wipml.replace('\r\n', '\n') + self.wipml = self.wipml.replace('> <', '>\n<') + self.wipml = self.wipml.replace('<mbp: ', '<mbp:') + # self.wipml = re.sub(r'<?xml[^>]*>', '', self.wipml) + self.wipml = self.wipml.replace('<br></br>','<br/>') + + def replace_page_breaks(self): + self.wipml = self.PAGE_BREAK_PAT.sub( + '<div class="mbp_pagebreak" />', + self.wipml) + + # parse leading text of ml and tag + def parseml(self): + p = self.pos + if p >= len(self.wipml): + return None + if self.wipml[p] != '<': + res = self.wipml.find('<',p) + if res == -1 : + res = len(self.wipml) + self.pos = res + return self.wipml[p:res], None + # handle comment as a special case to deal with multi-line comments + if self.wipml[p:p+4] == '<!--': + te = self.wipml.find('-->',p+1) + if te != -1: + te = te+2 + else : + te = self.wipml.find('>',p+1) + ntb = self.wipml.find('<',p+1) + if ntb != -1 and ntb < te: + self.pos = ntb + return self.wipml[p:ntb], None + self.pos = te + 1 + return None, self.wipml[p:te+1] + + # parses string version of tag to identify its name, + # its type 'begin', 'end' or 'single', + # plus build a hashtable of its attributes + # code is written to handle the possiblity of very poor formating + def parsetag(self, s): + p = 1 + # get the tag name + tname = None + ttype = None + tattr = {} + while s[p:p+1] == ' ' : + p += 1 + if s[p:p+1] == '/': + ttype = 'end' + p += 1 + while s[p:p+1] == ' ' : + p += 1 + b = p + while s[p:p+1] not in ('>', '/', ' ', '"', "'", "\r", "\n") : + p += 1 + tname=s[b:p].lower() + if tname == '!doctype': + tname = '!DOCTYPE' + # special cases + if tname in SPECIAL_HANDLING_TAGS: + ttype, backstep = SPECIAL_HANDLING_TAGS[tname] + tattr['special'] = s[p:backstep] + if ttype is None: + # parse any attributes + while s.find('=',p) != -1 : + while s[p:p+1] == ' ' : + p += 1 + b = p + while s[p:p+1] != '=' : + p += 1 + aname = s[b:p].lower() + aname = aname.rstrip(' ') + p += 1 + while s[p:p+1] == ' ' : + p += 1 + if s[p:p+1] in ('"', "'") : + p = p + 1 + b = p + while s[p:p+1] not in ('"', "'") : + p += 1 + val = s[b:p] + p += 1 + else : + b = p + while s[p:p+1] not in ('>', '/', ' ') : + p += 1 + val = s[b:p] + tattr[aname] = val + # label beginning and single tags + if ttype is None: + ttype = 'begin' + if s.find(' /',p) >= 0: + ttype = 'single_ext' + elif s.find('/',p) >= 0: + ttype = 'single' + return ttype, tname, tattr + + # main routine to convert from mobi markup language to html + def processml(self): + + # are these really needed + html_done = False + head_done = False + body_done = False + + skip = False + + htmlstr = '' + self.replace_page_breaks() + self.cleanup_html() + + # now parse the cleaned up ml into standard xhtml + while True: + + r = self.parseml() + if not r: + break + + text, tag = r + + if text: + if not skip: + htmlstr += text + + if tag: + ttype, tname, tattr = self.parsetag(tag) + + # If we run into a DTD or xml declarations inside the body ... bail. + if tname in SPECIAL_HANDLING_TAGS and tname != 'comment' and body_done: + htmlstr += '\n</body></html>' + break + + # make sure self-closing tags actually self-close + if ttype == 'begin' and tname in SELF_CLOSING_TAGS: + ttype = 'single' + + # make sure any end tags of self-closing tags are discarded + if ttype == 'end' and tname in SELF_CLOSING_TAGS: + continue + + # remove embedded guide and refernces from old mobis + if tname in ('guide', 'ncx', 'reference') and ttype in ('begin', 'single', 'single_ext'): + tname = 'removeme:{0}'.format(tname) + tattr = None + if tname in ('guide', 'ncx', 'reference', 'font', 'span') and ttype == 'end': + if self.path[-1] == 'removeme:{0}'.format(tname): + tname = 'removeme:{0}'.format(tname) + tattr = None + + # Get rid of font tags that only have a color attribute. + if tname == 'font' and ttype in ('begin', 'single', 'single_ext'): + if 'color' in tattr and len(tattr) == 1: + tname = 'removeme:{0}'.format(tname) + tattr = None + + # Get rid of empty spans in the markup. + if tname == 'span' and ttype in ('begin', 'single', 'single_ext') and not len(tattr): + tname = 'removeme:{0}'.format(tname) + + # need to handle fonts outside of the normal methods + # so fonts tags won't be added to the self.path since we keep track + # of font tags separately with self.font_history + if tname == 'font' and ttype == 'begin': + # check for nested font start tags + if len(self.font_history) > 0 : + # inject a font end tag + taginfo = ('end', 'font', None) + htmlstr += self.processtag(taginfo) + self.font_history.append((ttype, tname, tattr)) + # handle the current font start tag + taginfo = (ttype, tname, tattr) + htmlstr += self.processtag(taginfo) + continue + + # check for nested font tags and unnest them + if tname == 'font' and ttype == 'end': + self.font_history.pop() + # handle this font end tag + taginfo = ('end', 'font', None) + htmlstr += self.processtag(taginfo) + # check if we were nested + if len(self.font_history) > 0: + # inject a copy of the most recent font start tag from history + taginfo = self.font_history[-1] + htmlstr += self.processtag(taginfo) + continue + + # keep track of nesting path + if ttype == 'begin': + self.path.append(tname) + elif ttype == 'end': + if tname != self.path[-1]: + print('improper nesting: ', self.path, tname, ttype) + if tname not in self.path: + # handle case of end tag with no beginning by injecting empty begin tag + taginfo = ('begin', tname, None) + htmlstr += self.processtag(taginfo) + print(" - fixed by injecting empty start tag ", tname) + self.path.append(tname) + elif len(self.path) > 1 and tname == self.path[-2]: + # handle case of dangling missing end + taginfo = ('end', self.path[-1], None) + htmlstr += self.processtag(taginfo) + print(" - fixed by injecting end tag ", self.path[-1]) + self.path.pop() + self.path.pop() + + if tname == 'removeme:{0}'.format(tname): + if ttype in ('begin', 'single', 'single_ext'): + skip = True + else: + skip = False + else: + taginfo = (ttype, tname, tattr) + htmlstr += self.processtag(taginfo) + + # handle potential issue of multiple html, head, and body sections + if tname == 'html' and ttype == 'begin' and not html_done: + htmlstr += '\n' + html_done = True + + if tname == 'head' and ttype == 'begin' and not head_done: + htmlstr += '\n' + # also add in metadata and style link tags + htmlstr += self.meta + htmlstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n' + head_done = True + + if tname == 'body' and ttype == 'begin' and not body_done: + htmlstr += '\n' + body_done = True + + # handle issue of possibly missing html, head, and body tags + # I have not seen this but the original did something like this so ... + if not body_done: + htmlstr = '<body>\n' + htmlstr + '</body>\n' + if not head_done: + headstr = '<head>\n' + headstr += self.meta + headstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n' + headstr += '</head>\n' + htmlstr = headstr + htmlstr + if not html_done: + htmlstr = '<html>\n' + htmlstr + '</html>\n' + + # finally add DOCTYPE info + htmlstr = '<?xml version="1.0"?>\n<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n' + htmlstr + + css = self.base_css_rules + for cls, rule in self.tag_css_rules.items(): + css += '.%s { %s }\n' % (cls, rule) + + return (htmlstr, css, self.cssname) + + def ensure_unit(self, raw, unit='px'): + if re.search(r'\d+$', raw) is not None: + raw += unit + return raw + + # flatten possibly modified tag back to string + def taginfo_tostring(self, taginfo): + (ttype, tname, tattr) = taginfo + if ttype is None or tname is None: + return '' + if ttype == 'end': + return '</%s>' % tname + if ttype in SPECIAL_HANDLING_TYPES and tattr is not None and 'special' in tattr: + info = tattr['special'] + if ttype == 'comment': + return '<%s %s-->' % (tname, info) + else: + return '<%s %s>' % (tname, info) + res = [] + res.append('<%s' % tname) + if tattr is not None: + for key in tattr: + res.append(' %s="%s"' % (key, tattr[key])) + if ttype == 'single': + res.append('/>') + elif ttype == 'single_ext': + res.append(' />') + else : + res.append('>') + return "".join(res) + + # routines to convert from mobi ml tags atributes to xhtml attributes and styles + def processtag(self, taginfo): + # Converting mobi font sizes to numerics + size_map = { + 'xx-small': '1', + 'x-small': '2', + 'small': '3', + 'medium': '4', + 'large': '5', + 'x-large': '6', + 'xx-large': '7', + } + + size_to_em_map = { + '1': '.65em', + '2': '.75em', + '3': '1em', + '4': '1.125em', + '5': '1.25em', + '6': '1.5em', + '7': '2em', + } + + # current tag to work on + (ttype, tname, tattr) = taginfo + if not tattr: + tattr = {} + + styles = [] + + if tname is None or tname.startswith('removeme'): + return '' + + # have not seen an example of this yet so keep it here to be safe + # until this is better understood + if tname in ('country-region', 'place', 'placetype', 'placename', + 'state', 'city', 'street', 'address', 'content'): + tname = 'div' if tname == 'content' else 'span' + for key in tattr: + tattr.pop(key) + + # handle general case of style, height, width, bgcolor in any tag + if 'style' in tattr: + style = tattr.pop('style').strip() + if style: + styles.append(style) + + if 'align' in tattr: + align = tattr.pop('align').strip() + if align: + if tname in ('table', 'td', 'tr'): + pass + else: + styles.append('text-align: %s' % align) + + if 'height' in tattr: + height = tattr.pop('height').strip() + if height and '<' not in height and '>' not in height and re.search(r'\d+', height): + if tname in ('table', 'td', 'tr'): + pass + elif tname == 'img': + tattr['height'] = height + else: + styles.append('margin-top: %s' % self.ensure_unit(height)) + + if 'width' in tattr: + width = tattr.pop('width').strip() + if width and re.search(r'\d+', width): + if tname in ('table', 'td', 'tr'): + pass + elif tname == 'img': + tattr['width'] = width + else: + styles.append('text-indent: %s' % self.ensure_unit(width)) + if width.startswith('-'): + styles.append('margin-left: %s' % self.ensure_unit(width[1:])) + + if 'bgcolor' in tattr: + # no proprietary html allowed + if tname == 'div': + del tattr['bgcolor'] + + elif tname == 'font': + # Change font tags to span tags + tname = 'span' + if ttype in ('begin', 'single', 'single_ext'): + # move the face attribute to css font-family + if 'face' in tattr: + face = tattr.pop('face').strip() + styles.append('font-family: "%s"' % face) + + # Monitor the constantly changing font sizes, change them to ems and move + # them to css. The following will work for 'flat' font tags, but nested font tags + # will cause things to go wonky. Need to revert to the parent font tag's size + # when a closing tag is encountered. + if 'size' in tattr: + sz = tattr.pop('size').strip().lower() + try: + float(sz) + except ValueError: + if sz in size_map: + sz = size_map[sz] + else: + if sz.startswith('-') or sz.startswith('+'): + sz = self.current_font_size + float(sz) + if sz > 7: + sz = 7 + elif sz < 1: + sz = 1 + sz = str(int(sz)) + styles.append('font-size: %s' % size_to_em_map[sz]) + self.current_font_size = int(sz) + + elif tname == 'img': + for attr in ('width', 'height'): + if attr in tattr: + val = tattr[attr] + if val.lower().endswith('em'): + try: + nval = float(val[:-2]) + nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile + tattr[attr] = "%dpx"%int(nval) + except: + del tattr[attr] + elif val.lower().endswith('%'): + del tattr[attr] + + # convert the anchor tags + if 'filepos-id' in tattr: + tattr['id'] = tattr.pop('filepos-id') + if 'name' in tattr and tattr['name'] != tattr['id']: + tattr['name'] = tattr['id'] + + if 'filepos' in tattr: + filepos = tattr.pop('filepos') + try: + tattr['href'] = "#filepos%d" % int(filepos) + except ValueError: + pass + + if styles: + ncls = None + rule = '; '.join(styles) + for sel, srule in self.tag_css_rules.items(): + if srule == rule: + ncls = sel + break + if ncls is None: + self.tag_css_rule_cnt += 1 + ncls = 'rule_%d' % self.tag_css_rule_cnt + self.tag_css_rules[ncls] = rule + cls = tattr.get('class', '') + cls = cls + (' ' if cls else '') + ncls + tattr['class'] = cls + + # convert updated tag back to string representation + if len(tattr) == 0: + tattr = None + taginfo = (ttype, tname, tattr) + return self.taginfo_tostring(taginfo) + +''' main only left in for testing outside of plugin ''' + +def main(argv=sys.argv): + if len(argv) != 2: + return 1 + else: + infile = argv[1] + + try: + print('Converting Mobi Markup Language to XHTML') + mlc = MobiMLConverter(infile) + print('Processing ...') + htmlstr, css, cssname = mlc.processml() + outname = infile.rsplit('.',1)[0] + '_converted.html' + open(outname, 'w').write(htmlstr) + open(cssname, 'w').write(css) + print('Completed') + print('XHTML version of book can be found at: ' + outname) + + except ValueError as e: + print("Error: %s" % e) + return 1 + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/epy_reader/tools/KindleUnpack/unipath.py b/src/epy_reader/tools/KindleUnpack/unipath.py new file mode 100755 index 0000000..2416279 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/unipath.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this list of +# conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, this list +# of conditions and the following disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY +# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import unicode_literals, division, absolute_import, print_function +from .compatibility_utils import PY2, text_type, binary_type + +import sys +import os + +# utility routines to convert all paths to be full unicode + +# Under Python 2, if a bytestring, try to convert it to unicode using sys.getfilesystemencoding +# Under Python 3, if bytes, try to convert it to unicode using os.fsencode() to decode it + +# Mac OS X and Windows will happily support full unicode paths +# Linux can support full unicode paths but allows arbitrary byte paths which may be inconsistent with unicode + +fsencoding = sys.getfilesystemencoding() + +def pathof(s, enc=fsencoding): + if s is None: + return None + if isinstance(s, text_type): + return s + if isinstance(s, binary_type): + try: + return s.decode(enc) + except: + pass + return s + +def exists(s): + return os.path.exists(pathof(s)) + +def isfile(s): + return os.path.isfile(pathof(s)) + +def isdir(s): + return os.path.isdir(pathof(s)) + +def mkdir(s): + return os.mkdir(pathof(s)) + +def listdir(s): + rv = [] + for file in os.listdir(pathof(s)): + rv.append(pathof(file)) + return rv + +def getcwd(): + if PY2: + return os.getcwdu() + return os.getcwd() + +def walk(top): + top = pathof(top) + rv = [] + for base, dnames, names in os.walk(top): + base = pathof(base) + for name in names: + name = pathof(name) + rv.append(relpath(os.path.join(base, name), top)) + return rv + +def relpath(path, start=None): + return os.path.relpath(pathof(path) , pathof(start)) + +def abspath(path): + return os.path.abspath(pathof(path)) diff --git a/src/epy_reader/tools/KindleUnpack/unpack_structure.py b/src/epy_reader/tools/KindleUnpack/unpack_structure.py new file mode 100644 index 0000000..2e66eb8 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/unpack_structure.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import text_type + +from . import unipath +from .unipath import pathof + +DUMP = False +""" Set to True to dump all possible information. """ + +import os + +import re +# note: re requites the pattern to be the exact same type as the data to be searched in python3 +# but u"" is not allowed for the pattern itself only b"" + +import zipfile +import binascii +from .mobi_utils import mangle_fonts + +class unpackException(Exception): + pass + +class ZipInfo(zipfile.ZipInfo): + + def __init__(self, *args, **kwargs): + if 'compress_type' in kwargs: + compress_type = kwargs.pop('compress_type') + super(ZipInfo, self).__init__(*args, **kwargs) + self.compress_type = compress_type + +class fileNames: + + def __init__(self, infile, outdir): + self.infile = infile + self.outdir = outdir + if not unipath.exists(self.outdir): + unipath.mkdir(self.outdir) + self.mobi7dir = os.path.join(self.outdir,'mobi7') + if not unipath.exists(self.mobi7dir): + unipath.mkdir(self.mobi7dir) + self.imgdir = os.path.join(self.mobi7dir, 'Images') + if not unipath.exists(self.imgdir): + unipath.mkdir(self.imgdir) + self.hdimgdir = os.path.join(self.outdir,'HDImages') + if not unipath.exists(self.hdimgdir): + unipath.mkdir(self.hdimgdir) + self.outbase = os.path.join(self.outdir, os.path.splitext(os.path.split(infile)[1])[0]) + + def getInputFileBasename(self): + return os.path.splitext(os.path.basename(self.infile))[0] + + def makeK8Struct(self): + self.k8dir = os.path.join(self.outdir,'mobi8') + if not unipath.exists(self.k8dir): + unipath.mkdir(self.k8dir) + self.k8metainf = os.path.join(self.k8dir,'META-INF') + if not unipath.exists(self.k8metainf): + unipath.mkdir(self.k8metainf) + self.k8oebps = os.path.join(self.k8dir,'OEBPS') + if not unipath.exists(self.k8oebps): + unipath.mkdir(self.k8oebps) + self.k8images = os.path.join(self.k8oebps,'Images') + if not unipath.exists(self.k8images): + unipath.mkdir(self.k8images) + self.k8fonts = os.path.join(self.k8oebps,'Fonts') + if not unipath.exists(self.k8fonts): + unipath.mkdir(self.k8fonts) + self.k8styles = os.path.join(self.k8oebps,'Styles') + if not unipath.exists(self.k8styles): + unipath.mkdir(self.k8styles) + self.k8text = os.path.join(self.k8oebps,'Text') + if not unipath.exists(self.k8text): + unipath.mkdir(self.k8text) + + # recursive zip creation support routine + def zipUpDir(self, myzip, tdir, localname): + currentdir = tdir + if localname != "": + currentdir = os.path.join(currentdir,localname) + list = unipath.listdir(currentdir) + for file in list: + afilename = file + localfilePath = os.path.join(localname, afilename) + realfilePath = os.path.join(currentdir,file) + if unipath.isfile(realfilePath): + myzip.write(pathof(realfilePath), pathof(localfilePath), zipfile.ZIP_DEFLATED) + elif unipath.isdir(realfilePath): + self.zipUpDir(myzip, tdir, localfilePath) + + def makeEPUB(self, usedmap, obfuscate_data, uid): + bname = os.path.join(self.k8dir, self.getInputFileBasename() + '.epub') + # Create an encryption key for Adobe font obfuscation + # based on the epub's uid + if isinstance(uid,text_type): + uid = uid.encode('ascii') + if obfuscate_data: + key = re.sub(br'[^a-fA-F0-9]', b'', uid) + key = binascii.unhexlify((key + key)[:32]) + + # copy over all images and fonts that are actually used in the ebook + # and remove all font files from mobi7 since not supported + imgnames = unipath.listdir(self.imgdir) + for name in imgnames: + if usedmap.get(name,'not used') == 'used': + filein = os.path.join(self.imgdir,name) + if name.endswith(".ttf"): + fileout = os.path.join(self.k8fonts,name) + elif name.endswith(".otf"): + fileout = os.path.join(self.k8fonts,name) + elif name.endswith(".failed"): + fileout = os.path.join(self.k8fonts,name) + else: + fileout = os.path.join(self.k8images,name) + data = b'' + with open(pathof(filein),'rb') as f: + data = f.read() + if obfuscate_data: + if name in obfuscate_data: + data = mangle_fonts(key, data) + open(pathof(fileout),'wb').write(data) + if name.endswith(".ttf") or name.endswith(".otf"): + os.remove(pathof(filein)) + + # opf file name hard coded to "content.opf" + container = '<?xml version="1.0" encoding="UTF-8"?>\n' + container += '<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">\n' + container += ' <rootfiles>\n' + container += '<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>' + container += ' </rootfiles>\n</container>\n' + fileout = os.path.join(self.k8metainf,'container.xml') + with open(pathof(fileout),'wb') as f: + f.write(container.encode('utf-8')) + + if obfuscate_data: + encryption = '<encryption xmlns="urn:oasis:names:tc:opendocument:xmlns:container" \ +xmlns:enc="http://www.w3.org/2001/04/xmlenc#" xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">\n' + for font in obfuscate_data: + encryption += ' <enc:EncryptedData>\n' + encryption += ' <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>\n' + encryption += ' <enc:CipherData>\n' + encryption += ' <enc:CipherReference URI="OEBPS/Fonts/' + font + '"/>\n' + encryption += ' </enc:CipherData>\n' + encryption += ' </enc:EncryptedData>\n' + encryption += '</encryption>\n' + fileout = os.path.join(self.k8metainf,'encryption.xml') + with open(pathof(fileout),'wb') as f: + f.write(encryption.encode('utf-8')) + + # ready to build epub + self.outzip = zipfile.ZipFile(pathof(bname), 'w') + + # add the mimetype file uncompressed + mimetype = b'application/epub+zip' + fileout = os.path.join(self.k8dir,'mimetype') + with open(pathof(fileout),'wb') as f: + f.write(mimetype) + nzinfo = ZipInfo('mimetype', compress_type=zipfile.ZIP_STORED) + nzinfo.external_attr = 0o600 << 16 # make this a normal file + self.outzip.writestr(nzinfo, mimetype) + self.zipUpDir(self.outzip,self.k8dir,'META-INF') + self.zipUpDir(self.outzip,self.k8dir,'OEBPS') + self.outzip.close() diff --git a/src/epy_reader/tools/__init__.py b/src/epy_reader/tools/__init__.py new file mode 100644 index 0000000..d97cba1 --- /dev/null +++ b/src/epy_reader/tools/__init__.py @@ -0,0 +1,3 @@ +__all__ = ["unpack_kindle_book"] + +from epy_reader.tools.KindleUnpack.kindleunpack import unpackBook as unpack_kindle_book diff --git a/src/epy_reader/utils.py b/src/epy_reader/utils.py new file mode 100644 index 0000000..5bba7f6 --- /dev/null +++ b/src/epy_reader/utils.py @@ -0,0 +1,377 @@ +import curses +import os +import re +import sys +import textwrap +from functools import wraps +from typing import List, Mapping, Optional, Sequence, Tuple, Union + +from epy_reader.ebooks import URL, Azw, Ebook, Epub, FictionBook, Mobi +from epy_reader.lib import is_url, tuple_subtract +from epy_reader.models import Key, LettersCount, NoUpdate, ReadingState, TextStructure, TocEntry +from epy_reader.parser import parse_html +from epy_reader.speakers import SpeakerBaseModel, SpeakerMimic, SpeakerPico + + +def get_ebook_obj(filepath: str) -> Ebook: + file_ext = os.path.splitext(filepath)[1].lower() + if is_url(filepath): + return URL(filepath) + elif file_ext in {".epub", ".epub3"}: + return Epub(filepath) + elif file_ext == ".fb2": + return FictionBook(filepath) + elif file_ext == ".mobi": + return Mobi(filepath) + elif file_ext in {".azw", ".azw3"}: + return Azw(filepath) + else: + sys.exit("ERROR: Format not supported. (Supported: epub, fb2)") + + +def safe_curs_set(state: int) -> None: + try: + curses.curs_set(state) + except: + return + + +def find_current_content_index( + toc_entries: Tuple[TocEntry, ...], toc_secid: Mapping[str, int], index: int, y: int +) -> int: + ntoc = 0 + for n, toc_entry in enumerate(toc_entries): + if toc_entry.content_index <= index: + if y >= toc_secid.get(toc_entry.section, 0): # type: ignore + ntoc = n + return ntoc + + +def pgup(current_row: int, window_height: int, counter: int = 1) -> int: + if current_row >= (window_height) * counter: + return current_row - (window_height) * counter + else: + return 0 + + +def pgdn(current_row: int, total_lines: int, window_height: int, counter: int = 1) -> int: + if current_row + (window_height * counter) <= total_lines - window_height: + return current_row + (window_height * counter) + else: + current_row = total_lines - window_height + if current_row < 0: + return 0 + return current_row + + +def pgend(total_lines: int, window_height: int) -> int: + if total_lines - window_height >= 0: + return total_lines - window_height + else: + return 0 + + +def choice_win(allowdel=False): + """ + Conjure options window by wrapping a window function + which has a return type of tuple in the form of + (title, list_to_chose, initial_active_index, windows_key_to_toggle) + and return tuple of (returned_key, chosen_index, chosen_index_to_delete) + """ + + def inner_f(listgen): + @wraps(listgen) + def wrapper(self, *args, **kwargs): + rows, cols = self.screen.getmaxyx() + hi, wi = rows - 4, cols - 4 + Y, X = 2, 2 + chwin = curses.newwin(hi, wi, Y, X) + if self.is_color_supported: + chwin.bkgd(self.screen.getbkgd()) + + title, ch_list, index, key = listgen(self, *args, **kwargs) + + if len(title) > cols - 8: + title = title[: cols - 8] + + chwin.box() + chwin.keypad(True) + chwin.addstr(1, 2, title) + chwin.addstr(2, 2, "-" * len(title)) + if allowdel: + chwin.addstr(3, 2, "HINT: Press 'd' to delete.") + key_chwin = 0 + + totlines = len(ch_list) + chwin.refresh() + pad = curses.newpad(totlines, wi - 2) + if self.is_color_supported: + pad.bkgd(self.screen.getbkgd()) + + pad.keypad(True) + + padhi = rows - 5 - Y - 4 + 1 - (1 if allowdel else 0) + # padhi = rows - 5 - Y - 4 + 1 - 1 + y = 0 + if index in range(padhi // 2, totlines - padhi // 2): + y = index - padhi // 2 + 1 + span = [] + + for n, i in enumerate(ch_list): + # strs = " " + str(n+1).rjust(d) + " " + i[0] + # remove newline from choice entries + # mostly happens in FictionBook (.fb2) format + strs = " " + i.replace("\n", " ") + strs = strs[0 : wi - 3] + pad.addstr(n, 0, strs) + span.append(len(strs)) + + countstring = "" + while key_chwin not in self.keymap.Quit + key: + if countstring == "": + count = 1 + else: + count = int(countstring) + if key_chwin in tuple(Key(i) for i in range(48, 58)): # i.e., k is a numeral + countstring = countstring + key_chwin.char + else: + if key_chwin in self.keymap.ScrollUp + self.keymap.PageUp: + index -= count + if index < 0: + index = 0 + elif key_chwin in self.keymap.ScrollDown or key_chwin in self.keymap.PageDown: + index += count + if index + 1 >= totlines: + index = totlines - 1 + elif key_chwin in self.keymap.Follow: + chwin.clear() + chwin.refresh() + return None, index, None + elif key_chwin in self.keymap.BeginningOfCh: + index = 0 + elif key_chwin in self.keymap.EndOfCh: + index = totlines - 1 + elif key_chwin == Key("D") and allowdel: + return None, (0 if index == 0 else index - 1), index + # chwin.redrawwin() + # chwin.refresh() + elif key_chwin == Key("d") and allowdel: + resk, resp, _ = self.show_win_options( + "Delete '{}'?".format(ch_list[index]), + ["(Y)es", "(N)o"], + 0, + (Key("n"),), + ) + if resk is not None: + key_chwin = resk + continue + elif resp == 0: + return None, (0 if index == 0 else index - 1), index + chwin.redrawwin() + chwin.refresh() + elif key_chwin in {Key(i) for i in ["Y", "y", "N", "n"]} and ch_list == [ + "(Y)es", + "(N)o", + ]: + if key_chwin in {Key("Y"), Key("y")}: + return None, 0, None + else: + return None, 1, None + elif key_chwin in tuple_subtract(self._win_keys, key): + chwin.clear() + chwin.refresh() + return key_chwin, index, None + countstring = "" + + while index not in range(y, y + padhi): + if index < y: + y -= 1 + else: + y += 1 + + for n in range(totlines): + att = curses.A_REVERSE if index == n else curses.A_NORMAL + pre = ">>" if index == n else " " + pad.addstr(n, 0, pre) + pad.chgat(n, 0, span[n], pad.getbkgd() | att) + + pad.refresh(y, 0, Y + 4 + (1 if allowdel else 0), X + 4, rows - 5, cols - 6) + # pad.refresh(y, 0, Y+5, X+4, rows - 5, cols - 6) + key_chwin = Key(chwin.getch()) + if key_chwin == Key(curses.KEY_MOUSE): + mouse_event = curses.getmouse() + if mouse_event[4] == curses.BUTTON4_PRESSED: + key_chwin = self.keymap.ScrollUp[0] + elif mouse_event[4] == 2097152: + key_chwin = self.keymap.ScrollDown[0] + elif mouse_event[4] == curses.BUTTON1_DOUBLE_CLICKED: + if ( + mouse_event[2] >= 6 + and mouse_event[2] < rows - 4 + and mouse_event[2] < 6 + totlines + ): + index = mouse_event[2] - 6 + y + key_chwin = self.keymap.Follow[0] + elif ( + mouse_event[4] == curses.BUTTON1_CLICKED + and mouse_event[2] >= 6 + and mouse_event[2] < rows - 4 + and mouse_event[2] < 6 + totlines + ): + if index == mouse_event[2] - 6 + y: + key_chwin = self.keymap.Follow[0] + continue + index = mouse_event[2] - 6 + y + elif mouse_event[4] == curses.BUTTON3_CLICKED: + key_chwin = self.keymap.Quit[0] + + chwin.clear() + chwin.refresh() + return None, None, None + + return wrapper + + return inner_f + + +def text_win(textfunc): + @wraps(textfunc) + def wrapper(self, *args, **kwargs) -> Union[NoUpdate, Key]: + rows, cols = self.screen.getmaxyx() + hi, wi = rows - 4, cols - 4 + Y, X = 2, 2 + textw = curses.newwin(hi, wi, Y, X) + if self.is_color_supported: + textw.bkgd(self.screen.getbkgd()) + + title, raw_texts, key = textfunc(self, *args, **kwargs) + + if len(title) > cols - 8: + title = title[: cols - 8] + + texts = [] + for i in raw_texts.splitlines(): + texts += textwrap.wrap(i, wi - 6, drop_whitespace=False) + + textw.box() + textw.keypad(True) + textw.addstr(1, 2, title) + textw.addstr(2, 2, "-" * len(title)) + key_textw: Union[NoUpdate, Key] = NoUpdate() + + totlines = len(texts) + + pad = curses.newpad(totlines, wi - 2) + if self.is_color_supported: + pad.bkgd(self.screen.getbkgd()) + + pad.keypad(True) + for n, i in enumerate(texts): + pad.addstr(n, 0, i) + y = 0 + textw.refresh() + pad.refresh(y, 0, Y + 4, X + 4, rows - 5, cols - 6) + padhi = rows - 8 - Y + + while key_textw not in self.keymap.Quit + key: + if key_textw in self.keymap.ScrollUp and y > 0: + y -= 1 + elif key_textw in self.keymap.ScrollDown and y < totlines - hi + 6: + y += 1 + elif key_textw in self.keymap.PageUp: + y = pgup(y, padhi) + elif key_textw in self.keymap.PageDown: + y = pgdn(y, totlines, padhi) + elif key_textw in self.keymap.BeginningOfCh: + y = 0 + elif key_textw in self.keymap.EndOfCh: + y = pgend(totlines, padhi) + elif key_textw in tuple_subtract(self._win_keys, key): + textw.clear() + textw.refresh() + return key_textw + pad.refresh(y, 0, 6, 5, rows - 5, cols - 5) + key_textw = Key(textw.getch()) + + textw.clear() + textw.refresh() + return NoUpdate() + + return wrapper + + +def merge_text_structures( + text_structure_first: TextStructure, text_structure_second: TextStructure +) -> TextStructure: + return TextStructure( + text_lines=text_structure_first.text_lines + text_structure_second.text_lines, + image_maps={**text_structure_first.image_maps, **text_structure_second.image_maps}, + section_rows={**text_structure_first.section_rows, **text_structure_second.section_rows}, + formatting=text_structure_first.formatting + text_structure_second.formatting, + ) + + +def construct_relative_reading_state( + abs_reading_state: ReadingState, totlines_per_content: Sequence[int] +) -> ReadingState: + """ + :param abs_reading_state: ReadingState absolute to whole book when Setting.Seamless==True + :param totlines_per_content: sequence of total lines per book content + :return: new ReadingState relative to per content of the book + """ + index = 0 + cumulative_contents_lines = 0 + all_contents_lines = sum(totlines_per_content) + # for n, content_lines in enumerate(totlines_per_content): + # cumulative_contents_lines += content_lines + # if cumulative_contents_lines > abs_reading_state.row: + # return + while True: + content_lines = totlines_per_content[index] + cumulative_contents_lines += content_lines + if cumulative_contents_lines > abs_reading_state.row: + break + index += 1 + + return ReadingState( + content_index=index, + textwidth=abs_reading_state.textwidth, + row=abs_reading_state.row - cumulative_contents_lines + content_lines, + rel_pctg=abs_reading_state.rel_pctg + - ((cumulative_contents_lines - content_lines) / all_contents_lines) + if abs_reading_state.rel_pctg + else None, + section=abs_reading_state.section, + ) + + +def count_letters(ebook: Ebook) -> LettersCount: + per_content_counts: List[int] = [] + cumulative_counts: List[int] = [] + # assert isinstance(ebook.contents, tuple) + for i in ebook.contents: + content = ebook.get_raw_text(i) + src_lines = parse_html(content) + assert isinstance(src_lines, tuple) + cumulative_counts.append(sum(per_content_counts)) + per_content_counts.append(sum([len(re.sub(r"\s", "", j)) for j in src_lines])) + + return LettersCount(all=sum(per_content_counts), cumulative=tuple(cumulative_counts)) + + +def count_letters_parallel(ebook: Ebook, child_conn) -> None: + child_conn.send(count_letters(ebook)) + child_conn.close() + + +def construct_speaker( + preferred: Optional[str] = None, args: List[str] = [] +) -> Optional[SpeakerBaseModel]: + available_speakers = [SpeakerMimic, SpeakerPico] + sorted_speakers = ( + sorted(available_speakers, key=lambda x: int(x.cmd == preferred), reverse=True) + if preferred + else available_speakers + ) + speaker = next((speaker for speaker in sorted_speakers if speaker.available), None) + return speaker(args) if speaker else None |