diff options
author | Benawi Adha <benawiadha@gmail.com> | 2022-10-02 21:22:38 +0700 |
---|---|---|
committer | Benawi Adha <benawiadha@gmail.com> | 2022-10-02 21:22:38 +0700 |
commit | 258c30d2e088cd4ab091a53794da3f93af79915d (patch) | |
tree | f49340bf565deb20c730358af74a01bcc231de53 /src/epy_reader/ebooks | |
parent | d43533f01d9d5baf5f78b71f832641382bd5962a (diff) | |
download | epy-258c30d2e088cd4ab091a53794da3f93af79915d.tar.gz |
Major refactor: breakdown epy.py script
into package project structure for easier
development
Squashed commit of the following:
commit 01309b961a4ab32394bff0d90949b57435dfda47
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 21:15:04 2022 +0700
Fix missing objects
commit aab2e773c30b255c81b1250b3b20967d5da40338
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 21:09:31 2022 +0700
Update README.md
commit d4e98926bcd9b00ce0410ad71249d24e6315abc5
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 21:07:28 2022 +0700
Add keywords in pyproject.toml
commit 432055af8245560a3ff2e046aef0b4e87da44930
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 21:04:34 2022 +0700
Bump version and deprecete setup.py
commit 51dd15aab8f8ff5996f822f8378e813f0b9fb80d
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 20:56:38 2022 +0700
Formatting
commit 81fb35e3b6fa0e27d79ef1da77202ed81eb99500
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 20:55:08 2022 +0700
Fix speakers module
commit 3b852e7c59b38d5a28520038e35f50a95270d2f1
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:52:46 2022 +0700
Fix circular import
commit 061e8a2649dabacd28a9e2f972559475316c654c
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:39:27 2022 +0700
Run formatting
commit abc2d0ab156992c63dc04745d14a69679a60accb
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:39:00 2022 +0700
Update isort and black config in pyproject
commit 5dc2e41bab5b997bd719bdc1561eb51ba0c17a83
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:31:00 2022 +0700
Add app Config
commit ed485a2ea8281585bf86dc5772f0c6dd9c803cc4
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:23:02 2022 +0700
Update debugpy script
commit 68b0553dd4d63eb4b847132c68ea4018587fa8ec
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:14:11 2022 +0700
Connect reader to main script
commit 63c3dd176f18a784a4ed2e88aa72b13d1c2b0990
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:11:17 2022 +0700
Implement reader
commit ce5eec8fb4e1db3870a16a07541365cd777d6c4c
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 19:29:49 2022 +0700
Fix script in pyproject.toml
commit 941e8e49f1593731fb582d92084206772b3f0442
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 19:28:39 2022 +0700
Rename modules
commit 5a3e7f766aee774c09b3b5336f3a2968e9cb1d0c
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 19:28:20 2022 +0700
Rename tool method
commit 3c0503ff475cb7eff8b12d3be0bda7a38efe1072
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 19:27:03 2022 +0700
Add ebooks lib
commit b5f71c3296a7d6f36454f6e1cbe84e15a45092ee
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 17:25:11 2022 +0700
Initial reorganization
Diffstat (limited to 'src/epy_reader/ebooks')
-rw-r--r-- | src/epy_reader/ebooks/__init__.py | 15 | ||||
-rw-r--r-- | src/epy_reader/ebooks/azw.py | 26 | ||||
-rw-r--r-- | src/epy_reader/ebooks/base.py | 48 | ||||
-rw-r--r-- | src/epy_reader/ebooks/epub.py | 202 | ||||
-rw-r--r-- | src/epy_reader/ebooks/fictionbook.py | 76 | ||||
-rw-r--r-- | src/epy_reader/ebooks/mobi.py | 69 | ||||
-rw-r--r-- | src/epy_reader/ebooks/url.py | 49 |
7 files changed, 485 insertions, 0 deletions
diff --git a/src/epy_reader/ebooks/__init__.py b/src/epy_reader/ebooks/__init__.py new file mode 100644 index 0000000..da5cfc0 --- /dev/null +++ b/src/epy_reader/ebooks/__init__.py @@ -0,0 +1,15 @@ +__all__ = [ + "Ebook", + "Epub", + "FictionBook", + "Mobi", + "Azw", + "URL", +] + +from epy_reader.ebooks.azw import Azw +from epy_reader.ebooks.base import Ebook +from epy_reader.ebooks.epub import Epub +from epy_reader.ebooks.fictionbook import FictionBook +from epy_reader.ebooks.mobi import Mobi +from epy_reader.ebooks.url import URL diff --git a/src/epy_reader/ebooks/azw.py b/src/epy_reader/ebooks/azw.py new file mode 100644 index 0000000..139fcc5 --- /dev/null +++ b/src/epy_reader/ebooks/azw.py @@ -0,0 +1,26 @@ +import contextlib +import os +import shutil +import tempfile +import zipfile + +from epy_reader.ebooks.epub import Epub +from epy_reader.tools import unpack_kindle_book + + +class Azw(Epub): + def __init__(self, fileepub): + self.path = os.path.abspath(fileepub) + self.tmpdir = tempfile.mkdtemp(prefix="epy-") + basename, _ = os.path.splitext(os.path.basename(self.path)) + self.tmpepub = os.path.join(self.tmpdir, "mobi8", basename + ".epub") + + def initialize(self): + with contextlib.redirect_stdout(None): + unpack_kindle_book(self.path, self.tmpdir, epubver="A", use_hd=True) + self.file = zipfile.ZipFile(self.tmpepub, "r") + Epub.initialize(self) + + def cleanup(self) -> None: + shutil.rmtree(self.tmpdir) + return diff --git a/src/epy_reader/ebooks/base.py b/src/epy_reader/ebooks/base.py new file mode 100644 index 0000000..0869db9 --- /dev/null +++ b/src/epy_reader/ebooks/base.py @@ -0,0 +1,48 @@ +import xml.etree.ElementTree as ET +from typing import Tuple, Union + +from epy_reader.models import BookMetadata, TocEntry + + +class Ebook: + def __init__(self, fileepub: str): + raise NotImplementedError("Ebook.__init__() not implemented") + + @property + def path(self) -> str: + return self._path + + @path.setter + def path(self, value: str) -> None: + self._path = value + + @property + def contents(self) -> Union[Tuple[str, ...], Tuple[ET.Element, ...]]: + return self._contents + + @contents.setter + def contents(self, value: Union[Tuple[str, ...], Tuple[ET.Element, ...]]) -> None: + self._contents = value + + @property + def toc_entries(self) -> Tuple[TocEntry, ...]: + return self._toc_entries + + @toc_entries.setter + def toc_entries(self, value: Tuple[TocEntry, ...]) -> None: + self._toc_entries = value + + def get_meta(self) -> BookMetadata: + raise NotImplementedError("Ebook.get_meta() not implemented") + + def initialize(self) -> None: + raise NotImplementedError("Ebook.initialize() not implemented") + + def get_raw_text(self, content: Union[str, ET.Element]) -> str: + raise NotImplementedError("Ebook.get_raw_text() not implemented") + + def get_img_bytestr(self, impath: str) -> Tuple[str, bytes]: + raise NotImplementedError("Ebook.get_img_bytestr() not implemented") + + def cleanup(self) -> None: + raise NotImplementedError("Ebook.cleanup() not implemented") diff --git a/src/epy_reader/ebooks/epub.py b/src/epy_reader/ebooks/epub.py new file mode 100644 index 0000000..a8cf0fa --- /dev/null +++ b/src/epy_reader/ebooks/epub.py @@ -0,0 +1,202 @@ +import dataclasses +import os +import xml.etree.ElementTree as ET +import zipfile +import zlib +from typing import Dict, List, Optional, Sequence, Tuple, Union +from urllib.parse import unquote, urljoin + +from epy_reader.ebooks.base import Ebook +from epy_reader.models import BookMetadata, TocEntry + + +# TODO: to be deprecated +DEBUG = False + + +class Epub(Ebook): + NAMESPACE = { + "DAISY": "http://www.daisy.org/z3986/2005/ncx/", + "OPF": "http://www.idpf.org/2007/opf", + "CONT": "urn:oasis:names:tc:opendocument:xmlns:container", + "XHTML": "http://www.w3.org/1999/xhtml", + "EPUB": "http://www.idpf.org/2007/ops", + # Dublin Core + "DC": "http://purl.org/dc/elements/1.1/", + } + + def __init__(self, fileepub: str): + self.path: str = os.path.abspath(fileepub) + self.file: Union[zipfile.ZipFile, str] = zipfile.ZipFile(fileepub, "r") + + # populate these attributes + # by calling self.initialize() + self.root_filepath: str + self.root_dirpath: str + + def get_meta(self) -> BookMetadata: + assert isinstance(self.file, zipfile.ZipFile) + # why self.file.read(self.root_filepath) problematic + # content_opf = ET.fromstring(self.file.open(self.root_filepath).read()) + content_opf = ET.parse(self.file.open(self.root_filepath)) + return Epub._get_metadata(content_opf) + + @staticmethod + def _get_metadata(content_opf: ET.ElementTree) -> BookMetadata: + metadata: Dict[str, Optional[str]] = {} + for field in dataclasses.fields(BookMetadata): + element = content_opf.find(f".//DC:{field.name}", Epub.NAMESPACE) + if element is not None: + metadata[field.name] = element.text + + return BookMetadata(**metadata) + + @staticmethod + def _get_contents(content_opf: ET.ElementTree) -> Tuple[str, ...]: + # cont = ET.parse(self.file.open(self.root_filepath)).getroot() + manifests: List[Tuple[str, str]] = [] + for manifest_elem in content_opf.findall("OPF:manifest/*", Epub.NAMESPACE): + # EPUB3 + # if manifest_elem.get("id") != "ncx" and manifest_elem.get("properties") != "nav": + if ( + manifest_elem.get("media-type") != "application/x-dtbncx+xml" + and manifest_elem.get("properties") != "nav" + ): + manifest_id = manifest_elem.get("id") + assert manifest_id is not None + manifest_href = manifest_elem.get("href") + assert manifest_href is not None + manifests.append((manifest_id, manifest_href)) + + spines: List[str] = [] + contents: List[str] = [] + for spine_elem in content_opf.findall("OPF:spine/*", Epub.NAMESPACE): + idref = spine_elem.get("idref") + assert idref is not None + spines.append(idref) + for spine in spines: + for manifest in manifests: + if spine == manifest[0]: + # book_contents.append(root_dirpath + unquote(manifest[1])) + contents.append(unquote(manifest[1])) + manifests.remove(manifest) + # TODO: test is break necessary + break + + return tuple(contents) + + @staticmethod + def _get_tocs(toc: ET.Element, version: str, contents: Sequence[str]) -> Tuple[TocEntry, ...]: + try: + # EPUB3 + if version in {"1.0", "2.0"}: + navPoints = toc.findall("DAISY:navMap//DAISY:navPoint", Epub.NAMESPACE) + elif version == "3.0": + navPoints = toc.findall( + "XHTML:body//XHTML:nav[@EPUB:type='toc']//XHTML:a", Epub.NAMESPACE + ) + + toc_entries: List[TocEntry] = [] + for navPoint in navPoints: + if version in {"1.0", "2.0"}: + src_elem = navPoint.find("DAISY:content", Epub.NAMESPACE) + assert src_elem is not None + src = src_elem.get("src") + + name_elem = navPoint.find("DAISY:navLabel/DAISY:text", Epub.NAMESPACE) + assert name_elem is not None + name = name_elem.text + elif version == "3.0": + src_elem = navPoint + assert src_elem is not None + src = src_elem.get("href") + + name = "".join(list(navPoint.itertext())) + + assert src is not None + src_id = src.split("#") + + try: + idx = contents.index(unquote(src_id[0])) + except ValueError: + continue + + # assert name is not None + # NOTE: skip empty label + if name is not None: + toc_entries.append( + TocEntry( + label=name, + content_index=idx, + section=src_id[1] if len(src_id) == 2 else None, + ) + ) + except AttributeError as e: + # TODO: + if DEBUG: + raise e + + return tuple(toc_entries) + + def initialize(self) -> None: + assert isinstance(self.file, zipfile.ZipFile) + + container = ET.parse(self.file.open("META-INF/container.xml")) + rootfile_elem = container.find("CONT:rootfiles/CONT:rootfile", Epub.NAMESPACE) + assert rootfile_elem is not None + self.root_filepath = rootfile_elem.attrib["full-path"] + self.root_dirpath = ( + os.path.dirname(self.root_filepath) + "/" + if os.path.dirname(self.root_filepath) != "" + else "" + ) + + content_opf = ET.parse(self.file.open(self.root_filepath)) + version = content_opf.getroot().get("version") + + contents = Epub._get_contents(content_opf) + self.contents = tuple(urljoin(self.root_dirpath, content) for content in contents) + + if version in {"1.0", "2.0"}: + # "OPF:manifest/*[@id='ncx']" + relative_toc = content_opf.find( + "OPF:manifest/*[@media-type='application/x-dtbncx+xml']", Epub.NAMESPACE + ) + elif version == "3.0": + relative_toc = content_opf.find("OPF:manifest/*[@properties='nav']", Epub.NAMESPACE) + else: + raise RuntimeError(f"Unsupported Epub version: {version}") + assert relative_toc is not None + relative_toc_path = relative_toc.get("href") + assert relative_toc_path is not None + toc_path = self.root_dirpath + relative_toc_path + toc = ET.parse(self.file.open(toc_path)).getroot() + self.toc_entries = Epub._get_tocs(toc, version, contents) # *self.contents (absolute path) + + def get_raw_text(self, content_path: Union[str, ET.Element]) -> str: + assert isinstance(self.file, zipfile.ZipFile) + assert isinstance(content_path, str) + + max_tries: Optional[int] = None # 1 if DEBUG else None + + # use try-except block to catch + # zlib.error: Error -3 while decompressing data: invalid distance too far back + # seems like caused by multiprocessing + tries = 0 + while True: + try: + content = self.file.open(content_path).read() + break + except zlib.error as e: + tries += 1 + if max_tries is not None and tries >= max_tries: + raise e + + return content.decode("utf-8") + + def get_img_bytestr(self, impath: str) -> Tuple[str, bytes]: + assert isinstance(self.file, zipfile.ZipFile) + return impath, self.file.read(impath) + + def cleanup(self) -> None: + pass diff --git a/src/epy_reader/ebooks/fictionbook.py b/src/epy_reader/ebooks/fictionbook.py new file mode 100644 index 0000000..35611b2 --- /dev/null +++ b/src/epy_reader/ebooks/fictionbook.py @@ -0,0 +1,76 @@ +import base64 +import os +import xml.etree.ElementTree as ET +from typing import List, Tuple, Union + +from epy_reader.ebooks import Ebook +from epy_reader.models import BookMetadata, TocEntry + + +class FictionBook(Ebook): + NAMESPACE = {"FB2": "http://www.gribuser.ru/xml/fictionbook/2.0"} + + def __init__(self, filefb: str): + self.path = os.path.abspath(filefb) + self.file = filefb + + # populate these attribute + # by calling self.initialize() + self.root: ET.Element + + def get_meta(self) -> BookMetadata: + title_elem = self.root.find(".//FB2:book-title", FictionBook.NAMESPACE) + first_name_elem = self.root.find(".//FB2:first-name", FictionBook.NAMESPACE) + last_name_elem = self.root.find(".//FB2:last-name", FictionBook.NAMESPACE) + date_elem = self.root.find(".//FB2:date", FictionBook.NAMESPACE) + identifier_elem = self.root.find(".//FB2:id", FictionBook.NAMESPACE) + + author = first_name_elem.text if first_name_elem is not None else None + if last_name_elem is not None: + if author is not None and author != "": + author += f" {last_name_elem.text}" + else: + author = last_name_elem.text + + return BookMetadata( + title=title_elem.text if title_elem is not None else None, + creator=author, + date=date_elem.text if date_elem is not None else None, + identifier=identifier_elem.text if identifier_elem is not None else None, + ) + + def initialize(self) -> None: + cont = ET.parse(self.file) + self.root = cont.getroot() + + self.contents = tuple(self.root.findall("FB2:body/*", FictionBook.NAMESPACE)) + + # TODO + toc_entries: List[TocEntry] = [] + for n, i in enumerate(self.contents): + title = i.find("FB2:title", FictionBook.NAMESPACE) + if title is not None: + toc_entries.append( + TocEntry(label="".join(title.itertext()), content_index=n, section=None) + ) + self.toc_entries = tuple(toc_entries) + + def get_raw_text(self, node: Union[str, ET.Element]) -> str: + assert isinstance(node, ET.Element) + ET.register_namespace("", "http://www.gribuser.ru/xml/fictionbook/2.0") + # sys.exit(ET.tostring(node, encoding="utf8", method="html").decode("utf-8").replace("ns1:","")) + return ET.tostring(node, encoding="utf8", method="html").decode("utf-8").replace("ns1:", "") + + def get_img_bytestr(self, imgid: str) -> Tuple[str, bytes]: + # TODO: test if image works + imgid = imgid.replace("#", "") + img_elem = self.root.find("*[@id='{}']".format(imgid)) + assert img_elem is not None + imgtype = img_elem.get("content-type") + img_elem_text = img_elem.text + assert imgtype is not None + assert img_elem_text is not None + return imgid + "." + imgtype.split("/")[1], base64.b64decode(img_elem_text) + + def cleanup(self) -> None: + return diff --git a/src/epy_reader/ebooks/mobi.py b/src/epy_reader/ebooks/mobi.py new file mode 100644 index 0000000..39f3be4 --- /dev/null +++ b/src/epy_reader/ebooks/mobi.py @@ -0,0 +1,69 @@ +import contextlib +import os +import shutil +import tempfile +import xml.etree.ElementTree as ET +from typing import Tuple, Union + +from epy_reader.ebooks.epub import Epub +from epy_reader.models import BookMetadata +from epy_reader.tools import unpack_kindle_book + + +class Mobi(Epub): + def __init__(self, filemobi: str): + self.path = os.path.abspath(filemobi) + self.file = tempfile.mkdtemp(prefix="epy-") + + # populate these attribute + # by calling self.initialize() + self.root_filepath: str + self.root_dirpath: str + + def get_meta(self) -> BookMetadata: + # why self.file.read(self.root_filepath) problematic + with open(os.path.join(self.root_dirpath, "content.opf")) as f: + content_opf = ET.parse(f) # .getroot() + return Epub._get_metadata(content_opf) + + def initialize(self) -> None: + assert isinstance(self.file, str) + + with contextlib.redirect_stdout(None): + unpack_kindle_book(self.path, self.file, epubver="A", use_hd=True) + # TODO: add cleanup here + + self.root_dirpath = os.path.join(self.file, "mobi7") + self.toc_path = os.path.join(self.root_dirpath, "toc.ncx") + version = "2.0" + + with open(os.path.join(self.root_dirpath, "content.opf")) as f: + content_opf = ET.parse(f) # .getroot() + + contents = Epub._get_contents(content_opf) + self.contents = tuple(os.path.join(self.root_dirpath, content) for content in contents) + + with open(self.toc_path) as f: + toc = ET.parse(f).getroot() + self.toc_entries = Epub._get_tocs(toc, version, contents) # *self.contents (absolute path) + + def get_raw_text(self, content_path: Union[str, ET.Element]) -> str: + assert isinstance(content_path, str) + with open(content_path, encoding="utf8") as f: + content = f.read() + # return content.decode("utf-8") + return content + + def get_img_bytestr(self, impath: str) -> Tuple[str, bytes]: + # TODO: test on windows + # if impath "Images/asdf.png" is problematic + image_abspath = os.path.join(self.root_dirpath, impath) + image_abspath = os.path.normpath(image_abspath) # handle crossplatform path + with open(image_abspath, "rb") as f: + src = f.read() + return impath, src + + def cleanup(self) -> None: + assert isinstance(self.file, str) + shutil.rmtree(self.file) + return diff --git a/src/epy_reader/ebooks/url.py b/src/epy_reader/ebooks/url.py new file mode 100644 index 0000000..4356fa7 --- /dev/null +++ b/src/epy_reader/ebooks/url.py @@ -0,0 +1,49 @@ +from pathlib import PurePosixPath +from typing import Tuple +from urllib.error import HTTPError, URLError +from urllib.parse import urljoin, urlparse +from urllib.request import Request, urlopen + +from epy_reader import __version__ +from epy_reader.ebooks import Ebook +from epy_reader.lib import is_url +from epy_reader.models import BookMetadata + + +class URL(Ebook): + _header = { + "User-Agent": f"epy/v{__version__}", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.8", + } + + def __init__(self, url: str): + self.path = url + self.file = url + self.contents = ("_",) + self.toc_entries = tuple() + + def get_meta(self) -> BookMetadata: + return BookMetadata() + + def initialize(self) -> None: + try: + with urlopen(Request(self.path, headers=URL._header)) as response: + self.html = response.read().decode() + except HTTPError as e: + raise e + except URLError as e: + raise e + + def get_raw_text(self, _) -> str: + return self.html + + def get_img_bytestr(self, src: str) -> Tuple[str, bytes]: + image_url = src if is_url(src) else urljoin(self.path, src) + # TODO: catch error on request + with urlopen(Request(image_url, headers=URL._header)) as response: + byte_str = response.read() + return PurePosixPath(urlparse(src).path).name, byte_str + + def cleanup(self) -> None: + return |