import dataclasses import os import xml.etree.ElementTree as ET import zipfile import zlib from typing import Dict, List, Optional, Sequence, Tuple, Union from urllib.parse import unquote, urljoin from epy_reader.ebooks.base import Ebook from epy_reader.models import BookMetadata, TocEntry # TODO: to be deprecated DEBUG = False class Epub(Ebook): NAMESPACE = { "DAISY": "http://www.daisy.org/z3986/2005/ncx/", "OPF": "http://www.idpf.org/2007/opf", "CONT": "urn:oasis:names:tc:opendocument:xmlns:container", "XHTML": "http://www.w3.org/1999/xhtml", "EPUB": "http://www.idpf.org/2007/ops", # Dublin Core "DC": "http://purl.org/dc/elements/1.1/", } def __init__(self, fileepub: str): self.path: str = os.path.abspath(fileepub) self.file: Union[zipfile.ZipFile, str] = zipfile.ZipFile(fileepub, "r") # populate these attributes # by calling self.initialize() self.root_filepath: str self.root_dirpath: str def get_meta(self) -> BookMetadata: assert isinstance(self.file, zipfile.ZipFile) # why self.file.read(self.root_filepath) problematic # content_opf = ET.fromstring(self.file.open(self.root_filepath).read()) content_opf = ET.parse(self.file.open(self.root_filepath)) return Epub._get_metadata(content_opf) @staticmethod def _get_metadata(content_opf: ET.ElementTree) -> BookMetadata: metadata: Dict[str, Optional[str]] = {} for field in dataclasses.fields(BookMetadata): element = content_opf.find(f".//DC:{field.name}", Epub.NAMESPACE) if element is not None: metadata[field.name] = element.text return BookMetadata(**metadata) @staticmethod def _get_contents(content_opf: ET.ElementTree) -> Tuple[str, ...]: # cont = ET.parse(self.file.open(self.root_filepath)).getroot() manifests: List[Tuple[str, str]] = [] for manifest_elem in content_opf.findall("OPF:manifest/*", Epub.NAMESPACE): # EPUB3 # if manifest_elem.get("id") != "ncx" and manifest_elem.get("properties") != "nav": if ( manifest_elem.get("media-type") != "application/x-dtbncx+xml" and manifest_elem.get("properties") != "nav" ): manifest_id = manifest_elem.get("id") assert manifest_id is not None manifest_href = manifest_elem.get("href") assert manifest_href is not None manifests.append((manifest_id, manifest_href)) spines: List[str] = [] contents: List[str] = [] for spine_elem in content_opf.findall("OPF:spine/*", Epub.NAMESPACE): idref = spine_elem.get("idref") assert idref is not None spines.append(idref) for spine in spines: for manifest in manifests: if spine == manifest[0]: # book_contents.append(root_dirpath + unquote(manifest[1])) contents.append(unquote(manifest[1])) manifests.remove(manifest) # TODO: test is break necessary break return tuple(contents) @staticmethod def _get_tocs(toc: ET.Element, version: str, contents: Sequence[str]) -> Tuple[TocEntry, ...]: try: # EPUB3 if version in {"1.0", "2.0"}: navPoints = toc.findall("DAISY:navMap//DAISY:navPoint", Epub.NAMESPACE) elif version == "3.0": navPoints = toc.findall( "XHTML:body//XHTML:nav[@EPUB:type='toc']//XHTML:a", Epub.NAMESPACE ) toc_entries: List[TocEntry] = [] for navPoint in navPoints: if version in {"1.0", "2.0"}: src_elem = navPoint.find("DAISY:content", Epub.NAMESPACE) assert src_elem is not None src = src_elem.get("src") name_elem = navPoint.find("DAISY:navLabel/DAISY:text", Epub.NAMESPACE) assert name_elem is not None name = name_elem.text elif version == "3.0": src_elem = navPoint assert src_elem is not None src = src_elem.get("href") name = "".join(list(navPoint.itertext())) assert src is not None src_id = src.split("#") try: idx = contents.index(unquote(src_id[0])) except ValueError: continue # assert name is not None # NOTE: skip empty label if name is not None: toc_entries.append( TocEntry( label=name, content_index=idx, section=src_id[1] if len(src_id) == 2 else None, ) ) except AttributeError as e: # TODO: if DEBUG: raise e return tuple(toc_entries) def initialize(self) -> None: assert isinstance(self.file, zipfile.ZipFile) container = ET.parse(self.file.open("META-INF/container.xml")) rootfile_elem = container.find("CONT:rootfiles/CONT:rootfile", Epub.NAMESPACE) assert rootfile_elem is not None self.root_filepath = rootfile_elem.attrib["full-path"] self.root_dirpath = ( os.path.dirname(self.root_filepath) + "/" if os.path.dirname(self.root_filepath) != "" else "" ) content_opf = ET.parse(self.file.open(self.root_filepath)) version = content_opf.getroot().get("version") contents = Epub._get_contents(content_opf) self.contents = tuple(urljoin(self.root_dirpath, content) for content in contents) if version in {"1.0", "2.0"}: # "OPF:manifest/*[@id='ncx']" relative_toc = content_opf.find( "OPF:manifest/*[@media-type='application/x-dtbncx+xml']", Epub.NAMESPACE ) elif version == "3.0": relative_toc = content_opf.find("OPF:manifest/*[@properties='nav']", Epub.NAMESPACE) else: raise RuntimeError(f"Unsupported Epub version: {version}") assert relative_toc is not None relative_toc_path = relative_toc.get("href") assert relative_toc_path is not None toc_path = self.root_dirpath + relative_toc_path toc = ET.parse(self.file.open(toc_path)).getroot() self.toc_entries = Epub._get_tocs(toc, version, contents) # *self.contents (absolute path) def get_raw_text(self, content_path: Union[str, ET.Element]) -> str: assert isinstance(self.file, zipfile.ZipFile) assert isinstance(content_path, str) max_tries: Optional[int] = None # 1 if DEBUG else None # use try-except block to catch # zlib.error: Error -3 while decompressing data: invalid distance too far back # seems like caused by multiprocessing tries = 0 while True: try: content = self.file.open(content_path).read() break except zlib.error as e: tries += 1 if max_tries is not None and tries >= max_tries: raise e return content.decode("utf-8") def get_img_bytestr(self, impath: str) -> Tuple[str, bytes]: assert isinstance(self.file, zipfile.ZipFile) return impath, self.file.read(impath) def cleanup(self) -> None: pass