Fixed book metadata

author: benadha <benawiadha@gmail.com> 2022-01-07 20:42:33 +0700
committer: benadha <benawiadha@gmail.com> 2022-01-07 20:42:33 +0700
commit: 8c6d3c6d2b131d13455ec9833eab83f6ce942fd0 (patch)
tree: 124d50801af5430013eaa0c76b132a948083c6aa /epy.py
parent: cd3d7d0814f579116c4593d67fa6fe6beda23994 (diff)
download: epy-8c6d3c6d2b131d13455ec9833eab83f6ce942fd0.tar.gz
1 files changed, 86 insertions, 51 deletions
diff --git a/epy.py b/epy.py
index 0ade925..1b2b0b2 100755
--- a/epy.py
+++ b/epy.py
@@ -197,6 +197,19 @@ SPEAKERS: List[Type[SpeakerBaseModel]] = [SpeakerMimic, SpeakerPico]
 
 
 @dataclass(frozen=True)
+class BookMetadata:
+    title: Optional[str] = None
+    creator: Optional[str] = None
+    description: Optional[str] = None
+    publisher: Optional[str] = None
+    date: Optional[str] = None
+    language: Optional[str] = None
+    format: Optional[str] = None
+    identifier: Optional[str] = None
+    source: Optional[str] = None
+
+
+@dataclass(frozen=True)
 class ReadingState:
     """
     Data model for reading state.
@@ -475,7 +488,7 @@ class Ebook:
     def toc_entries(self, value: Tuple[TocEntry, ...]) -> None:
         self._toc_entries = value
 
-    def get_meta(self) -> Tuple[Tuple[str, str], ...]:
+    def get_meta(self) -> BookMetadata:
         raise NotImplementedError("Ebook.get_meta() not implemented")
 
     def initialize(self) -> None:
@@ -492,12 +505,14 @@ class Ebook:
 
 
 class Epub(Ebook):
-    NS = {
+    NAMESPACE = {
         "DAISY": "http://www.daisy.org/z3986/2005/ncx/",
         "OPF": "http://www.idpf.org/2007/opf",
         "CONT": "urn:oasis:names:tc:opendocument:xmlns:container",
         "XHTML": "http://www.w3.org/1999/xhtml",
         "EPUB": "http://www.idpf.org/2007/ops",
+        # Dublin Core
+        "DC": "http://purl.org/dc/elements/1.1/",
     }
 
     def __init__(self, fileepub: str):
@@ -509,22 +524,28 @@ class Epub(Ebook):
         self.root_filepath: str
         self.root_dirpath: str
 
-    def get_meta(self) -> Tuple[Tuple[str, str], ...]:
+    def get_meta(self) -> BookMetadata:
         assert isinstance(self.file, zipfile.ZipFile)
-
-        meta: List[Tuple[str, str]] = []
         # why self.file.read(self.root_filepath) problematic
-        cont = ET.fromstring(self.file.open(self.root_filepath).read())
-        for i in cont.findall("OPF:metadata/*", self.NS):
-            if i.text is not None:
-                meta.append((re.sub("{.*?}", "", i.tag), i.text))
-        return tuple(meta)
+        # content_opf = ET.fromstring(self.file.open(self.root_filepath).read())
+        content_opf = ET.parse(self.file.open(self.root_filepath))
+        return Epub._get_metadata(content_opf)
+
+    @staticmethod
+    def _get_metadata(content_opf: ET.ElementTree) -> BookMetadata:
+        metadata: Dict[str, Optional[str]] = {}
+        for field in dataclasses.fields(BookMetadata):
+            element = content_opf.find(f".//DC:{field.name}", Epub.NAMESPACE)
+            if element is not None:
+                metadata[field.name] = element.text
+
+        return BookMetadata(**metadata)
 
     @staticmethod
-    def get_contents(content_opf: ET.ElementTree) -> Tuple[str, ...]:
+    def _get_contents(content_opf: ET.ElementTree) -> Tuple[str, ...]:
         # cont = ET.parse(self.file.open(self.root_filepath)).getroot()
         manifests: List[Tuple[str, str]] = []
-        for manifest_elem in content_opf.findall("OPF:manifest/*", Epub.NS):
+        for manifest_elem in content_opf.findall("OPF:manifest/*", Epub.NAMESPACE):
             # EPUB3
             # if manifest_elem.get("id") != "ncx" and manifest_elem.get("properties") != "nav":
             if (
@@ -539,7 +560,7 @@ class Epub(Ebook):
 
         spines: List[str] = []
         contents: List[str] = []
-        for spine_elem in content_opf.findall("OPF:spine/*", Epub.NS):
+        for spine_elem in content_opf.findall("OPF:spine/*", Epub.NAMESPACE):
             idref = spine_elem.get("idref")
             assert idref is not None
             spines.append(idref)
@@ -555,22 +576,24 @@ class Epub(Ebook):
         return tuple(contents)
 
     @staticmethod
-    def get_tocs(toc: ET.Element, version: str, contents: Sequence[str]) -> Tuple[TocEntry, ...]:
+    def _get_tocs(toc: ET.Element, version: str, contents: Sequence[str]) -> Tuple[TocEntry, ...]:
         try:
             # EPUB3
             if version in {"1.0", "2.0"}:
-                navPoints = toc.findall("DAISY:navMap//DAISY:navPoint", Epub.NS)
+                navPoints = toc.findall("DAISY:navMap//DAISY:navPoint", Epub.NAMESPACE)
             elif version == "3.0":
-                navPoints = toc.findall("XHTML:body//XHTML:nav[@EPUB:type='toc']//XHTML:a", Epub.NS)
+                navPoints = toc.findall(
+                    "XHTML:body//XHTML:nav[@EPUB:type='toc']//XHTML:a", Epub.NAMESPACE
+                )
 
             toc_entries: List[TocEntry] = []
             for navPoint in navPoints:
                 if version in {"1.0", "2.0"}:
-                    src_elem = navPoint.find("DAISY:content", Epub.NS)
+                    src_elem = navPoint.find("DAISY:content", Epub.NAMESPACE)
                     assert src_elem is not None
                     src = src_elem.get("src")
 
-                    name_elem = navPoint.find("DAISY:navLabel/DAISY:text", Epub.NS)
+                    name_elem = navPoint.find("DAISY:navLabel/DAISY:text", Epub.NAMESPACE)
                     assert name_elem is not None
                     name = name_elem.text
                 elif version == "3.0":
@@ -608,7 +631,7 @@ class Epub(Ebook):
         assert isinstance(self.file, zipfile.ZipFile)
 
         container = ET.parse(self.file.open("META-INF/container.xml"))
-        rootfile_elem = container.find("CONT:rootfiles/CONT:rootfile", self.NS)
+        rootfile_elem = container.find("CONT:rootfiles/CONT:rootfile", Epub.NAMESPACE)
         assert rootfile_elem is not None
         self.root_filepath = rootfile_elem.attrib["full-path"]
         self.root_dirpath = (
@@ -620,16 +643,16 @@ class Epub(Ebook):
         content_opf = ET.parse(self.file.open(self.root_filepath))
         version = content_opf.getroot().get("version")
 
-        contents = Epub.get_contents(content_opf)
+        contents = Epub._get_contents(content_opf)
         self.contents = tuple(self.root_dirpath + content for content in contents)
 
         if version in {"1.0", "2.0"}:
             # "OPF:manifest/*[@id='ncx']"
             relative_toc = content_opf.find(
-                "OPF:manifest/*[@media-type='application/x-dtbncx+xml']", self.NS
+                "OPF:manifest/*[@media-type='application/x-dtbncx+xml']", Epub.NAMESPACE
             )
         elif version == "3.0":
-            relative_toc = content_opf.find("OPF:manifest/*[@properties='nav']", self.NS)
+            relative_toc = content_opf.find("OPF:manifest/*[@properties='nav']", Epub.NAMESPACE)
         else:
             raise RuntimeError(f"Unsupported Epub version: {version}")
         assert relative_toc is not None
@@ -637,7 +660,7 @@ class Epub(Ebook):
         assert relative_toc_path is not None
         toc_path = self.root_dirpath + relative_toc_path
         toc = ET.parse(self.file.open(toc_path)).getroot()
-        self.toc_entries = Epub.get_tocs(toc, version, contents)  # *self.contents (absolute path)
+        self.toc_entries = Epub._get_tocs(toc, version, contents)  # *self.contents (absolute path)
 
     def get_raw_text(self, content_path: Union[str, ET.Element]) -> str:
         assert isinstance(self.file, zipfile.ZipFile)
@@ -678,15 +701,11 @@ class Mobi(Epub):
         self.root_filepath: str
         self.root_dirpath: str
 
-    def get_meta(self) -> Tuple[Tuple[str, str], ...]:
-        meta: List[Tuple[str, str]] = []
+    def get_meta(self) -> BookMetadata:
         # why self.file.read(self.root_filepath) problematic
         with open(os.path.join(self.root_dirpath, "content.opf")) as f:
-            cont = ET.parse(f).getroot()
-        for i in cont.findall("OPF:metadata/*", self.NS):
-            if i.text is not None:
-                meta.append((re.sub("{.*?}", "", i.tag), i.text))
-        return tuple(meta)
+            content_opf = ET.parse(f)  # .getroot()
+        return Epub._get_metadata(content_opf)
 
     def initialize(self) -> None:
         assert isinstance(self.file, str)
@@ -698,12 +717,12 @@ class Mobi(Epub):
         with open(os.path.join(self.root_dirpath, "content.opf")) as f:
             content_opf = ET.parse(f)  # .getroot()
 
-        contents = Epub.get_contents(content_opf)
+        contents = Epub._get_contents(content_opf)
         self.contents = tuple(os.path.join(self.root_dirpath, content) for content in contents)
 
         with open(self.toc_path) as f:
             toc = ET.parse(f).getroot()
-        self.toc_entries = Epub.get_tocs(toc, version, contents)  # *self.contents (absolute path)
+        self.toc_entries = Epub._get_tocs(toc, version, contents)  # *self.contents (absolute path)
 
     def get_raw_text(self, content_path: Union[str, ET.Element]) -> str:
         assert isinstance(content_path, str)
@@ -739,7 +758,7 @@ class Azw3(Epub):
 
 
 class FictionBook(Ebook):
-    NS = {"FB2": "http://www.gribuser.ru/xml/fictionbook/2.0"}
+    NAMESPACE = {"FB2": "http://www.gribuser.ru/xml/fictionbook/2.0"}
 
     def __init__(self, filefb: str):
         self.path = os.path.abspath(filefb)
@@ -749,22 +768,37 @@ class FictionBook(Ebook):
         # by calling self.initialize()
         self.root: ET.Element
 
-    def get_meta(self) -> Tuple[Tuple[str, str], ...]:
-        desc = self.root.find("FB2:description", self.NS)
-        assert desc is not None
-        alltags = desc.findall("*/*")
-        return tuple((re.sub("{.*?}", "", i.tag), " ".join(i.itertext())) for i in alltags)
+    def get_meta(self) -> BookMetadata:
+        title_elem = self.root.find(".//FB2:book-title", FictionBook.NAMESPACE)
+        first_name_elem = self.root.find(".//FB2:first-name", FictionBook.NAMESPACE)
+        last_name_elem = self.root.find(".//FB2:last-name", FictionBook.NAMESPACE)
+        date_elem = self.root.find(".//FB2:date", FictionBook.NAMESPACE)
+        identifier_elem = self.root.find(".//FB2:id", FictionBook.NAMESPACE)
+
+        author = first_name_elem.text if first_name_elem is not None else None
+        if last_name_elem is not None:
+            if author is not None and author != "":
+                author += f" {last_name_elem.text}"
+            else:
+                author = last_name_elem.text
+
+        return BookMetadata(
+            title=title_elem.text if title_elem is not None else None,
+            creator=author,
+            date=date_elem.text if date_elem is not None else None,
+            identifier=identifier_elem.text if identifier_elem is not None else None,
+        )
 
     def initialize(self) -> None:
         cont = ET.parse(self.file)
         self.root = cont.getroot()
 
-        self.contents = tuple(self.root.findall("FB2:body/*", self.NS))
+        self.contents = tuple(self.root.findall("FB2:body/*", FictionBook.NAMESPACE))
 
         # TODO
         toc_entries: List[TocEntry] = []
         for n, i in enumerate(self.contents):
-            title = i.find("FB2:title", self.NS)
+            title = i.find("FB2:title", FictionBook.NAMESPACE)
             if title is not None:
                 toc_entries.append(
                     TocEntry(label="".join(title.itertext()), content_index=n, section=None)
@@ -801,8 +835,8 @@ class HTMLtoLines(HTMLParser):
     ital = {"i", "em"}
     bold = {"b", "strong"}
     # hide = {"script", "style", "head", ", "sub}
-    sup_lookup = "⁰¹²³⁴⁵⁶⁷⁸⁹"
-    sub_lookup = "₀₁₂₃₄₅₆₇₈₉"
+    # sup_lookup = "⁰¹²³⁴⁵⁶⁷⁸⁹"
+    # sub_lookup = "₀₁₂₃₄₅₆₇₈₉"
 
     attr_bold = curses.A_BOLD
     try:
@@ -917,9 +951,7 @@ class HTMLtoLines(HTMLParser):
         return spans
 
     @staticmethod
-    def _group_spans_by_row(
-        blocks: Sequence[TextSpan]
-    ) -> Mapping[int, List[TextSpan]]:
+    def _group_spans_by_row(blocks: Sequence[TextSpan]) -> Mapping[int, List[TextSpan]]:
         groups: Dict[int, List[TextSpan]] = {}
         for block in blocks:
             row = block.start.row
@@ -1685,7 +1717,7 @@ def construct_relative_reading_state(
 
 
 def get_ebook_obj(filepath: str) -> Ebook:
-    file_ext = os.path.splitext(filepath)[1]
+    file_ext = os.path.splitext(filepath)[1].lower()
     if file_ext == ".epub":
         return Epub(filepath)
     elif file_ext == ".fb2":
@@ -2245,11 +2277,14 @@ class Reader:
         mdata = "[File Info]\nPATH: {}\nSIZE: {} MB\n \n[Book Info]\n".format(
             self.ebook.path, round(os.path.getsize(self.ebook.path) / 1024 ** 2, 2)
         )
-        for i in self.ebook.get_meta():
-            data = re.sub("<[^>]*>", "", i[1])
-            mdata += i[0].upper() + ": " + data + "\n"
-            # data = re.sub("\t", "", data)
-            # mdata += textwrap.wrap(i[0].upper() + ": " + data, wi - 6)
+
+        book_metadata = self.ebook.get_meta()
+        for field in dataclasses.fields(book_metadata):
+            value = getattr(book_metadata, field.name)
+            if value:
+                value = unescape(re.sub("<[^>]*>", "", value))
+                mdata += f"{field.name.title()}: {value}\n"
+
         return "Metadata", mdata, self.keymap.Metadata
 
     @text_win
author	benadha <benawiadha@gmail.com>	2022-01-07 20:42:33 +0700
committer	benadha <benawiadha@gmail.com>	2022-01-07 20:42:33 +0700
commit	8c6d3c6d2b131d13455ec9833eab83f6ce942fd0 (patch)
tree	124d50801af5430013eaa0c76b132a948083c6aa /epy.py
parent	cd3d7d0814f579116c4593d67fa6fe6beda23994 (diff)
download	epy-8c6d3c6d2b131d13455ec9833eab83f6ce942fd0.tar.gz