1 files changed, 421 insertions, 0 deletions
diff --git a/src/epy_reader/parser.py b/src/epy_reader/parser.py
new file mode 100644
index 0000000..6eced00
--- /dev/null
+++ b/src/epy_reader/parser.py
@@ -0,0 +1,421 @@
+import curses
+import dataclasses
+import re
+import textwrap
+from html import unescape
+from html.parser import HTMLParser
+from typing import Dict, List, Mapping, Optional, Sequence, Set, Tuple, Union
+from urllib.parse import unquote
+
+from epy_reader.models import CharPos, InlineStyle, TextMark, TextSpan, TextStructure
+
+
+class HTMLtoLines(HTMLParser):
+    para = {"p", "div"}
+    inde = {"q", "dt", "dd", "blockquote"}
+    pref = {"pre"}
+    bull = {"li"}
+    hide = {"script", "style", "head"}
+    ital = {"i", "em"}
+    bold = {"b", "strong"}
+    # hide = {"script", "style", "head", ", "sub}
+    # sup_lookup = "⁰¹²³⁴⁵⁶⁷⁸⁹"
+    # sub_lookup = "₀₁₂₃₄₅₆₇₈₉"
+
+    attr_bold = curses.A_BOLD
+    try:
+        attr_italic = curses.A_ITALIC
+    except AttributeError:
+        try:
+            attr_italic = curses.A_UNDERLINE
+        except AttributeError:
+            attr_italic = curses.A_NORMAL
+
+    @staticmethod
+    def _mark_to_spans(text: Sequence[str], marks: Sequence[TextMark]) -> List[TextSpan]:
+        """
+        Convert text marks in line of text to per line text span.
+        Keeping duplicate spans.
+        """
+        spans: List[TextSpan] = []
+        for mark in marks:
+            if mark.is_valid():
+                # mypy issue, should be handled by mark.is_valid()
+                assert mark.end is not None
+                if mark.start.row == mark.end.row:
+                    spans.append(
+                        TextSpan(start=mark.start, n_letters=mark.end.col - mark.start.col)
+                    )
+                else:
+                    spans.append(
+                        TextSpan(
+                            start=mark.start, n_letters=len(text[mark.start.row]) - mark.start.col
+                        )
+                    )
+                    for nth_line in range(mark.start.row + 1, mark.end.row):
+                        spans.append(
+                            TextSpan(
+                                start=CharPos(row=nth_line, col=0), n_letters=len(text[nth_line])
+                            )
+                        )
+                    spans.append(
+                        TextSpan(start=CharPos(row=mark.end.row, col=0), n_letters=mark.end.col)
+                    )
+
+        return spans  # list(set(spans))
+
+    @staticmethod
+    def _adjust_wrapped_spans(
+        wrapped_lines: Sequence[str],
+        span: TextSpan,
+        *,
+        line_adjustment: int = 0,
+        left_adjustment: int = 0,
+    ) -> List[TextSpan]:
+        """
+        Adjust text span to wrapped lines.
+        Not perfect, but should be good enough considering
+        the limitation on commandline interface.
+        """
+
+        # current_row = span.start.row + line_adjustment
+        current_row = line_adjustment
+        start_col = span.start.col
+        end_col = start_col + span.n_letters
+
+        prev = 0  # chars length before current line
+        spans: List[TextSpan] = []
+        for n, line in enumerate(wrapped_lines):
+            # + 1 compensates textwrap.wrap(*args, replace_whitespace=True, drop_whitespace=True)
+            line_len = len(line) + 1
+            current = prev + line_len  # chars length before next line
+
+            # -:unmarked *:marked
+            # |------*****--------|
+            if start_col in range(prev, current) and end_col in range(prev, current):
+                spans.append(
+                    TextSpan(
+                        start=CharPos(row=current_row + n, col=start_col - prev + left_adjustment),
+                        n_letters=span.n_letters,
+                    )
+                )
+
+            # |----------*********|
+            elif start_col in range(prev, current):
+                spans.append(
+                    TextSpan(
+                        start=CharPos(row=current_row + n, col=start_col - prev + left_adjustment),
+                        n_letters=current - start_col - 1,  # -1: dropped whitespace
+                    )
+                )
+
+            # |********-----------|
+            elif end_col in range(prev, current):
+                spans.append(
+                    TextSpan(
+                        start=CharPos(row=current_row + n, col=0 + left_adjustment),
+                        n_letters=end_col - prev + 1,  # +1: dropped whitespace
+                    )
+                )
+
+            # |*******************|
+            elif prev in range(start_col, end_col) and current in range(start_col, end_col):
+                spans.append(
+                    TextSpan(
+                        start=CharPos(row=current_row + n, col=0 + left_adjustment),
+                        n_letters=line_len - 1,  # -1: dropped whitespace
+                    )
+                )
+
+            elif prev > end_col:
+                break
+
+            prev = current
+
+        return spans
+
+    @staticmethod
+    def _group_spans_by_row(blocks: Sequence[TextSpan]) -> Mapping[int, List[TextSpan]]:
+        groups: Dict[int, List[TextSpan]] = {}
+        for block in blocks:
+            row = block.start.row
+            if row in groups:
+                groups[row].append(block)
+            else:
+                groups[row] = [block]
+        return groups
+
+    def __init__(self, sects={""}):
+        HTMLParser.__init__(self)
+        self.text = [""]
+        self.ishead = False
+        self.isinde = False
+        self.isbull = False
+        self.ispref = False
+        self.ishidden = False
+        self.idhead = set()
+        self.idinde = set()
+        self.idbull = set()
+        self.idpref = set()
+        self.idimgs = set()
+        self.sects = sects
+        self.sectsindex = {}
+        self.italic_marks: List[TextMark] = []
+        self.bold_marks: List[TextMark] = []
+        self.imgs: Dict[int, str] = dict()
+
+    def handle_starttag(self, tag, attrs):
+        if re.match("h[1-6]", tag) is not None:
+            self.ishead = True
+        elif tag in self.inde:
+            self.isinde = True
+        elif tag in self.pref:
+            self.ispref = True
+        elif tag in self.bull:
+            self.isbull = True
+        elif tag in self.hide:
+            self.ishidden = True
+        elif tag == "sup":
+            self.text[-1] += "^{"
+        elif tag == "sub":
+            self.text[-1] += "_{"
+        # NOTE: "img" and "image"
+        # In HTML, both are startendtag (no need endtag)
+        # but in XHTML both need endtag
+        elif tag in {"img", "image"}:
+            for i in attrs:
+                if (tag == "img" and i[0] == "src") or (tag == "image" and i[0].endswith("href")):
+                    this_line = len(self.text)
+                    self.idimgs.add(this_line)
+                    self.imgs[this_line] = unquote(i[1])
+                    self.text.append("[IMAGE]")
+        # formatting
+        elif tag in self.ital:
+            if len(self.italic_marks) == 0 or self.italic_marks[-1].is_valid():
+                char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1]))
+                self.italic_marks.append(TextMark(start=char_pos))
+        elif tag in self.bold:
+            if len(self.bold_marks) == 0 or self.bold_marks[-1].is_valid():
+                char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1]))
+                self.bold_marks.append(TextMark(start=char_pos))
+        if self.sects != {""}:
+            for i in attrs:
+                if i[0] == "id" and i[1] in self.sects:
+                    # self.text[-1] += " (#" + i[1] + ") "
+                    # self.sectsindex.append([len(self.text), i[1]])
+                    self.sectsindex[len(self.text) - 1] = i[1]
+
+    def handle_startendtag(self, tag, attrs):
+        if tag == "br":
+            self.text += [""]
+        elif tag in {"img", "image"}:
+            for i in attrs:
+                #  if (tag == "img" and i[0] == "src")\
+                #     or (tag == "image" and i[0] == "xlink:href"):
+                if (tag == "img" and i[0] == "src") or (tag == "image" and i[0].endswith("href")):
+                    this_line = len(self.text)
+                    self.idimgs.add(this_line)
+                    self.imgs[this_line] = unquote(i[1])
+                    self.text.append("[IMAGE]")
+                    self.text.append("")
+        # sometimes attribute "id" is inside "startendtag"
+        # especially html from mobi module (kindleunpack fork)
+        if self.sects != {""}:
+            for i in attrs:
+                if i[0] == "id" and i[1] in self.sects:
+                    # self.text[-1] += " (#" + i[1] + ") "
+                    self.sectsindex[len(self.text) - 1] = i[1]
+
+    def handle_endtag(self, tag):
+        if re.match("h[1-6]", tag) is not None:
+            self.text.append("")
+            self.text.append("")
+            self.ishead = False
+        elif tag in self.para:
+            self.text.append("")
+        elif tag in self.hide:
+            self.ishidden = False
+        elif tag in self.inde:
+            if self.text[-1] != "":
+                self.text.append("")
+            self.isinde = False
+        elif tag in self.pref:
+            if self.text[-1] != "":
+                self.text.append("")
+            self.ispref = False
+        elif tag in self.bull:
+            if self.text[-1] != "":
+                self.text.append("")
+            self.isbull = False
+        elif tag in {"sub", "sup"}:
+            self.text[-1] += "}"
+        elif tag in {"img", "image"}:
+            self.text.append("")
+        # formatting
+        elif tag in self.ital:
+            char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1]))
+            last_mark = self.italic_marks[-1]
+            self.italic_marks[-1] = dataclasses.replace(last_mark, end=char_pos)
+        elif tag in self.bold:
+            char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1]))
+            last_mark = self.bold_marks[-1]
+            self.bold_marks[-1] = dataclasses.replace(last_mark, end=char_pos)
+
+    def handle_data(self, raw):
+        if raw and not self.ishidden:
+            if self.text[-1] == "":
+                tmp = raw.lstrip()
+            else:
+                tmp = raw
+            if self.ispref:
+                line = unescape(tmp)
+            else:
+                line = unescape(re.sub(r"\s+", " ", tmp))
+            self.text[-1] += line
+            if self.ishead:
+                self.idhead.add(len(self.text) - 1)
+            elif self.isbull:
+                self.idbull.add(len(self.text) - 1)
+            elif self.isinde:
+                self.idinde.add(len(self.text) - 1)
+            elif self.ispref:
+                self.idpref.add(len(self.text) - 1)
+
+    def get_structured_text(
+        self, textwidth: Optional[int] = 0, starting_line: int = 0
+    ) -> Union[Tuple[str, ...], TextStructure]:
+
+        if not textwidth:
+            return tuple(self.text)
+
+        text: List[str] = []
+        images: Dict[int, str] = dict()  # {line_num: path/in/zip}
+        sect: Dict[str, int] = dict()  # {section_id: line_num}
+        formatting: List[InlineStyle] = []
+
+        italic_spans: List[TextSpan] = HTMLtoLines._mark_to_spans(self.text, self.italic_marks)
+        bold_spans: List[TextSpan] = HTMLtoLines._mark_to_spans(self.text, self.bold_marks)
+        italic_groups = HTMLtoLines._group_spans_by_row(italic_spans)
+        bold_groups = HTMLtoLines._group_spans_by_row(bold_spans)
+
+        for n, line in enumerate(self.text):
+
+            startline = len(text)
+            # findsect = re.search(r"(?<= \(#).*?(?=\) )", line)
+            # if findsect is not None and findsect.group() in self.sects:
+            # line = line.replace(" (#" + findsect.group() + ") ", "")
+            # # line = line.replace(" (#" + findsect.group() + ") ", " "*(5+len(findsect.group())))
+            # sect[findsect.group()] = len(text)
+            if n in self.sectsindex.keys():
+                sect[self.sectsindex[n]] = starting_line + len(text)
+            if n in self.idhead:
+                # text += [line.rjust(textwidth // 2 + len(line) // 2)] + [""]
+                text += [line.center(textwidth)] + [""]
+                formatting += [
+                    InlineStyle(
+                        row=starting_line + i, col=0, n_letters=len(text[i]), attr=self.attr_bold
+                    )
+                    for i in range(startline, len(text))
+                ]
+            elif n in self.idinde:
+                text += ["   " + i for i in textwrap.wrap(line, textwidth - 3)] + [""]
+            elif n in self.idbull:
+                tmp = textwrap.wrap(line, textwidth - 3)
+                text += [" - " + i if i == tmp[0] else "   " + i for i in tmp] + [""]
+            elif n in self.idpref:
+                tmp = line.splitlines()
+                wraptmp = []
+                for tmp_line in tmp:
+                    wraptmp += [i for i in textwrap.wrap(tmp_line, textwidth - 6)]
+                text += ["   " + i for i in wraptmp] + [""]
+            elif n in self.idimgs:
+                images[starting_line + len(text)] = self.imgs[n]
+                text += [line.center(textwidth)]
+                formatting += [
+                    InlineStyle(
+                        row=starting_line + len(text) - 1,
+                        col=0,
+                        n_letters=len(text[-1]),
+                        attr=self.attr_bold,
+                    )
+                ]
+                text += [""]
+            else:
+                text += textwrap.wrap(line, textwidth) + [""]
+
+            endline = len(text)  # -1
+
+            left_adjustment = 3 if n in self.idbull | self.idinde else 0
+
+            for spans in italic_groups.get(n, []):
+                italics = HTMLtoLines._adjust_wrapped_spans(
+                    text[startline:endline],
+                    spans,
+                    line_adjustment=startline,
+                    left_adjustment=left_adjustment,
+                )
+                for span in italics:
+                    formatting.append(
+                        InlineStyle(
+                            row=starting_line + span.start.row,
+                            col=span.start.col,
+                            n_letters=span.n_letters,
+                            attr=self.attr_italic,
+                        )
+                    )
+
+            for spans in bold_groups.get(n, []):
+                bolds = HTMLtoLines._adjust_wrapped_spans(
+                    text[startline:endline],
+                    spans,
+                    line_adjustment=startline,
+                    left_adjustment=left_adjustment,
+                )
+                for span in bolds:
+                    formatting.append(
+                        InlineStyle(
+                            row=starting_line + span.start.row,
+                            col=span.start.col,
+                            n_letters=span.n_letters,
+                            attr=self.attr_bold,
+                        )
+                    )
+
+        # chapter suffix
+        text += ["***".center(textwidth)]
+
+        return TextStructure(
+            text_lines=tuple(text),
+            image_maps=images,
+            section_rows=sect,
+            formatting=tuple(formatting),
+        )
+
+
+def parse_html(
+    html_src: str,
+    *,
+    textwidth: Optional[int] = None,
+    section_ids: Optional[Set[str]] = None,
+    starting_line: int = 0,
+) -> Union[Tuple[str, ...], TextStructure]:
+    """
+    Parse html string into TextStructure
+
+    :param html_src: html str to parse
+    :param textwidth: textwidth to count max length of returned TextStructure
+                      if None given, sequence of text as paragraph is returned
+    :param section_ids: set of section ids to look for inside html tag attr
+    :return: Tuple[str, ...] if textwidth not given else TextStructure
+    """
+    if not section_ids:
+        section_ids = set()
+
+    parser = HTMLtoLines(section_ids)
+    # try:
+    parser.feed(html_src)
+    parser.close()
+    # except:
+    #     pass
+
+    return parser.get_structured_text(textwidth, starting_line)