path: root/src/epy_reader/parser.py



import curses
import dataclasses
import re
import textwrap
from html import unescape
from html.parser import HTMLParser
from typing import Dict, List, Mapping, Optional, Sequence, Set, Tuple, Union
from urllib.parse import unquote

from epy_reader.models import CharPos, InlineStyle, TextMark, TextSpan, TextStructure


class HTMLtoLines(HTMLParser):
    para = {"p", "div"}
    inde = {"q", "dt", "dd", "blockquote"}
    pref = {"pre"}
    bull = {"li"}
    hide = {"script", "style", "head"}
    ital = {"i", "em"}
    bold = {"b", "strong"}
    # hide = {"script", "style", "head", ", "sub}
    # sup_lookup = "⁰¹²³⁴⁵⁶⁷⁸⁹"
    # sub_lookup = "₀₁₂₃₄₅₆₇₈₉"

    attr_bold = curses.A_BOLD
    try:
        attr_italic = curses.A_ITALIC
    except AttributeError:
        try:
            attr_italic = curses.A_UNDERLINE
        except AttributeError:
            attr_italic = curses.A_NORMAL

    @staticmethod
    def _mark_to_spans(text: Sequence[str], marks: Sequence[TextMark]) -> List[TextSpan]:
        """
        Convert text marks in line of text to per line text span.
        Keeping duplicate spans.
        """
        spans: List[TextSpan] = []
        for mark in marks:
            if mark.is_valid():
                # mypy issue, should be handled by mark.is_valid()
                assert mark.end is not None
                if mark.start.row == mark.end.row:
                    spans.append(
                        TextSpan(start=mark.start, n_letters=mark.end.col - mark.start.col)
                    )
                else:
                    spans.append(
                        TextSpan(
                            start=mark.start, n_letters=len(text[mark.start.row]) - mark.start.col
                        )
                    )
                    for nth_line in range(mark.start.row + 1, mark.end.row):
                        spans.append(
                            TextSpan(
                                start=CharPos(row=nth_line, col=0), n_letters=len(text[nth_line])
                            )
                        )
                    spans.append(
                        TextSpan(start=CharPos(row=mark.end.row, col=0), n_letters=mark.end.col)
                    )

        return spans  # list(set(spans))

    @staticmethod
    def _adjust_wrapped_spans(
        wrapped_lines: Sequence[str],
        span: TextSpan,
        *,
        line_adjustment: int = 0,
        left_adjustment: int = 0,
    ) -> List[TextSpan]:
        """
        Adjust text span to wrapped lines.
        Not perfect, but should be good enough considering
        the limitation on commandline interface.
        """

        # current_row = span.start.row + line_adjustment
        current_row = line_adjustment
        start_col = span.start.col
        end_col = start_col + span.n_letters

        prev = 0  # chars length before current line
        spans: List[TextSpan] = []
        for n, line in enumerate(wrapped_lines):
            # + 1 compensates textwrap.wrap(*args, replace_whitespace=True, drop_whitespace=True)
            line_len = len(line) + 1
            current = prev + line_len  # chars length before next line

            # -:unmarked *:marked
            # |------*****--------|
            if start_col in range(prev, current) and end_col in range(prev, current):
                spans.append(
                    TextSpan(
                        start=CharPos(row=current_row + n, col=start_col - prev + left_adjustment),
                        n_letters=span.n_letters,
                    )
                )

            # |----------*********|
            elif start_col in range(prev, current):
                spans.append(
                    TextSpan(
                        start=CharPos(row=current_row + n, col=start_col - prev + left_adjustment),
                        n_letters=current - start_col - 1,  # -1: dropped whitespace
                    )
                )

            # |********-----------|
            elif end_col in range(prev, current):
                spans.append(
                    TextSpan(
                        start=CharPos(row=current_row + n, col=0 + left_adjustment),
                        n_letters=end_col - prev + 1,  # +1: dropped whitespace
                    )
                )

            # |*******************|
            elif prev in range(start_col, end_col) and current in range(start_col, end_col):
                spans.append(
                    TextSpan(
                        start=CharPos(row=current_row + n, col=0 + left_adjustment),
                        n_letters=line_len - 1,  # -1: dropped whitespace
                    )
                )

            elif prev > end_col:
                break

            prev = current

        return spans

    @staticmethod
    def _group_spans_by_row(blocks: Sequence[TextSpan]) -> Mapping[int, List[TextSpan]]:
        groups: Dict[int, List[TextSpan]] = {}
        for block in blocks:
            row = block.start.row
            if row in groups:
                groups[row].append(block)
            else:
                groups[row] = [block]
        return groups

    def __init__(self, sects={""}):
        HTMLParser.__init__(self)
        self.text = [""]
        self.ishead = False
        self.isinde = False
        self.isbull = False
        self.ispref = False
        self.ishidden = False
        self.idhead = set()
        self.idinde = set()
        self.idbull = set()
        self.idpref = set()
        self.idimgs = set()
        self.sects = sects
        self.sectsindex = {}
        self.italic_marks: List[TextMark] = []
        self.bold_marks: List[TextMark] = []
        self.imgs: Dict[int, str] = dict()

    def handle_starttag(self, tag, attrs):
        if re.match("h[1-6]", tag) is not None:
            self.ishead = True
        elif tag in self.inde:
            self.isinde = True
        elif tag in self.pref:
            self.ispref = True
        elif tag in self.bull:
            self.isbull = True
        elif tag in self.hide:
            self.ishidden = True
        elif tag == "sup":
            self.text[-1] += "^{"
        elif tag == "sub":
            self.text[-1] += "_{"
        # NOTE: "img" and "image"
        # In HTML, both are startendtag (no need endtag)
        # but in XHTML both need endtag
        elif tag in {"img", "image"}:
            for i in attrs:
                if (tag == "img" and i[0] == "src") or (tag == "image" and i[0].endswith("href")):
                    this_line = len(self.text)
                    self.idimgs.add(this_line)
                    self.imgs[this_line] = unquote(i[1])
                    self.text.append("[IMAGE]")
        # formatting
        elif tag in self.ital:
            if len(self.italic_marks) == 0 or self.italic_marks[-1].is_valid():
                char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1]))
                self.italic_marks.append(TextMark(start=char_pos))
        elif tag in self.bold:
            if len(self.bold_marks) == 0 or self.bold_marks[-1].is_valid():
                char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1]))
                self.bold_marks.append(TextMark(start=char_pos))
        if self.sects != {""}:
            for i in attrs:
                if i[0] == "id" and i[1] in self.sects:
                    # self.text[-1] += " (#" + i[1] + ") "
                    # self.sectsindex.append([len(self.text), i[1]])
                    self.sectsindex[len(self.text) - 1] = i[1]

    def handle_startendtag(self, tag, attrs):
        if tag == "br":
            self.text += [""]
        elif tag in {"img", "image"}:
            for i in attrs:
                #  if (tag == "img" and i[0] == "src")\
                #     or (tag == "image" and i[0] == "xlink:href"):
                if (tag == "img" and i[0] == "src") or (tag == "image" and i[0].endswith("href")):
                    this_line = len(self.text)
                    self.idimgs.add(this_line)
                    self.imgs[this_line] = unquote(i[1])
                    self.text.append("[IMAGE]")
                    self.text.append("")
        # sometimes attribute "id" is inside "startendtag"
        # especially html from mobi module (kindleunpack fork)
        if self.sects != {""}:
            for i in attrs:
                if i[0] == "id" and i[1] in self.sects:
                    # self.text[-1] += " (#" + i[1] + ") "
                    self.sectsindex[len(self.text) - 1] = i[1]

    def handle_endtag(self, tag):
        if re.match("h[1-6]", tag) is not None:
            self.text.append("")
            self.text.append("")
            self.ishead = False
        elif tag in self.para:
            self.text.append("")
        elif tag in self.hide:
            self.ishidden = False
        elif tag in self.inde:
            if self.text[-1] != "":
                self.text.append("")
            self.isinde = False
        elif tag in self.pref:
            if self.text[-1] != "":
                self.text.append("")
            self.ispref = False
        elif tag in self.bull:
            if self.text[-1] != "":
                self.text.append("")
            self.isbull = False
        elif tag in {"sub", "sup"}:
            self.text[-1] += "}"
        elif tag in {"img", "image"}:
            self.text.append("")
        # formatting
        elif tag in self.ital:
            char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1]))
            last_mark = self.italic_marks[-1]
            self.italic_marks[-1] = dataclasses.replace(last_mark, end=char_pos)
        elif tag in self.bold:
            char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1]))
            last_mark = self.bold_marks[-1]
            self.bold_marks[-1] = dataclasses.replace(last_mark, end=char_pos)

    def handle_data(self, raw):
        if raw and not self.ishidden:
            if self.text[-1] == "":
                tmp = raw.lstrip()
            else:
                tmp = raw
            if self.ispref:
                line = unescape(tmp)
            else:
                line = unescape(re.sub(r"\s+", " ", tmp))
            self.text[-1] += line
            if self.ishead:
                self.idhead.add(len(self.text) - 1)
            elif self.isbull:
                self.idbull.add(len(self.text) - 1)
            elif self.isinde:
                self.idinde.add(len(self.text) - 1)
            elif self.ispref:
                self.idpref.add(len(self.text) - 1)

    def get_structured_text(
        self, textwidth: Optional[int] = 0, starting_line: int = 0
    ) -> Union[Tuple[str, ...], TextStructure]:

        if not textwidth:
            return tuple(self.text)

        text: List[str] = []
        images: Dict[int, str] = dict()  # {line_num: path/in/zip}
        sect: Dict[str, int] = dict()  # {section_id: line_num}
        formatting: List[InlineStyle] = []

        italic_spans: List[TextSpan] = HTMLtoLines._mark_to_spans(self.text, self.italic_marks)
        bold_spans: List[TextSpan] = HTMLtoLines._mark_to_spans(self.text, self.bold_marks)
        italic_groups = HTMLtoLines._group_spans_by_row(italic_spans)
        bold_groups = HTMLtoLines._group_spans_by_row(bold_spans)

        for n, line in enumerate(self.text):

            startline = len(text)
            # findsect = re.search(r"(?<= \(#).*?(?=\) )", line)
            # if findsect is not None and findsect.group() in self.sects:
            # line = line.replace(" (#" + findsect.group() + ") ", "")
            # # line = line.replace(" (#" + findsect.group() + ") ", " "*(5+len(findsect.group())))
            # sect[findsect.group()] = len(text)
            if n in self.sectsindex.keys():
                sect[self.sectsindex[n]] = starting_line + len(text)
            if n in self.idhead:
                # text += [line.rjust(textwidth // 2 + len(line) // 2)] + [""]
                text += [line.center(textwidth)] + [""]
                formatting += [
                    InlineStyle(
                        row=starting_line + i, col=0, n_letters=len(text[i]), attr=self.attr_bold
                    )
                    for i in range(startline, len(text))
                ]
            elif n in self.idinde:
                text += ["   " + i for i in textwrap.wrap(line, textwidth - 3)] + [""]
            elif n in self.idbull:
                tmp = textwrap.wrap(line, textwidth - 3)
                text += [" - " + i if i == tmp[0] else "   " + i for i in tmp] + [""]
            elif n in self.idpref:
                tmp = line.splitlines()
                wraptmp = []
                for tmp_line in tmp:
                    wraptmp += [i for i in textwrap.wrap(tmp_line, textwidth - 6)]
                text += ["   " + i for i in wraptmp] + [""]
            elif n in self.idimgs:
                images[starting_line + len(text)] = self.imgs[n]
                text += [line.center(textwidth)]
                formatting += [
                    InlineStyle(
                        row=starting_line + len(text) - 1,
                        col=0,
                        n_letters=len(text[-1]),
                        attr=self.attr_bold,
                    )
                ]
                text += [""]
            else:
                text += textwrap.wrap(line, textwidth) + [""]

            endline = len(text)  # -1

            left_adjustment = 3 if n in self.idbull | self.idinde else 0

            for spans in italic_groups.get(n, []):
                italics = HTMLtoLines._adjust_wrapped_spans(
                    text[startline:endline],
                    spans,
                    line_adjustment=startline,
                    left_adjustment=left_adjustment,
                )
                for span in italics:
                    formatting.append(
                        InlineStyle(
                            row=starting_line + span.start.row,
                            col=span.start.col,
                            n_letters=span.n_letters,
                            attr=self.attr_italic,
                        )
                    )

            for spans in bold_groups.get(n, []):
                bolds = HTMLtoLines._adjust_wrapped_spans(
                    text[startline:endline],
                    spans,
                    line_adjustment=startline,
                    left_adjustment=left_adjustment,
                )
                for span in bolds:
                    formatting.append(
                        InlineStyle(
                            row=starting_line + span.start.row,
                            col=span.start.col,
                            n_letters=span.n_letters,
                            attr=self.attr_bold,
                        )
                    )

        # chapter suffix
        text += ["***".center(textwidth)]

        return TextStructure(
            text_lines=tuple(text),
            image_maps=images,
            section_rows=sect,
            formatting=tuple(formatting),
        )


def parse_html(
    html_src: str,
    *,
    textwidth: Optional[int] = None,
    section_ids: Optional[Set[str]] = None,
    starting_line: int = 0,
) -> Union[Tuple[str, ...], TextStructure]:
    """
    Parse html string into TextStructure

    :param html_src: html str to parse
    :param textwidth: textwidth to count max length of returned TextStructure
                      if None given, sequence of text as paragraph is returned
    :param section_ids: set of section ids to look for inside html tag attr
    :return: Tuple[str, ...] if textwidth not given else TextStructure
    """
    if not section_ids:
        section_ids = set()

    parser = HTMLtoLines(section_ids)
    # try:
    parser.feed(html_src)
    parser.close()
    # except:
    #     pass

    return parser.get_structured_text(textwidth, starting_line)