diff options
Diffstat (limited to 'src/epy_reader/parser.py')
-rw-r--r-- | src/epy_reader/parser.py | 421 |
1 files changed, 421 insertions, 0 deletions
diff --git a/src/epy_reader/parser.py b/src/epy_reader/parser.py new file mode 100644 index 0000000..6eced00 --- /dev/null +++ b/src/epy_reader/parser.py @@ -0,0 +1,421 @@ +import curses +import dataclasses +import re +import textwrap +from html import unescape +from html.parser import HTMLParser +from typing import Dict, List, Mapping, Optional, Sequence, Set, Tuple, Union +from urllib.parse import unquote + +from epy_reader.models import CharPos, InlineStyle, TextMark, TextSpan, TextStructure + + +class HTMLtoLines(HTMLParser): + para = {"p", "div"} + inde = {"q", "dt", "dd", "blockquote"} + pref = {"pre"} + bull = {"li"} + hide = {"script", "style", "head"} + ital = {"i", "em"} + bold = {"b", "strong"} + # hide = {"script", "style", "head", ", "sub} + # sup_lookup = "⁰¹²³⁴⁵⁶⁷⁸⁹" + # sub_lookup = "₀₁₂₃₄₅₆₇₈₉" + + attr_bold = curses.A_BOLD + try: + attr_italic = curses.A_ITALIC + except AttributeError: + try: + attr_italic = curses.A_UNDERLINE + except AttributeError: + attr_italic = curses.A_NORMAL + + @staticmethod + def _mark_to_spans(text: Sequence[str], marks: Sequence[TextMark]) -> List[TextSpan]: + """ + Convert text marks in line of text to per line text span. + Keeping duplicate spans. + """ + spans: List[TextSpan] = [] + for mark in marks: + if mark.is_valid(): + # mypy issue, should be handled by mark.is_valid() + assert mark.end is not None + if mark.start.row == mark.end.row: + spans.append( + TextSpan(start=mark.start, n_letters=mark.end.col - mark.start.col) + ) + else: + spans.append( + TextSpan( + start=mark.start, n_letters=len(text[mark.start.row]) - mark.start.col + ) + ) + for nth_line in range(mark.start.row + 1, mark.end.row): + spans.append( + TextSpan( + start=CharPos(row=nth_line, col=0), n_letters=len(text[nth_line]) + ) + ) + spans.append( + TextSpan(start=CharPos(row=mark.end.row, col=0), n_letters=mark.end.col) + ) + + return spans # list(set(spans)) + + @staticmethod + def _adjust_wrapped_spans( + wrapped_lines: Sequence[str], + span: TextSpan, + *, + line_adjustment: int = 0, + left_adjustment: int = 0, + ) -> List[TextSpan]: + """ + Adjust text span to wrapped lines. + Not perfect, but should be good enough considering + the limitation on commandline interface. + """ + + # current_row = span.start.row + line_adjustment + current_row = line_adjustment + start_col = span.start.col + end_col = start_col + span.n_letters + + prev = 0 # chars length before current line + spans: List[TextSpan] = [] + for n, line in enumerate(wrapped_lines): + # + 1 compensates textwrap.wrap(*args, replace_whitespace=True, drop_whitespace=True) + line_len = len(line) + 1 + current = prev + line_len # chars length before next line + + # -:unmarked *:marked + # |------*****--------| + if start_col in range(prev, current) and end_col in range(prev, current): + spans.append( + TextSpan( + start=CharPos(row=current_row + n, col=start_col - prev + left_adjustment), + n_letters=span.n_letters, + ) + ) + + # |----------*********| + elif start_col in range(prev, current): + spans.append( + TextSpan( + start=CharPos(row=current_row + n, col=start_col - prev + left_adjustment), + n_letters=current - start_col - 1, # -1: dropped whitespace + ) + ) + + # |********-----------| + elif end_col in range(prev, current): + spans.append( + TextSpan( + start=CharPos(row=current_row + n, col=0 + left_adjustment), + n_letters=end_col - prev + 1, # +1: dropped whitespace + ) + ) + + # |*******************| + elif prev in range(start_col, end_col) and current in range(start_col, end_col): + spans.append( + TextSpan( + start=CharPos(row=current_row + n, col=0 + left_adjustment), + n_letters=line_len - 1, # -1: dropped whitespace + ) + ) + + elif prev > end_col: + break + + prev = current + + return spans + + @staticmethod + def _group_spans_by_row(blocks: Sequence[TextSpan]) -> Mapping[int, List[TextSpan]]: + groups: Dict[int, List[TextSpan]] = {} + for block in blocks: + row = block.start.row + if row in groups: + groups[row].append(block) + else: + groups[row] = [block] + return groups + + def __init__(self, sects={""}): + HTMLParser.__init__(self) + self.text = [""] + self.ishead = False + self.isinde = False + self.isbull = False + self.ispref = False + self.ishidden = False + self.idhead = set() + self.idinde = set() + self.idbull = set() + self.idpref = set() + self.idimgs = set() + self.sects = sects + self.sectsindex = {} + self.italic_marks: List[TextMark] = [] + self.bold_marks: List[TextMark] = [] + self.imgs: Dict[int, str] = dict() + + def handle_starttag(self, tag, attrs): + if re.match("h[1-6]", tag) is not None: + self.ishead = True + elif tag in self.inde: + self.isinde = True + elif tag in self.pref: + self.ispref = True + elif tag in self.bull: + self.isbull = True + elif tag in self.hide: + self.ishidden = True + elif tag == "sup": + self.text[-1] += "^{" + elif tag == "sub": + self.text[-1] += "_{" + # NOTE: "img" and "image" + # In HTML, both are startendtag (no need endtag) + # but in XHTML both need endtag + elif tag in {"img", "image"}: + for i in attrs: + if (tag == "img" and i[0] == "src") or (tag == "image" and i[0].endswith("href")): + this_line = len(self.text) + self.idimgs.add(this_line) + self.imgs[this_line] = unquote(i[1]) + self.text.append("[IMAGE]") + # formatting + elif tag in self.ital: + if len(self.italic_marks) == 0 or self.italic_marks[-1].is_valid(): + char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1])) + self.italic_marks.append(TextMark(start=char_pos)) + elif tag in self.bold: + if len(self.bold_marks) == 0 or self.bold_marks[-1].is_valid(): + char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1])) + self.bold_marks.append(TextMark(start=char_pos)) + if self.sects != {""}: + for i in attrs: + if i[0] == "id" and i[1] in self.sects: + # self.text[-1] += " (#" + i[1] + ") " + # self.sectsindex.append([len(self.text), i[1]]) + self.sectsindex[len(self.text) - 1] = i[1] + + def handle_startendtag(self, tag, attrs): + if tag == "br": + self.text += [""] + elif tag in {"img", "image"}: + for i in attrs: + # if (tag == "img" and i[0] == "src")\ + # or (tag == "image" and i[0] == "xlink:href"): + if (tag == "img" and i[0] == "src") or (tag == "image" and i[0].endswith("href")): + this_line = len(self.text) + self.idimgs.add(this_line) + self.imgs[this_line] = unquote(i[1]) + self.text.append("[IMAGE]") + self.text.append("") + # sometimes attribute "id" is inside "startendtag" + # especially html from mobi module (kindleunpack fork) + if self.sects != {""}: + for i in attrs: + if i[0] == "id" and i[1] in self.sects: + # self.text[-1] += " (#" + i[1] + ") " + self.sectsindex[len(self.text) - 1] = i[1] + + def handle_endtag(self, tag): + if re.match("h[1-6]", tag) is not None: + self.text.append("") + self.text.append("") + self.ishead = False + elif tag in self.para: + self.text.append("") + elif tag in self.hide: + self.ishidden = False + elif tag in self.inde: + if self.text[-1] != "": + self.text.append("") + self.isinde = False + elif tag in self.pref: + if self.text[-1] != "": + self.text.append("") + self.ispref = False + elif tag in self.bull: + if self.text[-1] != "": + self.text.append("") + self.isbull = False + elif tag in {"sub", "sup"}: + self.text[-1] += "}" + elif tag in {"img", "image"}: + self.text.append("") + # formatting + elif tag in self.ital: + char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1])) + last_mark = self.italic_marks[-1] + self.italic_marks[-1] = dataclasses.replace(last_mark, end=char_pos) + elif tag in self.bold: + char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1])) + last_mark = self.bold_marks[-1] + self.bold_marks[-1] = dataclasses.replace(last_mark, end=char_pos) + + def handle_data(self, raw): + if raw and not self.ishidden: + if self.text[-1] == "": + tmp = raw.lstrip() + else: + tmp = raw + if self.ispref: + line = unescape(tmp) + else: + line = unescape(re.sub(r"\s+", " ", tmp)) + self.text[-1] += line + if self.ishead: + self.idhead.add(len(self.text) - 1) + elif self.isbull: + self.idbull.add(len(self.text) - 1) + elif self.isinde: + self.idinde.add(len(self.text) - 1) + elif self.ispref: + self.idpref.add(len(self.text) - 1) + + def get_structured_text( + self, textwidth: Optional[int] = 0, starting_line: int = 0 + ) -> Union[Tuple[str, ...], TextStructure]: + + if not textwidth: + return tuple(self.text) + + text: List[str] = [] + images: Dict[int, str] = dict() # {line_num: path/in/zip} + sect: Dict[str, int] = dict() # {section_id: line_num} + formatting: List[InlineStyle] = [] + + italic_spans: List[TextSpan] = HTMLtoLines._mark_to_spans(self.text, self.italic_marks) + bold_spans: List[TextSpan] = HTMLtoLines._mark_to_spans(self.text, self.bold_marks) + italic_groups = HTMLtoLines._group_spans_by_row(italic_spans) + bold_groups = HTMLtoLines._group_spans_by_row(bold_spans) + + for n, line in enumerate(self.text): + + startline = len(text) + # findsect = re.search(r"(?<= \(#).*?(?=\) )", line) + # if findsect is not None and findsect.group() in self.sects: + # line = line.replace(" (#" + findsect.group() + ") ", "") + # # line = line.replace(" (#" + findsect.group() + ") ", " "*(5+len(findsect.group()))) + # sect[findsect.group()] = len(text) + if n in self.sectsindex.keys(): + sect[self.sectsindex[n]] = starting_line + len(text) + if n in self.idhead: + # text += [line.rjust(textwidth // 2 + len(line) // 2)] + [""] + text += [line.center(textwidth)] + [""] + formatting += [ + InlineStyle( + row=starting_line + i, col=0, n_letters=len(text[i]), attr=self.attr_bold + ) + for i in range(startline, len(text)) + ] + elif n in self.idinde: + text += [" " + i for i in textwrap.wrap(line, textwidth - 3)] + [""] + elif n in self.idbull: + tmp = textwrap.wrap(line, textwidth - 3) + text += [" - " + i if i == tmp[0] else " " + i for i in tmp] + [""] + elif n in self.idpref: + tmp = line.splitlines() + wraptmp = [] + for tmp_line in tmp: + wraptmp += [i for i in textwrap.wrap(tmp_line, textwidth - 6)] + text += [" " + i for i in wraptmp] + [""] + elif n in self.idimgs: + images[starting_line + len(text)] = self.imgs[n] + text += [line.center(textwidth)] + formatting += [ + InlineStyle( + row=starting_line + len(text) - 1, + col=0, + n_letters=len(text[-1]), + attr=self.attr_bold, + ) + ] + text += [""] + else: + text += textwrap.wrap(line, textwidth) + [""] + + endline = len(text) # -1 + + left_adjustment = 3 if n in self.idbull | self.idinde else 0 + + for spans in italic_groups.get(n, []): + italics = HTMLtoLines._adjust_wrapped_spans( + text[startline:endline], + spans, + line_adjustment=startline, + left_adjustment=left_adjustment, + ) + for span in italics: + formatting.append( + InlineStyle( + row=starting_line + span.start.row, + col=span.start.col, + n_letters=span.n_letters, + attr=self.attr_italic, + ) + ) + + for spans in bold_groups.get(n, []): + bolds = HTMLtoLines._adjust_wrapped_spans( + text[startline:endline], + spans, + line_adjustment=startline, + left_adjustment=left_adjustment, + ) + for span in bolds: + formatting.append( + InlineStyle( + row=starting_line + span.start.row, + col=span.start.col, + n_letters=span.n_letters, + attr=self.attr_bold, + ) + ) + + # chapter suffix + text += ["***".center(textwidth)] + + return TextStructure( + text_lines=tuple(text), + image_maps=images, + section_rows=sect, + formatting=tuple(formatting), + ) + + +def parse_html( + html_src: str, + *, + textwidth: Optional[int] = None, + section_ids: Optional[Set[str]] = None, + starting_line: int = 0, +) -> Union[Tuple[str, ...], TextStructure]: + """ + Parse html string into TextStructure + + :param html_src: html str to parse + :param textwidth: textwidth to count max length of returned TextStructure + if None given, sequence of text as paragraph is returned + :param section_ids: set of section ids to look for inside html tag attr + :return: Tuple[str, ...] if textwidth not given else TextStructure + """ + if not section_ids: + section_ids = set() + + parser = HTMLtoLines(section_ids) + # try: + parser.feed(html_src) + parser.close() + # except: + # pass + + return parser.get_structured_text(textwidth, starting_line) |