import curses import dataclasses import re import textwrap from html import unescape from html.parser import HTMLParser from typing import Dict, List, Mapping, Optional, Sequence, Set, Tuple, Union from urllib.parse import unquote from epy_reader.models import CharPos, InlineStyle, TextMark, TextSpan, TextStructure class HTMLtoLines(HTMLParser): para = {"p", "div"} inde = {"q", "dt", "dd", "blockquote"} pref = {"pre"} bull = {"li"} hide = {"script", "style", "head"} ital = {"i", "em"} bold = {"b", "strong"} # hide = {"script", "style", "head", ", "sub} # sup_lookup = "⁰¹²³⁴⁵⁶⁷⁸⁹" # sub_lookup = "₀₁₂₃₄₅₆₇₈₉" attr_bold = curses.A_BOLD try: attr_italic = curses.A_ITALIC except AttributeError: try: attr_italic = curses.A_UNDERLINE except AttributeError: attr_italic = curses.A_NORMAL @staticmethod def _mark_to_spans(text: Sequence[str], marks: Sequence[TextMark]) -> List[TextSpan]: """ Convert text marks in line of text to per line text span. Keeping duplicate spans. """ spans: List[TextSpan] = [] for mark in marks: if mark.is_valid(): # mypy issue, should be handled by mark.is_valid() assert mark.end is not None if mark.start.row == mark.end.row: spans.append( TextSpan(start=mark.start, n_letters=mark.end.col - mark.start.col) ) else: spans.append( TextSpan( start=mark.start, n_letters=len(text[mark.start.row]) - mark.start.col ) ) for nth_line in range(mark.start.row + 1, mark.end.row): spans.append( TextSpan( start=CharPos(row=nth_line, col=0), n_letters=len(text[nth_line]) ) ) spans.append( TextSpan(start=CharPos(row=mark.end.row, col=0), n_letters=mark.end.col) ) return spans # list(set(spans)) @staticmethod def _adjust_wrapped_spans( wrapped_lines: Sequence[str], span: TextSpan, *, line_adjustment: int = 0, left_adjustment: int = 0, ) -> List[TextSpan]: """ Adjust text span to wrapped lines. Not perfect, but should be good enough considering the limitation on commandline interface. """ # current_row = span.start.row + line_adjustment current_row = line_adjustment start_col = span.start.col end_col = start_col + span.n_letters prev = 0 # chars length before current line spans: List[TextSpan] = [] for n, line in enumerate(wrapped_lines): # + 1 compensates textwrap.wrap(*args, replace_whitespace=True, drop_whitespace=True) line_len = len(line) + 1 current = prev + line_len # chars length before next line # -:unmarked *:marked # |------*****--------| if start_col in range(prev, current) and end_col in range(prev, current): spans.append( TextSpan( start=CharPos(row=current_row + n, col=start_col - prev + left_adjustment), n_letters=span.n_letters, ) ) # |----------*********| elif start_col in range(prev, current): spans.append( TextSpan( start=CharPos(row=current_row + n, col=start_col - prev + left_adjustment), n_letters=current - start_col - 1, # -1: dropped whitespace ) ) # |********-----------| elif end_col in range(prev, current): spans.append( TextSpan( start=CharPos(row=current_row + n, col=0 + left_adjustment), n_letters=end_col - prev + 1, # +1: dropped whitespace ) ) # |*******************| elif prev in range(start_col, end_col) and current in range(start_col, end_col): spans.append( TextSpan( start=CharPos(row=current_row + n, col=0 + left_adjustment), n_letters=line_len - 1, # -1: dropped whitespace ) ) elif prev > end_col: break prev = current return spans @staticmethod def _group_spans_by_row(blocks: Sequence[TextSpan]) -> Mapping[int, List[TextSpan]]: groups: Dict[int, List[TextSpan]] = {} for block in blocks: row = block.start.row if row in groups: groups[row].append(block) else: groups[row] = [block] return groups def __init__(self, sects={""}): HTMLParser.__init__(self) self.text = [""] self.ishead = False self.isinde = False self.isbull = False self.ispref = False self.ishidden = False self.idhead = set() self.idinde = set() self.idbull = set() self.idpref = set() self.idimgs = set() self.sects = sects self.sectsindex = {} self.italic_marks: List[TextMark] = [] self.bold_marks: List[TextMark] = [] self.imgs: Dict[int, str] = dict() def handle_starttag(self, tag, attrs): if re.match("h[1-6]", tag) is not None: self.ishead = True elif tag in self.inde: self.isinde = True elif tag in self.pref: self.ispref = True elif tag in self.bull: self.isbull = True elif tag in self.hide: self.ishidden = True elif tag == "sup": self.text[-1] += "^{" elif tag == "sub": self.text[-1] += "_{" # NOTE: "img" and "image" # In HTML, both are startendtag (no need endtag) # but in XHTML both need endtag elif tag in {"img", "image"}: for i in attrs: if (tag == "img" and i[0] == "src") or (tag == "image" and i[0].endswith("href")): this_line = len(self.text) self.idimgs.add(this_line) self.imgs[this_line] = unquote(i[1]) self.text.append("[IMAGE]") # formatting elif tag in self.ital: if len(self.italic_marks) == 0 or self.italic_marks[-1].is_valid(): char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1])) self.italic_marks.append(TextMark(start=char_pos)) elif tag in self.bold: if len(self.bold_marks) == 0 or self.bold_marks[-1].is_valid(): char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1])) self.bold_marks.append(TextMark(start=char_pos)) if self.sects != {""}: for i in attrs: if i[0] == "id" and i[1] in self.sects: # self.text[-1] += " (#" + i[1] + ") " # self.sectsindex.append([len(self.text), i[1]]) self.sectsindex[len(self.text) - 1] = i[1] def handle_startendtag(self, tag, attrs): if tag == "br": self.text += [""] elif tag in {"img", "image"}: for i in attrs: # if (tag == "img" and i[0] == "src")\ # or (tag == "image" and i[0] == "xlink:href"): if (tag == "img" and i[0] == "src") or (tag == "image" and i[0].endswith("href")): this_line = len(self.text) self.idimgs.add(this_line) self.imgs[this_line] = unquote(i[1]) self.text.append("[IMAGE]") self.text.append("") # sometimes attribute "id" is inside "startendtag" # especially html from mobi module (kindleunpack fork) if self.sects != {""}: for i in attrs: if i[0] == "id" and i[1] in self.sects: # self.text[-1] += " (#" + i[1] + ") " self.sectsindex[len(self.text) - 1] = i[1] def handle_endtag(self, tag): if re.match("h[1-6]", tag) is not None: self.text.append("") self.text.append("") self.ishead = False elif tag in self.para: self.text.append("") elif tag in self.hide: self.ishidden = False elif tag in self.inde: if self.text[-1] != "": self.text.append("") self.isinde = False elif tag in self.pref: if self.text[-1] != "": self.text.append("") self.ispref = False elif tag in self.bull: if self.text[-1] != "": self.text.append("") self.isbull = False elif tag in {"sub", "sup"}: self.text[-1] += "}" elif tag in {"img", "image"}: self.text.append("") # formatting elif tag in self.ital: char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1])) last_mark = self.italic_marks[-1] self.italic_marks[-1] = dataclasses.replace(last_mark, end=char_pos) elif tag in self.bold: char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1])) last_mark = self.bold_marks[-1] self.bold_marks[-1] = dataclasses.replace(last_mark, end=char_pos) def handle_data(self, raw): if raw and not self.ishidden: if self.text[-1] == "": tmp = raw.lstrip() else: tmp = raw if self.ispref: line = unescape(tmp) else: line = unescape(re.sub(r"\s+", " ", tmp)) self.text[-1] += line if self.ishead: self.idhead.add(len(self.text) - 1) elif self.isbull: self.idbull.add(len(self.text) - 1) elif self.isinde: self.idinde.add(len(self.text) - 1) elif self.ispref: self.idpref.add(len(self.text) - 1) def get_structured_text( self, textwidth: Optional[int] = 0, starting_line: int = 0 ) -> Union[Tuple[str, ...], TextStructure]: if not textwidth: return tuple(self.text) text: List[str] = [] images: Dict[int, str] = dict() # {line_num: path/in/zip} sect: Dict[str, int] = dict() # {section_id: line_num} formatting: List[InlineStyle] = [] italic_spans: List[TextSpan] = HTMLtoLines._mark_to_spans(self.text, self.italic_marks) bold_spans: List[TextSpan] = HTMLtoLines._mark_to_spans(self.text, self.bold_marks) italic_groups = HTMLtoLines._group_spans_by_row(italic_spans) bold_groups = HTMLtoLines._group_spans_by_row(bold_spans) for n, line in enumerate(self.text): startline = len(text) # findsect = re.search(r"(?<= \(#).*?(?=\) )", line) # if findsect is not None and findsect.group() in self.sects: # line = line.replace(" (#" + findsect.group() + ") ", "") # # line = line.replace(" (#" + findsect.group() + ") ", " "*(5+len(findsect.group()))) # sect[findsect.group()] = len(text) if n in self.sectsindex.keys(): sect[self.sectsindex[n]] = starting_line + len(text) if n in self.idhead: # text += [line.rjust(textwidth // 2 + len(line) // 2)] + [""] text += [line.center(textwidth)] + [""] formatting += [ InlineStyle( row=starting_line + i, col=0, n_letters=len(text[i]), attr=self.attr_bold ) for i in range(startline, len(text)) ] elif n in self.idinde: text += [" " + i for i in textwrap.wrap(line, textwidth - 3)] + [""] elif n in self.idbull: tmp = textwrap.wrap(line, textwidth - 3) text += [" - " + i if i == tmp[0] else " " + i for i in tmp] + [""] elif n in self.idpref: tmp = line.splitlines() wraptmp = [] for tmp_line in tmp: wraptmp += [i for i in textwrap.wrap(tmp_line, textwidth - 6)] text += [" " + i for i in wraptmp] + [""] elif n in self.idimgs: images[starting_line + len(text)] = self.imgs[n] text += [line.center(textwidth)] formatting += [ InlineStyle( row=starting_line + len(text) - 1, col=0, n_letters=len(text[-1]), attr=self.attr_bold, ) ] text += [""] else: text += textwrap.wrap(line, textwidth) + [""] endline = len(text) # -1 left_adjustment = 3 if n in self.idbull | self.idinde else 0 for spans in italic_groups.get(n, []): italics = HTMLtoLines._adjust_wrapped_spans( text[startline:endline], spans, line_adjustment=startline, left_adjustment=left_adjustment, ) for span in italics: formatting.append( InlineStyle( row=starting_line + span.start.row, col=span.start.col, n_letters=span.n_letters, attr=self.attr_italic, ) ) for spans in bold_groups.get(n, []): bolds = HTMLtoLines._adjust_wrapped_spans( text[startline:endline], spans, line_adjustment=startline, left_adjustment=left_adjustment, ) for span in bolds: formatting.append( InlineStyle( row=starting_line + span.start.row, col=span.start.col, n_letters=span.n_letters, attr=self.attr_bold, ) ) # chapter suffix text += ["***".center(textwidth)] return TextStructure( text_lines=tuple(text), image_maps=images, section_rows=sect, formatting=tuple(formatting), ) def parse_html( html_src: str, *, textwidth: Optional[int] = None, section_ids: Optional[Set[str]] = None, starting_line: int = 0, ) -> Union[Tuple[str, ...], TextStructure]: """ Parse html string into TextStructure :param html_src: html str to parse :param textwidth: textwidth to count max length of returned TextStructure if None given, sequence of text as paragraph is returned :param section_ids: set of section ids to look for inside html tag attr :return: Tuple[str, ...] if textwidth not given else TextStructure """ if not section_ids: section_ids = set() parser = HTMLtoLines(section_ids) # try: parser.feed(html_src) parser.close() # except: # pass return parser.get_structured_text(textwidth, starting_line)