aboutsummaryrefslogtreecommitdiffstats
path: root/src/epy_reader/parser.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/epy_reader/parser.py')
-rw-r--r--src/epy_reader/parser.py421
1 files changed, 421 insertions, 0 deletions
diff --git a/src/epy_reader/parser.py b/src/epy_reader/parser.py
new file mode 100644
index 0000000..6eced00
--- /dev/null
+++ b/src/epy_reader/parser.py
@@ -0,0 +1,421 @@
+import curses
+import dataclasses
+import re
+import textwrap
+from html import unescape
+from html.parser import HTMLParser
+from typing import Dict, List, Mapping, Optional, Sequence, Set, Tuple, Union
+from urllib.parse import unquote
+
+from epy_reader.models import CharPos, InlineStyle, TextMark, TextSpan, TextStructure
+
+
+class HTMLtoLines(HTMLParser):
+ para = {"p", "div"}
+ inde = {"q", "dt", "dd", "blockquote"}
+ pref = {"pre"}
+ bull = {"li"}
+ hide = {"script", "style", "head"}
+ ital = {"i", "em"}
+ bold = {"b", "strong"}
+ # hide = {"script", "style", "head", ", "sub}
+ # sup_lookup = "⁰¹²³⁴⁵⁶⁷⁸⁹"
+ # sub_lookup = "₀₁₂₃₄₅₆₇₈₉"
+
+ attr_bold = curses.A_BOLD
+ try:
+ attr_italic = curses.A_ITALIC
+ except AttributeError:
+ try:
+ attr_italic = curses.A_UNDERLINE
+ except AttributeError:
+ attr_italic = curses.A_NORMAL
+
+ @staticmethod
+ def _mark_to_spans(text: Sequence[str], marks: Sequence[TextMark]) -> List[TextSpan]:
+ """
+ Convert text marks in line of text to per line text span.
+ Keeping duplicate spans.
+ """
+ spans: List[TextSpan] = []
+ for mark in marks:
+ if mark.is_valid():
+ # mypy issue, should be handled by mark.is_valid()
+ assert mark.end is not None
+ if mark.start.row == mark.end.row:
+ spans.append(
+ TextSpan(start=mark.start, n_letters=mark.end.col - mark.start.col)
+ )
+ else:
+ spans.append(
+ TextSpan(
+ start=mark.start, n_letters=len(text[mark.start.row]) - mark.start.col
+ )
+ )
+ for nth_line in range(mark.start.row + 1, mark.end.row):
+ spans.append(
+ TextSpan(
+ start=CharPos(row=nth_line, col=0), n_letters=len(text[nth_line])
+ )
+ )
+ spans.append(
+ TextSpan(start=CharPos(row=mark.end.row, col=0), n_letters=mark.end.col)
+ )
+
+ return spans # list(set(spans))
+
+ @staticmethod
+ def _adjust_wrapped_spans(
+ wrapped_lines: Sequence[str],
+ span: TextSpan,
+ *,
+ line_adjustment: int = 0,
+ left_adjustment: int = 0,
+ ) -> List[TextSpan]:
+ """
+ Adjust text span to wrapped lines.
+ Not perfect, but should be good enough considering
+ the limitation on commandline interface.
+ """
+
+ # current_row = span.start.row + line_adjustment
+ current_row = line_adjustment
+ start_col = span.start.col
+ end_col = start_col + span.n_letters
+
+ prev = 0 # chars length before current line
+ spans: List[TextSpan] = []
+ for n, line in enumerate(wrapped_lines):
+ # + 1 compensates textwrap.wrap(*args, replace_whitespace=True, drop_whitespace=True)
+ line_len = len(line) + 1
+ current = prev + line_len # chars length before next line
+
+ # -:unmarked *:marked
+ # |------*****--------|
+ if start_col in range(prev, current) and end_col in range(prev, current):
+ spans.append(
+ TextSpan(
+ start=CharPos(row=current_row + n, col=start_col - prev + left_adjustment),
+ n_letters=span.n_letters,
+ )
+ )
+
+ # |----------*********|
+ elif start_col in range(prev, current):
+ spans.append(
+ TextSpan(
+ start=CharPos(row=current_row + n, col=start_col - prev + left_adjustment),
+ n_letters=current - start_col - 1, # -1: dropped whitespace
+ )
+ )
+
+ # |********-----------|
+ elif end_col in range(prev, current):
+ spans.append(
+ TextSpan(
+ start=CharPos(row=current_row + n, col=0 + left_adjustment),
+ n_letters=end_col - prev + 1, # +1: dropped whitespace
+ )
+ )
+
+ # |*******************|
+ elif prev in range(start_col, end_col) and current in range(start_col, end_col):
+ spans.append(
+ TextSpan(
+ start=CharPos(row=current_row + n, col=0 + left_adjustment),
+ n_letters=line_len - 1, # -1: dropped whitespace
+ )
+ )
+
+ elif prev > end_col:
+ break
+
+ prev = current
+
+ return spans
+
+ @staticmethod
+ def _group_spans_by_row(blocks: Sequence[TextSpan]) -> Mapping[int, List[TextSpan]]:
+ groups: Dict[int, List[TextSpan]] = {}
+ for block in blocks:
+ row = block.start.row
+ if row in groups:
+ groups[row].append(block)
+ else:
+ groups[row] = [block]
+ return groups
+
+ def __init__(self, sects={""}):
+ HTMLParser.__init__(self)
+ self.text = [""]
+ self.ishead = False
+ self.isinde = False
+ self.isbull = False
+ self.ispref = False
+ self.ishidden = False
+ self.idhead = set()
+ self.idinde = set()
+ self.idbull = set()
+ self.idpref = set()
+ self.idimgs = set()
+ self.sects = sects
+ self.sectsindex = {}
+ self.italic_marks: List[TextMark] = []
+ self.bold_marks: List[TextMark] = []
+ self.imgs: Dict[int, str] = dict()
+
+ def handle_starttag(self, tag, attrs):
+ if re.match("h[1-6]", tag) is not None:
+ self.ishead = True
+ elif tag in self.inde:
+ self.isinde = True
+ elif tag in self.pref:
+ self.ispref = True
+ elif tag in self.bull:
+ self.isbull = True
+ elif tag in self.hide:
+ self.ishidden = True
+ elif tag == "sup":
+ self.text[-1] += "^{"
+ elif tag == "sub":
+ self.text[-1] += "_{"
+ # NOTE: "img" and "image"
+ # In HTML, both are startendtag (no need endtag)
+ # but in XHTML both need endtag
+ elif tag in {"img", "image"}:
+ for i in attrs:
+ if (tag == "img" and i[0] == "src") or (tag == "image" and i[0].endswith("href")):
+ this_line = len(self.text)
+ self.idimgs.add(this_line)
+ self.imgs[this_line] = unquote(i[1])
+ self.text.append("[IMAGE]")
+ # formatting
+ elif tag in self.ital:
+ if len(self.italic_marks) == 0 or self.italic_marks[-1].is_valid():
+ char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1]))
+ self.italic_marks.append(TextMark(start=char_pos))
+ elif tag in self.bold:
+ if len(self.bold_marks) == 0 or self.bold_marks[-1].is_valid():
+ char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1]))
+ self.bold_marks.append(TextMark(start=char_pos))
+ if self.sects != {""}:
+ for i in attrs:
+ if i[0] == "id" and i[1] in self.sects:
+ # self.text[-1] += " (#" + i[1] + ") "
+ # self.sectsindex.append([len(self.text), i[1]])
+ self.sectsindex[len(self.text) - 1] = i[1]
+
+ def handle_startendtag(self, tag, attrs):
+ if tag == "br":
+ self.text += [""]
+ elif tag in {"img", "image"}:
+ for i in attrs:
+ # if (tag == "img" and i[0] == "src")\
+ # or (tag == "image" and i[0] == "xlink:href"):
+ if (tag == "img" and i[0] == "src") or (tag == "image" and i[0].endswith("href")):
+ this_line = len(self.text)
+ self.idimgs.add(this_line)
+ self.imgs[this_line] = unquote(i[1])
+ self.text.append("[IMAGE]")
+ self.text.append("")
+ # sometimes attribute "id" is inside "startendtag"
+ # especially html from mobi module (kindleunpack fork)
+ if self.sects != {""}:
+ for i in attrs:
+ if i[0] == "id" and i[1] in self.sects:
+ # self.text[-1] += " (#" + i[1] + ") "
+ self.sectsindex[len(self.text) - 1] = i[1]
+
+ def handle_endtag(self, tag):
+ if re.match("h[1-6]", tag) is not None:
+ self.text.append("")
+ self.text.append("")
+ self.ishead = False
+ elif tag in self.para:
+ self.text.append("")
+ elif tag in self.hide:
+ self.ishidden = False
+ elif tag in self.inde:
+ if self.text[-1] != "":
+ self.text.append("")
+ self.isinde = False
+ elif tag in self.pref:
+ if self.text[-1] != "":
+ self.text.append("")
+ self.ispref = False
+ elif tag in self.bull:
+ if self.text[-1] != "":
+ self.text.append("")
+ self.isbull = False
+ elif tag in {"sub", "sup"}:
+ self.text[-1] += "}"
+ elif tag in {"img", "image"}:
+ self.text.append("")
+ # formatting
+ elif tag in self.ital:
+ char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1]))
+ last_mark = self.italic_marks[-1]
+ self.italic_marks[-1] = dataclasses.replace(last_mark, end=char_pos)
+ elif tag in self.bold:
+ char_pos = CharPos(row=len(self.text) - 1, col=len(self.text[-1]))
+ last_mark = self.bold_marks[-1]
+ self.bold_marks[-1] = dataclasses.replace(last_mark, end=char_pos)
+
+ def handle_data(self, raw):
+ if raw and not self.ishidden:
+ if self.text[-1] == "":
+ tmp = raw.lstrip()
+ else:
+ tmp = raw
+ if self.ispref:
+ line = unescape(tmp)
+ else:
+ line = unescape(re.sub(r"\s+", " ", tmp))
+ self.text[-1] += line
+ if self.ishead:
+ self.idhead.add(len(self.text) - 1)
+ elif self.isbull:
+ self.idbull.add(len(self.text) - 1)
+ elif self.isinde:
+ self.idinde.add(len(self.text) - 1)
+ elif self.ispref:
+ self.idpref.add(len(self.text) - 1)
+
+ def get_structured_text(
+ self, textwidth: Optional[int] = 0, starting_line: int = 0
+ ) -> Union[Tuple[str, ...], TextStructure]:
+
+ if not textwidth:
+ return tuple(self.text)
+
+ text: List[str] = []
+ images: Dict[int, str] = dict() # {line_num: path/in/zip}
+ sect: Dict[str, int] = dict() # {section_id: line_num}
+ formatting: List[InlineStyle] = []
+
+ italic_spans: List[TextSpan] = HTMLtoLines._mark_to_spans(self.text, self.italic_marks)
+ bold_spans: List[TextSpan] = HTMLtoLines._mark_to_spans(self.text, self.bold_marks)
+ italic_groups = HTMLtoLines._group_spans_by_row(italic_spans)
+ bold_groups = HTMLtoLines._group_spans_by_row(bold_spans)
+
+ for n, line in enumerate(self.text):
+
+ startline = len(text)
+ # findsect = re.search(r"(?<= \(#).*?(?=\) )", line)
+ # if findsect is not None and findsect.group() in self.sects:
+ # line = line.replace(" (#" + findsect.group() + ") ", "")
+ # # line = line.replace(" (#" + findsect.group() + ") ", " "*(5+len(findsect.group())))
+ # sect[findsect.group()] = len(text)
+ if n in self.sectsindex.keys():
+ sect[self.sectsindex[n]] = starting_line + len(text)
+ if n in self.idhead:
+ # text += [line.rjust(textwidth // 2 + len(line) // 2)] + [""]
+ text += [line.center(textwidth)] + [""]
+ formatting += [
+ InlineStyle(
+ row=starting_line + i, col=0, n_letters=len(text[i]), attr=self.attr_bold
+ )
+ for i in range(startline, len(text))
+ ]
+ elif n in self.idinde:
+ text += [" " + i for i in textwrap.wrap(line, textwidth - 3)] + [""]
+ elif n in self.idbull:
+ tmp = textwrap.wrap(line, textwidth - 3)
+ text += [" - " + i if i == tmp[0] else " " + i for i in tmp] + [""]
+ elif n in self.idpref:
+ tmp = line.splitlines()
+ wraptmp = []
+ for tmp_line in tmp:
+ wraptmp += [i for i in textwrap.wrap(tmp_line, textwidth - 6)]
+ text += [" " + i for i in wraptmp] + [""]
+ elif n in self.idimgs:
+ images[starting_line + len(text)] = self.imgs[n]
+ text += [line.center(textwidth)]
+ formatting += [
+ InlineStyle(
+ row=starting_line + len(text) - 1,
+ col=0,
+ n_letters=len(text[-1]),
+ attr=self.attr_bold,
+ )
+ ]
+ text += [""]
+ else:
+ text += textwrap.wrap(line, textwidth) + [""]
+
+ endline = len(text) # -1
+
+ left_adjustment = 3 if n in self.idbull | self.idinde else 0
+
+ for spans in italic_groups.get(n, []):
+ italics = HTMLtoLines._adjust_wrapped_spans(
+ text[startline:endline],
+ spans,
+ line_adjustment=startline,
+ left_adjustment=left_adjustment,
+ )
+ for span in italics:
+ formatting.append(
+ InlineStyle(
+ row=starting_line + span.start.row,
+ col=span.start.col,
+ n_letters=span.n_letters,
+ attr=self.attr_italic,
+ )
+ )
+
+ for spans in bold_groups.get(n, []):
+ bolds = HTMLtoLines._adjust_wrapped_spans(
+ text[startline:endline],
+ spans,
+ line_adjustment=startline,
+ left_adjustment=left_adjustment,
+ )
+ for span in bolds:
+ formatting.append(
+ InlineStyle(
+ row=starting_line + span.start.row,
+ col=span.start.col,
+ n_letters=span.n_letters,
+ attr=self.attr_bold,
+ )
+ )
+
+ # chapter suffix
+ text += ["***".center(textwidth)]
+
+ return TextStructure(
+ text_lines=tuple(text),
+ image_maps=images,
+ section_rows=sect,
+ formatting=tuple(formatting),
+ )
+
+
+def parse_html(
+ html_src: str,
+ *,
+ textwidth: Optional[int] = None,
+ section_ids: Optional[Set[str]] = None,
+ starting_line: int = 0,
+) -> Union[Tuple[str, ...], TextStructure]:
+ """
+ Parse html string into TextStructure
+
+ :param html_src: html str to parse
+ :param textwidth: textwidth to count max length of returned TextStructure
+ if None given, sequence of text as paragraph is returned
+ :param section_ids: set of section ids to look for inside html tag attr
+ :return: Tuple[str, ...] if textwidth not given else TextStructure
+ """
+ if not section_ids:
+ section_ids = set()
+
+ parser = HTMLtoLines(section_ids)
+ # try:
+ parser.feed(html_src)
+ parser.close()
+ # except:
+ # pass
+
+ return parser.get_structured_text(textwidth, starting_line)