diff options
author | benadha <benawiadha@gmail.com> | 2022-01-08 10:30:31 +0700 |
---|---|---|
committer | benadha <benawiadha@gmail.com> | 2022-01-08 10:30:31 +0700 |
commit | c0bfec3da1c55ae11a991a89938261c4e93ce795 (patch) | |
tree | ead3835a551887f11426e45326166cdbd31c9146 /epy_extras/KindleUnpack/mobiml2xhtml.py | |
parent | 7e4a230a8336844cb344301870b87e7bea1691a6 (diff) | |
download | epy-c0bfec3da1c55ae11a991a89938261c4e93ce795.tar.gz |
Initial prepackaging kindleunpack with epy
Diffstat (limited to 'epy_extras/KindleUnpack/mobiml2xhtml.py')
-rwxr-xr-x | epy_extras/KindleUnpack/mobiml2xhtml.py | 527 |
1 files changed, 527 insertions, 0 deletions
diff --git a/epy_extras/KindleUnpack/mobiml2xhtml.py b/epy_extras/KindleUnpack/mobiml2xhtml.py new file mode 100755 index 0000000..94fc671 --- /dev/null +++ b/epy_extras/KindleUnpack/mobiml2xhtml.py @@ -0,0 +1,527 @@ +#! /usr/bin/python +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + + +# this program works in concert with the output from KindleUnpack + +''' +Convert from Mobi ML to XHTML +''' + +from __future__ import division, absolute_import, print_function + +import os +import sys +import re + +SPECIAL_HANDLING_TAGS = { + '?xml' : ('xmlheader', -1), + '!--' : ('comment', -3), + '!DOCTYPE' : ('doctype', -1), +} + +SPECIAL_HANDLING_TYPES = ['xmlheader', 'doctype', 'comment'] + +SELF_CLOSING_TAGS = ['br' , 'hr', 'input', 'img', 'image', 'meta', 'spacer', 'link', 'frame', 'base', 'col', 'reference'] + +class MobiMLConverter(object): + + PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE) + IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex') + + def __init__(self, filename): + self.base_css_rules = 'blockquote { margin: 0em 0em 0em 1.25em }\n' + self.base_css_rules += 'p { margin: 0em }\n' + self.base_css_rules += '.bold { font-weight: bold }\n' + self.base_css_rules += '.italic { font-style: italic }\n' + self.base_css_rules += '.mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n' + self.tag_css_rules = {} + self.tag_css_rule_cnt = 0 + self.path = [] + self.filename = filename + self.wipml = open(self.filename, 'r').read() + self.pos = 0 + self.opfname = self.filename.rsplit('.',1)[0] + '.opf' + self.opos = 0 + self.meta = '' + self.cssname = os.path.join(os.path.dirname(self.filename),'styles.css') + self.current_font_size = 3 + self.font_history = [] + + def cleanup_html(self): + self.wipml = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.wipml) + self.wipml = self.wipml.replace('\r\n', '\n') + self.wipml = self.wipml.replace('> <', '>\n<') + self.wipml = self.wipml.replace('<mbp: ', '<mbp:') + # self.wipml = re.sub(r'<?xml[^>]*>', '', self.wipml) + self.wipml = self.wipml.replace('<br></br>','<br/>') + + def replace_page_breaks(self): + self.wipml = self.PAGE_BREAK_PAT.sub( + '<div class="mbp_pagebreak" />', + self.wipml) + + # parse leading text of ml and tag + def parseml(self): + p = self.pos + if p >= len(self.wipml): + return None + if self.wipml[p] != '<': + res = self.wipml.find('<',p) + if res == -1 : + res = len(self.wipml) + self.pos = res + return self.wipml[p:res], None + # handle comment as a special case to deal with multi-line comments + if self.wipml[p:p+4] == '<!--': + te = self.wipml.find('-->',p+1) + if te != -1: + te = te+2 + else : + te = self.wipml.find('>',p+1) + ntb = self.wipml.find('<',p+1) + if ntb != -1 and ntb < te: + self.pos = ntb + return self.wipml[p:ntb], None + self.pos = te + 1 + return None, self.wipml[p:te+1] + + # parses string version of tag to identify its name, + # its type 'begin', 'end' or 'single', + # plus build a hashtable of its attributes + # code is written to handle the possiblity of very poor formating + def parsetag(self, s): + p = 1 + # get the tag name + tname = None + ttype = None + tattr = {} + while s[p:p+1] == ' ' : + p += 1 + if s[p:p+1] == '/': + ttype = 'end' + p += 1 + while s[p:p+1] == ' ' : + p += 1 + b = p + while s[p:p+1] not in ('>', '/', ' ', '"', "'", "\r", "\n") : + p += 1 + tname=s[b:p].lower() + if tname == '!doctype': + tname = '!DOCTYPE' + # special cases + if tname in SPECIAL_HANDLING_TAGS: + ttype, backstep = SPECIAL_HANDLING_TAGS[tname] + tattr['special'] = s[p:backstep] + if ttype is None: + # parse any attributes + while s.find('=',p) != -1 : + while s[p:p+1] == ' ' : + p += 1 + b = p + while s[p:p+1] != '=' : + p += 1 + aname = s[b:p].lower() + aname = aname.rstrip(' ') + p += 1 + while s[p:p+1] == ' ' : + p += 1 + if s[p:p+1] in ('"', "'") : + p = p + 1 + b = p + while s[p:p+1] not in ('"', "'") : + p += 1 + val = s[b:p] + p += 1 + else : + b = p + while s[p:p+1] not in ('>', '/', ' ') : + p += 1 + val = s[b:p] + tattr[aname] = val + # label beginning and single tags + if ttype is None: + ttype = 'begin' + if s.find(' /',p) >= 0: + ttype = 'single_ext' + elif s.find('/',p) >= 0: + ttype = 'single' + return ttype, tname, tattr + + # main routine to convert from mobi markup language to html + def processml(self): + + # are these really needed + html_done = False + head_done = False + body_done = False + + skip = False + + htmlstr = '' + self.replace_page_breaks() + self.cleanup_html() + + # now parse the cleaned up ml into standard xhtml + while True: + + r = self.parseml() + if not r: + break + + text, tag = r + + if text: + if not skip: + htmlstr += text + + if tag: + ttype, tname, tattr = self.parsetag(tag) + + # If we run into a DTD or xml declarations inside the body ... bail. + if tname in SPECIAL_HANDLING_TAGS and tname != 'comment' and body_done: + htmlstr += '\n</body></html>' + break + + # make sure self-closing tags actually self-close + if ttype == 'begin' and tname in SELF_CLOSING_TAGS: + ttype = 'single' + + # make sure any end tags of self-closing tags are discarded + if ttype == 'end' and tname in SELF_CLOSING_TAGS: + continue + + # remove embedded guide and refernces from old mobis + if tname in ('guide', 'ncx', 'reference') and ttype in ('begin', 'single', 'single_ext'): + tname = 'removeme:{0}'.format(tname) + tattr = None + if tname in ('guide', 'ncx', 'reference', 'font', 'span') and ttype == 'end': + if self.path[-1] == 'removeme:{0}'.format(tname): + tname = 'removeme:{0}'.format(tname) + tattr = None + + # Get rid of font tags that only have a color attribute. + if tname == 'font' and ttype in ('begin', 'single', 'single_ext'): + if 'color' in tattr and len(tattr) == 1: + tname = 'removeme:{0}'.format(tname) + tattr = None + + # Get rid of empty spans in the markup. + if tname == 'span' and ttype in ('begin', 'single', 'single_ext') and not len(tattr): + tname = 'removeme:{0}'.format(tname) + + # need to handle fonts outside of the normal methods + # so fonts tags won't be added to the self.path since we keep track + # of font tags separately with self.font_history + if tname == 'font' and ttype == 'begin': + # check for nested font start tags + if len(self.font_history) > 0 : + # inject a font end tag + taginfo = ('end', 'font', None) + htmlstr += self.processtag(taginfo) + self.font_history.append((ttype, tname, tattr)) + # handle the current font start tag + taginfo = (ttype, tname, tattr) + htmlstr += self.processtag(taginfo) + continue + + # check for nested font tags and unnest them + if tname == 'font' and ttype == 'end': + self.font_history.pop() + # handle this font end tag + taginfo = ('end', 'font', None) + htmlstr += self.processtag(taginfo) + # check if we were nested + if len(self.font_history) > 0: + # inject a copy of the most recent font start tag from history + taginfo = self.font_history[-1] + htmlstr += self.processtag(taginfo) + continue + + # keep track of nesting path + if ttype == 'begin': + self.path.append(tname) + elif ttype == 'end': + if tname != self.path[-1]: + print('improper nesting: ', self.path, tname, ttype) + if tname not in self.path: + # handle case of end tag with no beginning by injecting empty begin tag + taginfo = ('begin', tname, None) + htmlstr += self.processtag(taginfo) + print(" - fixed by injecting empty start tag ", tname) + self.path.append(tname) + elif len(self.path) > 1 and tname == self.path[-2]: + # handle case of dangling missing end + taginfo = ('end', self.path[-1], None) + htmlstr += self.processtag(taginfo) + print(" - fixed by injecting end tag ", self.path[-1]) + self.path.pop() + self.path.pop() + + if tname == 'removeme:{0}'.format(tname): + if ttype in ('begin', 'single', 'single_ext'): + skip = True + else: + skip = False + else: + taginfo = (ttype, tname, tattr) + htmlstr += self.processtag(taginfo) + + # handle potential issue of multiple html, head, and body sections + if tname == 'html' and ttype == 'begin' and not html_done: + htmlstr += '\n' + html_done = True + + if tname == 'head' and ttype == 'begin' and not head_done: + htmlstr += '\n' + # also add in metadata and style link tags + htmlstr += self.meta + htmlstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n' + head_done = True + + if tname == 'body' and ttype == 'begin' and not body_done: + htmlstr += '\n' + body_done = True + + # handle issue of possibly missing html, head, and body tags + # I have not seen this but the original did something like this so ... + if not body_done: + htmlstr = '<body>\n' + htmlstr + '</body>\n' + if not head_done: + headstr = '<head>\n' + headstr += self.meta + headstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n' + headstr += '</head>\n' + htmlstr = headstr + htmlstr + if not html_done: + htmlstr = '<html>\n' + htmlstr + '</html>\n' + + # finally add DOCTYPE info + htmlstr = '<?xml version="1.0"?>\n<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n' + htmlstr + + css = self.base_css_rules + for cls, rule in self.tag_css_rules.items(): + css += '.%s { %s }\n' % (cls, rule) + + return (htmlstr, css, self.cssname) + + def ensure_unit(self, raw, unit='px'): + if re.search(r'\d+$', raw) is not None: + raw += unit + return raw + + # flatten possibly modified tag back to string + def taginfo_tostring(self, taginfo): + (ttype, tname, tattr) = taginfo + if ttype is None or tname is None: + return '' + if ttype == 'end': + return '</%s>' % tname + if ttype in SPECIAL_HANDLING_TYPES and tattr is not None and 'special' in tattr: + info = tattr['special'] + if ttype == 'comment': + return '<%s %s-->' % (tname, info) + else: + return '<%s %s>' % (tname, info) + res = [] + res.append('<%s' % tname) + if tattr is not None: + for key in tattr: + res.append(' %s="%s"' % (key, tattr[key])) + if ttype == 'single': + res.append('/>') + elif ttype == 'single_ext': + res.append(' />') + else : + res.append('>') + return "".join(res) + + # routines to convert from mobi ml tags atributes to xhtml attributes and styles + def processtag(self, taginfo): + # Converting mobi font sizes to numerics + size_map = { + 'xx-small': '1', + 'x-small': '2', + 'small': '3', + 'medium': '4', + 'large': '5', + 'x-large': '6', + 'xx-large': '7', + } + + size_to_em_map = { + '1': '.65em', + '2': '.75em', + '3': '1em', + '4': '1.125em', + '5': '1.25em', + '6': '1.5em', + '7': '2em', + } + + # current tag to work on + (ttype, tname, tattr) = taginfo + if not tattr: + tattr = {} + + styles = [] + + if tname is None or tname.startswith('removeme'): + return '' + + # have not seen an example of this yet so keep it here to be safe + # until this is better understood + if tname in ('country-region', 'place', 'placetype', 'placename', + 'state', 'city', 'street', 'address', 'content'): + tname = 'div' if tname == 'content' else 'span' + for key in tattr: + tattr.pop(key) + + # handle general case of style, height, width, bgcolor in any tag + if 'style' in tattr: + style = tattr.pop('style').strip() + if style: + styles.append(style) + + if 'align' in tattr: + align = tattr.pop('align').strip() + if align: + if tname in ('table', 'td', 'tr'): + pass + else: + styles.append('text-align: %s' % align) + + if 'height' in tattr: + height = tattr.pop('height').strip() + if height and '<' not in height and '>' not in height and re.search(r'\d+', height): + if tname in ('table', 'td', 'tr'): + pass + elif tname == 'img': + tattr['height'] = height + else: + styles.append('margin-top: %s' % self.ensure_unit(height)) + + if 'width' in tattr: + width = tattr.pop('width').strip() + if width and re.search(r'\d+', width): + if tname in ('table', 'td', 'tr'): + pass + elif tname == 'img': + tattr['width'] = width + else: + styles.append('text-indent: %s' % self.ensure_unit(width)) + if width.startswith('-'): + styles.append('margin-left: %s' % self.ensure_unit(width[1:])) + + if 'bgcolor' in tattr: + # no proprietary html allowed + if tname == 'div': + del tattr['bgcolor'] + + elif tname == 'font': + # Change font tags to span tags + tname = 'span' + if ttype in ('begin', 'single', 'single_ext'): + # move the face attribute to css font-family + if 'face' in tattr: + face = tattr.pop('face').strip() + styles.append('font-family: "%s"' % face) + + # Monitor the constantly changing font sizes, change them to ems and move + # them to css. The following will work for 'flat' font tags, but nested font tags + # will cause things to go wonky. Need to revert to the parent font tag's size + # when a closing tag is encountered. + if 'size' in tattr: + sz = tattr.pop('size').strip().lower() + try: + float(sz) + except ValueError: + if sz in size_map: + sz = size_map[sz] + else: + if sz.startswith('-') or sz.startswith('+'): + sz = self.current_font_size + float(sz) + if sz > 7: + sz = 7 + elif sz < 1: + sz = 1 + sz = str(int(sz)) + styles.append('font-size: %s' % size_to_em_map[sz]) + self.current_font_size = int(sz) + + elif tname == 'img': + for attr in ('width', 'height'): + if attr in tattr: + val = tattr[attr] + if val.lower().endswith('em'): + try: + nval = float(val[:-2]) + nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile + tattr[attr] = "%dpx"%int(nval) + except: + del tattr[attr] + elif val.lower().endswith('%'): + del tattr[attr] + + # convert the anchor tags + if 'filepos-id' in tattr: + tattr['id'] = tattr.pop('filepos-id') + if 'name' in tattr and tattr['name'] != tattr['id']: + tattr['name'] = tattr['id'] + + if 'filepos' in tattr: + filepos = tattr.pop('filepos') + try: + tattr['href'] = "#filepos%d" % int(filepos) + except ValueError: + pass + + if styles: + ncls = None + rule = '; '.join(styles) + for sel, srule in self.tag_css_rules.items(): + if srule == rule: + ncls = sel + break + if ncls is None: + self.tag_css_rule_cnt += 1 + ncls = 'rule_%d' % self.tag_css_rule_cnt + self.tag_css_rules[ncls] = rule + cls = tattr.get('class', '') + cls = cls + (' ' if cls else '') + ncls + tattr['class'] = cls + + # convert updated tag back to string representation + if len(tattr) == 0: + tattr = None + taginfo = (ttype, tname, tattr) + return self.taginfo_tostring(taginfo) + +''' main only left in for testing outside of plugin ''' + +def main(argv=sys.argv): + if len(argv) != 2: + return 1 + else: + infile = argv[1] + + try: + print('Converting Mobi Markup Language to XHTML') + mlc = MobiMLConverter(infile) + print('Processing ...') + htmlstr, css, cssname = mlc.processml() + outname = infile.rsplit('.',1)[0] + '_converted.html' + open(outname, 'w').write(htmlstr) + open(cssname, 'w').write(css) + print('Completed') + print('XHTML version of book can be found at: ' + outname) + + except ValueError as e: + print("Error: %s" % e) + return 1 + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) |