#! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab # this program works in concert with the output from KindleUnpack ''' Convert from Mobi ML to XHTML ''' from __future__ import division, absolute_import, print_function import os import sys import re SPECIAL_HANDLING_TAGS = { '?xml' : ('xmlheader', -1), '!--' : ('comment', -3), '!DOCTYPE' : ('doctype', -1), } SPECIAL_HANDLING_TYPES = ['xmlheader', 'doctype', 'comment'] SELF_CLOSING_TAGS = ['br' , 'hr', 'input', 'img', 'image', 'meta', 'spacer', 'link', 'frame', 'base', 'col', 'reference'] class MobiMLConverter(object): PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE) IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex') def __init__(self, filename): self.base_css_rules = 'blockquote { margin: 0em 0em 0em 1.25em }\n' self.base_css_rules += 'p { margin: 0em }\n' self.base_css_rules += '.bold { font-weight: bold }\n' self.base_css_rules += '.italic { font-style: italic }\n' self.base_css_rules += '.mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n' self.tag_css_rules = {} self.tag_css_rule_cnt = 0 self.path = [] self.filename = filename self.wipml = open(self.filename, 'r').read() self.pos = 0 self.opfname = self.filename.rsplit('.',1)[0] + '.opf' self.opos = 0 self.meta = '' self.cssname = os.path.join(os.path.dirname(self.filename),'styles.css') self.current_font_size = 3 self.font_history = [] def cleanup_html(self): self.wipml = re.sub(r'
', '', self.wipml) self.wipml = self.wipml.replace('\r\n', '\n') self.wipml = self.wipml.replace('> <', '>\n<') self.wipml = self.wipml.replace(']*>', '', self.wipml) self.wipml = self.wipml.replace('

','
') def replace_page_breaks(self): self.wipml = self.PAGE_BREAK_PAT.sub( '
', self.wipml) # parse leading text of ml and tag def parseml(self): p = self.pos if p >= len(self.wipml): return None if self.wipml[p] != '<': res = self.wipml.find('<',p) if res == -1 : res = len(self.wipml) self.pos = res return self.wipml[p:res], None # handle comment as a special case to deal with multi-line comments if self.wipml[p:p+4] == '',p+1) if te != -1: te = te+2 else : te = self.wipml.find('>',p+1) ntb = self.wipml.find('<',p+1) if ntb != -1 and ntb < te: self.pos = ntb return self.wipml[p:ntb], None self.pos = te + 1 return None, self.wipml[p:te+1] # parses string version of tag to identify its name, # its type 'begin', 'end' or 'single', # plus build a hashtable of its attributes # code is written to handle the possiblity of very poor formating def parsetag(self, s): p = 1 # get the tag name tname = None ttype = None tattr = {} while s[p:p+1] == ' ' : p += 1 if s[p:p+1] == '/': ttype = 'end' p += 1 while s[p:p+1] == ' ' : p += 1 b = p while s[p:p+1] not in ('>', '/', ' ', '"', "'", "\r", "\n") : p += 1 tname=s[b:p].lower() if tname == '!doctype': tname = '!DOCTYPE' # special cases if tname in SPECIAL_HANDLING_TAGS: ttype, backstep = SPECIAL_HANDLING_TAGS[tname] tattr['special'] = s[p:backstep] if ttype is None: # parse any attributes while s.find('=',p) != -1 : while s[p:p+1] == ' ' : p += 1 b = p while s[p:p+1] != '=' : p += 1 aname = s[b:p].lower() aname = aname.rstrip(' ') p += 1 while s[p:p+1] == ' ' : p += 1 if s[p:p+1] in ('"', "'") : p = p + 1 b = p while s[p:p+1] not in ('"', "'") : p += 1 val = s[b:p] p += 1 else : b = p while s[p:p+1] not in ('>', '/', ' ') : p += 1 val = s[b:p] tattr[aname] = val # label beginning and single tags if ttype is None: ttype = 'begin' if s.find(' /',p) >= 0: ttype = 'single_ext' elif s.find('/',p) >= 0: ttype = 'single' return ttype, tname, tattr # main routine to convert from mobi markup language to html def processml(self): # are these really needed html_done = False head_done = False body_done = False skip = False htmlstr = '' self.replace_page_breaks() self.cleanup_html() # now parse the cleaned up ml into standard xhtml while True: r = self.parseml() if not r: break text, tag = r if text: if not skip: htmlstr += text if tag: ttype, tname, tattr = self.parsetag(tag) # If we run into a DTD or xml declarations inside the body ... bail. if tname in SPECIAL_HANDLING_TAGS and tname != 'comment' and body_done: htmlstr += '\n' break # make sure self-closing tags actually self-close if ttype == 'begin' and tname in SELF_CLOSING_TAGS: ttype = 'single' # make sure any end tags of self-closing tags are discarded if ttype == 'end' and tname in SELF_CLOSING_TAGS: continue # remove embedded guide and refernces from old mobis if tname in ('guide', 'ncx', 'reference') and ttype in ('begin', 'single', 'single_ext'): tname = 'removeme:{0}'.format(tname) tattr = None if tname in ('guide', 'ncx', 'reference', 'font', 'span') and ttype == 'end': if self.path[-1] == 'removeme:{0}'.format(tname): tname = 'removeme:{0}'.format(tname) tattr = None # Get rid of font tags that only have a color attribute. if tname == 'font' and ttype in ('begin', 'single', 'single_ext'): if 'color' in tattr and len(tattr) == 1: tname = 'removeme:{0}'.format(tname) tattr = None # Get rid of empty spans in the markup. if tname == 'span' and ttype in ('begin', 'single', 'single_ext') and not len(tattr): tname = 'removeme:{0}'.format(tname) # need to handle fonts outside of the normal methods # so fonts tags won't be added to the self.path since we keep track # of font tags separately with self.font_history if tname == 'font' and ttype == 'begin': # check for nested font start tags if len(self.font_history) > 0 : # inject a font end tag taginfo = ('end', 'font', None) htmlstr += self.processtag(taginfo) self.font_history.append((ttype, tname, tattr)) # handle the current font start tag taginfo = (ttype, tname, tattr) htmlstr += self.processtag(taginfo) continue # check for nested font tags and unnest them if tname == 'font' and ttype == 'end': self.font_history.pop() # handle this font end tag taginfo = ('end', 'font', None) htmlstr += self.processtag(taginfo) # check if we were nested if len(self.font_history) > 0: # inject a copy of the most recent font start tag from history taginfo = self.font_history[-1] htmlstr += self.processtag(taginfo) continue # keep track of nesting path if ttype == 'begin': self.path.append(tname) elif ttype == 'end': if tname != self.path[-1]: print('improper nesting: ', self.path, tname, ttype) if tname not in self.path: # handle case of end tag with no beginning by injecting empty begin tag taginfo = ('begin', tname, None) htmlstr += self.processtag(taginfo) print(" - fixed by injecting empty start tag ", tname) self.path.append(tname) elif len(self.path) > 1 and tname == self.path[-2]: # handle case of dangling missing end taginfo = ('end', self.path[-1], None) htmlstr += self.processtag(taginfo) print(" - fixed by injecting end tag ", self.path[-1]) self.path.pop() self.path.pop() if tname == 'removeme:{0}'.format(tname): if ttype in ('begin', 'single', 'single_ext'): skip = True else: skip = False else: taginfo = (ttype, tname, tattr) htmlstr += self.processtag(taginfo) # handle potential issue of multiple html, head, and body sections if tname == 'html' and ttype == 'begin' and not html_done: htmlstr += '\n' html_done = True if tname == 'head' and ttype == 'begin' and not head_done: htmlstr += '\n' # also add in metadata and style link tags htmlstr += self.meta htmlstr += '\n' head_done = True if tname == 'body' and ttype == 'begin' and not body_done: htmlstr += '\n' body_done = True # handle issue of possibly missing html, head, and body tags # I have not seen this but the original did something like this so ... if not body_done: htmlstr = '\n' + htmlstr + '\n' if not head_done: headstr = '\n' headstr += self.meta headstr += '\n' headstr += '\n' htmlstr = headstr + htmlstr if not html_done: htmlstr = '\n' + htmlstr + '\n' # finally add DOCTYPE info htmlstr = '\n\n' + htmlstr css = self.base_css_rules for cls, rule in self.tag_css_rules.items(): css += '.%s { %s }\n' % (cls, rule) return (htmlstr, css, self.cssname) def ensure_unit(self, raw, unit='px'): if re.search(r'\d+$', raw) is not None: raw += unit return raw # flatten possibly modified tag back to string def taginfo_tostring(self, taginfo): (ttype, tname, tattr) = taginfo if ttype is None or tname is None: return '' if ttype == 'end': return '' % tname if ttype in SPECIAL_HANDLING_TYPES and tattr is not None and 'special' in tattr: info = tattr['special'] if ttype == 'comment': return '<%s %s-->' % (tname, info) else: return '<%s %s>' % (tname, info) res = [] res.append('<%s' % tname) if tattr is not None: for key in tattr: res.append(' %s="%s"' % (key, tattr[key])) if ttype == 'single': res.append('/>') elif ttype == 'single_ext': res.append(' />') else : res.append('>') return "".join(res) # routines to convert from mobi ml tags atributes to xhtml attributes and styles def processtag(self, taginfo): # Converting mobi font sizes to numerics size_map = { 'xx-small': '1', 'x-small': '2', 'small': '3', 'medium': '4', 'large': '5', 'x-large': '6', 'xx-large': '7', } size_to_em_map = { '1': '.65em', '2': '.75em', '3': '1em', '4': '1.125em', '5': '1.25em', '6': '1.5em', '7': '2em', } # current tag to work on (ttype, tname, tattr) = taginfo if not tattr: tattr = {} styles = [] if tname is None or tname.startswith('removeme'): return '' # have not seen an example of this yet so keep it here to be safe # until this is better understood if tname in ('country-region', 'place', 'placetype', 'placename', 'state', 'city', 'street', 'address', 'content'): tname = 'div' if tname == 'content' else 'span' for key in tattr: tattr.pop(key) # handle general case of style, height, width, bgcolor in any tag if 'style' in tattr: style = tattr.pop('style').strip() if style: styles.append(style) if 'align' in tattr: align = tattr.pop('align').strip() if align: if tname in ('table', 'td', 'tr'): pass else: styles.append('text-align: %s' % align) if 'height' in tattr: height = tattr.pop('height').strip() if height and '<' not in height and '>' not in height and re.search(r'\d+', height): if tname in ('table', 'td', 'tr'): pass elif tname == 'img': tattr['height'] = height else: styles.append('margin-top: %s' % self.ensure_unit(height)) if 'width' in tattr: width = tattr.pop('width').strip() if width and re.search(r'\d+', width): if tname in ('table', 'td', 'tr'): pass elif tname == 'img': tattr['width'] = width else: styles.append('text-indent: %s' % self.ensure_unit(width)) if width.startswith('-'): styles.append('margin-left: %s' % self.ensure_unit(width[1:])) if 'bgcolor' in tattr: # no proprietary html allowed if tname == 'div': del tattr['bgcolor'] elif tname == 'font': # Change font tags to span tags tname = 'span' if ttype in ('begin', 'single', 'single_ext'): # move the face attribute to css font-family if 'face' in tattr: face = tattr.pop('face').strip() styles.append('font-family: "%s"' % face) # Monitor the constantly changing font sizes, change them to ems and move # them to css. The following will work for 'flat' font tags, but nested font tags # will cause things to go wonky. Need to revert to the parent font tag's size # when a closing tag is encountered. if 'size' in tattr: sz = tattr.pop('size').strip().lower() try: float(sz) except ValueError: if sz in size_map: sz = size_map[sz] else: if sz.startswith('-') or sz.startswith('+'): sz = self.current_font_size + float(sz) if sz > 7: sz = 7 elif sz < 1: sz = 1 sz = str(int(sz)) styles.append('font-size: %s' % size_to_em_map[sz]) self.current_font_size = int(sz) elif tname == 'img': for attr in ('width', 'height'): if attr in tattr: val = tattr[attr] if val.lower().endswith('em'): try: nval = float(val[:-2]) nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile tattr[attr] = "%dpx"%int(nval) except: del tattr[attr] elif val.lower().endswith('%'): del tattr[attr] # convert the anchor tags if 'filepos-id' in tattr: tattr['id'] = tattr.pop('filepos-id') if 'name' in tattr and tattr['name'] != tattr['id']: tattr['name'] = tattr['id'] if 'filepos' in tattr: filepos = tattr.pop('filepos') try: tattr['href'] = "#filepos%d" % int(filepos) except ValueError: pass if styles: ncls = None rule = '; '.join(styles) for sel, srule in self.tag_css_rules.items(): if srule == rule: ncls = sel break if ncls is None: self.tag_css_rule_cnt += 1 ncls = 'rule_%d' % self.tag_css_rule_cnt self.tag_css_rules[ncls] = rule cls = tattr.get('class', '') cls = cls + (' ' if cls else '') + ncls tattr['class'] = cls # convert updated tag back to string representation if len(tattr) == 0: tattr = None taginfo = (ttype, tname, tattr) return self.taginfo_tostring(taginfo) ''' main only left in for testing outside of plugin ''' def main(argv=sys.argv): if len(argv) != 2: return 1 else: infile = argv[1] try: print('Converting Mobi Markup Language to XHTML') mlc = MobiMLConverter(infile) print('Processing ...') htmlstr, css, cssname = mlc.processml() outname = infile.rsplit('.',1)[0] + '_converted.html' open(outname, 'w').write(htmlstr) open(cssname, 'w').write(css) print('Completed') print('XHTML version of book can be found at: ' + outname) except ValueError as e: print("Error: %s" % e) return 1 return 0 if __name__ == "__main__": sys.exit(main())