aboutsummaryrefslogtreecommitdiffstats
path: root/src/epy_reader/tools/KindleUnpack/mobiml2xhtml.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/epy_reader/tools/KindleUnpack/mobiml2xhtml.py')
-rwxr-xr-xsrc/epy_reader/tools/KindleUnpack/mobiml2xhtml.py527
1 files changed, 527 insertions, 0 deletions
diff --git a/src/epy_reader/tools/KindleUnpack/mobiml2xhtml.py b/src/epy_reader/tools/KindleUnpack/mobiml2xhtml.py
new file mode 100755
index 0000000..94fc671
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/mobiml2xhtml.py
@@ -0,0 +1,527 @@
+#! /usr/bin/python
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+
+# this program works in concert with the output from KindleUnpack
+
+'''
+Convert from Mobi ML to XHTML
+'''
+
+from __future__ import division, absolute_import, print_function
+
+import os
+import sys
+import re
+
+SPECIAL_HANDLING_TAGS = {
+ '?xml' : ('xmlheader', -1),
+ '!--' : ('comment', -3),
+ '!DOCTYPE' : ('doctype', -1),
+}
+
+SPECIAL_HANDLING_TYPES = ['xmlheader', 'doctype', 'comment']
+
+SELF_CLOSING_TAGS = ['br' , 'hr', 'input', 'img', 'image', 'meta', 'spacer', 'link', 'frame', 'base', 'col', 'reference']
+
+class MobiMLConverter(object):
+
+ PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
+ IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
+
+ def __init__(self, filename):
+ self.base_css_rules = 'blockquote { margin: 0em 0em 0em 1.25em }\n'
+ self.base_css_rules += 'p { margin: 0em }\n'
+ self.base_css_rules += '.bold { font-weight: bold }\n'
+ self.base_css_rules += '.italic { font-style: italic }\n'
+ self.base_css_rules += '.mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n'
+ self.tag_css_rules = {}
+ self.tag_css_rule_cnt = 0
+ self.path = []
+ self.filename = filename
+ self.wipml = open(self.filename, 'r').read()
+ self.pos = 0
+ self.opfname = self.filename.rsplit('.',1)[0] + '.opf'
+ self.opos = 0
+ self.meta = ''
+ self.cssname = os.path.join(os.path.dirname(self.filename),'styles.css')
+ self.current_font_size = 3
+ self.font_history = []
+
+ def cleanup_html(self):
+ self.wipml = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.wipml)
+ self.wipml = self.wipml.replace('\r\n', '\n')
+ self.wipml = self.wipml.replace('> <', '>\n<')
+ self.wipml = self.wipml.replace('<mbp: ', '<mbp:')
+ # self.wipml = re.sub(r'<?xml[^>]*>', '', self.wipml)
+ self.wipml = self.wipml.replace('<br></br>','<br/>')
+
+ def replace_page_breaks(self):
+ self.wipml = self.PAGE_BREAK_PAT.sub(
+ '<div class="mbp_pagebreak" />',
+ self.wipml)
+
+ # parse leading text of ml and tag
+ def parseml(self):
+ p = self.pos
+ if p >= len(self.wipml):
+ return None
+ if self.wipml[p] != '<':
+ res = self.wipml.find('<',p)
+ if res == -1 :
+ res = len(self.wipml)
+ self.pos = res
+ return self.wipml[p:res], None
+ # handle comment as a special case to deal with multi-line comments
+ if self.wipml[p:p+4] == '<!--':
+ te = self.wipml.find('-->',p+1)
+ if te != -1:
+ te = te+2
+ else :
+ te = self.wipml.find('>',p+1)
+ ntb = self.wipml.find('<',p+1)
+ if ntb != -1 and ntb < te:
+ self.pos = ntb
+ return self.wipml[p:ntb], None
+ self.pos = te + 1
+ return None, self.wipml[p:te+1]
+
+ # parses string version of tag to identify its name,
+ # its type 'begin', 'end' or 'single',
+ # plus build a hashtable of its attributes
+ # code is written to handle the possiblity of very poor formating
+ def parsetag(self, s):
+ p = 1
+ # get the tag name
+ tname = None
+ ttype = None
+ tattr = {}
+ while s[p:p+1] == ' ' :
+ p += 1
+ if s[p:p+1] == '/':
+ ttype = 'end'
+ p += 1
+ while s[p:p+1] == ' ' :
+ p += 1
+ b = p
+ while s[p:p+1] not in ('>', '/', ' ', '"', "'", "\r", "\n") :
+ p += 1
+ tname=s[b:p].lower()
+ if tname == '!doctype':
+ tname = '!DOCTYPE'
+ # special cases
+ if tname in SPECIAL_HANDLING_TAGS:
+ ttype, backstep = SPECIAL_HANDLING_TAGS[tname]
+ tattr['special'] = s[p:backstep]
+ if ttype is None:
+ # parse any attributes
+ while s.find('=',p) != -1 :
+ while s[p:p+1] == ' ' :
+ p += 1
+ b = p
+ while s[p:p+1] != '=' :
+ p += 1
+ aname = s[b:p].lower()
+ aname = aname.rstrip(' ')
+ p += 1
+ while s[p:p+1] == ' ' :
+ p += 1
+ if s[p:p+1] in ('"', "'") :
+ p = p + 1
+ b = p
+ while s[p:p+1] not in ('"', "'") :
+ p += 1
+ val = s[b:p]
+ p += 1
+ else :
+ b = p
+ while s[p:p+1] not in ('>', '/', ' ') :
+ p += 1
+ val = s[b:p]
+ tattr[aname] = val
+ # label beginning and single tags
+ if ttype is None:
+ ttype = 'begin'
+ if s.find(' /',p) >= 0:
+ ttype = 'single_ext'
+ elif s.find('/',p) >= 0:
+ ttype = 'single'
+ return ttype, tname, tattr
+
+ # main routine to convert from mobi markup language to html
+ def processml(self):
+
+ # are these really needed
+ html_done = False
+ head_done = False
+ body_done = False
+
+ skip = False
+
+ htmlstr = ''
+ self.replace_page_breaks()
+ self.cleanup_html()
+
+ # now parse the cleaned up ml into standard xhtml
+ while True:
+
+ r = self.parseml()
+ if not r:
+ break
+
+ text, tag = r
+
+ if text:
+ if not skip:
+ htmlstr += text
+
+ if tag:
+ ttype, tname, tattr = self.parsetag(tag)
+
+ # If we run into a DTD or xml declarations inside the body ... bail.
+ if tname in SPECIAL_HANDLING_TAGS and tname != 'comment' and body_done:
+ htmlstr += '\n</body></html>'
+ break
+
+ # make sure self-closing tags actually self-close
+ if ttype == 'begin' and tname in SELF_CLOSING_TAGS:
+ ttype = 'single'
+
+ # make sure any end tags of self-closing tags are discarded
+ if ttype == 'end' and tname in SELF_CLOSING_TAGS:
+ continue
+
+ # remove embedded guide and refernces from old mobis
+ if tname in ('guide', 'ncx', 'reference') and ttype in ('begin', 'single', 'single_ext'):
+ tname = 'removeme:{0}'.format(tname)
+ tattr = None
+ if tname in ('guide', 'ncx', 'reference', 'font', 'span') and ttype == 'end':
+ if self.path[-1] == 'removeme:{0}'.format(tname):
+ tname = 'removeme:{0}'.format(tname)
+ tattr = None
+
+ # Get rid of font tags that only have a color attribute.
+ if tname == 'font' and ttype in ('begin', 'single', 'single_ext'):
+ if 'color' in tattr and len(tattr) == 1:
+ tname = 'removeme:{0}'.format(tname)
+ tattr = None
+
+ # Get rid of empty spans in the markup.
+ if tname == 'span' and ttype in ('begin', 'single', 'single_ext') and not len(tattr):
+ tname = 'removeme:{0}'.format(tname)
+
+ # need to handle fonts outside of the normal methods
+ # so fonts tags won't be added to the self.path since we keep track
+ # of font tags separately with self.font_history
+ if tname == 'font' and ttype == 'begin':
+ # check for nested font start tags
+ if len(self.font_history) > 0 :
+ # inject a font end tag
+ taginfo = ('end', 'font', None)
+ htmlstr += self.processtag(taginfo)
+ self.font_history.append((ttype, tname, tattr))
+ # handle the current font start tag
+ taginfo = (ttype, tname, tattr)
+ htmlstr += self.processtag(taginfo)
+ continue
+
+ # check for nested font tags and unnest them
+ if tname == 'font' and ttype == 'end':
+ self.font_history.pop()
+ # handle this font end tag
+ taginfo = ('end', 'font', None)
+ htmlstr += self.processtag(taginfo)
+ # check if we were nested
+ if len(self.font_history) > 0:
+ # inject a copy of the most recent font start tag from history
+ taginfo = self.font_history[-1]
+ htmlstr += self.processtag(taginfo)
+ continue
+
+ # keep track of nesting path
+ if ttype == 'begin':
+ self.path.append(tname)
+ elif ttype == 'end':
+ if tname != self.path[-1]:
+ print('improper nesting: ', self.path, tname, ttype)
+ if tname not in self.path:
+ # handle case of end tag with no beginning by injecting empty begin tag
+ taginfo = ('begin', tname, None)
+ htmlstr += self.processtag(taginfo)
+ print(" - fixed by injecting empty start tag ", tname)
+ self.path.append(tname)
+ elif len(self.path) > 1 and tname == self.path[-2]:
+ # handle case of dangling missing end
+ taginfo = ('end', self.path[-1], None)
+ htmlstr += self.processtag(taginfo)
+ print(" - fixed by injecting end tag ", self.path[-1])
+ self.path.pop()
+ self.path.pop()
+
+ if tname == 'removeme:{0}'.format(tname):
+ if ttype in ('begin', 'single', 'single_ext'):
+ skip = True
+ else:
+ skip = False
+ else:
+ taginfo = (ttype, tname, tattr)
+ htmlstr += self.processtag(taginfo)
+
+ # handle potential issue of multiple html, head, and body sections
+ if tname == 'html' and ttype == 'begin' and not html_done:
+ htmlstr += '\n'
+ html_done = True
+
+ if tname == 'head' and ttype == 'begin' and not head_done:
+ htmlstr += '\n'
+ # also add in metadata and style link tags
+ htmlstr += self.meta
+ htmlstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
+ head_done = True
+
+ if tname == 'body' and ttype == 'begin' and not body_done:
+ htmlstr += '\n'
+ body_done = True
+
+ # handle issue of possibly missing html, head, and body tags
+ # I have not seen this but the original did something like this so ...
+ if not body_done:
+ htmlstr = '<body>\n' + htmlstr + '</body>\n'
+ if not head_done:
+ headstr = '<head>\n'
+ headstr += self.meta
+ headstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
+ headstr += '</head>\n'
+ htmlstr = headstr + htmlstr
+ if not html_done:
+ htmlstr = '<html>\n' + htmlstr + '</html>\n'
+
+ # finally add DOCTYPE info
+ htmlstr = '<?xml version="1.0"?>\n<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n' + htmlstr
+
+ css = self.base_css_rules
+ for cls, rule in self.tag_css_rules.items():
+ css += '.%s { %s }\n' % (cls, rule)
+
+ return (htmlstr, css, self.cssname)
+
+ def ensure_unit(self, raw, unit='px'):
+ if re.search(r'\d+$', raw) is not None:
+ raw += unit
+ return raw
+
+ # flatten possibly modified tag back to string
+ def taginfo_tostring(self, taginfo):
+ (ttype, tname, tattr) = taginfo
+ if ttype is None or tname is None:
+ return ''
+ if ttype == 'end':
+ return '</%s>' % tname
+ if ttype in SPECIAL_HANDLING_TYPES and tattr is not None and 'special' in tattr:
+ info = tattr['special']
+ if ttype == 'comment':
+ return '<%s %s-->' % (tname, info)
+ else:
+ return '<%s %s>' % (tname, info)
+ res = []
+ res.append('<%s' % tname)
+ if tattr is not None:
+ for key in tattr:
+ res.append(' %s="%s"' % (key, tattr[key]))
+ if ttype == 'single':
+ res.append('/>')
+ elif ttype == 'single_ext':
+ res.append(' />')
+ else :
+ res.append('>')
+ return "".join(res)
+
+ # routines to convert from mobi ml tags atributes to xhtml attributes and styles
+ def processtag(self, taginfo):
+ # Converting mobi font sizes to numerics
+ size_map = {
+ 'xx-small': '1',
+ 'x-small': '2',
+ 'small': '3',
+ 'medium': '4',
+ 'large': '5',
+ 'x-large': '6',
+ 'xx-large': '7',
+ }
+
+ size_to_em_map = {
+ '1': '.65em',
+ '2': '.75em',
+ '3': '1em',
+ '4': '1.125em',
+ '5': '1.25em',
+ '6': '1.5em',
+ '7': '2em',
+ }
+
+ # current tag to work on
+ (ttype, tname, tattr) = taginfo
+ if not tattr:
+ tattr = {}
+
+ styles = []
+
+ if tname is None or tname.startswith('removeme'):
+ return ''
+
+ # have not seen an example of this yet so keep it here to be safe
+ # until this is better understood
+ if tname in ('country-region', 'place', 'placetype', 'placename',
+ 'state', 'city', 'street', 'address', 'content'):
+ tname = 'div' if tname == 'content' else 'span'
+ for key in tattr:
+ tattr.pop(key)
+
+ # handle general case of style, height, width, bgcolor in any tag
+ if 'style' in tattr:
+ style = tattr.pop('style').strip()
+ if style:
+ styles.append(style)
+
+ if 'align' in tattr:
+ align = tattr.pop('align').strip()
+ if align:
+ if tname in ('table', 'td', 'tr'):
+ pass
+ else:
+ styles.append('text-align: %s' % align)
+
+ if 'height' in tattr:
+ height = tattr.pop('height').strip()
+ if height and '<' not in height and '>' not in height and re.search(r'\d+', height):
+ if tname in ('table', 'td', 'tr'):
+ pass
+ elif tname == 'img':
+ tattr['height'] = height
+ else:
+ styles.append('margin-top: %s' % self.ensure_unit(height))
+
+ if 'width' in tattr:
+ width = tattr.pop('width').strip()
+ if width and re.search(r'\d+', width):
+ if tname in ('table', 'td', 'tr'):
+ pass
+ elif tname == 'img':
+ tattr['width'] = width
+ else:
+ styles.append('text-indent: %s' % self.ensure_unit(width))
+ if width.startswith('-'):
+ styles.append('margin-left: %s' % self.ensure_unit(width[1:]))
+
+ if 'bgcolor' in tattr:
+ # no proprietary html allowed
+ if tname == 'div':
+ del tattr['bgcolor']
+
+ elif tname == 'font':
+ # Change font tags to span tags
+ tname = 'span'
+ if ttype in ('begin', 'single', 'single_ext'):
+ # move the face attribute to css font-family
+ if 'face' in tattr:
+ face = tattr.pop('face').strip()
+ styles.append('font-family: "%s"' % face)
+
+ # Monitor the constantly changing font sizes, change them to ems and move
+ # them to css. The following will work for 'flat' font tags, but nested font tags
+ # will cause things to go wonky. Need to revert to the parent font tag's size
+ # when a closing tag is encountered.
+ if 'size' in tattr:
+ sz = tattr.pop('size').strip().lower()
+ try:
+ float(sz)
+ except ValueError:
+ if sz in size_map:
+ sz = size_map[sz]
+ else:
+ if sz.startswith('-') or sz.startswith('+'):
+ sz = self.current_font_size + float(sz)
+ if sz > 7:
+ sz = 7
+ elif sz < 1:
+ sz = 1
+ sz = str(int(sz))
+ styles.append('font-size: %s' % size_to_em_map[sz])
+ self.current_font_size = int(sz)
+
+ elif tname == 'img':
+ for attr in ('width', 'height'):
+ if attr in tattr:
+ val = tattr[attr]
+ if val.lower().endswith('em'):
+ try:
+ nval = float(val[:-2])
+ nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile
+ tattr[attr] = "%dpx"%int(nval)
+ except:
+ del tattr[attr]
+ elif val.lower().endswith('%'):
+ del tattr[attr]
+
+ # convert the anchor tags
+ if 'filepos-id' in tattr:
+ tattr['id'] = tattr.pop('filepos-id')
+ if 'name' in tattr and tattr['name'] != tattr['id']:
+ tattr['name'] = tattr['id']
+
+ if 'filepos' in tattr:
+ filepos = tattr.pop('filepos')
+ try:
+ tattr['href'] = "#filepos%d" % int(filepos)
+ except ValueError:
+ pass
+
+ if styles:
+ ncls = None
+ rule = '; '.join(styles)
+ for sel, srule in self.tag_css_rules.items():
+ if srule == rule:
+ ncls = sel
+ break
+ if ncls is None:
+ self.tag_css_rule_cnt += 1
+ ncls = 'rule_%d' % self.tag_css_rule_cnt
+ self.tag_css_rules[ncls] = rule
+ cls = tattr.get('class', '')
+ cls = cls + (' ' if cls else '') + ncls
+ tattr['class'] = cls
+
+ # convert updated tag back to string representation
+ if len(tattr) == 0:
+ tattr = None
+ taginfo = (ttype, tname, tattr)
+ return self.taginfo_tostring(taginfo)
+
+''' main only left in for testing outside of plugin '''
+
+def main(argv=sys.argv):
+ if len(argv) != 2:
+ return 1
+ else:
+ infile = argv[1]
+
+ try:
+ print('Converting Mobi Markup Language to XHTML')
+ mlc = MobiMLConverter(infile)
+ print('Processing ...')
+ htmlstr, css, cssname = mlc.processml()
+ outname = infile.rsplit('.',1)[0] + '_converted.html'
+ open(outname, 'w').write(htmlstr)
+ open(cssname, 'w').write(css)
+ print('Completed')
+ print('XHTML version of book can be found at: ' + outname)
+
+ except ValueError as e:
+ print("Error: %s" % e)
+ return 1
+
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())