#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# this program works in concert with the output from KindleUnpack
'''
Convert from Mobi ML to XHTML
'''
from __future__ import division, absolute_import, print_function
import os
import sys
import re
SPECIAL_HANDLING_TAGS = {
'?xml' : ('xmlheader', -1),
'!--' : ('comment', -3),
'!DOCTYPE' : ('doctype', -1),
}
SPECIAL_HANDLING_TYPES = ['xmlheader', 'doctype', 'comment']
SELF_CLOSING_TAGS = ['br' , 'hr', 'input', 'img', 'image', 'meta', 'spacer', 'link', 'frame', 'base', 'col', 'reference']
class MobiMLConverter(object):
PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
def __init__(self, filename):
self.base_css_rules = 'blockquote { margin: 0em 0em 0em 1.25em }\n'
self.base_css_rules += 'p { margin: 0em }\n'
self.base_css_rules += '.bold { font-weight: bold }\n'
self.base_css_rules += '.italic { font-style: italic }\n'
self.base_css_rules += '.mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n'
self.tag_css_rules = {}
self.tag_css_rule_cnt = 0
self.path = []
self.filename = filename
self.wipml = open(self.filename, 'r').read()
self.pos = 0
self.opfname = self.filename.rsplit('.',1)[0] + '.opf'
self.opos = 0
self.meta = ''
self.cssname = os.path.join(os.path.dirname(self.filename),'styles.css')
self.current_font_size = 3
self.font_history = []
def cleanup_html(self):
self.wipml = re.sub(r'
', '', self.wipml)
self.wipml = self.wipml.replace('\r\n', '\n')
self.wipml = self.wipml.replace('> <', '>\n<')
self.wipml = self.wipml.replace(']*>', '', self.wipml)
self.wipml = self.wipml.replace('
','
')
def replace_page_breaks(self):
self.wipml = self.PAGE_BREAK_PAT.sub(
'',
self.wipml)
# parse leading text of ml and tag
def parseml(self):
p = self.pos
if p >= len(self.wipml):
return None
if self.wipml[p] != '<':
res = self.wipml.find('<',p)
if res == -1 :
res = len(self.wipml)
self.pos = res
return self.wipml[p:res], None
# handle comment as a special case to deal with multi-line comments
if self.wipml[p:p+4] == '',p+1)
if te != -1:
te = te+2
else :
te = self.wipml.find('>',p+1)
ntb = self.wipml.find('<',p+1)
if ntb != -1 and ntb < te:
self.pos = ntb
return self.wipml[p:ntb], None
self.pos = te + 1
return None, self.wipml[p:te+1]
# parses string version of tag to identify its name,
# its type 'begin', 'end' or 'single',
# plus build a hashtable of its attributes
# code is written to handle the possiblity of very poor formating
def parsetag(self, s):
p = 1
# get the tag name
tname = None
ttype = None
tattr = {}
while s[p:p+1] == ' ' :
p += 1
if s[p:p+1] == '/':
ttype = 'end'
p += 1
while s[p:p+1] == ' ' :
p += 1
b = p
while s[p:p+1] not in ('>', '/', ' ', '"', "'", "\r", "\n") :
p += 1
tname=s[b:p].lower()
if tname == '!doctype':
tname = '!DOCTYPE'
# special cases
if tname in SPECIAL_HANDLING_TAGS:
ttype, backstep = SPECIAL_HANDLING_TAGS[tname]
tattr['special'] = s[p:backstep]
if ttype is None:
# parse any attributes
while s.find('=',p) != -1 :
while s[p:p+1] == ' ' :
p += 1
b = p
while s[p:p+1] != '=' :
p += 1
aname = s[b:p].lower()
aname = aname.rstrip(' ')
p += 1
while s[p:p+1] == ' ' :
p += 1
if s[p:p+1] in ('"', "'") :
p = p + 1
b = p
while s[p:p+1] not in ('"', "'") :
p += 1
val = s[b:p]
p += 1
else :
b = p
while s[p:p+1] not in ('>', '/', ' ') :
p += 1
val = s[b:p]
tattr[aname] = val
# label beginning and single tags
if ttype is None:
ttype = 'begin'
if s.find(' /',p) >= 0:
ttype = 'single_ext'
elif s.find('/',p) >= 0:
ttype = 'single'
return ttype, tname, tattr
# main routine to convert from mobi markup language to html
def processml(self):
# are these really needed
html_done = False
head_done = False
body_done = False
skip = False
htmlstr = ''
self.replace_page_breaks()
self.cleanup_html()
# now parse the cleaned up ml into standard xhtml
while True:
r = self.parseml()
if not r:
break
text, tag = r
if text:
if not skip:
htmlstr += text
if tag:
ttype, tname, tattr = self.parsetag(tag)
# If we run into a DTD or xml declarations inside the body ... bail.
if tname in SPECIAL_HANDLING_TAGS and tname != 'comment' and body_done:
htmlstr += '\n