#!/usr/bin/env python # -*- coding: utf-8 -*- # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab from __future__ import unicode_literals, division, absolute_import, print_function import os from .unipath import pathof from .compatibility_utils import unescapeit import re # note: re requites the pattern to be the exact same type as the data to be searched in python3 # but u"" is not allowed for the pattern itself only b"" from xml.sax.saxutils import escape as xmlescape from .mobi_utils import toBase32 from .mobi_index import MobiIndex DEBUG_NCX = False class ncxExtract: def __init__(self, mh, files): self.mh = mh self.sect = self.mh.sect self.files = files self.isNCX = False self.mi = MobiIndex(self.sect) self.ncxidx = self.mh.ncxidx self.indx_data = None def parseNCX(self): indx_data = [] tag_fieldname_map = { 1: ['pos',0], 2: ['len',0], 3: ['noffs',0], 4: ['hlvl',0], 5: ['koffs',0], 6: ['pos_fid',0], 21: ['parent',0], 22: ['child1',0], 23: ['childn',0] } if self.ncxidx != 0xffffffff: outtbl, ctoc_text = self.mi.getIndexData(self.ncxidx, "NCX") if DEBUG_NCX: print(ctoc_text) print(outtbl) num = 0 for [text, tagMap] in outtbl: tmp = { 'name': text.decode('utf-8'), 'pos': -1, 'len': 0, 'noffs': -1, 'text' : "Unknown Text", 'hlvl' : -1, 'kind' : "Unknown Kind", 'pos_fid' : None, 'parent' : -1, 'child1' : -1, 'childn' : -1, 'num' : num } for tag in tag_fieldname_map: [fieldname, i] = tag_fieldname_map[tag] if tag in tagMap: fieldvalue = tagMap[tag][i] if tag == 6: pos_fid = toBase32(fieldvalue,4).decode('utf-8') fieldvalue2 = tagMap[tag][i+1] pos_off = toBase32(fieldvalue2,10).decode('utf-8') fieldvalue = 'kindle:pos:fid:%s:off:%s' % (pos_fid, pos_off) tmp[fieldname] = fieldvalue if tag == 3: toctext = ctoc_text.get(fieldvalue, 'Unknown Text') toctext = toctext.decode(self.mh.codec) tmp['text'] = toctext if tag == 5: kindtext = ctoc_text.get(fieldvalue, 'Unknown Kind') kindtext = kindtext.decode(self.mh.codec) tmp['kind'] = kindtext indx_data.append(tmp) if DEBUG_NCX: print("record number: ", num) print("name: ", tmp['name'],) print("position", tmp['pos']," length: ", tmp['len']) print("text: ", tmp['text']) print("kind: ", tmp['kind']) print("heading level: ", tmp['hlvl']) print("parent:", tmp['parent']) print("first child: ",tmp['child1']," last child: ", tmp['childn']) print("pos_fid is ", tmp['pos_fid']) print("\n\n") num += 1 self.indx_data = indx_data return indx_data def buildNCX(self, htmlfile, title, ident, lang): indx_data = self.indx_data ncx_header = \ ''' %s ''' ncx_footer = \ ''' ''' ncx_entry = \ ''' %s ''' # recursive part def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1): if start>len(indx_data) or end>len(indx_data): print("Warning: missing INDX child entries", start, end, len(indx_data)) return '' if DEBUG_NCX: print("recursINDX lvl %d from %d to %d" % (lvl, start, end)) xml = '' if start <= 0: start = 0 if end <= 0: end = len(indx_data) if lvl > max_lvl: max_lvl = lvl indent = ' ' * (2 + lvl) for i in range(start, end): e = indx_data[i] if not e['hlvl'] == lvl: continue # open entry num += 1 link = '%s#filepos%d' % (htmlfile, e['pos']) tagid = 'np_%d' % num entry = ncx_entry % (tagid, num, xmlescape(unescapeit(e['text'])), link) entry = re.sub(re.compile('^', re.M), indent, entry, 0) xml += entry + '\n' # recurs if e['child1']>=0: xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1, e['child1'], e['childn'] + 1) xml += xmlrec # close entry xml += indent + '\n' return xml, max_lvl, num body, max_lvl, num = recursINDX() header = ncx_header % (lang, ident, max_lvl + 1, title) ncx = header + body + ncx_footer if not len(indx_data) == num: print("Warning: different number of entries in NCX", len(indx_data), num) return ncx def writeNCX(self, metadata): # build the xml self.isNCX = True print("Write ncx") # htmlname = os.path.basename(self.files.outbase) # htmlname += '.html' htmlname = 'book.html' xml = self.buildNCX(htmlname, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0]) # write the ncx file # ncxname = os.path.join(self.files.mobi7dir, self.files.getInputFileBasename() + '.ncx') ncxname = os.path.join(self.files.mobi7dir, 'toc.ncx') with open(pathof(ncxname), 'wb') as f: f.write(xml.encode('utf-8')) def buildK8NCX(self, indx_data, title, ident, lang): ncx_header = \ ''' %s ''' ncx_footer = \ ''' ''' ncx_entry = \ ''' %s ''' # recursive part def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1): if start>len(indx_data) or end>len(indx_data): print("Warning: missing INDX child entries", start, end, len(indx_data)) return '' if DEBUG_NCX: print("recursINDX lvl %d from %d to %d" % (lvl, start, end)) xml = '' if start <= 0: start = 0 if end <= 0: end = len(indx_data) if lvl > max_lvl: max_lvl = lvl indent = ' ' * (2 + lvl) for i in range(start, end): e = indx_data[i] htmlfile = e['filename'] desttag = e['idtag'] if not e['hlvl'] == lvl: continue # open entry num += 1 if desttag == '': link = 'Text/%s' % htmlfile else: link = 'Text/%s#%s' % (htmlfile, desttag) tagid = 'np_%d' % num entry = ncx_entry % (tagid, num, xmlescape(unescapeit(e['text'])), link) entry = re.sub(re.compile('^', re.M), indent, entry, 0) xml += entry + '\n' # recurs if e['child1']>=0: xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1, e['child1'], e['childn'] + 1) xml += xmlrec # close entry xml += indent + '\n' return xml, max_lvl, num body, max_lvl, num = recursINDX() header = ncx_header % (lang, ident, max_lvl + 1, title) ncx = header + body + ncx_footer if not len(indx_data) == num: print("Warning: different number of entries in NCX", len(indx_data), num) return ncx def writeK8NCX(self, ncx_data, metadata): # build the xml self.isNCX = True print("Write K8 ncx") xml = self.buildK8NCX(ncx_data, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0]) bname = 'toc.ncx' ncxname = os.path.join(self.files.k8oebps,bname) with open(pathof(ncxname), 'wb') as f: f.write(xml.encode('utf-8'))