#!/usr/bin/env python # -*- coding: utf-8 -*- # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab from __future__ import unicode_literals, division, absolute_import, print_function from .compatibility_utils import PY2, bstr, utf8_str if PY2: range = xrange import os import struct # note: struct pack, unpack, unpack_from all require bytestring format # data all the way up to at least python 2.7.5, python 3 okay with bytestring import re # note: re requites the pattern to be the exact same type as the data to be searched in python3 # but u"" is not allowed for the pattern itself only b"" from .mobi_index import MobiIndex from .mobi_utils import fromBase32 from .unipath import pathof _guide_types = [b'cover',b'title-page',b'toc',b'index',b'glossary',b'acknowledgements', b'bibliography',b'colophon',b'copyright-page',b'dedication', b'epigraph',b'foreward',b'loi',b'lot',b'notes',b'preface',b'text'] # locate beginning and ending positions of tag with specific aid attribute def locate_beg_end_of_tag(ml, aid): pattern = utf8_str(r'''<[^>]*\said\s*=\s*['"]%s['"][^>]*>''' % aid) aid_pattern = re.compile(pattern,re.IGNORECASE) for m in re.finditer(aid_pattern, ml): plt = m.start() pgt = ml.find(b'>',plt+1) return plt, pgt return 0, 0 # iterate over all tags in block in reverse order, i.e. last ta to first tag def reverse_tag_iter(block): end = len(block) while True: pgt = block.rfind(b'>', 0, end) if pgt == -1: break plt = block.rfind(b'<', 0, pgt) if plt == -1: break yield block[plt:pgt+1] end = plt class K8Processor: def __init__(self, mh, sect, files, debug=False): self.sect = sect self.files = files self.mi = MobiIndex(sect) self.mh = mh self.skelidx = mh.skelidx self.fragidx = mh.fragidx self.guideidx = mh.guideidx self.fdst = mh.fdst self.flowmap = {} self.flows = None self.flowinfo = [] self.parts = None self.partinfo = [] self.linked_aids = set() self.fdsttbl= [0,0xffffffff] self.DEBUG = debug # read in and parse the FDST info which is very similar in format to the Palm DB section # parsing except it provides offsets into rawML file and not the Palm DB file # this is needed to split up the final css, svg, etc flow section # that can exist at the end of the rawML file if self.fdst != 0xffffffff: header = self.sect.loadSection(self.fdst) if header[0:4] == b"FDST": num_sections, = struct.unpack_from(b'>L', header, 0x08) self.fdsttbl = struct.unpack_from(bstr('>%dL' % (num_sections*2)), header, 12)[::2] + (mh.rawSize, ) sect.setsectiondescription(self.fdst,"KF8 FDST INDX") if self.DEBUG: print("\nFDST Section Map: %d sections" % num_sections) for j in range(num_sections): print("Section %d: 0x%08X - 0x%08X" % (j, self.fdsttbl[j],self.fdsttbl[j+1])) else: print("\nError: K8 Mobi with Missing FDST info") # read/process skeleton index info to create the skeleton table skeltbl = [] if self.skelidx != 0xffffffff: # for i in range(2): # fname = 'skel%04d.dat' % i # data = self.sect.loadSection(self.skelidx + i) # with open(pathof(fname), 'wb') as f: # f.write(data) outtbl, ctoc_text = self.mi.getIndexData(self.skelidx, "KF8 Skeleton") fileptr = 0 for [text, tagMap] in outtbl: # file number, skeleton name, fragtbl record count, start position, length skeltbl.append([fileptr, text, tagMap[1][0], tagMap[6][0], tagMap[6][1]]) fileptr += 1 self.skeltbl = skeltbl if self.DEBUG: print("\nSkel Table: %d entries" % len(self.skeltbl)) print("table: filenum, skeleton name, frag tbl record count, start position, length") for j in range(len(self.skeltbl)): print(self.skeltbl[j]) # read/process the fragment index to create the fragment table fragtbl = [] if self.fragidx != 0xffffffff: # for i in range(3): # fname = 'frag%04d.dat' % i # data = self.sect.loadSection(self.fragidx + i) # with open(pathof(fname), 'wb') as f: # f.write(data) outtbl, ctoc_text = self.mi.getIndexData(self.fragidx, "KF8 Fragment") for [text, tagMap] in outtbl: # insert position, ctoc offset (aidtext), file number, sequence number, start position, length ctocoffset = tagMap[2][0] ctocdata = ctoc_text[ctocoffset] fragtbl.append([int(text), ctocdata, tagMap[3][0], tagMap[4][0], tagMap[6][0], tagMap[6][1]]) self.fragtbl = fragtbl if self.DEBUG: print("\nFragment Table: %d entries" % len(self.fragtbl)) print("table: file position, link id text, file num, sequence number, start position, length") for j in range(len(self.fragtbl)): print(self.fragtbl[j]) # read / process guide index for guide elements of opf guidetbl = [] if self.guideidx != 0xffffffff: # for i in range(3): # fname = 'guide%04d.dat' % i # data = self.sect.loadSection(self.guideidx + i) # with open(pathof(fname), 'wb') as f: # f.write(data) outtbl, ctoc_text = self.mi.getIndexData(self.guideidx, "KF8 Guide elements)") for [text, tagMap] in outtbl: # ref_type, ref_title, frag number ctocoffset = tagMap[1][0] ref_title = ctoc_text[ctocoffset] ref_type = text fileno = None if 3 in tagMap: fileno = tagMap[3][0] if 6 in tagMap: fileno = tagMap[6][0] guidetbl.append([ref_type, ref_title, fileno]) self.guidetbl = guidetbl if self.DEBUG: print("\nGuide Table: %d entries" % len(self.guidetbl)) print("table: ref_type, ref_title, fragtbl entry number") for j in range(len(self.guidetbl)): print(self.guidetbl[j]) def buildParts(self, rawML): # now split the rawML into its flow pieces self.flows = [] for j in range(0, len(self.fdsttbl)-1): start = self.fdsttbl[j] end = self.fdsttbl[j+1] self.flows.append(rawML[start:end]) # the first piece represents the xhtml text text = self.flows[0] self.flows[0] = b'' # walk the and fragment tables to build original source xhtml files # *without* destroying any file position information needed for later href processing # and create final list of file separation start: stop points and etc in partinfo if self.DEBUG: print("\nRebuilding flow piece 0: the main body of the ebook") self.parts = [] self.partinfo = [] fragptr = 0 baseptr = 0 cnt = 0 filename = 'part%04d.xhtml' % cnt for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl: baseptr = skelpos + skellen skeleton = text[skelpos: baseptr] aidtext = "0" for i in range(fragcnt): [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[fragptr] aidtext = idtext[12:-2] if i == 0: filename = 'part%04d.xhtml' % filenum slice = text[baseptr: baseptr + length] insertpos = insertpos - skelpos head = skeleton[:insertpos] tail = skeleton[insertpos:] actual_inspos = insertpos if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') < head.rfind(b'<')): # There is an incomplete tag in either the head or tail. # This can happen for some badly formed KF8 files print('The fragment table for %s has incorrect insert position. Calculating manually.' % skelname) bp, ep = locate_beg_end_of_tag(skeleton, aidtext) if bp != ep: actual_inspos = ep + 1 + startpos if insertpos != actual_inspos: print("fixed corrupt fragment table insert position", insertpos+skelpos, actual_inspos+skelpos) insertpos = actual_inspos self.fragtbl[fragptr][0] = actual_inspos + skelpos skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:] baseptr = baseptr + length fragptr += 1 cnt += 1 self.parts.append(skeleton) self.partinfo.append([skelnum, 'Text', filename, skelpos, baseptr, aidtext]) assembled_text = b''.join(self.parts) if self.DEBUG: outassembled = os.path.join(self.files.k8dir, 'assembled_text.dat') with open(pathof(outassembled),'wb') as f: f.write(assembled_text) # The primary css style sheet is typically stored next followed by any # snippets of code that were previously inlined in the # original xhtml but have been stripped out and placed here. # This can include local CDATA snippets and and svg sections. # The problem is that for most browsers and ereaders, you can not # use to import any svg image that itself # properly uses an tag to import some raster image - it # should work according to the spec but does not for almost all browsers # and ereaders and causes epub validation issues because those raster # images are in manifest but not in xhtml text - since they only # referenced from an svg image # So we need to check the remaining flow pieces to see if they are css # or svg images. if svg images, we must check if they have an # and if so inline them into the xhtml text pieces. # there may be other sorts of pieces stored here but until we see one # in the wild to reverse engineer we won't be able to tell self.flowinfo.append([None, None, None, None]) svg_tag_pattern = re.compile(br'''(]*>)''', re.IGNORECASE) image_tag_pattern = re.compile(br'''(]*>)''', re.IGNORECASE) for j in range(1,len(self.flows)): flowpart = self.flows[j] nstr = '%04d' % j m = re.search(svg_tag_pattern, flowpart) if m is not None: # svg ptype = b'svg' start = m.start() m2 = re.search(image_tag_pattern, flowpart) if m2 is not None: pformat = b'inline' pdir = None fname = None # strip off anything before = 0: ptype = b'css' flowpart = b'\n' pformat = b'inline' pdir = None fname = None else: # css - assume as standalone css file ptype = b'css' pformat = b'file' pdir = "Styles" fname = 'style' + nstr + '.css' self.flows[j] = flowpart self.flowinfo.append([ptype, pformat, pdir, fname]) if self.DEBUG: print("\nFlow Map: %d entries" % len(self.flowinfo)) for fi in self.flowinfo: print(fi) print("\n") print("\nXHTML File Part Position Information: %d entries" % len(self.partinfo)) for pi in self.partinfo: print(pi) if False: # self.Debug: # dump all of the locations of the aid tags used in TEXT # find id links only inside of tags # inside any < > pair find all "aid=' and return whatever is inside the quotes # [^>]* means match any amount of chars except for '>' char # [^'"] match any amount of chars except for the quote character # \s* means match any amount of whitespace print("\npositions of all aid= pieces") id_pattern = re.compile(br'''<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>''',re.IGNORECASE) for m in re.finditer(id_pattern, rawML): [filename, partnum, start, end] = self.getFileInfo(m.start()) [seqnum, idtext] = self.getFragTblInfo(m.start()) value = fromBase32(m.group(1)) print(" aid: %s value: %d at: %d -> part: %d, start: %d, end: %d" % (m.group(1), value, m.start(), partnum, start, end)) print(" %s fragtbl entry %d" % (idtext, seqnum)) return # get information fragment table entry by pos def getFragTblInfo(self, pos): for j in range(len(self.fragtbl)): [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[j] if pos >= insertpos and pos < (insertpos + length): # why are these "in: and before: added here return seqnum, b'in: ' + idtext if pos < insertpos: return seqnum, b'before: ' + idtext return None, None # get information about the part (file) that exists at pos in original rawML def getFileInfo(self, pos): for [partnum, pdir, filename, start, end, aidtext] in self.partinfo: if pos >= start and pos < end: return filename, partnum, start, end return None, None, None, None # accessor functions to properly protect the internal structure def getNumberOfParts(self): return len(self.parts) def getPart(self,i): if i >= 0 and i < len(self.parts): return self.parts[i] return None def getPartInfo(self, i): if i >= 0 and i < len(self.partinfo): return self.partinfo[i] return None def getNumberOfFlows(self): return len(self.flows) def getFlow(self,i): # note flows[0] is empty - it was all of the original text if i > 0 and i < len(self.flows): return self.flows[i] return None def getFlowInfo(self,i): # note flowinfo[0] is empty - it was all of the original text if i > 0 and i < len(self.flowinfo): return self.flowinfo[i] return None def getIDTagByPosFid(self, posfid, offset): # first convert kindle:pos:fid and offset info to position in file # (fromBase32 can handle both string types on input) row = fromBase32(posfid) off = fromBase32(offset) [insertpos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[row] pos = insertpos + off fname, pn, skelpos, skelend = self.getFileInfo(pos) if fname is None: # pos does not exist # default to skeleton pos instead print("Link To Position", pos, "does not exist, retargeting to top of target") pos = self.skeltbl[filenum][3] fname, pn, skelpos, skelend = self.getFileInfo(pos) # an existing "id=" or "name=" attribute must exist in original xhtml otherwise it would not have worked for linking. # Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent # some position information encoded into Base32 name. # so find the closest "id=" before position the file by actually searching in that file idtext = self.getIDTag(pos) return fname, idtext def getIDTag(self, pos): # find the first tag with a named anchor (name or id attribute) before pos fname, pn, skelpos, skelend = self.getFileInfo(pos) if pn is None and skelpos is None: print("Error: getIDTag - no file contains ", pos) textblock = self.parts[pn] npos = pos - skelpos # if npos inside a tag then search all text before the its end of tag marker pgt = textblock.find(b'>',npos) plt = textblock.find(b'<',npos) if plt == npos or pgt < plt: npos = pgt + 1 # find id and name attributes only inside of tags # use a reverse tag search since that is faster # inside any < > pair find "id=" and "name=" attributes return it # [^>]* means match any amount of chars except for '>' char # [^'"] match any amount of chars except for the quote character # \s* means match any amount of whitespace textblock = textblock[0:npos] id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE) name_pattern = re.compile(br'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE) aid_pattern = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''') for tag in reverse_tag_iter(textblock): # any ids in the body should default to top of file if tag[0:6] == b'= start and pos < end: return [partnum, pdir, filename, start, end, aidtext] return [None, None, None, None, None, None] # fileno is actually a reference into fragtbl (a fragment) def getGuideText(self): guidetext = b'' for [ref_type, ref_title, fileno] in self.guidetbl: if ref_type == b'thumbimagestandard': continue if ref_type not in _guide_types and not ref_type.startswith(b'other.'): if ref_type == b'start': ref_type = b'text' else: ref_type = b'other.' + ref_type [pos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[fileno] [pn, pdir, filename, skelpos, skelend, aidtext] = self.getSkelInfo(pos) idtext = self.getIDTag(pos) linktgt = filename.encode('utf-8') if idtext != b'': linktgt += b'#' + idtext guidetext += b'\n' # opf is encoded utf-8 so must convert any titles properly guidetext = (guidetext.decode(self.mh.codec)).encode("utf-8") return guidetext def getPageIDTag(self, pos): # find the first tag with a named anchor (name or id attribute) before pos # but page map offsets need to little more leeway so if the offset points # into a tag look for the next ending tag "/>" or "',npos) plt = textblock.find(b'<',npos) if plt == npos or pgt < plt: # we are in a tag # so find first ending tag pend1 = textblock.find(b'/>', npos) pend2 = textblock.find(b' pair find "id=" and "name=" attributes return it # [^>]* means match any amount of chars except for '>' char # [^'"] match any amount of chars except for the quote character # \s* means match any amount of whitespace textblock = textblock[0:npos] id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE) name_pattern = re.compile(br'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE) for tag in reverse_tag_iter(textblock): # any ids in the body should default to top of file if tag[0:6] == b'