aboutsummaryrefslogblamecommitdiffstats
path: root/epy_extras/KindleUnpack/mobi_k8proc.py
blob: 5b8274e596e55a91a4e3e25c80c436c8beadb867 (plain) (tree)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496















































































































































































































































































































































































































































































































                                                                                                                                         
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab

from __future__ import unicode_literals, division, absolute_import, print_function

from .compatibility_utils import PY2, bstr, utf8_str

if PY2:
    range = xrange

import os

import struct
# note:  struct pack, unpack, unpack_from all require bytestring format
# data all the way up to at least python 2.7.5, python 3 okay with bytestring

import re
# note: re requites the pattern to be the exact same type as the data to be searched in python3
# but u"" is not allowed for the pattern itself only b""

from .mobi_index import MobiIndex
from .mobi_utils import fromBase32
from .unipath import pathof

_guide_types = [b'cover',b'title-page',b'toc',b'index',b'glossary',b'acknowledgements',
                b'bibliography',b'colophon',b'copyright-page',b'dedication',
                b'epigraph',b'foreward',b'loi',b'lot',b'notes',b'preface',b'text']

# locate beginning and ending positions of tag with specific aid attribute
def locate_beg_end_of_tag(ml, aid):
    pattern = utf8_str(r'''<[^>]*\said\s*=\s*['"]%s['"][^>]*>''' % aid)
    aid_pattern = re.compile(pattern,re.IGNORECASE)
    for m in re.finditer(aid_pattern, ml):
        plt = m.start()
        pgt = ml.find(b'>',plt+1)
        return plt, pgt
    return 0, 0


# iterate over all tags in block in reverse order, i.e. last ta to first tag
def reverse_tag_iter(block):
    end = len(block)
    while True:
        pgt = block.rfind(b'>', 0, end)
        if pgt == -1:
            break
        plt = block.rfind(b'<', 0, pgt)
        if plt == -1:
            break
        yield block[plt:pgt+1]
        end = plt


class K8Processor:

    def __init__(self, mh, sect, files, debug=False):
        self.sect = sect
        self.files = files
        self.mi = MobiIndex(sect)
        self.mh = mh
        self.skelidx = mh.skelidx
        self.fragidx = mh.fragidx
        self.guideidx = mh.guideidx
        self.fdst = mh.fdst
        self.flowmap = {}
        self.flows = None
        self.flowinfo = []
        self.parts = None
        self.partinfo = []
        self.linked_aids = set()
        self.fdsttbl= [0,0xffffffff]
        self.DEBUG = debug

        # read in and parse the FDST info which is very similar in format to the Palm DB section
        # parsing except it provides offsets into rawML file and not the Palm DB file
        # this is needed to split up the final css, svg, etc flow section
        # that can exist at the end of the rawML file
        if self.fdst != 0xffffffff:
            header = self.sect.loadSection(self.fdst)
            if header[0:4] == b"FDST":
                num_sections, = struct.unpack_from(b'>L', header, 0x08)
                self.fdsttbl = struct.unpack_from(bstr('>%dL' % (num_sections*2)), header, 12)[::2] + (mh.rawSize, )
                sect.setsectiondescription(self.fdst,"KF8 FDST INDX")
                if self.DEBUG:
                    print("\nFDST Section Map:  %d sections" % num_sections)
                    for j in range(num_sections):
                        print("Section %d: 0x%08X - 0x%08X" % (j, self.fdsttbl[j],self.fdsttbl[j+1]))
            else:
                print("\nError: K8 Mobi with Missing FDST info")

        # read/process skeleton index info to create the skeleton table
        skeltbl = []
        if self.skelidx != 0xffffffff:
            # for i in range(2):
            #     fname = 'skel%04d.dat' % i
            #     data = self.sect.loadSection(self.skelidx + i)
            #     with open(pathof(fname), 'wb') as f:
            #         f.write(data)
            outtbl, ctoc_text = self.mi.getIndexData(self.skelidx, "KF8 Skeleton")
            fileptr = 0
            for [text, tagMap] in outtbl:
                # file number, skeleton name, fragtbl record count, start position, length
                skeltbl.append([fileptr, text, tagMap[1][0], tagMap[6][0], tagMap[6][1]])
                fileptr += 1
        self.skeltbl = skeltbl
        if self.DEBUG:
            print("\nSkel Table:  %d entries" % len(self.skeltbl))
            print("table: filenum, skeleton name, frag tbl record count, start position, length")
            for j in range(len(self.skeltbl)):
                print(self.skeltbl[j])

        # read/process the fragment index to create the fragment table
        fragtbl = []
        if self.fragidx != 0xffffffff:
            # for i in range(3):
            #     fname = 'frag%04d.dat' % i
            #     data = self.sect.loadSection(self.fragidx + i)
            #     with open(pathof(fname), 'wb') as f:
            #         f.write(data)
            outtbl, ctoc_text = self.mi.getIndexData(self.fragidx, "KF8 Fragment")
            for [text, tagMap] in outtbl:
                # insert position, ctoc offset (aidtext), file number, sequence number, start position, length
                ctocoffset = tagMap[2][0]
                ctocdata = ctoc_text[ctocoffset]
                fragtbl.append([int(text), ctocdata, tagMap[3][0], tagMap[4][0], tagMap[6][0], tagMap[6][1]])
        self.fragtbl = fragtbl
        if self.DEBUG:
            print("\nFragment Table: %d entries" % len(self.fragtbl))
            print("table: file position, link id text, file num, sequence number, start position, length")
            for j in range(len(self.fragtbl)):
                print(self.fragtbl[j])

        # read / process guide index for guide elements of opf
        guidetbl = []
        if self.guideidx != 0xffffffff:
            # for i in range(3):
            #     fname = 'guide%04d.dat' % i
            #     data = self.sect.loadSection(self.guideidx + i)
            #     with open(pathof(fname), 'wb') as f:
            #         f.write(data)
            outtbl, ctoc_text = self.mi.getIndexData(self.guideidx, "KF8 Guide elements)")
            for [text, tagMap] in outtbl:
                # ref_type, ref_title, frag number
                ctocoffset = tagMap[1][0]
                ref_title = ctoc_text[ctocoffset]
                ref_type = text
                fileno = None
                if 3 in tagMap:
                    fileno  = tagMap[3][0]
                if 6 in tagMap:
                    fileno = tagMap[6][0]
                guidetbl.append([ref_type, ref_title, fileno])
        self.guidetbl = guidetbl
        if self.DEBUG:
            print("\nGuide Table: %d entries" % len(self.guidetbl))
            print("table: ref_type, ref_title, fragtbl entry number")
            for j in range(len(self.guidetbl)):
                print(self.guidetbl[j])

    def buildParts(self, rawML):
        # now split the rawML into its flow pieces
        self.flows = []
        for j in range(0, len(self.fdsttbl)-1):
            start = self.fdsttbl[j]
            end = self.fdsttbl[j+1]
            self.flows.append(rawML[start:end])

        # the first piece represents the xhtml text
        text = self.flows[0]
        self.flows[0] = b''

        # walk the <skeleton> and fragment tables to build original source xhtml files
        # *without* destroying any file position information needed for later href processing
        # and create final list of file separation start: stop points and etc in partinfo
        if self.DEBUG:
            print("\nRebuilding flow piece 0: the main body of the ebook")
        self.parts = []
        self.partinfo = []
        fragptr = 0
        baseptr = 0
        cnt = 0
        filename = 'part%04d.xhtml' % cnt
        for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl:
            baseptr = skelpos + skellen
            skeleton = text[skelpos: baseptr]
            aidtext = "0"
            for i in range(fragcnt):
                [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[fragptr]
                aidtext = idtext[12:-2]
                if i == 0:
                    filename = 'part%04d.xhtml' % filenum
                slice = text[baseptr: baseptr + length]
                insertpos = insertpos - skelpos
                head = skeleton[:insertpos]
                tail = skeleton[insertpos:]
                actual_inspos = insertpos
                if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') < head.rfind(b'<')):
                    # There is an incomplete tag in either the head or tail.
                    # This can happen for some badly formed KF8 files
                    print('The fragment table for %s has incorrect insert position. Calculating manually.' % skelname)
                    bp, ep = locate_beg_end_of_tag(skeleton, aidtext)
                    if bp != ep:
                        actual_inspos = ep + 1 + startpos
                if insertpos != actual_inspos:
                    print("fixed corrupt fragment table insert position", insertpos+skelpos, actual_inspos+skelpos)
                    insertpos = actual_inspos
                    self.fragtbl[fragptr][0] = actual_inspos + skelpos
                skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:]
                baseptr = baseptr + length
                fragptr += 1
            cnt += 1
            self.parts.append(skeleton)
            self.partinfo.append([skelnum, 'Text', filename, skelpos, baseptr, aidtext])

        assembled_text = b''.join(self.parts)
        if self.DEBUG:
            outassembled = os.path.join(self.files.k8dir, 'assembled_text.dat')
            with open(pathof(outassembled),'wb') as f:
                f.write(assembled_text)

        # The primary css style sheet is typically stored next followed by any
        # snippets of code that were previously inlined in the
        # original xhtml but have been stripped out and placed here.
        # This can include local CDATA snippets and and svg sections.

        # The problem is that for most browsers and ereaders, you can not
        # use <img src="imageXXXX.svg" /> to import any svg image that itself
        # properly uses an <image/> tag to import some raster image - it
        # should work according to the spec but does not for almost all browsers
        # and ereaders and causes epub validation issues because those  raster
        # images are in manifest but not in xhtml text - since they only
        # referenced from an svg image

        # So we need to check the remaining flow pieces to see if they are css
        # or svg images.  if svg images, we must check if they have an <image />
        # and if so inline them into the xhtml text pieces.

        # there may be other sorts of pieces stored here but until we see one
        # in the wild to reverse engineer we won't be able to tell
        self.flowinfo.append([None, None, None, None])
        svg_tag_pattern = re.compile(br'''(<svg[^>]*>)''', re.IGNORECASE)
        image_tag_pattern = re.compile(br'''(<image[^>]*>)''', re.IGNORECASE)
        for j in range(1,len(self.flows)):
            flowpart = self.flows[j]
            nstr = '%04d' % j
            m = re.search(svg_tag_pattern, flowpart)
            if m is not None:
                # svg
                ptype = b'svg'
                start = m.start()
                m2 = re.search(image_tag_pattern, flowpart)
                if m2 is not None:
                    pformat = b'inline'
                    pdir = None
                    fname = None
                    # strip off anything before <svg if inlining
                    flowpart = flowpart[start:]
                else:
                    pformat = b'file'
                    pdir = "Images"
                    fname = 'svgimg' + nstr + '.svg'
            else:
                # search for CDATA and if exists inline it
                if flowpart.find(b'[CDATA[') >= 0:
                    ptype = b'css'
                    flowpart = b'<style type="text/css">\n' + flowpart + b'\n</style>\n'
                    pformat = b'inline'
                    pdir = None
                    fname = None
                else:
                    # css - assume as standalone css file
                    ptype = b'css'
                    pformat = b'file'
                    pdir = "Styles"
                    fname = 'style' + nstr + '.css'

            self.flows[j] = flowpart
            self.flowinfo.append([ptype, pformat, pdir, fname])

        if self.DEBUG:
            print("\nFlow Map:  %d entries" % len(self.flowinfo))
            for fi in self.flowinfo:
                print(fi)
            print("\n")

            print("\nXHTML File Part Position Information: %d entries" % len(self.partinfo))
            for pi in self.partinfo:
                print(pi)

        if False:  # self.Debug:
            # dump all of the locations of the aid tags used in TEXT
            # find id links only inside of tags
            #    inside any < > pair find all "aid=' and return whatever is inside the quotes
            #    [^>]* means match any amount of chars except for  '>' char
            #    [^'"] match any amount of chars except for the quote character
            #    \s* means match any amount of whitespace
            print("\npositions of all aid= pieces")
            id_pattern = re.compile(br'''<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>''',re.IGNORECASE)
            for m in re.finditer(id_pattern, rawML):
                [filename, partnum, start, end] = self.getFileInfo(m.start())
                [seqnum, idtext] = self.getFragTblInfo(m.start())
                value = fromBase32(m.group(1))
                print("  aid: %s value: %d at: %d -> part: %d, start: %d, end: %d" % (m.group(1), value, m.start(), partnum, start, end))
                print("       %s  fragtbl entry %d" % (idtext, seqnum))

        return

    # get information fragment table entry by pos
    def getFragTblInfo(self, pos):
        for j in range(len(self.fragtbl)):
            [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[j]
            if pos >= insertpos and pos < (insertpos + length):
                # why are these "in: and before: added here
                return seqnum, b'in: ' + idtext
            if pos < insertpos:
                return seqnum, b'before: ' + idtext
        return None, None

    # get information about the part (file) that exists at pos in original rawML
    def getFileInfo(self, pos):
        for [partnum, pdir, filename, start, end, aidtext] in self.partinfo:
            if pos >= start and pos < end:
                return filename, partnum, start, end
        return None, None, None, None

    # accessor functions to properly protect the internal structure
    def getNumberOfParts(self):
        return len(self.parts)

    def getPart(self,i):
        if i >= 0 and i < len(self.parts):
            return self.parts[i]
        return None

    def getPartInfo(self, i):
        if i >= 0 and i < len(self.partinfo):
            return self.partinfo[i]
        return None

    def getNumberOfFlows(self):
        return len(self.flows)

    def getFlow(self,i):
        # note flows[0] is empty - it was all of the original text
        if i > 0 and i < len(self.flows):
            return self.flows[i]
        return None

    def getFlowInfo(self,i):
        # note flowinfo[0] is empty - it was all of the original text
        if i > 0 and i < len(self.flowinfo):
            return self.flowinfo[i]
        return None

    def getIDTagByPosFid(self, posfid, offset):
        # first convert kindle:pos:fid and offset info to position in file
        # (fromBase32 can handle both string types on input)
        row = fromBase32(posfid)
        off = fromBase32(offset)
        [insertpos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[row]
        pos = insertpos + off
        fname, pn, skelpos, skelend = self.getFileInfo(pos)
        if fname is None:
            # pos does not exist
            # default to skeleton pos instead
            print("Link To Position", pos, "does not exist, retargeting to top of target")
            pos = self.skeltbl[filenum][3]
            fname, pn, skelpos, skelend = self.getFileInfo(pos)
        # an existing "id=" or "name=" attribute must exist in original xhtml otherwise it would not have worked for linking.
        # Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent
        # some position information encoded into Base32 name.
        # so find the closest "id=" before position the file  by actually searching in that file
        idtext = self.getIDTag(pos)
        return fname, idtext

    def getIDTag(self, pos):
        # find the first tag with a named anchor (name or id attribute) before pos
        fname, pn, skelpos, skelend = self.getFileInfo(pos)
        if pn is None and skelpos is None:
            print("Error: getIDTag - no file contains ", pos)
        textblock = self.parts[pn]
        npos = pos - skelpos
        # if npos inside a tag then search all text before the its end of tag marker
        pgt = textblock.find(b'>',npos)
        plt = textblock.find(b'<',npos)
        if plt == npos or pgt < plt:
            npos = pgt + 1
        # find id and name attributes only inside of tags
        # use a reverse tag search since that is faster
        #    inside any < > pair find "id=" and "name=" attributes return it
        #    [^>]* means match any amount of chars except for  '>' char
        #    [^'"] match any amount of chars except for the quote character
        #    \s* means match any amount of whitespace
        textblock = textblock[0:npos]
        id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE)
        name_pattern = re.compile(br'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE)
        aid_pattern = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''')
        for tag in reverse_tag_iter(textblock):
            # any ids in the body should default to top of file
            if tag[0:6] == b'<body ':
                return b''
            if tag[0:6] != b'<meta ':
                m = id_pattern.match(tag) or name_pattern.match(tag)
                if m is not None:
                    return m.group(1)
                m = aid_pattern.match(tag)
                if m is not None:
                    self.linked_aids.add(m.group(1))
                    return b'aid-' + m.group(1)
        return b''

    # do we need to do deep copying
    def setParts(self, parts):
        assert(len(parts) == len(self.parts))
        for i in range(len(parts)):
            self.parts[i] = parts[i]

    # do we need to do deep copying
    def setFlows(self, flows):
        assert(len(flows) == len(self.flows))
        for i in range(len(flows)):
            self.flows[i] = flows[i]

    # get information about the part (file) that exists at pos in original rawML
    def getSkelInfo(self, pos):
        for [partnum, pdir, filename, start, end, aidtext] in self.partinfo:
            if pos >= start and pos < end:
                return [partnum, pdir, filename, start, end, aidtext]
        return [None, None, None, None, None, None]

    # fileno is actually a reference into fragtbl (a fragment)
    def getGuideText(self):
        guidetext = b''
        for [ref_type, ref_title, fileno] in self.guidetbl:
            if ref_type == b'thumbimagestandard':
                continue
            if ref_type not in _guide_types and not ref_type.startswith(b'other.'):
                if ref_type == b'start':
                    ref_type = b'text'
                else:
                    ref_type = b'other.' + ref_type
            [pos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[fileno]
            [pn, pdir, filename, skelpos, skelend, aidtext] = self.getSkelInfo(pos)
            idtext = self.getIDTag(pos)
            linktgt = filename.encode('utf-8')
            if idtext != b'':
                linktgt += b'#' + idtext
            guidetext += b'<reference type="'+ref_type+b'" title="'+ref_title+b'" href="'+utf8_str(pdir)+b'/'+linktgt+b'" />\n'
        # opf is encoded utf-8 so must convert any titles properly
        guidetext = (guidetext.decode(self.mh.codec)).encode("utf-8")
        return guidetext

    def getPageIDTag(self, pos):
        # find the first tag with a named anchor (name or id attribute) before pos
        # but page map offsets need to little more leeway so if the offset points
        # into a tag look for the next ending tag "/>" or "</" and start your search from there.
        fname, pn, skelpos, skelend = self.getFileInfo(pos)
        if pn is None and skelpos is None:
            print("Error: getIDTag - no file contains ", pos)
        textblock = self.parts[pn]
        npos = pos - skelpos
        # if npos inside a tag then search all text before next ending tag
        pgt = textblock.find(b'>',npos)
        plt = textblock.find(b'<',npos)
        if plt == npos or pgt < plt:
            # we are in a tag
            # so find first ending tag
            pend1 = textblock.find(b'/>', npos)
            pend2 = textblock.find(b'</', npos)
            if pend1 != -1 and pend2 != -1:
                pend = min(pend1, pend2)
            else:
                pend = max(pend1, pend2)
            if pend != -1:
                npos = pend
            else:
                npos = pgt + 1
        # find id and name attributes only inside of tags
        # use a reverse tag search since that is faster
        #    inside any < > pair find "id=" and "name=" attributes return it
        #    [^>]* means match any amount of chars except for  '>' char
        #    [^'"] match any amount of chars except for the quote character
        #    \s* means match any amount of whitespace
        textblock = textblock[0:npos]
        id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE)
        name_pattern = re.compile(br'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE)
        for tag in reverse_tag_iter(textblock):
            # any ids in the body should default to top of file
            if tag[0:6] == b'<body ':
                return b''
            if tag[0:6] != b'<meta ':
                m = id_pattern.match(tag) or name_pattern.match(tag)
                if m is not None:
                    return m.group(1)
        return b''