aboutsummaryrefslogtreecommitdiffstats
path: root/epy_extras/KindleUnpack/mobi_html.py
blob: eda766c33a15e2140edc5c19257cf2a234cace0f (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab

from __future__ import unicode_literals, division, absolute_import, print_function

from .compatibility_utils import PY2, utf8_str

if PY2:
    range = xrange

import re
# note: re requites the pattern to be the exact same type as the data to be searched in python3
# but u"" is not allowed for the pattern itself only b""

from .mobi_utils import fromBase32

class HTMLProcessor:

    def __init__(self, files, metadata, rscnames):
        self.files = files
        self.metadata = metadata
        self.rscnames = rscnames
        # for original style mobis, default to including all image files in the opf manifest
        self.used = {}
        for name in rscnames:
            self.used[name] = 'used'

    def findAnchors(self, rawtext, indx_data, positionMap):
        # process the raw text
        # find anchors...
        print("Find link anchors")
        link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''', re.IGNORECASE)
        # TEST NCX: merge in filepos from indx
        pos_links = [int(m.group(1)) for m in link_pattern.finditer(rawtext)]
        if indx_data:
            pos_indx = [e['pos'] for e in indx_data if e['pos']>0]
            pos_links = list(set(pos_links + pos_indx))

        for position in pos_links:
            if position in positionMap:
                positionMap[position] = positionMap[position] + utf8_str('<a id="filepos%d" />' % position)
            else:
                positionMap[position] = utf8_str('<a id="filepos%d" />' % position)

        # apply dictionary metadata and anchors
        print("Insert data into html")
        pos = 0
        lastPos = len(rawtext)
        dataList = []
        for end in sorted(positionMap.keys()):
            if end == 0 or end > lastPos:
                continue  # something's up - can't put a tag in outside <html>...</html>
            dataList.append(rawtext[pos:end])
            dataList.append(positionMap[end])
            pos = end
        dataList.append(rawtext[pos:])
        srctext = b"".join(dataList)
        rawtext = None
        dataList = None
        self.srctext = srctext
        self.indx_data = indx_data
        return srctext

    def insertHREFS(self):
        srctext = self.srctext
        rscnames = self.rscnames
        metadata = self.metadata

        # put in the hrefs
        print("Insert hrefs into html")
        # There doesn't seem to be a standard, so search as best as we can

        link_pattern = re.compile(br'''<a([^>]*?)filepos=['"]{0,1}0*(\d+)['"]{0,1}([^>]*?)>''', re.IGNORECASE)
        srctext = link_pattern.sub(br'''<a\1href="#filepos\2"\3>''', srctext)

        # remove empty anchors
        print("Remove empty anchors from html")
        srctext = re.sub(br"<a\s*/>",br"", srctext)
        srctext = re.sub(br"<a\s*>\s*</a>",br"", srctext)

        # convert image references
        print("Insert image references into html")
        # split string into image tag pieces and other pieces
        image_pattern = re.compile(br'''(<img.*?>)''', re.IGNORECASE)
        image_index_pattern = re.compile(br'''recindex=['"]{0,1}([0-9]+)['"]{0,1}''', re.IGNORECASE)
        srcpieces = image_pattern.split(srctext)
        srctext = self.srctext = None

        # all odd pieces are image tags (nulls string on even pieces if no space between them in srctext)
        for i in range(1, len(srcpieces), 2):
            tag = srcpieces[i]
            for m in image_index_pattern.finditer(tag):
                imageNumber = int(m.group(1))
                imageName = rscnames[imageNumber-1]
                if imageName is None:
                    print("Error: Referenced image %s was not recognized as a valid image" % imageNumber)
                else:
                    replacement = b'src="Images/' + utf8_str(imageName) + b'"'
                    tag = image_index_pattern.sub(replacement, tag, 1)
            srcpieces[i] = tag
        srctext = b"".join(srcpieces)

        # add in character set meta into the html header if needed
        if 'Codec' in metadata:
            srctext = srctext[0:12]+b'<meta http-equiv="content-type" content="text/html; charset='+utf8_str(metadata.get('Codec')[0])+b'" />'+srctext[12:]
        return srctext, self.used


class XHTMLK8Processor:

    def __init__(self, rscnames, k8proc):
        self.rscnames = rscnames
        self.k8proc = k8proc
        self.used = {}

    def buildXHTML(self):

        # first need to update all links that are internal which
        # are based on positions within the xhtml files **BEFORE**
        # cutting and pasting any pieces into the xhtml text files

        #   kindle:pos:fid:XXXX:off:YYYYYYYYYY  (used for internal link within xhtml)
        #       XXXX is the offset in records into divtbl
        #       YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position

        # pos:fid pattern
        posfid_pattern = re.compile(br'''(<a.*?href=.*?>)''', re.IGNORECASE)
        posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''')

        parts = []
        print("Building proper xhtml for each file")
        for i in range(self.k8proc.getNumberOfParts()):
            part = self.k8proc.getPart(i)
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.getPartInfo(i)

            # internal links
            srcpieces = posfid_pattern.split(part)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if tag.startswith(b'<'):
                    for m in posfid_index_pattern.finditer(tag):
                        posfid = m.group(1)
                        offset = m.group(2)
                        filename, idtag = self.k8proc.getIDTagByPosFid(posfid, offset)
                        if idtag == b'':
                            replacement= b'"' + utf8_str(filename) + b'"'
                        else:
                            replacement = b'"' + utf8_str(filename) + b'#' + idtag + b'"'
                        tag = posfid_index_pattern.sub(replacement, tag, 1)
                    srcpieces[j] = tag
            part = b"".join(srcpieces)
            parts.append(part)

        # we are free to cut and paste as we see fit
        # we can safely remove all of the Kindlegen generated aid tags
        # change aid ids that are in k8proc.linked_aids to xhtml ids
        find_tag_with_aid_pattern = re.compile(br'''(<[^>]*\said\s*=[^>]*>)''', re.IGNORECASE)
        within_tag_aid_position_pattern = re.compile(br'''\said\s*=['"]([^'"]*)['"]''')
        for i in range(len(parts)):
            part = parts[i]
            srcpieces = find_tag_with_aid_pattern.split(part)
            for j in range(len(srcpieces)):
                tag = srcpieces[j]
                if tag.startswith(b'<'):
                    for m in within_tag_aid_position_pattern.finditer(tag):
                        try:
                            aid = m.group(1)
                        except IndexError:
                            aid = None
                        replacement = b''
                        if aid in self.k8proc.linked_aids:
                            replacement = b' id="aid-' + aid + b'"'
                        tag = within_tag_aid_position_pattern.sub(replacement, tag, 1)
                    srcpieces[j] = tag
            part = b"".join(srcpieces)
            parts[i] = part

        # we can safely replace all of the Kindlegen generated data-AmznPageBreak tags
        # with page-break-after style patterns
        find_tag_with_AmznPageBreak_pattern = re.compile(br'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE)
        within_tag_AmznPageBreak_position_pattern = re.compile(br'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''')
        for i in range(len(parts)):
            part = parts[i]
            srcpieces = find_tag_with_AmznPageBreak_pattern.split(part)
            for j in range(len(srcpieces)):
                tag = srcpieces[j]
                if tag.startswith(b'<'):
                    srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub(
                        lambda m:b' style="page-break-after:' + m.group(1) + b'"', tag)
            part = b"".join(srcpieces)
            parts[i] = part

        # we have to handle substitutions for the flows  pieces first as they may
        # be inlined into the xhtml text
        #   kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
        #   kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
        #   kindle:embed:XXXX   (used for fonts)

        flows = []
        flows.append(None)
        flowinfo = []
        flowinfo.append([None, None, None, None])

        # regular expression search patterns
        img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
        img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE)

        tag_pattern = re.compile(br'''(<[^>]*>)''')
        flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)

        url_pattern = re.compile(br'''(url\(.*?\))''', re.IGNORECASE)
        url_img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*["')]''', re.IGNORECASE)
        font_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)["')]''', re.IGNORECASE)
        url_css_index_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE)
        url_svg_image_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=image/svg\+xml[^\)]*''', re.IGNORECASE)

        for i in range(1, self.k8proc.getNumberOfFlows()):
            [ftype, format, dir, filename] = self.k8proc.getFlowInfo(i)
            flowpart = self.k8proc.getFlow(i)

            # links to raster image files from image tags
            # image_pattern
            srcpieces = img_pattern.split(flowpart)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if tag.startswith(b'<im'):
                    for m in img_index_pattern.finditer(tag):
                        imageNumber = fromBase32(m.group(1))
                        imageName = self.rscnames[imageNumber-1]
                        if imageName is not None:
                            replacement = b'"../Images/' + utf8_str(imageName) + b'"'
                            self.used[imageName] = 'used'
                            tag = img_index_pattern.sub(replacement, tag, 1)
                        else:
                            print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag))
                    srcpieces[j] = tag
            flowpart = b"".join(srcpieces)

            # replacements inside css url():
            srcpieces = url_pattern.split(flowpart)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]

                #  process links to raster image files
                for m in url_img_index_pattern.finditer(tag):
                    imageNumber = fromBase32(m.group(1))
                    imageName = self.rscnames[imageNumber-1]
                    osep = m.group()[0:1]
                    csep = m.group()[-1:]
                    if imageName is not None:
                        replacement = osep +  b'../Images/' + utf8_str(imageName) +  csep
                        self.used[imageName] = 'used'
                        tag = url_img_index_pattern.sub(replacement, tag, 1)
                    else:
                        print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag))

                # process links to fonts
                for m in font_index_pattern.finditer(tag):
                    fontNumber = fromBase32(m.group(1))
                    fontName = self.rscnames[fontNumber-1]
                    osep = m.group()[0:1]
                    csep = m.group()[-1:]
                    if fontName is None:
                        print("Error: Referenced font %s was not recognized as a valid font in %s" % (fontNumber, tag))
                    else:
                        replacement = osep +  b'../Fonts/' + utf8_str(fontName) +  csep
                        tag = font_index_pattern.sub(replacement, tag, 1)
                        self.used[fontName] = 'used'

                # process links to other css pieces
                for m in url_css_index_pattern.finditer(tag):
                    num = fromBase32(m.group(1))
                    [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
                    replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
                    tag = url_css_index_pattern.sub(replacement, tag, 1)
                    self.used[fnm] = 'used'

                # process links to svg images
                for m in url_svg_image_pattern.finditer(tag):
                    num = fromBase32(m.group(1))
                    [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
                    replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
                    tag = url_svg_image_pattern.sub(replacement, tag, 1)
                    self.used[fnm] = 'used'

                srcpieces[j] = tag
            flowpart = b"".join(srcpieces)

            # store away in our own copy
            flows.append(flowpart)

            # I do not think this case exists and even if it does exist, it needs to be done in a separate
            # pass to prevent inlining a flow piece into another flow piece before the inserted one or the
            # target one has been fully processed

            # but keep it around if it ends up we do need it

            # flow pattern not inside url()
            # srcpieces = tag_pattern.split(flowpart)
            # for j in range(1, len(srcpieces),2):
            #     tag = srcpieces[j]
            #     if tag.startswith(b'<'):
            #         for m in flow_pattern.finditer(tag):
            #             num = fromBase32(m.group(1))
            #             [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
            #             flowtext = self.k8proc.getFlow(num)
            #             if fmt == b'inline':
            #                 tag = flowtext
            #             else:
            #                 replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
            #                 tag = flow_pattern.sub(replacement, tag, 1)
            #                 self.used[fnm] = 'used'
            #         srcpieces[j] = tag
            # flowpart = b"".join(srcpieces)

        # now handle the main text xhtml parts

        # Handle the flow items in the XHTML text pieces
        # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
        tag_pattern = re.compile(br'''(<[^>]*>)''')
        flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
        for i in range(len(parts)):
            part = parts[i]
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
            # flow pattern
            srcpieces = tag_pattern.split(part)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if tag.startswith(b'<'):
                    for m in flow_pattern.finditer(tag):
                        num = fromBase32(m.group(1))
                        if num > 0 and num < len(self.k8proc.flowinfo):
                            [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
                            flowpart = flows[num]
                            if fmt == b'inline':
                                tag = flowpart
                            else:
                                replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
                                tag = flow_pattern.sub(replacement, tag, 1)
                                self.used[fnm] = 'used'
                        else:
                            print("warning: ignoring non-existent flow link", tag, " value 0x%x" % num)
                    srcpieces[j] = tag
            part = b''.join(srcpieces)

            # store away modified version
            parts[i] = part

        # Handle any embedded raster images links in style= attributes urls
        style_pattern = re.compile(br'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''', re.IGNORECASE)
        img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE)

        for i in range(len(parts)):
            part = parts[i]
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]

            # replace urls in style attributes
            srcpieces = style_pattern.split(part)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if b'kindle:embed' in tag:
                    for m in img_index_pattern.finditer(tag):
                        imageNumber = fromBase32(m.group(1))
                        imageName = self.rscnames[imageNumber-1]
                        osep = m.group()[0:1]
                        csep = m.group()[-1:]
                        if imageName is not None:
                            replacement = osep + b'../Images/'+ utf8_str(imageName) + csep
                            self.used[imageName] = 'used'
                            tag = img_index_pattern.sub(replacement, tag, 1)
                        else:
                            print("Error: Referenced image %s in style url was not recognized in %s" % (imageNumber, tag))
                    srcpieces[j] = tag
            part = b"".join(srcpieces)

            # store away modified version
            parts[i] = part

        # Handle any embedded raster images links in the xhtml text
        # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
        img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
        img_index_pattern = re.compile(br'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''')

        for i in range(len(parts)):
            part = parts[i]
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]

            # links to raster image files
            # image_pattern
            srcpieces = img_pattern.split(part)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if tag.startswith(b'<im'):
                    for m in img_index_pattern.finditer(tag):
                        imageNumber = fromBase32(m.group(1))
                        imageName = self.rscnames[imageNumber-1]
                        if imageName is not None:
                            replacement = b'"../Images/' + utf8_str(imageName) + b'"'
                            self.used[imageName] = 'used'
                            tag = img_index_pattern.sub(replacement, tag, 1)
                        else:
                            print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag))
                    srcpieces[j] = tag
            part = b"".join(srcpieces)
            # store away modified version
            parts[i] = part

        # finally perform any general cleanups needed to make valid XHTML
        # these include:
        #   in svg tags replace "perserveaspectratio" attributes with "perserveAspectRatio"
        #   in svg tags replace "viewbox" attributes with "viewBox"
        #   in <li> remove value="XX" attributes since these are illegal
        tag_pattern = re.compile(br'''(<[^>]*>)''')
        li_value_pattern = re.compile(br'''\svalue\s*=\s*['"][^'"]*['"]''', re.IGNORECASE)

        for i in range(len(parts)):
            part = parts[i]
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]

            # tag pattern
            srcpieces = tag_pattern.split(part)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if tag.startswith(b'<svg') or tag.startswith(b'<SVG'):
                    tag = tag.replace(b'preserveaspectratio',b'preserveAspectRatio')
                    tag = tag.replace(b'viewbox',b'viewBox')
                elif tag.startswith(b'<li ') or tag.startswith(b'<LI '):
                    tagpieces = li_value_pattern.split(tag)
                    tag = b"".join(tagpieces)
                srcpieces[j] = tag
            part = b"".join(srcpieces)
            # store away modified version
            parts[i] = part

        self.k8proc.setFlows(flows)
        self.k8proc.setParts(parts)

        return self.used