epy_extras/KindleUnpack/mobi_k8resc.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab

from __future__ import unicode_literals, division, absolute_import, print_function

DEBUG_USE_ORDERED_DICTIONARY = False  # OrderedDict is supoorted >= python 2.7.
""" set to True to use OrderedDict for K8RESCProcessor.parsetag.tattr."""

if DEBUG_USE_ORDERED_DICTIONARY:
    from collections import OrderedDict as dict_
else:
    dict_ = dict

from .compatibility_utils import unicode_str

from .mobi_utils import fromBase32

_OPF_PARENT_TAGS = ['xml', 'package', 'metadata', 'dc-metadata',
                    'x-metadata', 'manifest', 'spine', 'tours', 'guide']

class K8RESCProcessor(object):

    def __init__(self, data, debug=False):
        self._debug = debug
        self.resc = None
        self.opos = 0
        self.extrameta = []
        self.cover_name = None
        self.spine_idrefs = {}
        self.spine_order = []
        self.spine_pageattributes = {}
        self.spine_ppd = None
        # need3 indicate the book has fields which require epub3.
        # but the estimation of the source epub version from the fields is difficult.
        self.need3 = False
        self.package_ver = None
        self.extra_metadata = []
        self.refines_metadata = []
        self.extra_attributes = []
        # get header
        start_pos = data.find(b'<')
        self.resc_header = data[:start_pos]
        # get resc data length
        start = self.resc_header.find(b'=') + 1
        end = self.resc_header.find(b'&', start)
        resc_size = 0
        if end > 0:
            resc_size = fromBase32(self.resc_header[start:end])
        resc_rawbytes = len(data) - start_pos
        if resc_rawbytes == resc_size:
            self.resc_length = resc_size
        else:
            # Most RESC has a nul string at its tail but some do not.
            end_pos = data.find(b'\x00', start_pos)
            if end_pos < 0:
                self.resc_length = resc_rawbytes
            else:
                self.resc_length = end_pos - start_pos
        if self.resc_length != resc_size:
            print("Warning: RESC section length({:d}bytes) does not match its size({:d}bytes).".format(self.resc_length, resc_size))
        # now parse RESC after converting it to unicode from utf-8
        try:
            self.resc = unicode_str(data[start_pos:start_pos+self.resc_length])
        except UnicodeDecodeError:
            self.resc = unicode_str(data[start_pos:start_pos+self.resc_length], enc='latin-1')
        self.parseData()

    def prepend_to_spine(self, key, idref, linear, properties):
        self.spine_order = [key] + self.spine_order
        self.spine_idrefs[key] = idref
        attributes = {}
        if linear is not None:
            attributes['linear'] = linear
        if properties is not None:
            attributes['properties'] = properties
        self.spine_pageattributes[key] = attributes

    # RESC tag iterator
    def resc_tag_iter(self):
        tcontent = last_tattr = None
        prefix = ['']
        while True:
            text, tag = self.parseresc()
            if text is None and tag is None:
                break
            if text is not None:
                tcontent = text.rstrip(' \r\n')
            else:  # we have a tag
                ttype, tname, tattr = self.parsetag(tag)
                if ttype == 'begin':
                    tcontent = None
                    prefix.append(tname + '.')
                    if tname in _OPF_PARENT_TAGS:
                        yield ''.join(prefix), tname, tattr, tcontent
                    else:
                        last_tattr = tattr
                else:  # single or end
                    if ttype == 'end':
                        prefix.pop()
                        tattr = last_tattr
                        last_tattr = None
                        if tname in _OPF_PARENT_TAGS:
                            tname += '-end'
                    yield ''.join(prefix), tname, tattr, tcontent
                    tcontent = None

    # now parse the RESC to extract spine and extra metadata info
    def parseData(self):
        for prefix, tname, tattr, tcontent in self.resc_tag_iter():
            if self._debug:
                print("   Parsing RESC: ", prefix, tname, tattr, tcontent)
            if tname == 'package':
                self.package_ver = tattr.get('version', '2.0')
                package_prefix = tattr.get('prefix','')
                if self.package_ver.startswith('3') or package_prefix.startswith('rendition'):
                    self.need3 = True
            if tname == 'spine':
                self.spine_ppd = tattr.get('page-progession-direction', None)
                if self.spine_ppd is not None and self.spine_ppd == 'rtl':
                    self.need3 = True
            if tname == 'itemref':
                skelid = tattr.pop('skelid', None)
                if skelid is None and len(self.spine_order) == 0:
                    # assume it was removed initial coverpage
                    skelid = 'coverpage'
                    tattr['linear'] = 'no'
                self.spine_order.append(skelid)
                idref = tattr.pop('idref', None)
                if idref is not None:
                    idref = 'x_' + idref
                self.spine_idrefs[skelid] = idref
                if 'id' in tattr:
                    del tattr['id']
                # tattr["id"] = 'x_' + tattr["id"]
                if 'properties' in tattr:
                    self.need3 = True
                self.spine_pageattributes[skelid] = tattr
            if tname == 'meta' or tname.startswith('dc:'):
                if 'refines' in tattr or 'property' in tattr:
                    self.need3 = True
                if tattr.get('name','') == 'cover':
                    cover_name = tattr.get('content',None)
                    if cover_name is not None:
                        cover_name = 'x_' + cover_name
                    self.cover_name = cover_name
                else:
                    self.extrameta.append([tname, tattr, tcontent])

    # parse and return either leading text or the next tag
    def parseresc(self):
        p = self.opos
        if p >= len(self.resc):
            return None, None
        if self.resc[p] != '<':
            res = self.resc.find('<',p)
            if res == -1 :
                res = len(self.resc)
            self.opos = res
            return self.resc[p:res], None
        # handle comment as a special case
        if self.resc[p:p+4] == '<!--':
            te = self.resc.find('-->',p+1)
            if te != -1:
                te = te+2
        else:
            te = self.resc.find('>',p+1)
            ntb = self.resc.find('<',p+1)
            if ntb != -1 and ntb < te:
                self.opos = ntb
                return self.resc[p:ntb], None
        self.opos = te + 1
        return None, self.resc[p:te+1]

    # parses tag to identify:  [tname, ttype, tattr]
    #    tname: tag name
    #    ttype: tag type ('begin', 'end' or 'single');
    #    tattr: dictionary of tag atributes
    def parsetag(self, s):
        p = 1
        tname = None
        ttype = None
        tattr = dict_()
        while s[p:p+1] == ' ' :
            p += 1
        if s[p:p+1] == '/':
            ttype = 'end'
            p += 1
            while s[p:p+1] == ' ' :
                p += 1
        b = p
        while s[p:p+1] not in ('>', '/', ' ', '"', "'",'\r','\n') :
            p += 1
        tname=s[b:p].lower()
        # some special cases
        if tname == '?xml':
            tname = 'xml'
        if tname == '!--':
            ttype = 'single'
            comment = s[p:-3].strip()
            tattr['comment'] = comment
        if ttype is None:
            # parse any attributes of begin or single tags
            while s.find('=',p) != -1 :
                while s[p:p+1] == ' ' :
                    p += 1
                b = p
                while s[p:p+1] != '=' :
                    p += 1
                aname = s[b:p].lower()
                aname = aname.rstrip(' ')
                p += 1
                while s[p:p+1] == ' ' :
                    p += 1
                if s[p:p+1] in ('"', "'") :
                    p = p + 1
                    b = p
                    while s[p:p+1] not in ('"', "'"):
                        p += 1
                    val = s[b:p]
                    p += 1
                else :
                    b = p
                    while s[p:p+1] not in ('>', '/', ' ') :
                        p += 1
                    val = s[b:p]
                tattr[aname] = val
        if ttype is None:
            ttype = 'begin'
            if s.find('/',p) >= 0:
                ttype = 'single'
        return ttype, tname, tattr

    def taginfo_toxml(self, taginfo):
        res = []
        tname, tattr, tcontent = taginfo
        res.append('<' + tname)
        if tattr is not None:
            for key in tattr:
                res.append(' ' + key + '="'+tattr[key]+'"')
        if tcontent is not None:
            res.append('>' + tcontent + '</' + tname + '>\n')
        else:
            res.append('/>\n')
        return "".join(res)

    def hasSpine(self):
        return len(self.spine_order) > 0

    def needEPUB3(self):
        return self.need3

    def hasRefines(self):
        for [tname, tattr, tcontent] in self.extrameta:
            if 'refines' in tattr:
                return True
        return False

    def createMetadata(self, epubver):
        for taginfo in self.extrameta:
            tname, tattr, tcontent = taginfo
            if 'refines' in tattr:
                if epubver == 'F' and 'property' in tattr:
                    attr = ' id="%s" opf:%s="%s"\n' % (tattr['refines'], tattr['property'], tcontent)
                    self.extra_attributes.append(attr)
                else:
                    tag = self.taginfo_toxml(taginfo)
                    self.refines_metadata.append(tag)
            else:
                tag = self.taginfo_toxml(taginfo)
                self.extra_metadata.append(tag)