1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
|
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
from __future__ import unicode_literals, division, absolute_import, print_function
DEBUG_USE_ORDERED_DICTIONARY = False # OrderedDict is supoorted >= python 2.7.
""" set to True to use OrderedDict for K8RESCProcessor.parsetag.tattr."""
if DEBUG_USE_ORDERED_DICTIONARY:
from collections import OrderedDict as dict_
else:
dict_ = dict
from .compatibility_utils import unicode_str
from .mobi_utils import fromBase32
_OPF_PARENT_TAGS = ['xml', 'package', 'metadata', 'dc-metadata',
'x-metadata', 'manifest', 'spine', 'tours', 'guide']
class K8RESCProcessor(object):
def __init__(self, data, debug=False):
self._debug = debug
self.resc = None
self.opos = 0
self.extrameta = []
self.cover_name = None
self.spine_idrefs = {}
self.spine_order = []
self.spine_pageattributes = {}
self.spine_ppd = None
# need3 indicate the book has fields which require epub3.
# but the estimation of the source epub version from the fields is difficult.
self.need3 = False
self.package_ver = None
self.extra_metadata = []
self.refines_metadata = []
self.extra_attributes = []
# get header
start_pos = data.find(b'<')
self.resc_header = data[:start_pos]
# get resc data length
start = self.resc_header.find(b'=') + 1
end = self.resc_header.find(b'&', start)
resc_size = 0
if end > 0:
resc_size = fromBase32(self.resc_header[start:end])
resc_rawbytes = len(data) - start_pos
if resc_rawbytes == resc_size:
self.resc_length = resc_size
else:
# Most RESC has a nul string at its tail but some do not.
end_pos = data.find(b'\x00', start_pos)
if end_pos < 0:
self.resc_length = resc_rawbytes
else:
self.resc_length = end_pos - start_pos
if self.resc_length != resc_size:
print("Warning: RESC section length({:d}bytes) does not match its size({:d}bytes).".format(self.resc_length, resc_size))
# now parse RESC after converting it to unicode from utf-8
try:
self.resc = unicode_str(data[start_pos:start_pos+self.resc_length])
except UnicodeDecodeError:
self.resc = unicode_str(data[start_pos:start_pos+self.resc_length], enc='latin-1')
self.parseData()
def prepend_to_spine(self, key, idref, linear, properties):
self.spine_order = [key] + self.spine_order
self.spine_idrefs[key] = idref
attributes = {}
if linear is not None:
attributes['linear'] = linear
if properties is not None:
attributes['properties'] = properties
self.spine_pageattributes[key] = attributes
# RESC tag iterator
def resc_tag_iter(self):
tcontent = last_tattr = None
prefix = ['']
while True:
text, tag = self.parseresc()
if text is None and tag is None:
break
if text is not None:
tcontent = text.rstrip(' \r\n')
else: # we have a tag
ttype, tname, tattr = self.parsetag(tag)
if ttype == 'begin':
tcontent = None
prefix.append(tname + '.')
if tname in _OPF_PARENT_TAGS:
yield ''.join(prefix), tname, tattr, tcontent
else:
last_tattr = tattr
else: # single or end
if ttype == 'end':
prefix.pop()
tattr = last_tattr
last_tattr = None
if tname in _OPF_PARENT_TAGS:
tname += '-end'
yield ''.join(prefix), tname, tattr, tcontent
tcontent = None
# now parse the RESC to extract spine and extra metadata info
def parseData(self):
for prefix, tname, tattr, tcontent in self.resc_tag_iter():
if self._debug:
print(" Parsing RESC: ", prefix, tname, tattr, tcontent)
if tname == 'package':
self.package_ver = tattr.get('version', '2.0')
package_prefix = tattr.get('prefix','')
if self.package_ver.startswith('3') or package_prefix.startswith('rendition'):
self.need3 = True
if tname == 'spine':
self.spine_ppd = tattr.get('page-progession-direction', None)
if self.spine_ppd is not None and self.spine_ppd == 'rtl':
self.need3 = True
if tname == 'itemref':
skelid = tattr.pop('skelid', None)
if skelid is None and len(self.spine_order) == 0:
# assume it was removed initial coverpage
skelid = 'coverpage'
tattr['linear'] = 'no'
self.spine_order.append(skelid)
idref = tattr.pop('idref', None)
if idref is not None:
idref = 'x_' + idref
self.spine_idrefs[skelid] = idref
if 'id' in tattr:
del tattr['id']
# tattr["id"] = 'x_' + tattr["id"]
if 'properties' in tattr:
self.need3 = True
self.spine_pageattributes[skelid] = tattr
if tname == 'meta' or tname.startswith('dc:'):
if 'refines' in tattr or 'property' in tattr:
self.need3 = True
if tattr.get('name','') == 'cover':
cover_name = tattr.get('content',None)
if cover_name is not None:
cover_name = 'x_' + cover_name
self.cover_name = cover_name
else:
self.extrameta.append([tname, tattr, tcontent])
# parse and return either leading text or the next tag
def parseresc(self):
p = self.opos
if p >= len(self.resc):
return None, None
if self.resc[p] != '<':
res = self.resc.find('<',p)
if res == -1 :
res = len(self.resc)
self.opos = res
return self.resc[p:res], None
# handle comment as a special case
if self.resc[p:p+4] == '<!--':
te = self.resc.find('-->',p+1)
if te != -1:
te = te+2
else:
te = self.resc.find('>',p+1)
ntb = self.resc.find('<',p+1)
if ntb != -1 and ntb < te:
self.opos = ntb
return self.resc[p:ntb], None
self.opos = te + 1
return None, self.resc[p:te+1]
# parses tag to identify: [tname, ttype, tattr]
# tname: tag name
# ttype: tag type ('begin', 'end' or 'single');
# tattr: dictionary of tag atributes
def parsetag(self, s):
p = 1
tname = None
ttype = None
tattr = dict_()
while s[p:p+1] == ' ' :
p += 1
if s[p:p+1] == '/':
ttype = 'end'
p += 1
while s[p:p+1] == ' ' :
p += 1
b = p
while s[p:p+1] not in ('>', '/', ' ', '"', "'",'\r','\n') :
p += 1
tname=s[b:p].lower()
# some special cases
if tname == '?xml':
tname = 'xml'
if tname == '!--':
ttype = 'single'
comment = s[p:-3].strip()
tattr['comment'] = comment
if ttype is None:
# parse any attributes of begin or single tags
while s.find('=',p) != -1 :
while s[p:p+1] == ' ' :
p += 1
b = p
while s[p:p+1] != '=' :
p += 1
aname = s[b:p].lower()
aname = aname.rstrip(' ')
p += 1
while s[p:p+1] == ' ' :
p += 1
if s[p:p+1] in ('"', "'") :
p = p + 1
b = p
while s[p:p+1] not in ('"', "'"):
p += 1
val = s[b:p]
p += 1
else :
b = p
while s[p:p+1] not in ('>', '/', ' ') :
p += 1
val = s[b:p]
tattr[aname] = val
if ttype is None:
ttype = 'begin'
if s.find('/',p) >= 0:
ttype = 'single'
return ttype, tname, tattr
def taginfo_toxml(self, taginfo):
res = []
tname, tattr, tcontent = taginfo
res.append('<' + tname)
if tattr is not None:
for key in tattr:
res.append(' ' + key + '="'+tattr[key]+'"')
if tcontent is not None:
res.append('>' + tcontent + '</' + tname + '>\n')
else:
res.append('/>\n')
return "".join(res)
def hasSpine(self):
return len(self.spine_order) > 0
def needEPUB3(self):
return self.need3
def hasRefines(self):
for [tname, tattr, tcontent] in self.extrameta:
if 'refines' in tattr:
return True
return False
def createMetadata(self, epubver):
for taginfo in self.extrameta:
tname, tattr, tcontent = taginfo
if 'refines' in tattr:
if epubver == 'F' and 'property' in tattr:
attr = ' id="%s" opf:%s="%s"\n' % (tattr['refines'], tattr['property'], tcontent)
self.extra_attributes.append(attr)
else:
tag = self.taginfo_toxml(taginfo)
self.refines_metadata.append(tag)
else:
tag = self.taginfo_toxml(taginfo)
self.extra_metadata.append(tag)
|