epy_extras/KindleUnpack/mobi_dict.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab

from __future__ import unicode_literals, division, absolute_import, print_function

from .compatibility_utils import PY2, PY3, utf8_str, bstr, bchr

if PY2:
    range = xrange
    array_format = b'B'
if PY3:
    unichr = chr
    array_format = "B"

import array

import struct
# note:  struct pack, unpack, unpack_from all require bytestring format
# data all the way up to at least python 2.7.5, python 3 okay with bytestring

from .mobi_index import getVariableWidthValue, readTagSection, getTagMap
from .mobi_utils import toHex

DEBUG_DICT = False

class InflectionData(object):

    def __init__(self, infldatas):
        self.infldatas = infldatas
        self.starts = []
        self.counts = []
        for idata in self.infldatas:
            start, = struct.unpack_from(b'>L', idata, 0x14)
            count, = struct.unpack_from(b'>L', idata, 0x18)
            self.starts.append(start)
            self.counts.append(count)

    def lookup(self, lookupvalue):
        i = 0
        rvalue = lookupvalue
        while rvalue >= self.counts[i]:
            rvalue = rvalue - self.counts[i]
            i += 1
            if i == len(self.counts):
                print("Error: Problem with multiple inflections data sections")
                return lookupvalue, self.starts[0], self.counts[0], self.infldatas[0]
        return rvalue, self.starts[i], self.counts[i], self.infldatas[i]

    def offsets(self, value):
        rvalue, start, count, data = self.lookup(value)
        offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue))
        if rvalue + 1 < count:
            nextOffset, = struct.unpack_from(b'>H',data, start + 4 + (2 * (rvalue + 1)))
        else:
            nextOffset = None
        return offset, nextOffset, data


class dictSupport(object):

    def __init__(self, mh, sect):
        self.mh = mh
        self.header = mh.header
        self.sect = sect
        self.metaOrthIndex = mh.metaOrthIndex
        self.metaInflIndex = mh.metaInflIndex

    def parseHeader(self, data):
        "read INDX header"
        if not data[:4] == b'INDX':
            print("Warning: index section is not INDX")
            return False
        words = (
                'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
                'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc'
        )
        num = len(words)
        values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)])
        header = {}
        for n in range(num):
            header[words[n]] = values[n]

        ordt1 = None
        ordt2 = None

        otype, oentries, op1, op2, otagx  = struct.unpack_from(b'>LLLLL',data, 0xa4)
        header['otype'] = otype
        header['oentries'] = oentries

        if DEBUG_DICT:
            print("otype %d, oentries %d, op1 %d, op2 %d, otagx %d" % (otype, oentries, op1, op2, otagx))

        if header['code'] == 0xfdea or oentries > 0:
            # some dictionaries seem to be codepage 65002 (0xFDEA) which seems
            # to be some sort of strange EBCDIC utf-8 or 16 encoded strings
            # So we need to look for them and store them away to process leading text
            # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
            # we only ever seem to use the second but ...
            #
            # if otype = 0, ORDT table uses 16 bit values as offsets into the table
            # if otype = 1, ORDT table uses 8 bit values as offsets inot the table

            assert(data[op1:op1+4] == b'ORDT')
            assert(data[op2:op2+4] == b'ORDT')
            ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4)
            ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4)

        if DEBUG_DICT:
            print("parsed INDX header:")
            for key in header:
                print(key, "%x" % header[key],)
            print("\n")
        return header, ordt1, ordt2

    def getPositionMap(self):
        sect = self.sect

        positionMap = {}

        metaOrthIndex = self.metaOrthIndex
        metaInflIndex = self.metaInflIndex

        decodeInflection = True
        if metaOrthIndex != 0xFFFFFFFF:
            print("Info: Document contains orthographic index, handle as dictionary")
            if metaInflIndex == 0xFFFFFFFF:
                decodeInflection = False
            else:
                metaInflIndexData = sect.loadSection(metaInflIndex)

                print("\nParsing metaInflIndexData")
                midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData)

                metaIndexCount = midxhdr['count']
                idatas = []
                for j in range(metaIndexCount):
                    idatas.append(sect.loadSection(metaInflIndex + 1 + j))
                dinfl = InflectionData(idatas)

                inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount)
                tagSectionStart = midxhdr['len']
                inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData)
                if DEBUG_DICT:
                    print("inflectionTagTable: %s" % inflectionTagTable)
                if self.hasTag(inflectionTagTable, 0x07):
                    print("Error: Dictionary uses obsolete inflection rule scheme which is not yet supported")
                    decodeInflection = False

            data = sect.loadSection(metaOrthIndex)

            print("\nParsing metaOrthIndex")
            idxhdr, hordt1, hordt2 = self.parseHeader(data)

            tagSectionStart = idxhdr['len']
            controlByteCount, tagTable = readTagSection(tagSectionStart, data)
            orthIndexCount = idxhdr['count']
            print("orthIndexCount is", orthIndexCount)
            if DEBUG_DICT:
                print("orthTagTable: %s" % tagTable)
            if hordt2 is not None:
                print("orth entry uses ordt2 lookup table of type ", idxhdr['otype'])
            hasEntryLength = self.hasTag(tagTable, 0x02)
            if not hasEntryLength:
                print("Info: Index doesn't contain entry length tags")

            print("Read dictionary index data")
            for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount):
                data = sect.loadSection(i)
                hdrinfo, ordt1, ordt2 = self.parseHeader(data)
                idxtPos = hdrinfo['start']
                entryCount = hdrinfo['count']
                idxPositions = []
                for j in range(entryCount):
                    pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j))
                    idxPositions.append(pos)
                # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
                idxPositions.append(idxtPos)
                for j in range(entryCount):
                    startPos = idxPositions[j]
                    endPos = idxPositions[j+1]
                    textLength = ord(data[startPos:startPos+1])
                    text = data[startPos+1:startPos+1+textLength]
                    if hordt2 is not None:
                        utext = u""
                        if idxhdr['otype'] == 0:
                            pattern = b'>H'
                            inc = 2
                        else:
                            pattern = b'>B'
                            inc = 1
                        pos = 0
                        while pos < textLength:
                            off, = struct.unpack_from(pattern, text, pos)
                            if off < len(hordt2):
                                utext += unichr(hordt2[off])
                            else:
                                utext += unichr(off)
                            pos += inc
                        text = utext.encode('utf-8')

                    tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos)
                    if 0x01 in tagMap:
                        if decodeInflection and 0x2a in tagMap:
                            inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable,
                                                                        dinfl, inflNameData, tagMap[0x2a])
                        else:
                            inflectionGroups = b''
                        assert len(tagMap[0x01]) == 1
                        entryStartPosition = tagMap[0x01][0]
                        if hasEntryLength:
                            # The idx:entry attribute "scriptable" must be present to create entry length tags.
                            ml = b'<idx:entry scriptable="yes"><idx:orth value="' + text + b'">' + inflectionGroups + b'</idx:orth>'
                            if entryStartPosition in positionMap:
                                positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml
                            else:
                                positionMap[entryStartPosition] = ml
                            assert len(tagMap[0x02]) == 1
                            entryEndPosition = entryStartPosition + tagMap[0x02][0]
                            if entryEndPosition in positionMap:
                                positionMap[entryEndPosition] = b"</idx:entry>" + positionMap[entryEndPosition]
                            else:
                                positionMap[entryEndPosition] = b"</idx:entry>"

                        else:
                            indexTags = b'<idx:entry>\n<idx:orth value="' + text + b'">\n' + inflectionGroups + b'</idx:entry>\n'
                            if entryStartPosition in positionMap:
                                positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags
                            else:
                                positionMap[entryStartPosition] = indexTags
        return positionMap

    def hasTag(self, tagTable, tag):
        '''
        Test if tag table contains given tag.

        @param tagTable: The tag table.
        @param tag: The tag to search.
        @return: True if tag table contains given tag; False otherwise.
        '''
        for currentTag, _, _, _ in tagTable:
            if currentTag == tag:
                return True
        return False

    def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList):
        '''
        Create string which contains the inflection groups with inflection rules as mobipocket tags.

        @param mainEntry: The word to inflect.
        @param controlByteCount: The number of control bytes.
        @param tagTable: The tag table.
        @param data: The Inflection data object to properly select the right inflection data section to use
        @param inflectionNames: The inflection rule name data.
        @param groupList: The list of inflection groups to process.
        @return: String with inflection groups and rules or empty string if required tags are not available.
        '''
        result = b""
        for value in groupList:
            offset, nextOffset, data = dinfl.offsets(value)

            # First byte seems to be always 0x00 and must be skipped.
            assert ord(data[offset:offset+1]) == 0x00
            tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset)

            # Make sure that the required tags are available.
            if 0x05 not in tagMap:
                print("Error: Required tag 0x05 not found in tagMap")
                return ""
            if 0x1a not in tagMap:
                print("Error: Required tag 0x1a not found in tagMap")
                return b''

            result += b'<idx:infl>'

            for i in range(len(tagMap[0x05])):

                # Get name of inflection rule.
                value = tagMap[0x05][i]
                consumed, textLength = getVariableWidthValue(inflectionNames, value)
                inflectionName = inflectionNames[value+consumed:value+consumed+textLength]

                # Get and apply inflection rule across possibly multiple inflection data sections
                value = tagMap[0x1a][i]
                rvalue, start, count, data = dinfl.lookup(value)
                offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue))
                textLength = ord(data[offset:offset+1])
                inflection = self.applyInflectionRule(mainEntry, data, offset+1, offset+1+textLength)
                if inflection is not None:
                    result += b'  <idx:iform name="' + inflectionName + b'" value="' + inflection + b'"/>'

            result += b'</idx:infl>'
        return result

    def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end):
        '''
        Apply inflection rule.

        @param mainEntry: The word to inflect.
        @param inflectionRuleData: The inflection rules.
        @param start: The start position of the inflection rule to use.
        @param end: The end position of the inflection rule to use.
        @return: The string with the inflected word or None if an error occurs.
        '''
        mode = -1
        byteArray = array.array(array_format, mainEntry)
        position = len(byteArray)
        for charOffset in range(start, end):
            char = inflectionRuleData[charOffset:charOffset+1]
            abyte = ord(char)
            if abyte >= 0x0a and abyte <= 0x13:
                # Move cursor backwards
                offset = abyte - 0x0a
                if mode not in [0x02, 0x03]:
                    mode = 0x02
                    position = len(byteArray)
                position -= offset
            elif abyte > 0x13:
                if mode == -1:
                    print("Error: Unexpected first byte %i of inflection rule" % abyte)
                    return None
                elif position == -1:
                    print("Error: Unexpected first byte %i of inflection rule" % abyte)
                    return None
                else:
                    if mode == 0x01:
                        # Insert at word start
                        byteArray.insert(position, abyte)
                        position += 1
                    elif mode == 0x02:
                        # Insert at word end
                        byteArray.insert(position, abyte)
                    elif mode == 0x03:
                        # Delete at word end
                        position -= 1
                        deleted = byteArray.pop(position)
                        if bchr(deleted) != char:
                            if DEBUG_DICT:
                                print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted)))
                            print("Error: Delete operation of inflection rule failed")
                            return None
                    elif mode == 0x04:
                        # Delete at word start
                        deleted = byteArray.pop(position)
                        if bchr(deleted) != char:
                            if DEBUG_DICT:
                                print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted)))
                            print("Error: Delete operation of inflection rule failed")
                            return None
                    else:
                        print("Error: Inflection rule mode %x is not implemented" % mode)
                        return None
            elif abyte == 0x01:
                # Insert at word start
                if mode not in [0x01, 0x04]:
                    position = 0
                mode = abyte
            elif abyte == 0x02:
                # Insert at word end
                if mode not in [0x02, 0x03]:
                    position = len(byteArray)
                mode = abyte
            elif abyte == 0x03:
                # Delete at word end
                if mode not in [0x02, 0x03]:
                    position = len(byteArray)
                mode = abyte
            elif abyte == 0x04:
                # Delete at word start
                if mode not in [0x01, 0x04]:
                    position = 0
                # Delete at word start
                mode = abyte
            else:
                print("Error: Inflection rule mode %x is not implemented" % abyte)
                return None
        return utf8_str(byteArray.tostring())