diff options
Diffstat (limited to 'src/epy_reader/tools/KindleUnpack/mobi_dict.py')
-rw-r--r-- | src/epy_reader/tools/KindleUnpack/mobi_dict.py | 377 |
1 files changed, 377 insertions, 0 deletions
diff --git a/src/epy_reader/tools/KindleUnpack/mobi_dict.py b/src/epy_reader/tools/KindleUnpack/mobi_dict.py new file mode 100644 index 0000000..bfc2ea8 --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/mobi_dict.py @@ -0,0 +1,377 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import unicode_literals, division, absolute_import, print_function + +from .compatibility_utils import PY2, PY3, utf8_str, bstr, bchr + +if PY2: + range = xrange + array_format = b'B' +if PY3: + unichr = chr + array_format = "B" + +import array + +import struct +# note: struct pack, unpack, unpack_from all require bytestring format +# data all the way up to at least python 2.7.5, python 3 okay with bytestring + +from .mobi_index import getVariableWidthValue, readTagSection, getTagMap +from .mobi_utils import toHex + +DEBUG_DICT = False + +class InflectionData(object): + + def __init__(self, infldatas): + self.infldatas = infldatas + self.starts = [] + self.counts = [] + for idata in self.infldatas: + start, = struct.unpack_from(b'>L', idata, 0x14) + count, = struct.unpack_from(b'>L', idata, 0x18) + self.starts.append(start) + self.counts.append(count) + + def lookup(self, lookupvalue): + i = 0 + rvalue = lookupvalue + while rvalue >= self.counts[i]: + rvalue = rvalue - self.counts[i] + i += 1 + if i == len(self.counts): + print("Error: Problem with multiple inflections data sections") + return lookupvalue, self.starts[0], self.counts[0], self.infldatas[0] + return rvalue, self.starts[i], self.counts[i], self.infldatas[i] + + def offsets(self, value): + rvalue, start, count, data = self.lookup(value) + offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue)) + if rvalue + 1 < count: + nextOffset, = struct.unpack_from(b'>H',data, start + 4 + (2 * (rvalue + 1))) + else: + nextOffset = None + return offset, nextOffset, data + + +class dictSupport(object): + + def __init__(self, mh, sect): + self.mh = mh + self.header = mh.header + self.sect = sect + self.metaOrthIndex = mh.metaOrthIndex + self.metaInflIndex = mh.metaInflIndex + + def parseHeader(self, data): + "read INDX header" + if not data[:4] == b'INDX': + print("Warning: index section is not INDX") + return False + words = ( + 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code', + 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc' + ) + num = len(words) + values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)]) + header = {} + for n in range(num): + header[words[n]] = values[n] + + ordt1 = None + ordt2 = None + + otype, oentries, op1, op2, otagx = struct.unpack_from(b'>LLLLL',data, 0xa4) + header['otype'] = otype + header['oentries'] = oentries + + if DEBUG_DICT: + print("otype %d, oentries %d, op1 %d, op2 %d, otagx %d" % (otype, oentries, op1, op2, otagx)) + + if header['code'] == 0xfdea or oentries > 0: + # some dictionaries seem to be codepage 65002 (0xFDEA) which seems + # to be some sort of strange EBCDIC utf-8 or 16 encoded strings + # So we need to look for them and store them away to process leading text + # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries + # we only ever seem to use the second but ... + # + # if otype = 0, ORDT table uses 16 bit values as offsets into the table + # if otype = 1, ORDT table uses 8 bit values as offsets inot the table + + assert(data[op1:op1+4] == b'ORDT') + assert(data[op2:op2+4] == b'ORDT') + ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4) + ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4) + + if DEBUG_DICT: + print("parsed INDX header:") + for key in header: + print(key, "%x" % header[key],) + print("\n") + return header, ordt1, ordt2 + + def getPositionMap(self): + sect = self.sect + + positionMap = {} + + metaOrthIndex = self.metaOrthIndex + metaInflIndex = self.metaInflIndex + + decodeInflection = True + if metaOrthIndex != 0xFFFFFFFF: + print("Info: Document contains orthographic index, handle as dictionary") + if metaInflIndex == 0xFFFFFFFF: + decodeInflection = False + else: + metaInflIndexData = sect.loadSection(metaInflIndex) + + print("\nParsing metaInflIndexData") + midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData) + + metaIndexCount = midxhdr['count'] + idatas = [] + for j in range(metaIndexCount): + idatas.append(sect.loadSection(metaInflIndex + 1 + j)) + dinfl = InflectionData(idatas) + + inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount) + tagSectionStart = midxhdr['len'] + inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData) + if DEBUG_DICT: + print("inflectionTagTable: %s" % inflectionTagTable) + if self.hasTag(inflectionTagTable, 0x07): + print("Error: Dictionary uses obsolete inflection rule scheme which is not yet supported") + decodeInflection = False + + data = sect.loadSection(metaOrthIndex) + + print("\nParsing metaOrthIndex") + idxhdr, hordt1, hordt2 = self.parseHeader(data) + + tagSectionStart = idxhdr['len'] + controlByteCount, tagTable = readTagSection(tagSectionStart, data) + orthIndexCount = idxhdr['count'] + print("orthIndexCount is", orthIndexCount) + if DEBUG_DICT: + print("orthTagTable: %s" % tagTable) + if hordt2 is not None: + print("orth entry uses ordt2 lookup table of type ", idxhdr['otype']) + hasEntryLength = self.hasTag(tagTable, 0x02) + if not hasEntryLength: + print("Info: Index doesn't contain entry length tags") + + print("Read dictionary index data") + for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount): + data = sect.loadSection(i) + hdrinfo, ordt1, ordt2 = self.parseHeader(data) + idxtPos = hdrinfo['start'] + entryCount = hdrinfo['count'] + idxPositions = [] + for j in range(entryCount): + pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j)) + idxPositions.append(pos) + # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) + idxPositions.append(idxtPos) + for j in range(entryCount): + startPos = idxPositions[j] + endPos = idxPositions[j+1] + textLength = ord(data[startPos:startPos+1]) + text = data[startPos+1:startPos+1+textLength] + if hordt2 is not None: + utext = u"" + if idxhdr['otype'] == 0: + pattern = b'>H' + inc = 2 + else: + pattern = b'>B' + inc = 1 + pos = 0 + while pos < textLength: + off, = struct.unpack_from(pattern, text, pos) + if off < len(hordt2): + utext += unichr(hordt2[off]) + else: + utext += unichr(off) + pos += inc + text = utext.encode('utf-8') + + tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos) + if 0x01 in tagMap: + if decodeInflection and 0x2a in tagMap: + inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable, + dinfl, inflNameData, tagMap[0x2a]) + else: + inflectionGroups = b'' + assert len(tagMap[0x01]) == 1 + entryStartPosition = tagMap[0x01][0] + if hasEntryLength: + # The idx:entry attribute "scriptable" must be present to create entry length tags. + ml = b'<idx:entry scriptable="yes"><idx:orth value="' + text + b'">' + inflectionGroups + b'</idx:orth>' + if entryStartPosition in positionMap: + positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml + else: + positionMap[entryStartPosition] = ml + assert len(tagMap[0x02]) == 1 + entryEndPosition = entryStartPosition + tagMap[0x02][0] + if entryEndPosition in positionMap: + positionMap[entryEndPosition] = b"</idx:entry>" + positionMap[entryEndPosition] + else: + positionMap[entryEndPosition] = b"</idx:entry>" + + else: + indexTags = b'<idx:entry>\n<idx:orth value="' + text + b'">\n' + inflectionGroups + b'</idx:entry>\n' + if entryStartPosition in positionMap: + positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags + else: + positionMap[entryStartPosition] = indexTags + return positionMap + + def hasTag(self, tagTable, tag): + ''' + Test if tag table contains given tag. + + @param tagTable: The tag table. + @param tag: The tag to search. + @return: True if tag table contains given tag; False otherwise. + ''' + for currentTag, _, _, _ in tagTable: + if currentTag == tag: + return True + return False + + def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList): + ''' + Create string which contains the inflection groups with inflection rules as mobipocket tags. + + @param mainEntry: The word to inflect. + @param controlByteCount: The number of control bytes. + @param tagTable: The tag table. + @param data: The Inflection data object to properly select the right inflection data section to use + @param inflectionNames: The inflection rule name data. + @param groupList: The list of inflection groups to process. + @return: String with inflection groups and rules or empty string if required tags are not available. + ''' + result = b"" + for value in groupList: + offset, nextOffset, data = dinfl.offsets(value) + + # First byte seems to be always 0x00 and must be skipped. + assert ord(data[offset:offset+1]) == 0x00 + tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset) + + # Make sure that the required tags are available. + if 0x05 not in tagMap: + print("Error: Required tag 0x05 not found in tagMap") + return "" + if 0x1a not in tagMap: + print("Error: Required tag 0x1a not found in tagMap") + return b'' + + result += b'<idx:infl>' + + for i in range(len(tagMap[0x05])): + + # Get name of inflection rule. + value = tagMap[0x05][i] + consumed, textLength = getVariableWidthValue(inflectionNames, value) + inflectionName = inflectionNames[value+consumed:value+consumed+textLength] + + # Get and apply inflection rule across possibly multiple inflection data sections + value = tagMap[0x1a][i] + rvalue, start, count, data = dinfl.lookup(value) + offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue)) + textLength = ord(data[offset:offset+1]) + inflection = self.applyInflectionRule(mainEntry, data, offset+1, offset+1+textLength) + if inflection is not None: + result += b' <idx:iform name="' + inflectionName + b'" value="' + inflection + b'"/>' + + result += b'</idx:infl>' + return result + + def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end): + ''' + Apply inflection rule. + + @param mainEntry: The word to inflect. + @param inflectionRuleData: The inflection rules. + @param start: The start position of the inflection rule to use. + @param end: The end position of the inflection rule to use. + @return: The string with the inflected word or None if an error occurs. + ''' + mode = -1 + byteArray = array.array(array_format, mainEntry) + position = len(byteArray) + for charOffset in range(start, end): + char = inflectionRuleData[charOffset:charOffset+1] + abyte = ord(char) + if abyte >= 0x0a and abyte <= 0x13: + # Move cursor backwards + offset = abyte - 0x0a + if mode not in [0x02, 0x03]: + mode = 0x02 + position = len(byteArray) + position -= offset + elif abyte > 0x13: + if mode == -1: + print("Error: Unexpected first byte %i of inflection rule" % abyte) + return None + elif position == -1: + print("Error: Unexpected first byte %i of inflection rule" % abyte) + return None + else: + if mode == 0x01: + # Insert at word start + byteArray.insert(position, abyte) + position += 1 + elif mode == 0x02: + # Insert at word end + byteArray.insert(position, abyte) + elif mode == 0x03: + # Delete at word end + position -= 1 + deleted = byteArray.pop(position) + if bchr(deleted) != char: + if DEBUG_DICT: + print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted))) + print("Error: Delete operation of inflection rule failed") + return None + elif mode == 0x04: + # Delete at word start + deleted = byteArray.pop(position) + if bchr(deleted) != char: + if DEBUG_DICT: + print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted))) + print("Error: Delete operation of inflection rule failed") + return None + else: + print("Error: Inflection rule mode %x is not implemented" % mode) + return None + elif abyte == 0x01: + # Insert at word start + if mode not in [0x01, 0x04]: + position = 0 + mode = abyte + elif abyte == 0x02: + # Insert at word end + if mode not in [0x02, 0x03]: + position = len(byteArray) + mode = abyte + elif abyte == 0x03: + # Delete at word end + if mode not in [0x02, 0x03]: + position = len(byteArray) + mode = abyte + elif abyte == 0x04: + # Delete at word start + if mode not in [0x01, 0x04]: + position = 0 + # Delete at word start + mode = abyte + else: + print("Error: Inflection rule mode %x is not implemented" % abyte) + return None + return utf8_str(byteArray.tostring()) |