#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
from __future__ import unicode_literals, division, absolute_import, print_function
from .compatibility_utils import PY2, PY3, utf8_str, bstr, bchr
if PY2:
range = xrange
array_format = b'B'
if PY3:
unichr = chr
array_format = "B"
import array
import struct
# note: struct pack, unpack, unpack_from all require bytestring format
# data all the way up to at least python 2.7.5, python 3 okay with bytestring
from .mobi_index import getVariableWidthValue, readTagSection, getTagMap
from .mobi_utils import toHex
DEBUG_DICT = False
class InflectionData(object):
def __init__(self, infldatas):
self.infldatas = infldatas
self.starts = []
self.counts = []
for idata in self.infldatas:
start, = struct.unpack_from(b'>L', idata, 0x14)
count, = struct.unpack_from(b'>L', idata, 0x18)
self.starts.append(start)
self.counts.append(count)
def lookup(self, lookupvalue):
i = 0
rvalue = lookupvalue
while rvalue >= self.counts[i]:
rvalue = rvalue - self.counts[i]
i += 1
if i == len(self.counts):
print("Error: Problem with multiple inflections data sections")
return lookupvalue, self.starts[0], self.counts[0], self.infldatas[0]
return rvalue, self.starts[i], self.counts[i], self.infldatas[i]
def offsets(self, value):
rvalue, start, count, data = self.lookup(value)
offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue))
if rvalue + 1 < count:
nextOffset, = struct.unpack_from(b'>H',data, start + 4 + (2 * (rvalue + 1)))
else:
nextOffset = None
return offset, nextOffset, data
class dictSupport(object):
def __init__(self, mh, sect):
self.mh = mh
self.header = mh.header
self.sect = sect
self.metaOrthIndex = mh.metaOrthIndex
self.metaInflIndex = mh.metaInflIndex
def parseHeader(self, data):
"read INDX header"
if not data[:4] == b'INDX':
print("Warning: index section is not INDX")
return False
words = (
'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc'
)
num = len(words)
values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)])
header = {}
for n in range(num):
header[words[n]] = values[n]
ordt1 = None
ordt2 = None
otype, oentries, op1, op2, otagx = struct.unpack_from(b'>LLLLL',data, 0xa4)
header['otype'] = otype
header['oentries'] = oentries
if DEBUG_DICT:
print("otype %d, oentries %d, op1 %d, op2 %d, otagx %d" % (otype, oentries, op1, op2, otagx))
if header['code'] == 0xfdea or oentries > 0:
# some dictionaries seem to be codepage 65002 (0xFDEA) which seems
# to be some sort of strange EBCDIC utf-8 or 16 encoded strings
# So we need to look for them and store them away to process leading text
# ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
# we only ever seem to use the second but ...
#
# if otype = 0, ORDT table uses 16 bit values as offsets into the table
# if otype = 1, ORDT table uses 8 bit values as offsets inot the table
assert(data[op1:op1+4] == b'ORDT')
assert(data[op2:op2+4] == b'ORDT')
ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4)
ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4)
if DEBUG_DICT:
print("parsed INDX header:")
for key in header:
print(key, "%x" % header[key],)
print("\n")
return header, ordt1, ordt2
def getPositionMap(self):
sect = self.sect
positionMap = {}
metaOrthIndex = self.metaOrthIndex
metaInflIndex = self.metaInflIndex
decodeInflection = True
if metaOrthIndex != 0xFFFFFFFF:
print("Info: Document contains orthographic index, handle as dictionary")
if metaInflIndex == 0xFFFFFFFF:
decodeInflection = False
else:
metaInflIndexData = sect.loadSection(metaInflIndex)
print("\nParsing metaInflIndexData")
midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData)
metaIndexCount = midxhdr['count']
idatas = []
for j in range(metaIndexCount):
idatas.append(sect.loadSection(metaInflIndex + 1 + j))
dinfl = InflectionData(idatas)
inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount)
tagSectionStart = midxhdr['len']
inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData)
if DEBUG_DICT:
print("inflectionTagTable: %s" % inflectionTagTable)
if self.hasTag(inflectionTagTable, 0x07):
print("Error: Dictionary uses obsolete inflection rule scheme which is not yet supported")
decodeInflection = False
data = sect.loadSection(metaOrthIndex)
print("\nParsing metaOrthIndex")
idxhdr, hordt1, hordt2 = self.parseHeader(data)
tagSectionStart = idxhdr['len']
controlByteCount, tagTable = readTagSection(tagSectionStart, data)
orthIndexCount = idxhdr['count']
print("orthIndexCount is", orthIndexCount)
if DEBUG_DICT:
print("orthTagTable: %s" % tagTable)
if hordt2 is not None:
print("orth entry uses ordt2 lookup table of type ", idxhdr['otype'])
hasEntryLength = self.hasTag(tagTable, 0x02)
if not hasEntryLength:
print("Info: Index doesn't contain entry length tags")
print("Read dictionary index data")
for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount):
data = sect.loadSection(i)
hdrinfo, ordt1, ordt2 = self.parseHeader(data)
idxtPos = hdrinfo['start']
entryCount = hdrinfo['count']
idxPositions = []
for j in range(entryCount):
pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j))
idxPositions.append(pos)
# The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
idxPositions.append(idxtPos)
for j in range(entryCount):
startPos = idxPositions[j]
endPos = idxPositions[j+1]
textLength = ord(data[startPos:startPos+1])
text = data[startPos+1:startPos+1+textLength]
if hordt2 is not None:
utext = u""
if idxhdr['otype'] == 0:
pattern = b'>H'
inc = 2
else:
pattern = b'>B'
inc = 1
pos = 0
while pos < textLength:
off, = struct.unpack_from(pattern, text, pos)
if off < len(hordt2):
utext += unichr(hordt2[off])
else:
utext += unichr(off)
pos += inc
text = utext.encode('utf-8')
tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos)
if 0x01 in tagMap:
if decodeInflection and 0x2a in tagMap:
inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable,
dinfl, inflNameData, tagMap[0x2a])
else:
inflectionGroups = b''
assert len(tagMap[0x01]) == 1
entryStartPosition = tagMap[0x01][0]
if hasEntryLength:
# The idx:entry attribute "scriptable" must be present to create entry length tags.
ml = b'<idx:entry scriptable="yes"><idx:orth value="' + text + b'">' + inflectionGroups + b'</idx:orth>'
if entryStartPosition in positionMap:
positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml
else:
positionMap[entryStartPosition] = ml
assert len(tagMap[0x02]) == 1
entryEndPosition = entryStartPosition + tagMap[0x02][0]
if entryEndPosition in positionMap:
positionMap[entryEndPosition] = b"</idx:entry>" + positionMap[entryEndPosition]
else:
positionMap[entryEndPosition] = b"</idx:entry>"
else:
indexTags = b'<idx:entry>\n<idx:orth value="' + text + b'">\n' + inflectionGroups + b'</idx:entry>\n'
if entryStartPosition in positionMap:
positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags
else:
positionMap[entryStartPosition] = indexTags
return positionMap
def hasTag(self, tagTable, tag):
'''
Test if tag table contains given tag.
@param tagTable: The tag table.
@param tag: The tag to search.
@return: True if tag table contains given tag; False otherwise.
'''
for currentTag, _, _, _ in tagTable:
if currentTag == tag:
return True
return False
def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList):
'''
Create string which contains the inflection groups with inflection rules as mobipocket tags.
@param mainEntry: The word to inflect.
@param controlByteCount: The number of control bytes.
@param tagTable: The tag table.
@param data: The Inflection data object to properly select the right inflection data section to use
@param inflectionNames: The inflection rule name data.
@param groupList: The list of inflection groups to process.
@return: String with inflection groups and rules or empty string if required tags are not available.
'''
result = b""
for value in groupList:
offset, nextOffset, data = dinfl.offsets(value)
# First byte seems to be always 0x00 and must be skipped.
assert ord(data[offset:offset+1]) == 0x00
tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset)
# Make sure that the required tags are available.
if 0x05 not in tagMap:
print("Error: Required tag 0x05 not found in tagMap")
return ""
if 0x1a not in tagMap:
print("Error: Required tag 0x1a not found in tagMap")
return b''
result += b'<idx:infl>'
for i in range(len(tagMap[0x05])):
# Get name of inflection rule.
value = tagMap[0x05][i]
consumed, textLength = getVariableWidthValue(inflectionNames, value)
inflectionName = inflectionNames[value+consumed:value+consumed+textLength]
# Get and apply inflection rule across possibly multiple inflection data sections
value = tagMap[0x1a][i]
rvalue, start, count, data = dinfl.lookup(value)
offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue))
textLength = ord(data[offset:offset+1])
inflection = self.applyInflectionRule(mainEntry, data, offset+1, offset+1+textLength)
if inflection is not None:
result += b' <idx:iform name="' + inflectionName + b'" value="' + inflection + b'"/>'
result += b'</idx:infl>'
return result
def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end):
'''
Apply inflection rule.
@param mainEntry: The word to inflect.
@param inflectionRuleData: The inflection rules.
@param start: The start position of the inflection rule to use.
@param end: The end position of the inflection rule to use.
@return: The string with the inflected word or None if an error occurs.
'''
mode = -1
byteArray = array.array(array_format, mainEntry)
position = len(byteArray)
for charOffset in range(start, end):
char = inflectionRuleData[charOffset:charOffset+1]
abyte = ord(char)
if abyte >= 0x0a and abyte <= 0x13:
# Move cursor backwards
offset = abyte - 0x0a
if mode not in [0x02, 0x03]:
mode = 0x02
position = len(byteArray)
position -= offset
elif abyte > 0x13:
if mode == -1:
print("Error: Unexpected first byte %i of inflection rule" % abyte)
return None
elif position == -1:
print("Error: Unexpected first byte %i of inflection rule" % abyte)
return None
else:
if mode == 0x01:
# Insert at word start
byteArray.insert(position, abyte)
position += 1
elif mode == 0x02:
# Insert at word end
byteArray.insert(position, abyte)
elif mode == 0x03:
# Delete at word end
position -= 1
deleted = byteArray.pop(position)
if bchr(deleted) != char:
if DEBUG_DICT:
print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted)))
print("Error: Delete operation of inflection rule failed")
return None
elif mode == 0x04:
# Delete at word start
deleted = byteArray.pop(position)
if bchr(deleted) != char:
if DEBUG_DICT:
print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted)))
print("Error: Delete operation of inflection rule failed")
return None
else:
print("Error: Inflection rule mode %x is not implemented" % mode)
return None
elif abyte == 0x01:
# Insert at word start
if mode not in [0x01, 0x04]:
position = 0
mode = abyte
elif abyte == 0x02:
# Insert at word end
if mode not in [0x02, 0x03]:
position = len(byteArray)
mode = abyte
elif abyte == 0x03:
# Delete at word end
if mode not in [0x02, 0x03]:
position = len(byteArray)
mode = abyte
elif abyte == 0x04:
# Delete at word start
if mode not in [0x01, 0x04]:
position = 0
# Delete at word start
mode = abyte
else:
print("Error: Inflection rule mode %x is not implemented" % abyte)
return None
return utf8_str(byteArray.tostring())