#!/usr/bin/env python # -*- coding: utf-8 -*- # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab from __future__ import unicode_literals, division, absolute_import, print_function DEBUG_USE_ORDERED_DICTIONARY = False # OrderedDict is supoorted >= python 2.7. """ set to True to use OrderedDict for MobiHeader.metadata.""" if DEBUG_USE_ORDERED_DICTIONARY: from collections import OrderedDict as dict_ else: dict_ = dict from .compatibility_utils import PY2, unicode_str, hexlify, bord if PY2: range = xrange import struct import uuid # import the mobiunpack support libraries from .mobi_utils import getLanguage from .mobi_uncompress import HuffcdicReader, PalmdocReader, UncompressedReader class unpackException(Exception): pass def sortedHeaderKeys(mheader): hdrkeys = sorted(list(mheader.keys()), key=lambda akey: mheader[akey][0]) return hdrkeys # HD Containers have their own headers and their own EXTH # this is just guesswork so far, making big assumption that # metavalue key numbers remain the same in the CONT EXTH # Note: The layout of the CONT Header is still unknown # so just deal with their EXTH sections for now def dump_contexth(cpage, extheader): # determine text encoding codec = 'windows-1252' codec_map = { 1252 : 'windows-1252', 65001: 'utf-8', } if cpage in codec_map: codec = codec_map[cpage] if extheader == b'': return id_map_strings = { 1 : 'Drm Server Id', 2 : 'Drm Commerce Id', 3 : 'Drm Ebookbase Book Id', 4 : 'Drm Ebookbase Dep Id', 100 : 'Creator', 101 : 'Publisher', 102 : 'Imprint', 103 : 'Description', 104 : 'ISBN', 105 : 'Subject', 106 : 'Published', 107 : 'Review', 108 : 'Contributor', 109 : 'Rights', 110 : 'SubjectCode', 111 : 'Type', 112 : 'Source', 113 : 'ASIN', # 114 : 'versionNumber', 117 : 'Adult', 118 : 'Retail-Price', 119 : 'Retail-Currency', 120 : 'TSC', 122 : 'fixed-layout', 123 : 'book-type', 124 : 'orientation-lock', 126 : 'original-resolution', 127 : 'zero-gutter', 128 : 'zero-margin', 129 : 'MetadataResourceURI', 132 : 'RegionMagnification', 150 : 'LendingEnabled', 200 : 'DictShortName', 501 : 'cdeType', 502 : 'last_update_time', 503 : 'Updated_Title', 504 : 'CDEContentKey', 505 : 'AmazonContentReference', 506 : 'Title-Language', 507 : 'Title-Display-Direction', 508 : 'Title-Pronunciation', 509 : 'Title-Collation', 510 : 'Secondary-Title', 511 : 'Secondary-Title-Language', 512 : 'Secondary-Title-Direction', 513 : 'Secondary-Title-Pronunciation', 514 : 'Secondary-Title-Collation', 515 : 'Author-Language', 516 : 'Author-Display-Direction', 517 : 'Author-Pronunciation', 518 : 'Author-Collation', 519 : 'Author-Type', 520 : 'Publisher-Language', 521 : 'Publisher-Display-Direction', 522 : 'Publisher-Pronunciation', 523 : 'Publisher-Collation', 524 : 'Content-Language-Tag', 525 : 'primary-writing-mode', 526 : 'NCX-Ingested-By-Software', 527 : 'page-progression-direction', 528 : 'override-kindle-fonts', 529 : 'Compression-Upgraded', 530 : 'Soft-Hyphens-In-Content', 531 : 'Dictionary_In_Langague', 532 : 'Dictionary_Out_Language', 533 : 'Font_Converted', 534 : 'Amazon_Creator_Info', 535 : 'Creator-Build-Tag', 536 : 'HD-Media-Containers-Info', # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?) 538 : 'Resource-Container-Fidelity', 539 : 'HD-Container-Mimetype', 540 : 'Sample-For_Special-Purpose', 541 : 'Kindletool-Operation-Information', 542 : 'Container_Id', 543 : 'Asset-Type', # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER 544 : 'Unknown_544', } id_map_values = { 114 : 'versionNumber', 115 : 'sample', 116 : 'StartOffset', 121 : 'Mobi8-Boundary-Section', 125 : 'Embedded-Record-Count', 130 : 'Offline-Sample', 131 : 'Metadata-Record-Offset', 201 : 'CoverOffset', 202 : 'ThumbOffset', 203 : 'HasFakeCover', 204 : 'Creator-Software', 205 : 'Creator-Major-Version', 206 : 'Creator-Minor-Version', 207 : 'Creator-Build-Number', 401 : 'Clipping-Limit', 402 : 'Publisher-Limit', 404 : 'Text-to-Speech-Disabled', 406 : 'Rental-Expiration-Time', } id_map_hexstrings = { 208 : 'Watermark_(hex)', 209 : 'Tamper-Proof-Keys_(hex)', 300 : 'Font-Signature_(hex)', 403 : 'Unknown_(403)_(hex)', 405 : 'Ownership-Type_(hex)', 407 : 'Unknown_(407)_(hex)', 420 : 'Multimedia-Content-Reference_(hex)', 450 : 'Locations_Match_(hex)', 451 : 'Full-Story-Length_(hex)', 452 : 'Sample-Start_Location_(hex)', 453 : 'Sample-End-Location_(hex)', } _length, num_items = struct.unpack(b'>LL', extheader[4:12]) extheader = extheader[12:] pos = 0 for _ in range(num_items): id, size = struct.unpack(b'>LL', extheader[pos:pos+8]) content = extheader[pos + 8: pos + size] if id in id_map_strings: name = id_map_strings[id] print('\n Key: "%s"\n Value: "%s"' % (name, content.decode(codec, errors='replace'))) elif id in id_map_values: name = id_map_values[id] if size == 9: value, = struct.unpack(b'B',content) print('\n Key: "%s"\n Value: 0x%01x' % (name, value)) elif size == 10: value, = struct.unpack(b'>H',content) print('\n Key: "%s"\n Value: 0x%02x' % (name, value)) elif size == 12: value, = struct.unpack(b'>L',content) print('\n Key: "%s"\n Value: 0x%04x' % (name, value)) else: print("\nError: Value for %s has unexpected size of %s" % (name, size)) elif id in id_map_hexstrings: name = id_map_hexstrings[id] print('\n Key: "%s"\n Value: 0x%s' % (name, hexlify(content))) else: print("\nWarning: Unknown metadata with id %s found" % id) name = str(id) + ' (hex)' print(' Key: "%s"\n Value: 0x%s' % (name, hexlify(content))) pos += size return class MobiHeader: # all values are packed in big endian format palmdoc_header = { 'compression_type' : (0x00, b'>H', 2), 'fill0' : (0x02, b'>H', 2), 'text_length' : (0x04, b'>L', 4), 'text_records' : (0x08, b'>H', 2), 'max_section_size' : (0x0a, b'>H', 2), 'read_pos ' : (0x0c, b'>L', 4), } mobi6_header = { 'compression_type' : (0x00, b'>H', 2), 'fill0' : (0x02, b'>H', 2), 'text_length' : (0x04, b'>L', 4), 'text_records' : (0x08, b'>H', 2), 'max_section_size' : (0x0a, b'>H', 2), 'crypto_type' : (0x0c, b'>H', 2), 'fill1' : (0x0e, b'>H', 2), 'magic' : (0x10, b'4s', 4), 'header_length (from MOBI)' : (0x14, b'>L', 4), 'type' : (0x18, b'>L', 4), 'codepage' : (0x1c, b'>L', 4), 'unique_id' : (0x20, b'>L', 4), 'version' : (0x24, b'>L', 4), 'metaorthindex' : (0x28, b'>L', 4), 'metainflindex' : (0x2c, b'>L', 4), 'index_names' : (0x30, b'>L', 4), 'index_keys' : (0x34, b'>L', 4), 'extra_index0' : (0x38, b'>L', 4), 'extra_index1' : (0x3c, b'>L', 4), 'extra_index2' : (0x40, b'>L', 4), 'extra_index3' : (0x44, b'>L', 4), 'extra_index4' : (0x48, b'>L', 4), 'extra_index5' : (0x4c, b'>L', 4), 'first_nontext' : (0x50, b'>L', 4), 'title_offset' : (0x54, b'>L', 4), 'title_length' : (0x58, b'>L', 4), 'language_code' : (0x5c, b'>L', 4), 'dict_in_lang' : (0x60, b'>L', 4), 'dict_out_lang' : (0x64, b'>L', 4), 'min_version' : (0x68, b'>L', 4), 'first_resc_offset' : (0x6c, b'>L', 4), 'huff_offset' : (0x70, b'>L', 4), 'huff_num' : (0x74, b'>L', 4), 'huff_tbl_offset' : (0x78, b'>L', 4), 'huff_tbl_len' : (0x7c, b'>L', 4), 'exth_flags' : (0x80, b'>L', 4), 'fill3_a' : (0x84, b'>L', 4), 'fill3_b' : (0x88, b'>L', 4), 'fill3_c' : (0x8c, b'>L', 4), 'fill3_d' : (0x90, b'>L', 4), 'fill3_e' : (0x94, b'>L', 4), 'fill3_f' : (0x98, b'>L', 4), 'fill3_g' : (0x9c, b'>L', 4), 'fill3_h' : (0xa0, b'>L', 4), 'unknown0' : (0xa4, b'>L', 4), 'drm_offset' : (0xa8, b'>L', 4), 'drm_count' : (0xac, b'>L', 4), 'drm_size' : (0xb0, b'>L', 4), 'drm_flags' : (0xb4, b'>L', 4), 'fill4_a' : (0xb8, b'>L', 4), 'fill4_b' : (0xbc, b'>L', 4), 'first_content' : (0xc0, b'>H', 2), 'last_content' : (0xc2, b'>H', 2), 'unknown0' : (0xc4, b'>L', 4), 'fcis_offset' : (0xc8, b'>L', 4), 'fcis_count' : (0xcc, b'>L', 4), 'flis_offset' : (0xd0, b'>L', 4), 'flis_count' : (0xd4, b'>L', 4), 'unknown1' : (0xd8, b'>L', 4), 'unknown2' : (0xdc, b'>L', 4), 'srcs_offset' : (0xe0, b'>L', 4), 'srcs_count' : (0xe4, b'>L', 4), 'unknown3' : (0xe8, b'>L', 4), 'unknown4' : (0xec, b'>L', 4), 'fill5' : (0xf0, b'>H', 2), 'traildata_flags' : (0xf2, b'>H', 2), 'ncx_index' : (0xf4, b'>L', 4), 'unknown5' : (0xf8, b'>L', 4), 'unknown6' : (0xfc, b'>L', 4), 'datp_offset' : (0x100, b'>L', 4), 'unknown7' : (0x104, b'>L', 4), 'Unknown ' : (0x108, b'>L', 4), 'Unknown ' : (0x10C, b'>L', 4), 'Unknown ' : (0x110, b'>L', 4), 'Unknown ' : (0x114, b'>L', 4), 'Unknown ' : (0x118, b'>L', 4), 'Unknown ' : (0x11C, b'>L', 4), 'Unknown ' : (0x120, b'>L', 4), 'Unknown ' : (0x124, b'>L', 4), 'Unknown ' : (0x128, b'>L', 4), 'Unknown ' : (0x12C, b'>L', 4), 'Unknown ' : (0x130, b'>L', 4), 'Unknown ' : (0x134, b'>L', 4), 'Unknown ' : (0x138, b'>L', 4), 'Unknown ' : (0x11C, b'>L', 4), } mobi8_header = { 'compression_type' : (0x00, b'>H', 2), 'fill0' : (0x02, b'>H', 2), 'text_length' : (0x04, b'>L', 4), 'text_records' : (0x08, b'>H', 2), 'max_section_size' : (0x0a, b'>H', 2), 'crypto_type' : (0x0c, b'>H', 2), 'fill1' : (0x0e, b'>H', 2), 'magic' : (0x10, b'4s', 4), 'header_length (from MOBI)' : (0x14, b'>L', 4), 'type' : (0x18, b'>L', 4), 'codepage' : (0x1c, b'>L', 4), 'unique_id' : (0x20, b'>L', 4), 'version' : (0x24, b'>L', 4), 'metaorthindex' : (0x28, b'>L', 4), 'metainflindex' : (0x2c, b'>L', 4), 'index_names' : (0x30, b'>L', 4), 'index_keys' : (0x34, b'>L', 4), 'extra_index0' : (0x38, b'>L', 4), 'extra_index1' : (0x3c, b'>L', 4), 'extra_index2' : (0x40, b'>L', 4), 'extra_index3' : (0x44, b'>L', 4), 'extra_index4' : (0x48, b'>L', 4), 'extra_index5' : (0x4c, b'>L', 4), 'first_nontext' : (0x50, b'>L', 4), 'title_offset' : (0x54, b'>L', 4), 'title_length' : (0x58, b'>L', 4), 'language_code' : (0x5c, b'>L', 4), 'dict_in_lang' : (0x60, b'>L', 4), 'dict_out_lang' : (0x64, b'>L', 4), 'min_version' : (0x68, b'>L', 4), 'first_resc_offset' : (0x6c, b'>L', 4), 'huff_offset' : (0x70, b'>L', 4), 'huff_num' : (0x74, b'>L', 4), 'huff_tbl_offset' : (0x78, b'>L', 4), 'huff_tbl_len' : (0x7c, b'>L', 4), 'exth_flags' : (0x80, b'>L', 4), 'fill3_a' : (0x84, b'>L', 4), 'fill3_b' : (0x88, b'>L', 4), 'fill3_c' : (0x8c, b'>L', 4), 'fill3_d' : (0x90, b'>L', 4), 'fill3_e' : (0x94, b'>L', 4), 'fill3_f' : (0x98, b'>L', 4), 'fill3_g' : (0x9c, b'>L', 4), 'fill3_h' : (0xa0, b'>L', 4), 'unknown0' : (0xa4, b'>L', 4), 'drm_offset' : (0xa8, b'>L', 4), 'drm_count' : (0xac, b'>L', 4), 'drm_size' : (0xb0, b'>L', 4), 'drm_flags' : (0xb4, b'>L', 4), 'fill4_a' : (0xb8, b'>L', 4), 'fill4_b' : (0xbc, b'>L', 4), 'fdst_offset' : (0xc0, b'>L', 4), 'fdst_flow_count' : (0xc4, b'>L', 4), 'fcis_offset' : (0xc8, b'>L', 4), 'fcis_count' : (0xcc, b'>L', 4), 'flis_offset' : (0xd0, b'>L', 4), 'flis_count' : (0xd4, b'>L', 4), 'unknown1' : (0xd8, b'>L', 4), 'unknown2' : (0xdc, b'>L', 4), 'srcs_offset' : (0xe0, b'>L', 4), 'srcs_count' : (0xe4, b'>L', 4), 'unknown3' : (0xe8, b'>L', 4), 'unknown4' : (0xec, b'>L', 4), 'fill5' : (0xf0, b'>H', 2), 'traildata_flags' : (0xf2, b'>H', 2), 'ncx_index' : (0xf4, b'>L', 4), 'fragment_index' : (0xf8, b'>L', 4), 'skeleton_index' : (0xfc, b'>L', 4), 'datp_offset' : (0x100, b'>L', 4), 'guide_index' : (0x104, b'>L', 4), 'Unknown ' : (0x108, b'>L', 4), 'Unknown ' : (0x10C, b'>L', 4), 'Unknown ' : (0x110, b'>L', 4), 'Unknown ' : (0x114, b'>L', 4), 'Unknown ' : (0x118, b'>L', 4), 'Unknown ' : (0x11C, b'>L', 4), 'Unknown ' : (0x120, b'>L', 4), 'Unknown ' : (0x124, b'>L', 4), 'Unknown ' : (0x128, b'>L', 4), 'Unknown ' : (0x12C, b'>L', 4), 'Unknown ' : (0x130, b'>L', 4), 'Unknown ' : (0x134, b'>L', 4), 'Unknown ' : (0x138, b'>L', 4), 'Unknown ' : (0x11C, b'>L', 4), } palmdoc_header_sorted_keys = sortedHeaderKeys(palmdoc_header) mobi6_header_sorted_keys = sortedHeaderKeys(mobi6_header) mobi8_header_sorted_keys = sortedHeaderKeys(mobi8_header) id_map_strings = { 1 : 'Drm Server Id', 2 : 'Drm Commerce Id', 3 : 'Drm Ebookbase Book Id', 4 : 'Drm Ebookbase Dep Id', 100 : 'Creator', 101 : 'Publisher', 102 : 'Imprint', 103 : 'Description', 104 : 'ISBN', 105 : 'Subject', 106 : 'Published', 107 : 'Review', 108 : 'Contributor', 109 : 'Rights', 110 : 'SubjectCode', 111 : 'Type', 112 : 'Source', 113 : 'ASIN', # 114 : 'versionNumber', 117 : 'Adult', 118 : 'Retail-Price', 119 : 'Retail-Currency', 120 : 'TSC', 122 : 'fixed-layout', 123 : 'book-type', 124 : 'orientation-lock', 126 : 'original-resolution', 127 : 'zero-gutter', 128 : 'zero-margin', 129 : 'MetadataResourceURI', 132 : 'RegionMagnification', 150 : 'LendingEnabled', 200 : 'DictShortName', 501 : 'cdeType', 502 : 'last_update_time', 503 : 'Updated_Title', 504 : 'CDEContentKey', 505 : 'AmazonContentReference', 506 : 'Title-Language', 507 : 'Title-Display-Direction', 508 : 'Title-Pronunciation', 509 : 'Title-Collation', 510 : 'Secondary-Title', 511 : 'Secondary-Title-Language', 512 : 'Secondary-Title-Direction', 513 : 'Secondary-Title-Pronunciation', 514 : 'Secondary-Title-Collation', 515 : 'Author-Language', 516 : 'Author-Display-Direction', 517 : 'Author-Pronunciation', 518 : 'Author-Collation', 519 : 'Author-Type', 520 : 'Publisher-Language', 521 : 'Publisher-Display-Direction', 522 : 'Publisher-Pronunciation', 523 : 'Publisher-Collation', 524 : 'Content-Language-Tag', 525 : 'primary-writing-mode', 526 : 'NCX-Ingested-By-Software', 527 : 'page-progression-direction', 528 : 'override-kindle-fonts', 529 : 'Compression-Upgraded', 530 : 'Soft-Hyphens-In-Content', 531 : 'Dictionary_In_Langague', 532 : 'Dictionary_Out_Language', 533 : 'Font_Converted', 534 : 'Amazon_Creator_Info', 535 : 'Creator-Build-Tag', 536 : 'HD-Media-Containers-Info', # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?) 538 : 'Resource-Container-Fidelity', 539 : 'HD-Container-Mimetype', 540 : 'Sample-For_Special-Purpose', 541 : 'Kindletool-Operation-Information', 542 : 'Container_Id', 543 : 'Asset-Type', # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER 544 : 'Unknown_544', } id_map_values = { 114 : 'versionNumber', 115 : 'sample', 116 : 'StartOffset', 121 : 'Mobi8-Boundary-Section', 125 : 'Embedded-Record-Count', 130 : 'Offline-Sample', 131 : 'Metadata-Record-Offset', 201 : 'CoverOffset', 202 : 'ThumbOffset', 203 : 'HasFakeCover', 204 : 'Creator-Software', 205 : 'Creator-Major-Version', 206 : 'Creator-Minor-Version', 207 : 'Creator-Build-Number', 401 : 'Clipping-Limit', 402 : 'Publisher-Limit', 404 : 'Text-to-Speech-Disabled', 406 : 'Rental-Expiration-Time', } id_map_hexstrings = { 208 : 'Watermark_(hex)', 209 : 'Tamper-Proof-Keys_(hex)', 300 : 'Font-Signature_(hex)', 403 : 'Unknown_(403)_(hex)', 405 : 'Ownership-Type_(hex)', 407 : 'Unknown_(407)_(hex)', 420 : 'Multimedia-Content-Reference_(hex)', 450 : 'Locations_Match_(hex)', 451 : 'Full-Story-Length_(hex)', 452 : 'Sample-Start_Location_(hex)', 453 : 'Sample-End-Location_(hex)', } def __init__(self, sect, sectNumber): self.sect = sect self.start = sectNumber self.header = self.sect.loadSection(self.start) if len(self.header)>20 and self.header[16:20] == b'MOBI': self.sect.setsectiondescription(0,"Mobipocket Header") self.palm = False elif self.sect.ident == b'TEXtREAd': self.sect.setsectiondescription(0, "PalmDOC Header") self.palm = True else: raise unpackException('Unknown File Format') self.records, = struct.unpack_from(b'>H', self.header, 0x8) # set defaults in case this is a PalmDOC self.title = self.sect.palmname.decode('latin-1', errors='replace') self.length = len(self.header)-16 self.type = 3 self.codepage = 1252 self.codec = 'windows-1252' self.unique_id = 0 self.version = 0 self.hasExth = False self.exth = b'' self.exth_offset = self.length + 16 self.exth_length = 0 self.crypto_type = 0 self.firstnontext = self.start+self.records + 1 self.firstresource = self.start+self.records + 1 self.ncxidx = 0xffffffff self.metaOrthIndex = 0xffffffff self.metaInflIndex = 0xffffffff self.skelidx = 0xffffffff self.fragidx = 0xffffffff self.guideidx = 0xffffffff self.fdst = 0xffffffff self.mlstart = self.sect.loadSection(self.start+1)[:4] self.rawSize = 0 self.metadata = dict_() # set up for decompression/unpacking self.compression, = struct.unpack_from(b'>H', self.header, 0x0) if self.compression == 0x4448: reader = HuffcdicReader() huffoff, huffnum = struct.unpack_from(b'>LL', self.header, 0x70) huffoff = huffoff + self.start self.sect.setsectiondescription(huffoff,"Huffman Compression Seed") reader.loadHuff(self.sect.loadSection(huffoff)) for i in range(1, huffnum): self.sect.setsectiondescription(huffoff+i,"Huffman CDIC Compression Seed %d" % i) reader.loadCdic(self.sect.loadSection(huffoff+i)) self.unpack = reader.unpack elif self.compression == 2: self.unpack = PalmdocReader().unpack elif self.compression == 1: self.unpack = UncompressedReader().unpack else: raise unpackException('invalid compression type: 0x%4x' % self.compression) if self.palm: return self.length, self.type, self.codepage, self.unique_id, self.version = struct.unpack(b'>LLLLL', self.header[20:40]) codec_map = { 1252 : 'windows-1252', 65001: 'utf-8', } if self.codepage in codec_map: self.codec = codec_map[self.codepage] # title toff, tlen = struct.unpack(b'>II', self.header[0x54:0x5c]) tend = toff + tlen self.title=self.header[toff:tend].decode(self.codec, errors='replace') exth_flag, = struct.unpack(b'>L', self.header[0x80:0x84]) self.hasExth = exth_flag & 0x40 self.exth_offset = self.length + 16 self.exth_length = 0 if self.hasExth: self.exth_length, = struct.unpack_from(b'>L', self.header, self.exth_offset+4) self.exth_length = ((self.exth_length + 3)>>2)<<2 # round to next 4 byte boundary self.exth = self.header[self.exth_offset:self.exth_offset+self.exth_length] # parse the exth / metadata self.parseMetaData() # self.mlstart = self.sect.loadSection(self.start+1) # self.mlstart = self.mlstart[0:4] self.crypto_type, = struct.unpack_from(b'>H', self.header, 0xC) # Start sector for additional files such as images, fonts, resources, etc # Can be missing so fall back to default set previously ofst, = struct.unpack_from(b'>L', self.header, 0x6C) if ofst != 0xffffffff: self.firstresource = ofst + self.start ofst, = struct.unpack_from(b'>L', self.header, 0x50) if ofst != 0xffffffff: self.firstnontext = ofst + self.start if self.isPrintReplica(): return if self.version < 8: # Dictionary metaOrthIndex self.metaOrthIndex, = struct.unpack_from(b'>L', self.header, 0x28) if self.metaOrthIndex != 0xffffffff: self.metaOrthIndex += self.start # Dictionary metaInflIndex self.metaInflIndex, = struct.unpack_from(b'>L', self.header, 0x2C) if self.metaInflIndex != 0xffffffff: self.metaInflIndex += self.start # handle older headers without any ncxindex info and later # specifically 0xe4 headers if self.length + 16 < 0xf8: return # NCX Index self.ncxidx, = struct.unpack(b'>L', self.header[0xf4:0xf8]) if self.ncxidx != 0xffffffff: self.ncxidx += self.start # K8 specific Indexes if self.start != 0 or self.version == 8: # Index into file skeletons in RawML self.skelidx, = struct.unpack_from(b'>L', self.header, 0xfc) if self.skelidx != 0xffffffff: self.skelidx += self.start # Index into
sections in RawML self.fragidx, = struct.unpack_from(b'>L', self.header, 0xf8) if self.fragidx != 0xffffffff: self.fragidx += self.start # Index into Other files self.guideidx, = struct.unpack_from(b'>L', self.header, 0x104) if self.guideidx != 0xffffffff: self.guideidx += self.start # dictionaries do not seem to use the same approach in K8's # so disable them self.metaOrthIndex = 0xffffffff self.metaInflIndex = 0xffffffff # need to use the FDST record to find out how to properly unpack # the rawML into pieces # it is simply a table of start and end locations for each flow piece self.fdst, = struct.unpack_from(b'>L', self.header, 0xc0) self.fdstcnt, = struct.unpack_from(b'>L', self.header, 0xc4) # if cnt is 1 or less, fdst section mumber can be garbage if self.fdstcnt <= 1: self.fdst = 0xffffffff if self.fdst != 0xffffffff: self.fdst += self.start # setting of fdst section description properly handled in mobi_kf8proc def dump_exth(self): # determine text encoding codec=self.codec if (not self.hasExth) or (self.exth_length) == 0 or (self.exth == b''): return num_items, = struct.unpack(b'>L', self.exth[8:12]) pos = 12 print("Key Size Description Value") for _ in range(num_items): id, size = struct.unpack(b'>LL', self.exth[pos:pos+8]) contentsize = size-8 content = self.exth[pos + 8: pos + size] if id in MobiHeader.id_map_strings: exth_name = MobiHeader.id_map_strings[id] print('{0: >3d} {1: >4d} {2: <30s} {3:s}'.format(id, contentsize, exth_name, content.decode(codec, errors='replace'))) elif id in MobiHeader.id_map_values: exth_name = MobiHeader.id_map_values[id] if size == 9: value, = struct.unpack(b'B',content) print('{0:3d} byte {1:<30s} {2:d}'.format(id, exth_name, value)) elif size == 10: value, = struct.unpack(b'>H',content) print('{0:3d} word {1:<30s} 0x{2:0>4X} ({2:d})'.format(id, exth_name, value)) elif size == 12: value, = struct.unpack(b'>L',content) print('{0:3d} long {1:<30s} 0x{2:0>8X} ({2:d})'.format(id, exth_name, value)) else: print('{0: >3d} {1: >4d} {2: <30s} (0x{3:s})'.format(id, contentsize, "Bad size for "+exth_name, hexlify(content))) elif id in MobiHeader.id_map_hexstrings: exth_name = MobiHeader.id_map_hexstrings[id] print('{0:3d} {1:4d} {2:<30s} 0x{3:s}'.format(id, contentsize, exth_name, hexlify(content))) else: exth_name = "Unknown EXTH ID {0:d}".format(id) print("{0: >3d} {1: >4d} {2: <30s} 0x{3:s}".format(id, contentsize, exth_name, hexlify(content))) pos += size return def dumpheader(self): # first 16 bytes are not part of the official mobiheader # but we will treat it as such # so section 0 is 16 (decimal) + self.length in total == at least 0x108 bytes for Mobi 8 headers print("Dumping section %d, Mobipocket Header version: %d, total length %d" % (self.start,self.version, self.length+16)) self.hdr = {} # set it up for the proper header version if self.version == 0: self.mobi_header = MobiHeader.palmdoc_header self.mobi_header_sorted_keys = MobiHeader.palmdoc_header_sorted_keys elif self.version < 8: self.mobi_header = MobiHeader.mobi6_header self.mobi_header_sorted_keys = MobiHeader.mobi6_header_sorted_keys else: self.mobi_header = MobiHeader.mobi8_header self.mobi_header_sorted_keys = MobiHeader.mobi8_header_sorted_keys # parse the header information for key in self.mobi_header_sorted_keys: (pos, format, tot_len) = self.mobi_header[key] if pos < (self.length + 16): val, = struct.unpack_from(format, self.header, pos) self.hdr[key] = val if 'title_offset' in self.hdr: title_offset = self.hdr['title_offset'] title_length = self.hdr['title_length'] else: title_offset = 0 title_length = 0 if title_offset == 0: title_offset = len(self.header) title_length = 0 self.title = self.sect.palmname.decode('latin-1', errors='replace') else: self.title = self.header[title_offset:title_offset+title_length].decode(self.codec, errors='replace') # title record always padded with two nul bytes and then padded with nuls to next 4 byte boundary title_length = ((title_length+2+3)>>2)<<2 self.extra1 = self.header[self.exth_offset+self.exth_length:title_offset] self.extra2 = self.header[title_offset+title_length:] print("Mobipocket header from section %d" % self.start) print(" Offset Value Hex Dec Description") for key in self.mobi_header_sorted_keys: (pos, format, tot_len) = self.mobi_header[key] if pos < (self.length + 16): if key != 'magic': fmt_string = "0x{0:0>3X} ({0:3d}){1: >" + str(9-2*tot_len) +"s}0x{2:0>" + str(2*tot_len) + "X} {2:10d} {3:s}" else: self.hdr[key] = unicode_str(self.hdr[key]) fmt_string = "0x{0:0>3X} ({0:3d}){2:>11s} {3:s}" print(fmt_string.format(pos, " ",self.hdr[key], key)) print("") if self.exth_length > 0: print("EXTH metadata, offset %d, padded length %d" % (self.exth_offset,self.exth_length)) self.dump_exth() print("") if len(self.extra1) > 0: print("Extra data between EXTH and Title, length %d" % len(self.extra1)) print(hexlify(self.extra1)) print("") if title_length > 0: print("Title in header at offset %d, padded length %d: '%s'" %(title_offset,title_length,self.title)) print("") if len(self.extra2) > 0: print("Extra data between Title and end of header, length %d" % len(self.extra2)) print(hexlify(self.extra2)) print("") def isPrintReplica(self): return self.mlstart[0:4] == b"%MOP" def isK8(self): return self.start != 0 or self.version == 8 def isEncrypted(self): return self.crypto_type != 0 def hasNCX(self): return self.ncxidx != 0xffffffff def isDictionary(self): return self.metaOrthIndex != 0xffffffff def getncxIndex(self): return self.ncxidx def decompress(self, data): return self.unpack(data) def Language(self): langcode = struct.unpack(b'!L', self.header[0x5c:0x60])[0] langid = langcode & 0xFF sublangid = (langcode >> 8) & 0xFF return getLanguage(langid, sublangid) def DictInLanguage(self): if self.isDictionary(): langcode = struct.unpack(b'!L', self.header[0x60:0x64])[0] langid = langcode & 0xFF sublangid = (langcode >> 10) & 0xFF if langid != 0: return getLanguage(langid, sublangid) return False def DictOutLanguage(self): if self.isDictionary(): langcode = struct.unpack(b'!L', self.header[0x64:0x68])[0] langid = langcode & 0xFF sublangid = (langcode >> 10) & 0xFF if langid != 0: return getLanguage(langid, sublangid) return False def getRawML(self): def getSizeOfTrailingDataEntry(data): num = 0 for v in data[-4:]: if bord(v) & 0x80: num = 0 num = (num << 7) | (bord(v) & 0x7f) return num def trimTrailingDataEntries(data): for _ in range(trailers): num = getSizeOfTrailingDataEntry(data) data = data[:-num] if multibyte: num = (ord(data[-1:]) & 3) + 1 data = data[:-num] return data multibyte = 0 trailers = 0 if self.sect.ident == b'BOOKMOBI': mobi_length, = struct.unpack_from(b'>L', self.header, 0x14) mobi_version, = struct.unpack_from(b'>L', self.header, 0x68) if (mobi_length >= 0xE4) and (mobi_version >= 5): flags, = struct.unpack_from(b'>H', self.header, 0xF2) multibyte = flags & 1 while flags > 1: if flags & 2: trailers += 1 flags = flags >> 1 # get raw mobi markup languge print("Unpacking raw markup language") dataList = [] # offset = 0 for i in range(1, self.records+1): data = trimTrailingDataEntries(self.sect.loadSection(self.start + i)) dataList.append(self.unpack(data)) if self.isK8(): self.sect.setsectiondescription(self.start + i,"KF8 Text Section {0:d}".format(i)) elif self.version == 0: self.sect.setsectiondescription(self.start + i,"PalmDOC Text Section {0:d}".format(i)) else: self.sect.setsectiondescription(self.start + i,"Mobipocket Text Section {0:d}".format(i)) rawML = b''.join(dataList) self.rawSize = len(rawML) return rawML # all metadata is stored in a dictionary with key and returns a *list* of values # a list is used to allow for multiple creators, multiple contributors, etc def parseMetaData(self): def addValue(name, value): if name not in self.metadata: self.metadata[name] = [value] else: self.metadata[name].append(value) codec=self.codec if self.hasExth: extheader=self.exth _length, num_items = struct.unpack(b'>LL', extheader[4:12]) extheader = extheader[12:] pos = 0 for _ in range(num_items): id, size = struct.unpack(b'>LL', extheader[pos:pos+8]) content = extheader[pos + 8: pos + size] if id in MobiHeader.id_map_strings: name = MobiHeader.id_map_strings[id] addValue(name, content.decode(codec, errors='replace')) elif id in MobiHeader.id_map_values: name = MobiHeader.id_map_values[id] if size == 9: value, = struct.unpack(b'B',content) addValue(name, unicode_str(str(value))) elif size == 10: value, = struct.unpack(b'>H',content) addValue(name, unicode_str(str(value))) elif size == 12: value, = struct.unpack(b'>L',content) # handle special case of missing CoverOffset or missing ThumbOffset if id == 201 or id == 202: if value != 0xffffffff: addValue(name, unicode_str(str(value))) else: addValue(name, unicode_str(str(value))) else: print("Warning: Bad key, size, value combination detected in EXTH ", id, size, hexlify(content)) addValue(name, hexlify(content)) elif id in MobiHeader.id_map_hexstrings: name = MobiHeader.id_map_hexstrings[id] addValue(name, hexlify(content)) else: name = unicode_str(str(id)) + ' (hex)' addValue(name, hexlify(content)) pos += size # add the basics to the metadata each as a list element self.metadata['Language'] = [self.Language()] self.metadata['Title'] = [unicode_str(self.title,self.codec)] self.metadata['Codec'] = [self.codec] self.metadata['UniqueID'] = [unicode_str(str(self.unique_id))] # if no asin create one using a uuid if 'ASIN' not in self.metadata: self.metadata['ASIN'] = [unicode_str(str(uuid.uuid4()))] # if no cdeType set it to "EBOK" if 'cdeType' not in self.metadata: self.metadata['cdeType'] = ['EBOK'] def getMetaData(self): return self.metadata def describeHeader(self, DUMP): print("Mobi Version:", self.version) print("Codec:", self.codec) print("Title:", self.title) if 'Updated_Title' in self.metadata: print("EXTH Title:", self.metadata['Updated_Title'][0]) if self.compression == 0x4448: print("Huffdic compression") elif self.compression == 2: print("Palmdoc compression") elif self.compression == 1: print("No compression") if DUMP: self.dumpheader()