aboutsummaryrefslogtreecommitdiffstats
path: root/src/epy_reader/tools/KindleUnpack/kindleunpack.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/epy_reader/tools/KindleUnpack/kindleunpack.py')
-rw-r--r--src/epy_reader/tools/KindleUnpack/kindleunpack.py1029
1 files changed, 1029 insertions, 0 deletions
diff --git a/src/epy_reader/tools/KindleUnpack/kindleunpack.py b/src/epy_reader/tools/KindleUnpack/kindleunpack.py
new file mode 100644
index 0000000..317941a
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/kindleunpack.py
@@ -0,0 +1,1029 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+import os
+
+__path__ = ["lib", os.path.dirname(os.path.realpath(__file__)), "kindleunpack"]
+
+import sys
+import codecs
+import traceback
+
+from .compatibility_utils import PY2, binary_type, utf8_str, unicode_str
+from .compatibility_utils import unicode_argv, add_cp65001_codec
+from .compatibility_utils import hexlify
+
+add_cp65001_codec()
+
+from .unipath import pathof
+
+if PY2:
+ range = xrange
+ # since will be printing unicode under python 2 need to protect
+ # against sys.stdout.encoding being None stupidly forcing forcing ascii encoding
+ if sys.stdout.encoding is None:
+ sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
+ else:
+ encoding = sys.stdout.encoding
+ sys.stdout = codecs.getwriter(encoding)(sys.stdout)
+
+# Changelog
+# 0.11 - Version by adamselene
+# 0.11pd - Tweaked version by pdurrant
+# 0.12 - extracts pictures too, and all into a folder.
+# 0.13 - added back in optional output dir for those who don't want it based on infile
+# 0.14 - auto flush stdout and wrapped in main, added proper return codes
+# 0.15 - added support for metadata
+# 0.16 - metadata now starting to be output as an opf file (PD)
+# 0.17 - Also created tweaked text as source for Mobipocket Creator
+# 0.18 - removed raw mobi file completely but kept _meta.html file for ease of conversion
+# 0.19 - added in metadata for ASIN, Updated Title and Rights to the opf
+# 0.20 - remove _meta.html since no longer needed
+# 0.21 - Fixed some typos in the opf output, and also updated handling
+# of test for trailing data/multibyte characters
+# 0.22 - Fixed problem with > 9 images
+# 0.23 - Now output Start guide item
+# 0.24 - Set firstaddl value for 'TEXtREAd'
+# 0.25 - Now added character set metadata to html file for utf-8 files.
+# 0.26 - Dictionary support added. Image handling speed improved.
+# For huge files create temp files to speed up decoding.
+# Language decoding fixed. Metadata is now converted to utf-8 when written to opf file.
+# 0.27 - Add idx:entry attribute "scriptable" if dictionary contains entry length tags.
+# Don't save non-image sections as images. Extract and save source zip file
+# included by kindlegen as kindlegensrc.zip.
+# 0.28 - Added back correct image file name extensions, created FastConcat class to simplify and clean up
+# 0.29 - Metadata handling reworked, multiple entries of the same type are now supported.
+# Several missing types added.
+# FastConcat class has been removed as in-memory handling with lists is faster, even for huge files.
+# 0.30 - Add support for outputting **all** metadata values - encode content with hex if of unknown type
+# 0.31 - Now supports Print Replica ebooks, outputting PDF and mysterious data sections
+# 0.32 - Now supports NCX file extraction/building.
+# Overhauled the structure of mobiunpack to be more class oriented.
+# 0.33 - Split Classes ito separate files and added prelim support for KF8 format eBooks
+# 0.34 - Improved KF8 support, guide support, bug fixes
+# 0.35 - Added splitting combo mobi7/mobi8 into standalone mobi7 and mobi8 files
+# Also handle mobi8-only file properly
+# 0.36 - very minor changes to support KF8 mobis with no flow items, no ncx, etc
+# 0.37 - separate output, add command line switches to control, interface to Mobi_Unpack.pyw
+# 0.38 - improve split function by resetting flags properly, fix bug in Thumbnail Images
+# 0.39 - improve split function so that ToC info is not lost for standalone mobi8s
+# 0.40 - make mobi7 split match official versions, add support for graphic novel metadata,
+# improve debug for KF8
+# 0.41 - fix when StartOffset set to 0xffffffff, fix to work with older mobi versions,
+# fix other minor metadata issues
+# 0.42 - add new class interface to allow it to integrate more easily with internal calibre routines
+# 0.43 - bug fixes for new class interface
+# 0.44 - more bug fixes and fix for potnetial bug caused by not properly closing created zip archive
+# 0.45 - sync to version in the new Mobi_Unpack plugin
+# 0.46 - fixes for: obfuscated fonts, improper toc links and ncx, add support for opentype fonts
+# 0.47 - minor opf improvements
+# 0.48 - ncx link fixes
+# 0.49 - use azw3 when splitting mobis
+# 0.50 - unknown change
+# 0.51 - fix for converting filepos links to hrefs, Added GPL3 notice, made KF8 extension just '.azw3'
+# 0.52 - fix for cover metadata (no support for Mobipocket Creator)
+# 0.53 - fix for proper identification of embedded fonts, added new metadata items
+# 0.54 - Added error-handling so wonky embedded fonts don't bomb the whole unpack process,
+# entity escape KF8 metadata to ensure valid OPF.
+# 0.55 Strip extra StartOffset EXTH from the mobi8 header when splitting, keeping only the relevant one
+# For mobi8 files, don't generate duplicate guide entries from the metadata if we could extract one
+# from the OTH table.
+# 0.56 - Added further entity escaping of OPF text.
+# Allow unicode string file paths to be passed as arguments to the unpackBook method without blowing up later
+# when the attempt to "re"-unicode a portion of that filename occurs in the process_all_mobi_headers method.
+# 0.57 - Fixed eror when splitting Preview files downloaded from KDP website
+# 0.58 - Output original kindlegen build log ('CMET' record) if included in the package.
+# 0.58 - Include and extend functionality of DumpMobiHeader, replacing DEBUG with DUMP
+# 0.59 - Much added DUMP functionality, including full dumping and descriptions of sections
+# 0.60 - Bug fixes in opf, div tables, bad links, page breaks, section descriptions
+# - plus a number of other bug fixed that were found by Sergey Dubinets
+# - fixs for file/paths that require full unicode to work properly
+# - replace subprocess with multiprocessing to remove need for unbuffered stdout
+# 0.61 - renamed to be KindleUnpack and more unicode/utf-8 path bug fixes and other minor fixes
+# 0.62 - fix for multiprocessing on Windows, split fixes, opf improvements
+# 0.63 - Modified to process right to left page progression books properly.
+# - Added some id_map_strings and RESC section processing; metadata and
+# - spine in the RESC are integrated partly to content.opf.
+# 0.63a- Separated K8 RESC processor to an individual file. Bug fixes. Added cover page creation.
+# 0.64 - minor bug fixes to more properly handle unicode command lines, and support for more jpeg types
+# 0.64a- Modifed to handle something irregular mobi and azw3 files.
+# 0.64b- Modifed to create k8resc.spine for no RECS files.
+# 0.65 - Bug fixes to shorten title and remove epub3 "properties" to make the output epub2 compliant
+# 0.65a- Bug fixes to extract RESC section correctly, to prevent item id confliction
+# - and to process multiline comments in RESC.
+# 0.66 - Bug fix to deal with missing first resource information sometimes generated by calibre
+# 0.66a- Fixed minor bugs, which probably do not affect the output anything
+# 0.67 - Fixed Mobi Split functionality bug with azw3 images not being properly copied
+# 0.68 - preliminary support for handling PAGE sections to create page-map.xml
+# 0.69 - preliminary support for CONT and CRES for HD Images
+# 0.70 - preliminary support for decoding apnx files when used with azw3 ebooks
+# 0.71 - extensive refactoring of kindleunpack.py to make it more manageable
+# 0.72 - many bug fixes from tkeo: fix pageProcessing, fix print replica, fix resc usage, fix font mangling, etc.
+# 0.72a- fix for still broken PrintReplica support
+# 0.72b- preview for primary epub3 support. A parameter epubver(default='2') is added to process_all_mobi_headers(), unpackBook().
+# 0.72c- preview for apnx page support
+# 0.72d- more bugs fixed in preview features, much improved GUI with ability to dynaically grow the Log Window with preference support
+# 0.72e- more bug fixes, Tk GUI adds support for epub version and HDImage use
+# 0.72f- more bug fixes, implement use hd images if present
+# 0.72g- minor bug fixes and cleanups from tkeo
+# 0.72h- updated mobi_header and mobi_k8proc to use the correct fragment and guide terms in place of div and other
+# to better match the terms that both Calibre and Amazon use internally to their own software
+# 0.72x- very experimental conversion to use new mobi_k8resc.py and some of its associated changes
+# 0.72y- more changes to simplify and integrate in epub3 support in a simpler manner
+# 0.72z- remove redundancy in mobi_opf.py and bug fixes for mobi_k8resc.py
+# 0.73 faster mobi split, numerous bug fixes in mobi_k8proc, mobi_header, mobi_opf, mobi_k8resc, etc
+# 0.74 added refines metadata, fixed language code in ncx and title in nav, added support for opf: from refines
+# 0.75 much improved dictioanry support including support for multiple inflection sections, minor mobi_opf fixes
+# 0.76 pre-release version only fix name related issues in opf by not using original file name in mobi7
+# 0.77 bug fix for unpacking HDImages with included Fonts
+# 0.80 converted to work with both python 2.7 and Python 3.3 and later
+# 0.81 various fixes
+# 0.82 Handle calibre-generated mobis that can have skeletons with no fragments
+# 0.83 Fix header item 114 being mistakenly treated as a string instead of a value
+
+DUMP = False
+""" Set to True to dump all possible information. """
+
+WRITE_RAW_DATA = False
+""" Set to True to create additional files with raw data for debugging/reverse engineering. """
+
+SPLIT_COMBO_MOBIS = False
+""" Set to True to split combination mobis into mobi7 and mobi8 pieces. """
+
+CREATE_COVER_PAGE = True # XXX experimental
+""" Create and insert a cover xhtml page. """
+
+EOF_RECORD = b'\xe9\x8e' + b'\r\n'
+""" The EOF record content. """
+
+TERMINATION_INDICATOR1 = b'\x00'
+TERMINATION_INDICATOR2 = b'\x00\x00'
+TERMINATION_INDICATOR3 = b'\x00\x00\x00'
+
+KINDLEGENSRC_FILENAME = "kindlegensrc.zip"
+""" The name for the kindlegen source archive. """
+
+KINDLEGENLOG_FILENAME = "kindlegenbuild.log"
+""" The name for the kindlegen build log. """
+
+K8_BOUNDARY = b'BOUNDARY'
+""" The section data that divides K8 mobi ebooks. """
+
+import os
+import struct
+import re
+import zlib
+import getopt
+
+class unpackException(Exception):
+ pass
+
+
+# import the kindleunpack support libraries
+from .unpack_structure import fileNames
+from .mobi_sectioner import Sectionizer, describe
+from .mobi_header import MobiHeader, dump_contexth
+from .mobi_utils import toBase32
+from .mobi_opf import OPFProcessor
+from .mobi_html import HTMLProcessor, XHTMLK8Processor
+from .mobi_ncx import ncxExtract
+from .mobi_k8proc import K8Processor
+from .mobi_split import mobi_split
+from .mobi_k8resc import K8RESCProcessor
+from .mobi_nav import NAVProcessor
+from .mobi_cover import CoverProcessor, get_image_type
+from .mobi_pagemap import PageMapProcessor
+from .mobi_dict import dictSupport
+
+
+def processSRCS(i, files, rscnames, sect, data):
+ # extract the source zip archive and save it.
+ print("File contains kindlegen source archive, extracting as %s" % KINDLEGENSRC_FILENAME)
+ srcname = os.path.join(files.outdir, KINDLEGENSRC_FILENAME)
+ with open(pathof(srcname), 'wb') as f:
+ f.write(data[16:])
+ rscnames.append(None)
+ sect.setsectiondescription(i,"Zipped Source Files")
+ return rscnames
+
+
+def processPAGE(i, files, rscnames, sect, data, mh, pagemapproc):
+ # process any page map information and create an apnx file
+ pagemapproc = PageMapProcessor(mh, data)
+ rscnames.append(None)
+ sect.setsectiondescription(i,"PageMap")
+ apnx_meta = {}
+ acr = sect.palmname.decode('latin-1').rstrip('\x00')
+ apnx_meta['acr'] = acr
+ apnx_meta['cdeType'] = mh.metadata['cdeType'][0]
+ apnx_meta['contentGuid'] = hex(int(mh.metadata['UniqueID'][0]))[2:]
+ apnx_meta['asin'] = mh.metadata['ASIN'][0]
+ apnx_meta['pageMap'] = pagemapproc.getPageMap()
+ if mh.version == 8:
+ apnx_meta['format'] = 'MOBI_8'
+ else:
+ apnx_meta['format'] = 'MOBI_7'
+ apnx_data = pagemapproc.generateAPNX(apnx_meta)
+ if mh.isK8():
+ outname = os.path.join(files.outdir, 'mobi8-'+files.getInputFileBasename() + '.apnx')
+ else:
+ outname = os.path.join(files.outdir, 'mobi7-'+files.getInputFileBasename() + '.apnx')
+ with open(pathof(outname), 'wb') as f:
+ f.write(apnx_data)
+ return rscnames, pagemapproc
+
+
+def processCMET(i, files, rscnames, sect, data):
+ # extract the build log
+ print("File contains kindlegen build log, extracting as %s" % KINDLEGENLOG_FILENAME)
+ srcname = os.path.join(files.outdir, KINDLEGENLOG_FILENAME)
+ with open(pathof(srcname), 'wb') as f:
+ f.write(data[10:])
+ rscnames.append(None)
+ sect.setsectiondescription(i,"Kindlegen log")
+ return rscnames
+
+
+# fonts only exist in KF8 ebooks
+# Format: bytes 0 - 3: 'FONT'
+# bytes 4 - 7: uncompressed size
+# bytes 8 - 11: flags
+# flag bit 0x0001 - zlib compression
+# flag bit 0x0002 - obfuscated with xor string
+# bytes 12 - 15: offset to start of compressed font data
+# bytes 16 - 19: length of xor string stored before the start of the comnpress font data
+# bytes 20 - 23: start of xor string
+def processFONT(i, files, rscnames, sect, data, obfuscate_data, beg, rsc_ptr):
+ fontname = "font%05d" % i
+ ext = '.dat'
+ font_error = False
+ font_data = data
+ try:
+ usize, fflags, dstart, xor_len, xor_start = struct.unpack_from(b'>LLLLL',data,4)
+ except:
+ print("Failed to extract font: {0:s} from section {1:d}".format(fontname,i))
+ font_error = True
+ ext = '.failed'
+ pass
+ if not font_error:
+ print("Extracting font:", fontname)
+ font_data = data[dstart:]
+ extent = len(font_data)
+ extent = min(extent, 1040)
+ if fflags & 0x0002:
+ # obfuscated so need to de-obfuscate the first 1040 bytes
+ key = bytearray(data[xor_start: xor_start+ xor_len])
+ buf = bytearray(font_data)
+ for n in range(extent):
+ buf[n] ^= key[n%xor_len]
+ font_data = bytes(buf)
+ if fflags & 0x0001:
+ # ZLIB compressed data
+ font_data = zlib.decompress(font_data)
+ hdr = font_data[0:4]
+ if hdr == b'\0\1\0\0' or hdr == b'true' or hdr == b'ttcf':
+ ext = '.ttf'
+ elif hdr == b'OTTO':
+ ext = '.otf'
+ else:
+ print("Warning: unknown font header %s" % hexlify(hdr))
+ if (ext == '.ttf' or ext == '.otf') and (fflags & 0x0002):
+ obfuscate_data.append(fontname + ext)
+ fontname += ext
+ outfnt = os.path.join(files.imgdir, fontname)
+ with open(pathof(outfnt), 'wb') as f:
+ f.write(font_data)
+ rscnames.append(fontname)
+ sect.setsectiondescription(i,"Font {0:s}".format(fontname))
+ if rsc_ptr == -1:
+ rsc_ptr = i - beg
+ return rscnames, obfuscate_data, rsc_ptr
+
+
+def processCRES(i, files, rscnames, sect, data, beg, rsc_ptr, use_hd):
+ # extract an HDImage
+ global DUMP
+ data = data[12:]
+ imgtype = get_image_type(None, data)
+
+ if imgtype is None:
+ print("Warning: CRES Section %s does not contain a recognised resource" % i)
+ rscnames.append(None)
+ sect.setsectiondescription(i,"Mysterious CRES data, first four bytes %s" % describe(data[0:4]))
+ if DUMP:
+ fname = "unknown%05d.dat" % i
+ outname= os.path.join(files.outdir, fname)
+ with open(pathof(outname), 'wb') as f:
+ f.write(data)
+ sect.setsectiondescription(i,"Mysterious CRES data, first four bytes %s extracting as %s" % (describe(data[0:4]), fname))
+ rsc_ptr += 1
+ return rscnames, rsc_ptr
+
+ if use_hd:
+ # overwrite corresponding lower res image with hd version
+ imgname = rscnames[rsc_ptr]
+ imgdest = files.imgdir
+ else:
+ imgname = "HDimage%05d.%s" % (i, imgtype)
+ imgdest = files.hdimgdir
+ print("Extracting HD image: {0:s} from section {1:d}".format(imgname,i))
+ outimg = os.path.join(imgdest, imgname)
+ with open(pathof(outimg), 'wb') as f:
+ f.write(data)
+ rscnames.append(None)
+ sect.setsectiondescription(i,"Optional HD Image {0:s}".format(imgname))
+ rsc_ptr += 1
+ return rscnames, rsc_ptr
+
+
+def processCONT(i, files, rscnames, sect, data):
+ global DUMP
+ # process a container header, most of this is unknown
+ # right now only extract its EXTH
+ dt = data[0:12]
+ if dt == b"CONTBOUNDARY":
+ rscnames.append(None)
+ sect.setsectiondescription(i,"CONTAINER BOUNDARY")
+ else:
+ sect.setsectiondescription(i,"CONT Header")
+ rscnames.append(None)
+ if DUMP:
+ cpage, = struct.unpack_from(b'>L', data, 12)
+ contexth = data[48:]
+ print("\n\nContainer EXTH Dump")
+ dump_contexth(cpage, contexth)
+ fname = "CONT_Header%05d.dat" % i
+ outname= os.path.join(files.outdir, fname)
+ with open(pathof(outname), 'wb') as f:
+ f.write(data)
+ return rscnames
+
+
+def processkind(i, files, rscnames, sect, data):
+ global DUMP
+ dt = data[0:12]
+ if dt == b"kindle:embed":
+ if DUMP:
+ print("\n\nHD Image Container Description String")
+ print(data)
+ sect.setsectiondescription(i,"HD Image Container Description String")
+ rscnames.append(None)
+ return rscnames
+
+
+# spine information from the original content.opf
+def processRESC(i, files, rscnames, sect, data, k8resc):
+ global DUMP
+ if DUMP:
+ rescname = "RESC%05d.dat" % i
+ print("Extracting Resource: ", rescname)
+ outrsc = os.path.join(files.outdir, rescname)
+ with open(pathof(outrsc), 'wb') as f:
+ f.write(data)
+ if True: # try:
+ # parse the spine and metadata from RESC
+ k8resc = K8RESCProcessor(data[16:], DUMP)
+ else: # except:
+ print("Warning: cannot extract information from RESC.")
+ k8resc = None
+ rscnames.append(None)
+ sect.setsectiondescription(i,"K8 RESC section")
+ return rscnames, k8resc
+
+
+def processImage(i, files, rscnames, sect, data, beg, rsc_ptr, cover_offset, thumb_offset):
+ global DUMP
+ # Extract an Image
+ imgtype = get_image_type(None, data)
+ if imgtype is None:
+ print("Warning: Section %s does not contain a recognised resource" % i)
+ rscnames.append(None)
+ sect.setsectiondescription(i,"Mysterious Section, first four bytes %s" % describe(data[0:4]))
+ if DUMP:
+ fname = "unknown%05d.dat" % i
+ outname= os.path.join(files.outdir, fname)
+ with open(pathof(outname), 'wb') as f:
+ f.write(data)
+ sect.setsectiondescription(i,"Mysterious Section, first four bytes %s extracting as %s" % (describe(data[0:4]), fname))
+ return rscnames, rsc_ptr
+
+ imgname = "image%05d.%s" % (i, imgtype)
+ if cover_offset is not None and i == beg + cover_offset:
+ imgname = "cover%05d.%s" % (i, imgtype)
+ if thumb_offset is not None and i == beg + thumb_offset:
+ imgname = "thumb%05d.%s" % (i, imgtype)
+ print("Extracting image: {0:s} from section {1:d}".format(imgname,i))
+ outimg = os.path.join(files.imgdir, imgname)
+ with open(pathof(outimg), 'wb') as f:
+ f.write(data)
+ rscnames.append(imgname)
+ sect.setsectiondescription(i,"Image {0:s}".format(imgname))
+ if rsc_ptr == -1:
+ rsc_ptr = i - beg
+ return rscnames, rsc_ptr
+
+
+def processPrintReplica(metadata, files, rscnames, mh):
+ global DUMP
+ global WRITE_RAW_DATA
+ rawML = mh.getRawML()
+ if DUMP or WRITE_RAW_DATA:
+ outraw = os.path.join(files.outdir,files.getInputFileBasename() + '.rawpr')
+ with open(pathof(outraw),'wb') as f:
+ f.write(rawML)
+
+ fileinfo = []
+ print("Print Replica ebook detected")
+ try:
+ numTables, = struct.unpack_from(b'>L', rawML, 0x04)
+ tableIndexOffset = 8 + 4*numTables
+ # for each table, read in count of sections, assume first section is a PDF
+ # and output other sections as binary files
+ for i in range(numTables):
+ sectionCount, = struct.unpack_from(b'>L', rawML, 0x08 + 4*i)
+ for j in range(sectionCount):
+ sectionOffset, sectionLength, = struct.unpack_from(b'>LL', rawML, tableIndexOffset)
+ tableIndexOffset += 8
+ if j == 0:
+ entryName = os.path.join(files.outdir, files.getInputFileBasename() + ('.%03d.pdf' % (i+1)))
+ else:
+ entryName = os.path.join(files.outdir, files.getInputFileBasename() + ('.%03d.%03d.data' % ((i+1),j)))
+ with open(pathof(entryName), 'wb') as f:
+ f.write(rawML[sectionOffset:(sectionOffset+sectionLength)])
+ except Exception as e:
+ print('Error processing Print Replica: ' + str(e))
+
+ fileinfo.append([None,'', files.getInputFileBasename() + '.pdf'])
+ usedmap = {}
+ for name in rscnames:
+ if name is not None:
+ usedmap[name] = 'used'
+ opf = OPFProcessor(files, metadata, fileinfo, rscnames, False, mh, usedmap)
+ opf.writeOPF()
+
+
+def processMobi8(mh, metadata, sect, files, rscnames, pagemapproc, k8resc, obfuscate_data, apnxfile=None, epubver='2'):
+ global DUMP
+ global WRITE_RAW_DATA
+
+ # extract raw markup langauge
+ rawML = mh.getRawML()
+ if DUMP or WRITE_RAW_DATA:
+ outraw = os.path.join(files.k8dir,files.getInputFileBasename() + '.rawml')
+ with open(pathof(outraw),'wb') as f:
+ f.write(rawML)
+
+ # KF8 require other indexes which contain parsing information and the FDST info
+ # to process the rawml back into the xhtml files, css files, svg image files, etc
+ k8proc = K8Processor(mh, sect, files, DUMP)
+ k8proc.buildParts(rawML)
+
+ # collect information for the guide first
+ guidetext = unicode_str(k8proc.getGuideText())
+
+ # if the guide was empty, add in any guide info from metadata, such as StartOffset
+ if not guidetext and 'StartOffset' in metadata:
+ # Apparently, KG 2.5 carries over the StartOffset from the mobi7 part...
+ # Taking that into account, we only care about the *last* StartOffset, which
+ # should always be the correct one in these cases (the one actually pointing
+ # to the right place in the mobi8 part).
+ starts = metadata['StartOffset']
+ last_start = starts[-1]
+ last_start = int(last_start)
+ if last_start == 0xffffffff:
+ last_start = 0
+ seq, idtext = k8proc.getFragTblInfo(last_start)
+ filename, idtext = k8proc.getIDTagByPosFid(toBase32(seq), b'0000000000')
+ linktgt = filename
+ idtext = unicode_str(idtext, mh.codec)
+ if idtext != '':
+ linktgt += '#' + idtext
+ guidetext += '<reference type="text" href="Text/%s" />\n' % linktgt
+
+ # if apnxfile is passed in use it for page map information
+ if apnxfile is not None and pagemapproc is None:
+ with open(apnxfile, 'rb') as f:
+ apnxdata = b"00000000" + f.read()
+ pagemapproc = PageMapProcessor(mh, apnxdata)
+
+ # generate the page map
+ pagemapxml = ''
+ if pagemapproc is not None:
+ pagemapxml = pagemapproc.generateKF8PageMapXML(k8proc)
+ outpm = os.path.join(files.k8oebps,'page-map.xml')
+ with open(pathof(outpm),'wb') as f:
+ f.write(pagemapxml.encode('utf-8'))
+ if DUMP:
+ print(pagemapproc.getNames())
+ print(pagemapproc.getOffsets())
+ print("\n\nPage Map")
+ print(pagemapxml)
+
+ # process the toc ncx
+ # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num
+ print("Processing ncx / toc")
+ ncx = ncxExtract(mh, files)
+ ncx_data = ncx.parseNCX()
+ # extend the ncx data with filenames and proper internal idtags
+ for i in range(len(ncx_data)):
+ ncxmap = ncx_data[i]
+ [junk1, junk2, junk3, fid, junk4, off] = ncxmap['pos_fid'].split(':')
+ filename, idtag = k8proc.getIDTagByPosFid(fid, off)
+ ncxmap['filename'] = filename
+ ncxmap['idtag'] = unicode_str(idtag)
+ ncx_data[i] = ncxmap
+
+ # convert the rawML to a set of xhtml files
+ print("Building an epub-like structure")
+ htmlproc = XHTMLK8Processor(rscnames, k8proc)
+ usedmap = htmlproc.buildXHTML()
+
+ # write out the xhtml svg, and css files
+ # fileinfo = [skelid|coverpage, dir, name]
+ fileinfo = []
+ # first create a cover page if none exists
+ if CREATE_COVER_PAGE:
+ cover = CoverProcessor(files, metadata, rscnames)
+ cover_img = utf8_str(cover.getImageName())
+ need_to_create_cover_page = False
+ if cover_img is not None:
+ if k8resc is None or not k8resc.hasSpine():
+ part = k8proc.getPart(0)
+ if part.find(cover_img) == -1:
+ need_to_create_cover_page = True
+ else:
+ if "coverpage" not in k8resc.spine_idrefs:
+ part = k8proc.getPart(int(k8resc.spine_order[0]))
+ if part.find(cover_img) == -1:
+ k8resc.prepend_to_spine("coverpage", "inserted", "no", None)
+ if k8resc.spine_order[0] == "coverpage":
+ need_to_create_cover_page = True
+ if need_to_create_cover_page:
+ filename = cover.getXHTMLName()
+ fileinfo.append(["coverpage", 'Text', filename])
+ guidetext += cover.guide_toxml()
+ cover.writeXHTML()
+
+ n = k8proc.getNumberOfParts()
+ for i in range(n):
+ part = k8proc.getPart(i)
+ [skelnum, dir, filename, beg, end, aidtext] = k8proc.getPartInfo(i)
+ fileinfo.append([str(skelnum), dir, filename])
+ fname = os.path.join(files.k8oebps,dir,filename)
+ with open(pathof(fname),'wb') as f:
+ f.write(part)
+ n = k8proc.getNumberOfFlows()
+ for i in range(1, n):
+ [ptype, pformat, pdir, filename] = k8proc.getFlowInfo(i)
+ flowpart = k8proc.getFlow(i)
+ if pformat == b'file':
+ fileinfo.append([None, pdir, filename])
+ fname = os.path.join(files.k8oebps,pdir,filename)
+ with open(pathof(fname),'wb') as f:
+ f.write(flowpart)
+
+ # create the opf
+ opf = OPFProcessor(files, metadata.copy(), fileinfo, rscnames, True, mh, usedmap,
+ pagemapxml=pagemapxml, guidetext=guidetext, k8resc=k8resc, epubver=epubver)
+ uuid = opf.writeOPF(bool(obfuscate_data))
+
+ if opf.hasNCX():
+ # Create a toc.ncx.
+ ncx.writeK8NCX(ncx_data, metadata)
+ if opf.hasNAV():
+ # Create a navigation document.
+ nav = NAVProcessor(files)
+ nav.writeNAV(ncx_data, guidetext, metadata)
+
+ # make an epub-like structure of it all
+ print("Creating an epub-like file")
+ files.makeEPUB(usedmap, obfuscate_data, uuid)
+
+
+def processMobi7(mh, metadata, sect, files, rscnames):
+ global DUMP
+ global WRITE_RAW_DATA
+ # An original Mobi
+ rawML = mh.getRawML()
+ if DUMP or WRITE_RAW_DATA:
+ outraw = os.path.join(files.mobi7dir,files.getInputFileBasename() + '.rawml')
+ with open(pathof(outraw),'wb') as f:
+ f.write(rawML)
+
+ # process the toc ncx
+ # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num
+ ncx = ncxExtract(mh, files)
+ ncx_data = ncx.parseNCX()
+ ncx.writeNCX(metadata)
+
+ positionMap = {}
+
+ # if Dictionary build up the positionMap
+ if mh.isDictionary():
+ if mh.DictInLanguage():
+ metadata['DictInLanguage'] = [mh.DictInLanguage()]
+ if mh.DictOutLanguage():
+ metadata['DictOutLanguage'] = [mh.DictOutLanguage()]
+ positionMap = dictSupport(mh, sect).getPositionMap()
+
+ # convert the rawml back to Mobi ml
+ proc = HTMLProcessor(files, metadata, rscnames)
+ srctext = proc.findAnchors(rawML, ncx_data, positionMap)
+ srctext, usedmap = proc.insertHREFS()
+
+ # write the proper mobi html
+ fileinfo=[]
+ # fname = files.getInputFileBasename() + '.html'
+ fname = 'book.html'
+ fileinfo.append([None,'', fname])
+ outhtml = os.path.join(files.mobi7dir, fname)
+ with open(pathof(outhtml), 'wb') as f:
+ f.write(srctext)
+
+ # extract guidetext from srctext
+ guidetext =b''
+ # no pagemap support for older mobis
+ # pagemapxml = None
+ guidematch = re.search(br'''<guide>(.*)</guide>''',srctext,re.IGNORECASE+re.DOTALL)
+ if guidematch:
+ guidetext = guidematch.group(1)
+ # sometimes old mobi guide from srctext horribly written so need to clean up
+ guidetext = guidetext.replace(b"\r", b"")
+ guidetext = guidetext.replace(b'<REFERENCE', b'<reference')
+ guidetext = guidetext.replace(b' HREF=', b' href=')
+ guidetext = guidetext.replace(b' TITLE=', b' title=')
+ guidetext = guidetext.replace(b' TYPE=', b' type=')
+ # reference must be a self-closing tag
+ # and any href must be replaced with filepos information
+ ref_tag_pattern = re.compile(br'''(<reference [^>]*>)''', re.IGNORECASE)
+ guidepieces = ref_tag_pattern.split(guidetext)
+ for i in range(1,len(guidepieces), 2):
+ reftag = guidepieces[i]
+ # remove any href there now to replace with filepos
+ reftag = re.sub(br'''href\s*=[^'"]*['"][^'"]*['"]''',b'', reftag)
+ # make sure the reference tag ends properly
+ if not reftag.endswith(b"/>"):
+ reftag = reftag[0:-1] + b"/>"
+ guidepieces[i] = reftag
+ guidetext = b''.join(guidepieces)
+ replacetext = br'''href="'''+utf8_str(fileinfo[0][2])+ br'''#filepos\1"'''
+ guidetext = re.sub(br'''filepos=['"]{0,1}0*(\d+)['"]{0,1}''', replacetext, guidetext)
+ guidetext += b'\n'
+
+ if 'StartOffset' in metadata:
+ for value in metadata['StartOffset']:
+ if int(value) == 0xffffffff:
+ value = '0'
+ starting_offset = value
+ # get guide items from metadata
+ metaguidetext = b'<reference type="text" href="'+utf8_str(fileinfo[0][2])+b'#filepos'+utf8_str(starting_offset)+b'" />\n'
+ guidetext += metaguidetext
+
+ if isinstance(guidetext, binary_type):
+ guidetext = guidetext.decode(mh.codec)
+
+ # create an OPF
+ opf = OPFProcessor(files, metadata, fileinfo, rscnames, ncx.isNCX, mh, usedmap, guidetext=guidetext)
+ opf.writeOPF()
+
+
+def processUnknownSections(mh, sect, files, K8Boundary):
+ global DUMP
+ global TERMINATION_INDICATOR1
+ global TERMINATION_INDICATOR2
+ global TERMINATION_INDICATOR3
+ if DUMP:
+ print("Unpacking any remaining unknown records")
+ beg = mh.start
+ end = sect.num_sections
+ if beg < K8Boundary:
+ # then we're processing the first part of a combination file
+ end = K8Boundary
+ for i in range(beg, end):
+ if sect.sectiondescriptions[i] == "":
+ data = sect.loadSection(i)
+ type = data[0:4]
+ if type == TERMINATION_INDICATOR3:
+ description = "Termination Marker 3 Nulls"
+ elif type == TERMINATION_INDICATOR2:
+ description = "Termination Marker 2 Nulls"
+ elif type == TERMINATION_INDICATOR1:
+ description = "Termination Marker 1 Null"
+ elif type == "INDX":
+ fname = "Unknown%05d_INDX.dat" % i
+ description = "Unknown INDX section"
+ if DUMP:
+ outname= os.path.join(files.outdir, fname)
+ with open(pathof(outname), 'wb') as f:
+ f.write(data)
+ print("Extracting %s: %s from section %d" % (description, fname, i))
+ description = description + ", extracting as %s" % fname
+ else:
+ fname = "unknown%05d.dat" % i
+ description = "Mysterious Section, first four bytes %s" % describe(data[0:4])
+ if DUMP:
+ outname= os.path.join(files.outdir, fname)
+ with open(pathof(outname), 'wb') as f:
+ f.write(data)
+ print("Extracting %s: %s from section %d" % (description, fname, i))
+ description = description + ", extracting as %s" % fname
+ sect.setsectiondescription(i, description)
+
+
+def process_all_mobi_headers(files, apnxfile, sect, mhlst, K8Boundary, k8only=False, epubver='2', use_hd=False):
+ global DUMP
+ global WRITE_RAW_DATA
+ rscnames = []
+ rsc_ptr = -1
+ k8resc = None
+ obfuscate_data = []
+ for mh in mhlst:
+ pagemapproc = None
+ if mh.isK8():
+ sect.setsectiondescription(mh.start,"KF8 Header")
+ mhname = os.path.join(files.outdir,"header_K8.dat")
+ print("Processing K8 section of book...")
+ elif mh.isPrintReplica():
+ sect.setsectiondescription(mh.start,"Print Replica Header")
+ mhname = os.path.join(files.outdir,"header_PR.dat")
+ print("Processing PrintReplica section of book...")
+ else:
+ if mh.version == 0:
+ sect.setsectiondescription(mh.start, "PalmDoc Header".format(mh.version))
+ else:
+ sect.setsectiondescription(mh.start,"Mobipocket {0:d} Header".format(mh.version))
+ mhname = os.path.join(files.outdir,"header.dat")
+ print("Processing Mobipocket {0:d} section of book...".format(mh.version))
+
+ if DUMP:
+ # write out raw mobi header data
+ with open(pathof(mhname), 'wb') as f:
+ f.write(mh.header)
+
+ # process each mobi header
+ metadata = mh.getMetaData()
+ mh.describeHeader(DUMP)
+ if mh.isEncrypted():
+ raise unpackException('Book is encrypted')
+
+ pagemapproc = None
+
+ # first handle all of the different resource sections: images, resources, fonts, and etc
+ # build up a list of image names to use to postprocess the ebook
+
+ print("Unpacking images, resources, fonts, etc")
+ beg = mh.firstresource
+ end = sect.num_sections
+ if beg < K8Boundary:
+ # processing first part of a combination file
+ end = K8Boundary
+
+ # Not sure the try/except is necessary, but just in case
+ try:
+ thumb_offset = int(metadata.get('ThumbOffset', ['-1'])[0])
+ except:
+ thumb_offset = None
+
+ cover_offset = int(metadata.get('CoverOffset', ['-1'])[0])
+ if not CREATE_COVER_PAGE:
+ cover_offset = None
+
+ for i in range(beg, end):
+ data = sect.loadSection(i)
+ type = data[0:4]
+
+ # handle the basics first
+ if type in [b"FLIS", b"FCIS", b"FDST", b"DATP"]:
+ if DUMP:
+ fname = unicode_str(type) + "%05d" % i
+ if mh.isK8():
+ fname += "_K8"
+ fname += '.dat'
+ outname= os.path.join(files.outdir, fname)
+ with open(pathof(outname), 'wb') as f:
+ f.write(data)
+ print("Dumping section {0:d} type {1:s} to file {2:s} ".format(i,unicode_str(type),outname))
+ sect.setsectiondescription(i,"Type {0:s}".format(unicode_str(type)))
+ rscnames.append(None)
+ elif type == b"SRCS":
+ rscnames = processSRCS(i, files, rscnames, sect, data)
+ elif type == b"PAGE":
+ rscnames, pagemapproc = processPAGE(i, files, rscnames, sect, data, mh, pagemapproc)
+ elif type == b"CMET":
+ rscnames = processCMET(i, files, rscnames, sect, data)
+ elif type == b"FONT":
+ rscnames, obfuscate_data, rsc_ptr = processFONT(i, files, rscnames, sect, data, obfuscate_data, beg, rsc_ptr)
+ elif type == b"CRES":
+ rscnames, rsc_ptr = processCRES(i, files, rscnames, sect, data, beg, rsc_ptr, use_hd)
+ elif type == b"CONT":
+ rscnames = processCONT(i, files, rscnames, sect, data)
+ elif type == b"kind":
+ rscnames = processkind(i, files, rscnames, sect, data)
+ elif type == b'\xa0\xa0\xa0\xa0':
+ sect.setsectiondescription(i,"Empty_HD_Image/Resource_Placeholder")
+ rscnames.append(None)
+ rsc_ptr += 1
+ elif type == b"RESC":
+ rscnames, k8resc = processRESC(i, files, rscnames, sect, data, k8resc)
+ elif data == EOF_RECORD:
+ sect.setsectiondescription(i,"End Of File")
+ rscnames.append(None)
+ elif data[0:8] == b"BOUNDARY":
+ sect.setsectiondescription(i,"BOUNDARY Marker")
+ rscnames.append(None)
+ else:
+ # if reached here should be an image ow treat as unknown
+ rscnames, rsc_ptr = processImage(i, files, rscnames, sect, data, beg, rsc_ptr, cover_offset, thumb_offset)
+ # done unpacking resources
+
+ # Print Replica
+ if mh.isPrintReplica() and not k8only:
+ processPrintReplica(metadata, files, rscnames, mh)
+ continue
+
+ # KF8 (Mobi 8)
+ if mh.isK8():
+ processMobi8(mh, metadata, sect, files, rscnames, pagemapproc, k8resc, obfuscate_data, apnxfile, epubver)
+
+ # Old Mobi (Mobi 7)
+ elif not k8only:
+ processMobi7(mh, metadata, sect, files, rscnames)
+
+ # process any remaining unknown sections of the palm file
+ processUnknownSections(mh, sect, files, K8Boundary)
+
+ return
+
+
+def unpackBook(infile, outdir, apnxfile=None, epubver='2', use_hd=False, dodump=False, dowriteraw=False, dosplitcombos=False):
+ global DUMP
+ global WRITE_RAW_DATA
+ global SPLIT_COMBO_MOBIS
+ if DUMP or dodump:
+ DUMP = True
+ if WRITE_RAW_DATA or dowriteraw:
+ WRITE_RAW_DATA = True
+ if SPLIT_COMBO_MOBIS or dosplitcombos:
+ SPLIT_COMBO_MOBIS = True
+
+ infile = unicode_str(infile)
+ outdir = unicode_str(outdir)
+ if apnxfile is not None:
+ apnxfile = unicode_str(apnxfile)
+
+ files = fileNames(infile, outdir)
+
+ # process the PalmDoc database header and verify it is a mobi
+ sect = Sectionizer(infile)
+ if sect.ident != b'BOOKMOBI' and sect.ident != b'TEXtREAd':
+ raise unpackException('Invalid file format')
+ if DUMP:
+ sect.dumppalmheader()
+ else:
+ print("Palm DB type: %s, %d sections." % (sect.ident.decode('utf-8'),sect.num_sections))
+
+ # scan sections to see if this is a compound mobi file (K8 format)
+ # and build a list of all mobi headers to process.
+ mhlst = []
+ mh = MobiHeader(sect,0)
+ # if this is a mobi8-only file hasK8 here will be true
+ mhlst.append(mh)
+ K8Boundary = -1
+
+ if mh.isK8():
+ print("Unpacking a KF8 book...")
+ hasK8 = True
+ else:
+ # This is either a Mobipocket 7 or earlier, or a combi M7/KF8
+ # Find out which
+ hasK8 = False
+ for i in range(len(sect.sectionoffsets)-1):
+ before, after = sect.sectionoffsets[i:i+2]
+ if (after - before) == 8:
+ data = sect.loadSection(i)
+ if data == K8_BOUNDARY:
+ sect.setsectiondescription(i,"Mobi/KF8 Boundary Section")
+ mh = MobiHeader(sect,i+1)
+ hasK8 = True
+ mhlst.append(mh)
+ K8Boundary = i
+ break
+ if hasK8:
+ print("Unpacking a Combination M{0:d}/KF8 book...".format(mh.version))
+ if SPLIT_COMBO_MOBIS:
+ # if this is a combination mobi7-mobi8 file split them up
+ mobisplit = mobi_split(infile)
+ if mobisplit.combo:
+ outmobi7 = os.path.join(files.outdir, 'mobi7-'+files.getInputFileBasename() + '.mobi')
+ outmobi8 = os.path.join(files.outdir, 'mobi8-'+files.getInputFileBasename() + '.azw3')
+ with open(pathof(outmobi7), 'wb') as f:
+ f.write(mobisplit.getResult7())
+ with open(pathof(outmobi8), 'wb') as f:
+ f.write(mobisplit.getResult8())
+ else:
+ print("Unpacking a Mobipocket {0:d} book...".format(mh.version))
+
+ if hasK8:
+ files.makeK8Struct()
+
+ process_all_mobi_headers(files, apnxfile, sect, mhlst, K8Boundary, False, epubver, use_hd)
+
+ if DUMP:
+ sect.dumpsectionsinfo()
+ return
+
+
+def usage(progname):
+ print("")
+ print("Description:")
+ print(" Unpacks an unencrypted Kindle/MobiPocket ebook to html and images")
+ print(" or an unencrypted Kindle/Print Replica ebook to PDF and images")
+ print(" into the specified output folder.")
+ print("Usage:")
+ print(" %s -r -s -p apnxfile -d -h --epub_version= infile [outdir]" % progname)
+ print("Options:")
+ print(" -h print this help message")
+ print(" -i use HD Images, if present, to overwrite reduced resolution images")
+ print(" -s split combination mobis into mobi7 and mobi8 ebooks")
+ print(" -p APNXFILE path to an .apnx file associated with the azw3 input (optional)")
+ print(" --epub_version= specify epub version to unpack to: 2, 3, A (for automatic) or ")
+ print(" F (force to fit to epub2 definitions), default is 2")
+ print(" -d dump headers and other info to output and extra files")
+ print(" -r write raw data to the output folder")
+
+
+def main(argv=unicode_argv()):
+ global DUMP
+ global WRITE_RAW_DATA
+ global SPLIT_COMBO_MOBIS
+
+ print("KindleUnpack v0.83")
+ print(" Based on initial mobipocket version Copyright © 2009 Charles M. Hannum <root@ihack.net>")
+ print(" Extensive Extensions and Improvements Copyright © 2009-2020 ")
+ print(" by: P. Durrant, K. Hendricks, S. Siebert, fandrieu, DiapDealer, nickredding, tkeo.")
+ print(" This program is free software: you can redistribute it and/or modify")
+ print(" it under the terms of the GNU General Public License as published by")
+ print(" the Free Software Foundation, version 3.")
+
+ progname = os.path.basename(argv[0])
+ try:
+ opts, args = getopt.getopt(argv[1:], "dhirsp:", ['epub_version='])
+ except getopt.GetoptError as err:
+ print(str(err))
+ usage(progname)
+ sys.exit(2)
+
+ if len(args)<1:
+ usage(progname)
+ sys.exit(2)
+
+ apnxfile = None
+ epubver = '2'
+ use_hd = False
+
+ for o, a in opts:
+ if o == "-h":
+ usage(progname)
+ sys.exit(0)
+ if o == "-i":
+ use_hd = True
+ if o == "-d":
+ DUMP = True
+ if o == "-r":
+ WRITE_RAW_DATA = True
+ if o == "-s":
+ SPLIT_COMBO_MOBIS = True
+ if o == "-p":
+ apnxfile = a
+ if o == "--epub_version":
+ epubver = a
+
+ if len(args) > 1:
+ infile, outdir = args
+ else:
+ infile = args[0]
+ outdir = os.path.splitext(infile)[0]
+
+ infileext = os.path.splitext(infile)[1].upper()
+ if infileext not in ['.MOBI', '.PRC', '.AZW', '.AZW3', '.AZW4']:
+ print("Error: first parameter must be a Kindle/Mobipocket ebook or a Kindle/Print Replica ebook.")
+ return 1
+
+ try:
+ print('Unpacking Book...')
+ unpackBook(infile, outdir, apnxfile, epubver, use_hd)
+ print('Completed')
+
+ except ValueError as e:
+ print("Error: %s" % e)
+ print(traceback.format_exc())
+ return 1
+
+ return 0
+
+
+if __name__ == '__main__':
+ sys.exit(main())