aboutsummaryrefslogtreecommitdiffstats
path: root/src/epy_reader/tools/KindleUnpack/mobi_split.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/epy_reader/tools/KindleUnpack/mobi_split.py')
-rwxr-xr-xsrc/epy_reader/tools/KindleUnpack/mobi_split.py438
1 files changed, 438 insertions, 0 deletions
diff --git a/src/epy_reader/tools/KindleUnpack/mobi_split.py b/src/epy_reader/tools/KindleUnpack/mobi_split.py
new file mode 100755
index 0000000..3535029
--- /dev/null
+++ b/src/epy_reader/tools/KindleUnpack/mobi_split.py
@@ -0,0 +1,438 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+import struct
+# note: struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+from .unipath import pathof
+
+
+# important pdb header offsets
+unique_id_seed = 68
+number_of_pdb_records = 76
+
+# important palmdoc header offsets
+book_length = 4
+book_record_count = 8
+first_pdb_record = 78
+
+# important rec0 offsets
+length_of_book = 4
+mobi_header_base = 16
+mobi_header_length = 20
+mobi_type = 24
+mobi_version = 36
+first_non_text = 80
+title_offset = 84
+first_resc_record = 108
+first_content_index = 192
+last_content_index = 194
+kf8_fdst_index = 192 # for KF8 mobi headers
+fcis_index = 200
+flis_index = 208
+srcs_index = 224
+srcs_count = 228
+primary_index = 244
+datp_index = 256
+huffoff = 112
+hufftbloff = 120
+
+def getint(datain,ofs,sz=b'L'):
+ i, = struct.unpack_from(b'>'+sz,datain,ofs)
+ return i
+
+def writeint(datain,ofs,n,len=b'L'):
+ if len==b'L':
+ return datain[:ofs]+struct.pack(b'>L',n)+datain[ofs+4:]
+ else:
+ return datain[:ofs]+struct.pack(b'>H',n)+datain[ofs+2:]
+
+def getsecaddr(datain,secno):
+ nsec = getint(datain,number_of_pdb_records,b'H')
+ assert secno>=0 & secno<nsec,'secno %d out of range (nsec=%d)'%(secno,nsec)
+ secstart = getint(datain,first_pdb_record+secno*8)
+ if secno == nsec-1:
+ secend = len(datain)
+ else:
+ secend = getint(datain,first_pdb_record+(secno+1)*8)
+ return secstart,secend
+
+def readsection(datain,secno):
+ secstart, secend = getsecaddr(datain,secno)
+ return datain[secstart:secend]
+
+def writesection(datain,secno,secdata): # overwrite, accounting for different length
+ # dataout = deletesectionrange(datain,secno, secno)
+ # return insertsection(dataout, secno, secdata)
+ datalst = []
+ nsec = getint(datain,number_of_pdb_records,b'H')
+ zerosecstart,zerosecend = getsecaddr(datain,0)
+ secstart,secend = getsecaddr(datain,secno)
+ dif = len(secdata) - (secend - secstart)
+ datalst.append(datain[:unique_id_seed])
+ datalst.append(struct.pack(b'>L',2*nsec+1))
+ datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
+ datalst.append(struct.pack(b'>H',nsec))
+ newstart = zerosecstart
+ for i in range(0,secno):
+ ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+ datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
+ datalst.append(struct.pack(b'>L', secstart) + struct.pack(b'>L', (2*secno)))
+ for i in range(secno+1,nsec):
+ ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+ ofs = ofs + dif
+ datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
+ lpad = newstart - (first_pdb_record + 8*nsec)
+ if lpad > 0:
+ datalst.append(b'\0' * lpad)
+ datalst.append(datain[zerosecstart:secstart])
+ datalst.append(secdata)
+ datalst.append(datain[secend:])
+ dataout = b''.join(datalst)
+ return dataout
+
+def nullsection(datain,secno): # make it zero-length without deleting it
+ datalst = []
+ nsec = getint(datain,number_of_pdb_records,b'H')
+ secstart, secend = getsecaddr(datain,secno)
+ zerosecstart, zerosecend = getsecaddr(datain, 0)
+ dif = secend-secstart
+ datalst.append(datain[:first_pdb_record])
+ for i in range(0,secno+1):
+ ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+ datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
+ for i in range(secno+1, nsec):
+ ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+ ofs = ofs - dif
+ datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
+ lpad = zerosecstart - (first_pdb_record + 8*nsec)
+ if lpad > 0:
+ datalst.append(b'\0' * lpad)
+ datalst.append(datain[zerosecstart: secstart])
+ datalst.append(datain[secend:])
+ dataout = b''.join(datalst)
+ return dataout
+
+def deletesectionrange(datain,firstsec,lastsec): # delete a range of sections
+ datalst = []
+ firstsecstart,firstsecend = getsecaddr(datain,firstsec)
+ lastsecstart,lastsecend = getsecaddr(datain,lastsec)
+ zerosecstart, zerosecend = getsecaddr(datain, 0)
+ dif = lastsecend - firstsecstart + 8*(lastsec-firstsec+1)
+ nsec = getint(datain,number_of_pdb_records,b'H')
+ datalst.append(datain[:unique_id_seed])
+ datalst.append(struct.pack(b'>L',2*(nsec-(lastsec-firstsec+1))+1))
+ datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
+ datalst.append(struct.pack(b'>H',nsec-(lastsec-firstsec+1)))
+ newstart = zerosecstart - 8*(lastsec-firstsec+1)
+ for i in range(0,firstsec):
+ ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+ ofs = ofs-8*(lastsec-firstsec+1)
+ datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
+ for i in range(lastsec+1,nsec):
+ ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+ ofs = ofs - dif
+ flgval = 2*(i-(lastsec-firstsec+1))
+ datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
+ lpad = newstart - (first_pdb_record + 8*(nsec - (lastsec - firstsec + 1)))
+ if lpad > 0:
+ datalst.append(b'\0' * lpad)
+ datalst.append(datain[zerosecstart:firstsecstart])
+ datalst.append(datain[lastsecend:])
+ dataout = b''.join(datalst)
+ return dataout
+
+def insertsection(datain,secno,secdata): # insert a new section
+ datalst = []
+ nsec = getint(datain,number_of_pdb_records,b'H')
+ # print("inserting secno" , secno, "into" ,nsec, "sections")
+ secstart,secend = getsecaddr(datain,secno)
+ zerosecstart,zerosecend = getsecaddr(datain,0)
+ dif = len(secdata)
+ datalst.append(datain[:unique_id_seed])
+ datalst.append(struct.pack(b'>L',2*(nsec+1)+1))
+ datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
+ datalst.append(struct.pack(b'>H',nsec+1))
+ newstart = zerosecstart + 8
+ for i in range(0,secno):
+ ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+ ofs += 8
+ datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
+ datalst.append(struct.pack(b'>L', secstart + 8) + struct.pack(b'>L', (2*secno)))
+ for i in range(secno,nsec):
+ ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+ ofs = ofs + dif + 8
+ flgval = 2*(i+1)
+ datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
+ lpad = newstart - (first_pdb_record + 8*(nsec + 1))
+ if lpad > 0:
+ datalst.append(b'\0' * lpad)
+ datalst.append(datain[zerosecstart:secstart])
+ datalst.append(secdata)
+ datalst.append(datain[secstart:])
+ dataout = b''.join(datalst)
+ return dataout
+
+
+def insertsectionrange(sectionsource,firstsec,lastsec,sectiontarget,targetsec): # insert a range of sections
+ # print("inserting secno" , firstsec, "to", lastsec, "into" ,targetsec, "sections")
+ # dataout = sectiontarget
+ # for idx in range(lastsec,firstsec-1,-1):
+ # dataout = insertsection(dataout,targetsec,readsection(sectionsource,idx))
+ # return dataout
+ datalst = []
+ nsec = getint(sectiontarget,number_of_pdb_records,b'H')
+ zerosecstart, zerosecend = getsecaddr(sectiontarget,0)
+ insstart, nul = getsecaddr(sectiontarget,targetsec)
+ nins = lastsec - firstsec + 1
+ srcstart, nul = getsecaddr(sectionsource,firstsec)
+ nul, srcend = getsecaddr(sectionsource,lastsec)
+ newstart = zerosecstart + 8*nins
+
+ datalst.append(sectiontarget[:unique_id_seed])
+ datalst.append(struct.pack(b'>L',2*(nsec+nins)+1))
+ datalst.append(sectiontarget[unique_id_seed+4:number_of_pdb_records])
+ datalst.append(struct.pack(b'>H',nsec+nins))
+ for i in range(0,targetsec):
+ ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8)
+ ofsnew = ofs + 8*nins
+ flgvalnew = flgval
+ datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew))
+ # print(ofsnew, flgvalnew, ofs, flgval)
+ srcstart0, nul = getsecaddr(sectionsource,firstsec)
+ for i in range(nins):
+ isrcstart, nul = getsecaddr(sectionsource,firstsec+i)
+ ofsnew = insstart + (isrcstart-srcstart0) + 8*nins
+ flgvalnew = 2*(targetsec+i)
+ datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew))
+ # print(ofsnew, flgvalnew)
+ dif = srcend - srcstart
+ for i in range(targetsec,nsec):
+ ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8)
+ ofsnew = ofs + dif + 8*nins
+ flgvalnew = 2*(i+nins)
+ datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L',flgvalnew))
+ # print(ofsnew, flgvalnew, ofs, flgval)
+ lpad = newstart - (first_pdb_record + 8*(nsec + nins))
+ if lpad > 0:
+ datalst.append(b'\0' * lpad)
+ datalst.append(sectiontarget[zerosecstart:insstart])
+ datalst.append(sectionsource[srcstart:srcend])
+ datalst.append(sectiontarget[insstart:])
+ dataout = b''.join(datalst)
+ return dataout
+
+def get_exth_params(rec0):
+ ebase = mobi_header_base + getint(rec0,mobi_header_length)
+ elen = getint(rec0,ebase+4)
+ enum = getint(rec0,ebase+8)
+ return ebase,elen,enum
+
+def add_exth(rec0,exth_num,exth_bytes):
+ ebase,elen,enum = get_exth_params(rec0)
+ newrecsize = 8+len(exth_bytes)
+ newrec0 = rec0[0:ebase+4]+struct.pack(b'>L',elen+newrecsize)+struct.pack(b'>L',enum+1)+\
+ struct.pack(b'>L',exth_num)+struct.pack(b'>L',newrecsize)+exth_bytes+rec0[ebase+12:]
+ newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+newrecsize)
+ return newrec0
+
+def read_exth(rec0,exth_num):
+ exth_values = []
+ ebase,elen,enum = get_exth_params(rec0)
+ ebase = ebase+12
+ while enum>0:
+ exth_id = getint(rec0,ebase)
+ if exth_id == exth_num:
+ # We might have multiple exths, so build a list.
+ exth_values.append(rec0[ebase+8:ebase+getint(rec0,ebase+4)])
+ enum = enum-1
+ ebase = ebase+getint(rec0,ebase+4)
+ return exth_values
+
+def write_exth(rec0,exth_num,exth_bytes):
+ ebase,elen,enum = get_exth_params(rec0)
+ ebase_idx = ebase+12
+ enum_idx = enum
+ while enum_idx>0:
+ exth_id = getint(rec0,ebase_idx)
+ if exth_id == exth_num:
+ dif = len(exth_bytes)+8-getint(rec0,ebase_idx+4)
+ newrec0 = rec0
+ if dif != 0:
+ newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+dif)
+ return newrec0[:ebase+4]+struct.pack(b'>L',elen+len(exth_bytes)+8-getint(rec0,ebase_idx+4))+\
+ struct.pack(b'>L',enum)+rec0[ebase+12:ebase_idx+4]+\
+ struct.pack(b'>L',len(exth_bytes)+8)+exth_bytes+\
+ rec0[ebase_idx+getint(rec0,ebase_idx+4):]
+ enum_idx = enum_idx-1
+ ebase_idx = ebase_idx+getint(rec0,ebase_idx+4)
+ return rec0
+
+def del_exth(rec0,exth_num):
+ ebase,elen,enum = get_exth_params(rec0)
+ ebase_idx = ebase+12
+ enum_idx = 0
+ while enum_idx < enum:
+ exth_id = getint(rec0,ebase_idx)
+ exth_size = getint(rec0,ebase_idx+4)
+ if exth_id == exth_num:
+ newrec0 = rec0
+ newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)-exth_size)
+ newrec0 = newrec0[:ebase_idx]+newrec0[ebase_idx+exth_size:]
+ newrec0 = newrec0[0:ebase+4]+struct.pack(b'>L',elen-exth_size)+struct.pack(b'>L',enum-1)+newrec0[ebase+12:]
+ return newrec0
+ enum_idx += 1
+ ebase_idx = ebase_idx+exth_size
+ return rec0
+
+
+class mobi_split:
+
+ def __init__(self, infile):
+ datain = b''
+ with open(pathof(infile), 'rb') as f:
+ datain = f.read()
+ datain_rec0 = readsection(datain,0)
+ ver = getint(datain_rec0,mobi_version)
+ self.combo = (ver!=8)
+ if not self.combo:
+ return
+ exth121 = read_exth(datain_rec0,121)
+ if len(exth121) == 0:
+ self.combo = False
+ return
+ else:
+ # only pay attention to first exth121
+ # (there should only be one)
+ datain_kf8, = struct.unpack_from(b'>L',exth121[0],0)
+ if datain_kf8 == 0xffffffff:
+ self.combo = False
+ return
+ datain_kfrec0 =readsection(datain,datain_kf8)
+
+ # create the standalone mobi7
+ num_sec = getint(datain,number_of_pdb_records,b'H')
+ # remove BOUNDARY up to but not including ELF record
+ self.result_file7 = deletesectionrange(datain,datain_kf8-1,num_sec-2)
+ # check if there are SRCS records and delete them
+ srcs = getint(datain_rec0,srcs_index)
+ num_srcs = getint(datain_rec0,srcs_count)
+ if srcs != 0xffffffff and num_srcs > 0:
+ self.result_file7 = deletesectionrange(self.result_file7,srcs,srcs+num_srcs-1)
+ datain_rec0 = writeint(datain_rec0,srcs_index,0xffffffff)
+ datain_rec0 = writeint(datain_rec0,srcs_count,0)
+ # reset the EXTH 121 KF8 Boundary meta data to 0xffffffff
+ datain_rec0 = write_exth(datain_rec0,121, struct.pack(b'>L', 0xffffffff))
+ # datain_rec0 = del_exth(datain_rec0,121)
+ # datain_rec0 = del_exth(datain_rec0,534)
+ # don't remove the EXTH 125 KF8 Count of Resources, seems to be present in mobi6 files as well
+ # set the EXTH 129 KF8 Masthead / Cover Image string to the null string
+ datain_rec0 = write_exth(datain_rec0,129, b'')
+ # don't remove the EXTH 131 KF8 Unidentified Count, seems to be present in mobi6 files as well
+
+ # need to reset flags stored in 0x80-0x83
+ # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050
+ # Bit Flags
+ # 0x1000 = Bit 12 indicates if embedded fonts are used or not
+ # 0x0800 = means this Header points to *shared* images/resource/fonts ??
+ # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8?
+ # 0x0040 = exth exists
+ # 0x0010 = Not sure but this is always set so far
+ fval, = struct.unpack_from(b'>L',datain_rec0, 0x80)
+ # need to remove flag 0x0800 for KindlePreviewer 2.8 and unset Bit 12 for embedded fonts
+ fval = fval & 0x07FF
+ datain_rec0 = datain_rec0[:0x80] + struct.pack(b'>L',fval) + datain_rec0[0x84:]
+
+ self.result_file7 = writesection(self.result_file7,0,datain_rec0)
+
+ # no need to replace kf8 style fcis with mobi 7 one
+ # fcis_secnum, = struct.unpack_from(b'>L',datain_rec0, 0xc8)
+ # if fcis_secnum != 0xffffffff:
+ # fcis_info = readsection(datain, fcis_secnum)
+ # text_len, = struct.unpack_from(b'>L', fcis_info, 0x14)
+ # new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
+ # new_fcis += struct.pack(b'>L',text_len)
+ # new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
+ # self.result_file7 = writesection(self.result_file7, fcis_secnum, new_fcis)
+
+ firstimage = getint(datain_rec0,first_resc_record)
+ lastimage = getint(datain_rec0,last_content_index,b'H')
+ # print("Old First Image, last Image", firstimage,lastimage)
+ if lastimage == 0xffff:
+ # find the lowest of the next sections and copy up to that.
+ ofs_list = [(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')]
+ for ofs,sz in ofs_list:
+ n = getint(datain_rec0,ofs,sz)
+ # print("n",n)
+ if n > 0 and n < lastimage:
+ lastimage = n-1
+ print("First Image, last Image", firstimage,lastimage)
+
+ # Try to null out FONT and RES, but leave the (empty) PDB record so image refs remain valid
+ for i in range(firstimage,lastimage):
+ imgsec = readsection(self.result_file7,i)
+ if imgsec[0:4] in [b'RESC',b'FONT']:
+ self.result_file7 = nullsection(self.result_file7,i)
+
+ # mobi7 finished
+
+ # create standalone mobi8
+ self.result_file8 = deletesectionrange(datain,0,datain_kf8-1)
+ target = getint(datain_kfrec0,first_resc_record)
+ self.result_file8 = insertsectionrange(datain,firstimage,lastimage,self.result_file8,target)
+ datain_kfrec0 =readsection(self.result_file8,0)
+
+ # Only keep the correct EXTH 116 StartOffset, KG 2.5 carries over the one from the mobi7 part, which then points at garbage in the mobi8 part, and confuses FW 3.4
+ kf8starts = read_exth(datain_kfrec0,116)
+ # If we have multiple StartOffset, keep only the last one
+ kf8start_count = len(kf8starts)
+ while kf8start_count > 1:
+ kf8start_count -= 1
+ datain_kfrec0 = del_exth(datain_kfrec0,116)
+
+ # update the EXTH 125 KF8 Count of Images/Fonts/Resources
+ datain_kfrec0 = write_exth(datain_kfrec0,125,struct.pack(b'>L',lastimage-firstimage+1))
+
+ # need to reset flags stored in 0x80-0x83
+ # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050
+ # standalone mobi8 with exth: 0x0050
+ # Bit Flags
+ # 0x1000 = Bit 12 indicates if embedded fonts are used or not
+ # 0x0800 = means this Header points to *shared* images/resource/fonts ??
+ # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8?
+ # 0x0040 = exth exists
+ # 0x0010 = Not sure but this is always set so far
+ fval, = struct.unpack_from('>L',datain_kfrec0, 0x80)
+ fval = fval & 0x1FFF
+ fval |= 0x0800
+ datain_kfrec0 = datain_kfrec0[:0x80] + struct.pack(b'>L',fval) + datain_kfrec0[0x84:]
+
+ # properly update other index pointers that have been shifted by the insertion of images
+ ofs_list = [(kf8_fdst_index,b'L'),(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')]
+ for ofs,sz in ofs_list:
+ n = getint(datain_kfrec0,ofs,sz)
+ if n != 0xffffffff:
+ datain_kfrec0 = writeint(datain_kfrec0,ofs,n+lastimage-firstimage+1,sz)
+ self.result_file8 = writesection(self.result_file8,0,datain_kfrec0)
+
+ # no need to replace kf8 style fcis with mobi 7 one
+ # fcis_secnum, = struct.unpack_from(b'>L',datain_kfrec0, 0xc8)
+ # if fcis_secnum != 0xffffffff:
+ # fcis_info = readsection(self.result_file8, fcis_secnum)
+ # text_len, = struct.unpack_from(b'>L', fcis_info, 0x14)
+ # new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
+ # new_fcis += struct.pack(b'>L',text_len)
+ # new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
+ # self.result_file8 = writesection(self.result_file8, fcis_secnum, new_fcis)
+
+ # mobi8 finished
+
+ def getResult8(self):
+ return self.result_file8
+
+ def getResult7(self):
+ return self.result_file7