diff options
author | Benawi Adha <benawiadha@gmail.com> | 2022-10-02 21:22:38 +0700 |
---|---|---|
committer | Benawi Adha <benawiadha@gmail.com> | 2022-10-02 21:22:38 +0700 |
commit | 258c30d2e088cd4ab091a53794da3f93af79915d (patch) | |
tree | f49340bf565deb20c730358af74a01bcc231de53 /src/epy_reader/tools/KindleUnpack/compatibility_utils.py | |
parent | d43533f01d9d5baf5f78b71f832641382bd5962a (diff) | |
download | epy-258c30d2e088cd4ab091a53794da3f93af79915d.tar.gz |
Major refactor: breakdown epy.py script
into package project structure for easier
development
Squashed commit of the following:
commit 01309b961a4ab32394bff0d90949b57435dfda47
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 21:15:04 2022 +0700
Fix missing objects
commit aab2e773c30b255c81b1250b3b20967d5da40338
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 21:09:31 2022 +0700
Update README.md
commit d4e98926bcd9b00ce0410ad71249d24e6315abc5
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 21:07:28 2022 +0700
Add keywords in pyproject.toml
commit 432055af8245560a3ff2e046aef0b4e87da44930
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 21:04:34 2022 +0700
Bump version and deprecete setup.py
commit 51dd15aab8f8ff5996f822f8378e813f0b9fb80d
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 20:56:38 2022 +0700
Formatting
commit 81fb35e3b6fa0e27d79ef1da77202ed81eb99500
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sun Oct 2 20:55:08 2022 +0700
Fix speakers module
commit 3b852e7c59b38d5a28520038e35f50a95270d2f1
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:52:46 2022 +0700
Fix circular import
commit 061e8a2649dabacd28a9e2f972559475316c654c
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:39:27 2022 +0700
Run formatting
commit abc2d0ab156992c63dc04745d14a69679a60accb
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:39:00 2022 +0700
Update isort and black config in pyproject
commit 5dc2e41bab5b997bd719bdc1561eb51ba0c17a83
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:31:00 2022 +0700
Add app Config
commit ed485a2ea8281585bf86dc5772f0c6dd9c803cc4
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:23:02 2022 +0700
Update debugpy script
commit 68b0553dd4d63eb4b847132c68ea4018587fa8ec
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:14:11 2022 +0700
Connect reader to main script
commit 63c3dd176f18a784a4ed2e88aa72b13d1c2b0990
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 20:11:17 2022 +0700
Implement reader
commit ce5eec8fb4e1db3870a16a07541365cd777d6c4c
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 19:29:49 2022 +0700
Fix script in pyproject.toml
commit 941e8e49f1593731fb582d92084206772b3f0442
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 19:28:39 2022 +0700
Rename modules
commit 5a3e7f766aee774c09b3b5336f3a2968e9cb1d0c
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 19:28:20 2022 +0700
Rename tool method
commit 3c0503ff475cb7eff8b12d3be0bda7a38efe1072
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 19:27:03 2022 +0700
Add ebooks lib
commit b5f71c3296a7d6f36454f6e1cbe84e15a45092ee
Author: Benawi Adha <benawiadha@gmail.com>
Date: Sat Oct 1 17:25:11 2022 +0700
Initial reorganization
Diffstat (limited to 'src/epy_reader/tools/KindleUnpack/compatibility_utils.py')
-rwxr-xr-x | src/epy_reader/tools/KindleUnpack/compatibility_utils.py | 278 |
1 files changed, 278 insertions, 0 deletions
diff --git a/src/epy_reader/tools/KindleUnpack/compatibility_utils.py b/src/epy_reader/tools/KindleUnpack/compatibility_utils.py new file mode 100755 index 0000000..c46c0bb --- /dev/null +++ b/src/epy_reader/tools/KindleUnpack/compatibility_utils.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this list of +# conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, this list +# of conditions and the following disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY +# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import unicode_literals, division, absolute_import, print_function + +import sys +import codecs + +PY2 = sys.version_info[0] == 2 +PY3 = sys.version_info[0] == 3 + +iswindows = sys.platform.startswith('win') + +try: + from urllib.parse import unquote +except ImportError: + from urllib import unquote + +if PY2: + from HTMLParser import HTMLParser + _h = HTMLParser() +elif sys.version_info[1] < 4: + import html.parser + _h = html.parser.HTMLParser() +else: + import html as _h + +if PY3: + text_type = str + binary_type = bytes + # if will be printing arbitraty binary data to stdout on python 3 + # sys.stdin = sys.stdin.detach() + # sys.stdout = sys.stdout.detach() + # sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach()) +else: + range = xrange + text_type = unicode + binary_type = str + # if will be printing unicode under python 2 need to protect + # against sys.stdout.encoding being None stupidly forcing forcing ascii encoding of unicode + # sys.stdout = codecs.getwriter("utf-8")(sys.stdout) + # alternatively set environment variable as follows **before** launching python: export PYTHONIOENCODING=UTF-8 + +# NOTE: Python 3 is completely broken when accessing single bytes in bytes strings +# (and they amazingly claim by design and no bug!) + +# To illustrate: this works for unicode in Python 3 and for all Python 2.X for both bytestrings and unicode +# >>> o = '123456789' +# >>> o[-3] +# '7' +# >>> type(o[-3]) +# <class 'str'> +# >>> type(o) +# <class 'str'> + +# Unfortunately, this is what Python 3 does for no sane reason and only for bytestrings +# >>> o = b'123456789' +# >>> o[-3] +# 55 +# >>> type(o[-3]) +# <class 'int'> +# >>> type(o) +# <class 'bytes'> + +# This mind boggling behaviour also happens when indexing a bytestring and/or +# iteratoring over a bytestring. In other words it will return an int but not +# the byte itself!!!!!!! + +# The only way to access a single byte as a byte in bytestring and get the byte in both +# Python 2 and Python 3 is to use a slice + +# This problem is so common there are horrible hacks floating around the net to **try** +# to work around it, so that code that works on both Python 2 and Python 3 is possible. + +# So in order to write code that works on both Python 2 and Python 3 +# if you index or access a single byte and want its ord() then use the bord() function. +# If instead you want it as a single character byte use the bchar() function +# both of which are defined below. + +if PY3: + # Also Note: if decode a bytestring using 'latin-1' (or any other full range 0-255 encoding) + # in place of ascii you will get a byte value to half-word or integer value + # one-to-one mapping (in the 0 - 255 range) + + def bchr(s): + return bytes([s]) + + def bstr(s): + if isinstance(s, str): + return bytes(s, 'latin-1') + else: + return bytes(s) + + def bord(s): + return s + + def bchar(s): + return bytes([s]) + +else: + def bchr(s): + return chr(s) + + def bstr(s): + return str(s) + + def bord(s): + return ord(s) + + def bchar(s): + return s + +if PY3: + # list-producing versions of the major Python iterating functions + def lrange(*args, **kwargs): + return list(range(*args, **kwargs)) + + def lzip(*args, **kwargs): + return list(zip(*args, **kwargs)) + + def lmap(*args, **kwargs): + return list(map(*args, **kwargs)) + + def lfilter(*args, **kwargs): + return list(filter(*args, **kwargs)) +else: + import __builtin__ + # Python 2-builtin ranges produce lists + lrange = __builtin__.range + lzip = __builtin__.zip + lmap = __builtin__.map + lfilter = __builtin__.filter + +# In Python 3 you can no longer use .encode('hex') on a bytestring +# instead use the following on both platforms +import binascii +def hexlify(bdata): + return (binascii.hexlify(bdata)).decode('ascii') + +# If you: import struct +# Note: struct pack, unpack, unpack_from all *require* bytestring format +# data all the way up to at least Python 2.7.5, Python 3 is okay with either + +# If you: import re +# note: Python 3 "re" requires the pattern to be the exact same type as the data to be +# searched ... but u"" is not allowed for the pattern itself only b"" +# Python 2.X allows the pattern to be any type and converts it to match the data +# and returns the same type as the data + +# convert string to be utf-8 encoded +def utf8_str(p, enc='utf-8'): + if p is None: + return None + if isinstance(p, text_type): + return p.encode('utf-8') + if enc != 'utf-8': + return p.decode(enc).encode('utf-8') + return p + +# convert string to be unicode encoded +def unicode_str(p, enc='utf-8'): + if p is None: + return None + if isinstance(p, text_type): + return p + return p.decode(enc) + +ASCII_CHARS = set(chr(x) for x in range(128)) +URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ' + 'abcdefghijklmnopqrstuvwxyz' + '0123456789' '#' '_.-/~') +IRI_UNSAFE = ASCII_CHARS - URL_SAFE + +# returns a quoted IRI (not a URI) +def quoteurl(href): + if isinstance(href,binary_type): + href = href.decode('utf-8') + result = [] + for char in href: + if char in IRI_UNSAFE: + char = "%%%02x" % ord(char) + result.append(char) + return ''.join(result) + +# unquotes url/iri +def unquoteurl(href): + if isinstance(href,binary_type): + href = href.decode('utf-8') + href = unquote(href) + return href + +# unescape html +def unescapeit(sval): + return _h.unescape(sval) + +# Python 2.X commandline parsing under Windows has been horribly broken for years! +# Use the following code to emulate full unicode commandline parsing on Python 2 +# ie. To get sys.argv arguments and properly encode them as unicode + +def unicode_argv(): + global iswindows + global PY3 + if PY3: + return sys.argv + if iswindows: + # Versions 2.x of Python don't support Unicode in sys.argv on + # Windows, with the underlying Windows API instead replacing multi-byte + # characters with '?'. So use shell32.GetCommandLineArgvW to get sys.argv + # as a list of Unicode strings + from ctypes import POINTER, byref, cdll, c_int, windll + from ctypes.wintypes import LPCWSTR, LPWSTR + + GetCommandLineW = cdll.kernel32.GetCommandLineW + GetCommandLineW.argtypes = [] + GetCommandLineW.restype = LPCWSTR + + CommandLineToArgvW = windll.shell32.CommandLineToArgvW + CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)] + CommandLineToArgvW.restype = POINTER(LPWSTR) + + cmd = GetCommandLineW() + argc = c_int(0) + argv = CommandLineToArgvW(cmd, byref(argc)) + if argc.value > 0: + # Remove Python executable and commands if present + start = argc.value - len(sys.argv) + return [argv[i] for i in + range(start, argc.value)] + # this should never happen + return None + else: + argv = [] + argvencoding = sys.stdin.encoding + if argvencoding is None: + argvencoding = sys.getfilesystemencoding() + if argvencoding is None: + argvencoding = 'utf-8' + for arg in sys.argv: + if isinstance(arg, text_type): + argv.append(arg) + else: + argv.append(arg.decode(argvencoding)) + return argv + + +# Python 2.X is broken in that it does not recognize CP65001 as UTF-8 +def add_cp65001_codec(): + if PY2: + try: + codecs.lookup('cp65001') + except LookupError: + codecs.register( + lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None) + return |