aboutsummaryrefslogtreecommitdiffstats
path: root/epy_extras/KindleUnpack/compatibility_utils.py
diff options
context:
space:
mode:
authorbenadha <benawiadha@gmail.com>2022-01-08 10:30:31 +0700
committerbenadha <benawiadha@gmail.com>2022-01-08 10:30:31 +0700
commitc0bfec3da1c55ae11a991a89938261c4e93ce795 (patch)
treeead3835a551887f11426e45326166cdbd31c9146 /epy_extras/KindleUnpack/compatibility_utils.py
parent7e4a230a8336844cb344301870b87e7bea1691a6 (diff)
downloadepy-c0bfec3da1c55ae11a991a89938261c4e93ce795.tar.gz
Initial prepackaging kindleunpack with epy
Diffstat (limited to 'epy_extras/KindleUnpack/compatibility_utils.py')
-rwxr-xr-xepy_extras/KindleUnpack/compatibility_utils.py278
1 files changed, 278 insertions, 0 deletions
diff --git a/epy_extras/KindleUnpack/compatibility_utils.py b/epy_extras/KindleUnpack/compatibility_utils.py
new file mode 100755
index 0000000..c46c0bb
--- /dev/null
+++ b/epy_extras/KindleUnpack/compatibility_utils.py
@@ -0,0 +1,278 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this list of
+# conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice, this list
+# of conditions and the following disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+import sys
+import codecs
+
+PY2 = sys.version_info[0] == 2
+PY3 = sys.version_info[0] == 3
+
+iswindows = sys.platform.startswith('win')
+
+try:
+ from urllib.parse import unquote
+except ImportError:
+ from urllib import unquote
+
+if PY2:
+ from HTMLParser import HTMLParser
+ _h = HTMLParser()
+elif sys.version_info[1] < 4:
+ import html.parser
+ _h = html.parser.HTMLParser()
+else:
+ import html as _h
+
+if PY3:
+ text_type = str
+ binary_type = bytes
+ # if will be printing arbitraty binary data to stdout on python 3
+ # sys.stdin = sys.stdin.detach()
+ # sys.stdout = sys.stdout.detach()
+ # sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
+else:
+ range = xrange
+ text_type = unicode
+ binary_type = str
+ # if will be printing unicode under python 2 need to protect
+ # against sys.stdout.encoding being None stupidly forcing forcing ascii encoding of unicode
+ # sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
+ # alternatively set environment variable as follows **before** launching python: export PYTHONIOENCODING=UTF-8
+
+# NOTE: Python 3 is completely broken when accessing single bytes in bytes strings
+# (and they amazingly claim by design and no bug!)
+
+# To illustrate: this works for unicode in Python 3 and for all Python 2.X for both bytestrings and unicode
+# >>> o = '123456789'
+# >>> o[-3]
+# '7'
+# >>> type(o[-3])
+# <class 'str'>
+# >>> type(o)
+# <class 'str'>
+
+# Unfortunately, this is what Python 3 does for no sane reason and only for bytestrings
+# >>> o = b'123456789'
+# >>> o[-3]
+# 55
+# >>> type(o[-3])
+# <class 'int'>
+# >>> type(o)
+# <class 'bytes'>
+
+# This mind boggling behaviour also happens when indexing a bytestring and/or
+# iteratoring over a bytestring. In other words it will return an int but not
+# the byte itself!!!!!!!
+
+# The only way to access a single byte as a byte in bytestring and get the byte in both
+# Python 2 and Python 3 is to use a slice
+
+# This problem is so common there are horrible hacks floating around the net to **try**
+# to work around it, so that code that works on both Python 2 and Python 3 is possible.
+
+# So in order to write code that works on both Python 2 and Python 3
+# if you index or access a single byte and want its ord() then use the bord() function.
+# If instead you want it as a single character byte use the bchar() function
+# both of which are defined below.
+
+if PY3:
+ # Also Note: if decode a bytestring using 'latin-1' (or any other full range 0-255 encoding)
+ # in place of ascii you will get a byte value to half-word or integer value
+ # one-to-one mapping (in the 0 - 255 range)
+
+ def bchr(s):
+ return bytes([s])
+
+ def bstr(s):
+ if isinstance(s, str):
+ return bytes(s, 'latin-1')
+ else:
+ return bytes(s)
+
+ def bord(s):
+ return s
+
+ def bchar(s):
+ return bytes([s])
+
+else:
+ def bchr(s):
+ return chr(s)
+
+ def bstr(s):
+ return str(s)
+
+ def bord(s):
+ return ord(s)
+
+ def bchar(s):
+ return s
+
+if PY3:
+ # list-producing versions of the major Python iterating functions
+ def lrange(*args, **kwargs):
+ return list(range(*args, **kwargs))
+
+ def lzip(*args, **kwargs):
+ return list(zip(*args, **kwargs))
+
+ def lmap(*args, **kwargs):
+ return list(map(*args, **kwargs))
+
+ def lfilter(*args, **kwargs):
+ return list(filter(*args, **kwargs))
+else:
+ import __builtin__
+ # Python 2-builtin ranges produce lists
+ lrange = __builtin__.range
+ lzip = __builtin__.zip
+ lmap = __builtin__.map
+ lfilter = __builtin__.filter
+
+# In Python 3 you can no longer use .encode('hex') on a bytestring
+# instead use the following on both platforms
+import binascii
+def hexlify(bdata):
+ return (binascii.hexlify(bdata)).decode('ascii')
+
+# If you: import struct
+# Note: struct pack, unpack, unpack_from all *require* bytestring format
+# data all the way up to at least Python 2.7.5, Python 3 is okay with either
+
+# If you: import re
+# note: Python 3 "re" requires the pattern to be the exact same type as the data to be
+# searched ... but u"" is not allowed for the pattern itself only b""
+# Python 2.X allows the pattern to be any type and converts it to match the data
+# and returns the same type as the data
+
+# convert string to be utf-8 encoded
+def utf8_str(p, enc='utf-8'):
+ if p is None:
+ return None
+ if isinstance(p, text_type):
+ return p.encode('utf-8')
+ if enc != 'utf-8':
+ return p.decode(enc).encode('utf-8')
+ return p
+
+# convert string to be unicode encoded
+def unicode_str(p, enc='utf-8'):
+ if p is None:
+ return None
+ if isinstance(p, text_type):
+ return p
+ return p.decode(enc)
+
+ASCII_CHARS = set(chr(x) for x in range(128))
+URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+ 'abcdefghijklmnopqrstuvwxyz'
+ '0123456789' '#' '_.-/~')
+IRI_UNSAFE = ASCII_CHARS - URL_SAFE
+
+# returns a quoted IRI (not a URI)
+def quoteurl(href):
+ if isinstance(href,binary_type):
+ href = href.decode('utf-8')
+ result = []
+ for char in href:
+ if char in IRI_UNSAFE:
+ char = "%%%02x" % ord(char)
+ result.append(char)
+ return ''.join(result)
+
+# unquotes url/iri
+def unquoteurl(href):
+ if isinstance(href,binary_type):
+ href = href.decode('utf-8')
+ href = unquote(href)
+ return href
+
+# unescape html
+def unescapeit(sval):
+ return _h.unescape(sval)
+
+# Python 2.X commandline parsing under Windows has been horribly broken for years!
+# Use the following code to emulate full unicode commandline parsing on Python 2
+# ie. To get sys.argv arguments and properly encode them as unicode
+
+def unicode_argv():
+ global iswindows
+ global PY3
+ if PY3:
+ return sys.argv
+ if iswindows:
+ # Versions 2.x of Python don't support Unicode in sys.argv on
+ # Windows, with the underlying Windows API instead replacing multi-byte
+ # characters with '?'. So use shell32.GetCommandLineArgvW to get sys.argv
+ # as a list of Unicode strings
+ from ctypes import POINTER, byref, cdll, c_int, windll
+ from ctypes.wintypes import LPCWSTR, LPWSTR
+
+ GetCommandLineW = cdll.kernel32.GetCommandLineW
+ GetCommandLineW.argtypes = []
+ GetCommandLineW.restype = LPCWSTR
+
+ CommandLineToArgvW = windll.shell32.CommandLineToArgvW
+ CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)]
+ CommandLineToArgvW.restype = POINTER(LPWSTR)
+
+ cmd = GetCommandLineW()
+ argc = c_int(0)
+ argv = CommandLineToArgvW(cmd, byref(argc))
+ if argc.value > 0:
+ # Remove Python executable and commands if present
+ start = argc.value - len(sys.argv)
+ return [argv[i] for i in
+ range(start, argc.value)]
+ # this should never happen
+ return None
+ else:
+ argv = []
+ argvencoding = sys.stdin.encoding
+ if argvencoding is None:
+ argvencoding = sys.getfilesystemencoding()
+ if argvencoding is None:
+ argvencoding = 'utf-8'
+ for arg in sys.argv:
+ if isinstance(arg, text_type):
+ argv.append(arg)
+ else:
+ argv.append(arg.decode(argvencoding))
+ return argv
+
+
+# Python 2.X is broken in that it does not recognize CP65001 as UTF-8
+def add_cp65001_codec():
+ if PY2:
+ try:
+ codecs.lookup('cp65001')
+ except LookupError:
+ codecs.register(
+ lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)
+ return