From c4a7abd99f96f37ef0b18b1fd8e3660470a98829 Mon Sep 17 00:00:00 2001 From: Matěj Cepl Date: Thu, 18 Jun 2020 10:33:02 +0200 Subject: Make epubgrep concurrent for speed improvement. Surprisingly ProcessPoolExecutor is vastly faster and less memory demanding than ThreadPoolExecutor. --- epubgrep.py | 54 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 17 deletions(-) (limited to 'epubgrep.py') diff --git a/epubgrep.py b/epubgrep.py index 0451c17..cf70bcf 100755 --- a/epubgrep.py +++ b/epubgrep.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 import argparse +import concurrent.futures import logging import os.path import re @@ -17,7 +18,8 @@ log = logging.getLogger('epubgrep') def get_chapter_title(mdata: List[Dict[str, Any]], fname: str) -> Optional[Tuple[str, int]]: if mdata is not None: - found_list = [(x['title'], x['index']) for x in mdata if x['src'] == fname] + found_list = [(x['title'], x['index']) + for x in mdata if x['src'] == fname] if len(found_list) > 0: chap_title = found_list[0][0].strip(' \t.0123456789') return chap_title, found_list[0][1] @@ -29,12 +31,14 @@ def grep_book(filename: str, pattern: str, flags: int, counting: bool=False, col assert os.path.isfile(filename), "{} is not EPub file.".format(filename) sought_RE = re.compile('(' + pattern + ')', flags) count = 0 + out_title = '' + out = '' mline = flags & re.M == re.M try: metadata = epub_meta.get_epub_metadata(filename) - except epub_meta.EPubException as ex: + except epub_meta.EPubException: log.exception('Failed to open {}'.format(filename)) book = zipfile.ZipFile(filename) printed_booktitle = False @@ -46,31 +50,35 @@ def grep_book(filename: str, pattern: str, flags: int, counting: bool=False, col res = sought_RE.search(decoded_str) if res: if not printed_booktitle: - print('{}'.format(filename)) + out += '{}\n'.format(filename) printed_booktitle = True if counting: count += 1 else: - chap_info = get_chapter_title(metadata.toc, zif.filename) - print("{}. {}:\n".format(chap_info[1], chap_info[0])) - print('{}\n'.format(res.group(0))) + chap_info = get_chapter_title(metadata.toc, + zif.filename) + out += "{}. {}:\n\n".format(chap_info[1], chap_info[0]) + out += '{}\n\n'.format(res.group(0)) else: printed_title = False for line in inf: decoded_line = line.decode(errors='replace').strip() res = sought_RE.search(decoded_line) if res: - if not printed_booktitle: - print('{}'.format(filename)) - printed_booktitle = True + if not out_title: + out_title = '{}'.format(filename) if counting: count += 1 else: + if not printed_booktitle: + out += out_title + '\n' + printed_booktitle = True if not printed_title: chap_info = get_chapter_title(metadata.toc, zif.filename) if chap_info is not None: - print("{}. {}:\n".format(chap_info[1], chap_info[0])) + out += "{}. {}:\n\n".format(chap_info[1], + chap_info[0]) printed_title = True # https://stackoverflow.com/a/33206814 # print("\033[31;1;4mHello\033[0m") @@ -79,12 +87,14 @@ def grep_book(filename: str, pattern: str, flags: int, counting: bool=False, col found_line = decoded_line.replace( res.group(1), "\033[31;1m" + res.group(1) + "\033[31;0m") - print('{}'.format(found_line)) + out += '{}\n'.format(found_line) else: - print(decoded_line) + out += decoded_line + '\n' if count > 0: - print('Found: {}'.format(count)) + out += '{:02d}:{}'.format(count, out_title) + + return out if __name__ == "__main__": @@ -113,9 +123,19 @@ if __name__ == "__main__": if args.multi_line: search_flags |= re.M | re.S - for filename in args.files: - book_fname = os.path.realpath(filename) + with concurrent.futures.ProcessPoolExecutor() as executor: + fut_to_fname = {executor.submit(grep_book, + os.path.realpath(filename), + args.pattern, search_flags, + args.count, args.color): + filename for filename in args.files} + for future in concurrent.futures.as_completed(fut_to_fname): + fname = fut_to_fname[future] try: - grep_book(book_fname, args.pattern, search_flags, args.count, args.color) - except BrokenPipeError: + data = future.result() + if data: + data = data.rstrip() + if len(data) > 0: + print(data) + except (BrokenPipeError, KeyboardInterrupt): sys.exit() -- cgit