Make epubgrep concurrent for speed improvement.0.4.0

Surprisingly ProcessPoolExecutor is vastly faster and less memory demanding than ThreadPoolExecutor.
author: Matěj Cepl <mcepl@cepl.eu> 2020-06-18 10:33:02 +0200
committer: Matěj Cepl <mcepl@cepl.eu> 2020-06-18 10:34:56 +0200
commit: c4a7abd99f96f37ef0b18b1fd8e3660470a98829 (patch)
tree: 53dcb0e00cb7cf3c144948b25eb65f7fe79ac5ef /epubgrep.py
parent: b5a7cf08178b587942c63eb119f66a5411503d1a (diff)
download: epubgrep-c4a7abd99f96f37ef0b18b1fd8e3660470a98829.tar.gz
1 files changed, 37 insertions, 17 deletions
diff --git a/epubgrep.py b/epubgrep.py
index 0451c17..cf70bcf 100755
--- a/epubgrep.py
+++ b/epubgrep.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 import argparse
+import concurrent.futures
 import logging
 import os.path
 import re
@@ -17,7 +18,8 @@ log = logging.getLogger('epubgrep')
 
 def get_chapter_title(mdata: List[Dict[str, Any]], fname: str) -> Optional[Tuple[str, int]]:
     if mdata is not None:
-        found_list = [(x['title'], x['index']) for x in mdata if x['src'] == fname]
+        found_list = [(x['title'], x['index'])
+                      for x in mdata if x['src'] == fname]
         if len(found_list) > 0:
             chap_title = found_list[0][0].strip(' \t.0123456789')
             return chap_title, found_list[0][1]
@@ -29,12 +31,14 @@ def grep_book(filename: str, pattern: str, flags: int, counting: bool=False, col
     assert os.path.isfile(filename), "{} is not EPub file.".format(filename)
     sought_RE = re.compile('(' + pattern + ')', flags)
     count = 0
+    out_title = ''
+    out = ''
 
     mline = flags & re.M == re.M
 
     try:
         metadata = epub_meta.get_epub_metadata(filename)
-    except epub_meta.EPubException as ex:
+    except epub_meta.EPubException:
         log.exception('Failed to open {}'.format(filename))
     book = zipfile.ZipFile(filename)
     printed_booktitle = False
@@ -46,31 +50,35 @@ def grep_book(filename: str, pattern: str, flags: int, counting: bool=False, col
                 res = sought_RE.search(decoded_str)
                 if res:
                     if not printed_booktitle:
-                        print('{}'.format(filename))
+                        out += '{}\n'.format(filename)
                         printed_booktitle = True
                     if counting:
                         count += 1
                     else:
-                        chap_info = get_chapter_title(metadata.toc, zif.filename)
-                        print("{}. {}:\n".format(chap_info[1], chap_info[0]))
-                        print('{}\n'.format(res.group(0)))
+                        chap_info = get_chapter_title(metadata.toc,
+                                                      zif.filename)
+                        out += "{}. {}:\n\n".format(chap_info[1], chap_info[0])
+                        out += '{}\n\n'.format(res.group(0))
             else:
                 printed_title = False
                 for line in inf:
                     decoded_line = line.decode(errors='replace').strip()
                     res = sought_RE.search(decoded_line)
                     if res:
-                        if not printed_booktitle:
-                            print('{}'.format(filename))
-                            printed_booktitle = True
+                        if not out_title:
+                            out_title = '{}'.format(filename)
                         if counting:
                             count += 1
                         else:
+                            if not printed_booktitle:
+                                out += out_title + '\n'
+                                printed_booktitle = True
                             if not printed_title:
                                 chap_info = get_chapter_title(metadata.toc,
                                                               zif.filename)
                                 if chap_info is not None:
-                                    print("{}. {}:\n".format(chap_info[1], chap_info[0]))
+                                    out += "{}. {}:\n\n".format(chap_info[1],
+                                                                chap_info[0])
                                     printed_title = True
                         # https://stackoverflow.com/a/33206814
                         # print("\033[31;1;4mHello\033[0m")
@@ -79,12 +87,14 @@ def grep_book(filename: str, pattern: str, flags: int, counting: bool=False, col
                                 found_line = decoded_line.replace(
                                     res.group(1),
                                     "\033[31;1m" + res.group(1) + "\033[31;0m")
-                                print('{}'.format(found_line))
+                                out += '{}\n'.format(found_line)
                             else:
-                                print(decoded_line)
+                                out += decoded_line + '\n'
 
     if count > 0:
-        print('Found: {}'.format(count))
+        out += '{:02d}:{}'.format(count, out_title)
+
+    return out
 
 
 if __name__ == "__main__":
@@ -113,9 +123,19 @@ if __name__ == "__main__":
     if args.multi_line:
         search_flags |= re.M | re.S
 
-    for filename in args.files:
-        book_fname = os.path.realpath(filename)
+    with concurrent.futures.ProcessPoolExecutor() as executor:
+        fut_to_fname = {executor.submit(grep_book,
+                                        os.path.realpath(filename),
+                                        args.pattern, search_flags,
+                                        args.count, args.color):
+                        filename for filename in args.files}
+    for future in concurrent.futures.as_completed(fut_to_fname):
+        fname = fut_to_fname[future]
         try:
-            grep_book(book_fname, args.pattern, search_flags, args.count, args.color)
-        except BrokenPipeError:
+            data = future.result()
+            if data:
+                data = data.rstrip()
+            if len(data) > 0:
+                print(data)
+        except (BrokenPipeError, KeyboardInterrupt):
             sys.exit()
author	Matěj Cepl <mcepl@cepl.eu>	2020-06-18 10:33:02 +0200
committer	Matěj Cepl <mcepl@cepl.eu>	2020-06-18 10:34:56 +0200
commit	c4a7abd99f96f37ef0b18b1fd8e3660470a98829 (patch)
tree	53dcb0e00cb7cf3c144948b25eb65f7fe79ac5ef /epubgrep.py
parent	b5a7cf08178b587942c63eb119f66a5411503d1a (diff)
download	epubgrep-c4a7abd99f96f37ef0b18b1fd8e3660470a98829.tar.gz