aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJake Hunsaker <jhunsake@redhat.com>2021-07-16 12:50:06 -0400
committerJake Hunsaker <jhunsake@redhat.com>2021-08-04 09:00:27 -0400
commit611b17788ee289bebfe3b13404fae73efd112f70 (patch)
treec835998174af28aec1052ee9145d1810a30ac990
parent4e5bebffca9936bcdf4d38aad9989970a15dd72b (diff)
downloadsos-611b17788ee289bebfe3b13404fae73efd112f70.tar.gz
[cleaner] Use a nested ProcessPoolExecutor for extraction
This commit inserts a nested ProcessPoolExecutor into the extraction workflow for archives that are being obfuscated by `sos clean`. Previously, the extraction was handled inside the same thread as the rest of the obfuscation routines for each archive. However, it has been found that when very large archives are manipulated concurrently, performance can take a massive hit during the extraction process. This is due to GIL limitations. In this aspect 'very large archives' implies many tens of thousands of files - e.g. 50K+. Because TarFile uses a 10K internal buffer, we end up spinning a lot of time processing each file via the interpreter. By shunting each extraction off into a new process space, we can avoid the GIL issues altogether. Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
-rw-r--r--sos/cleaner/obfuscation_archive.py24
1 files changed, 19 insertions, 5 deletions
diff --git a/sos/cleaner/obfuscation_archive.py b/sos/cleaner/obfuscation_archive.py
index e357450b..ea0b7012 100644
--- a/sos/cleaner/obfuscation_archive.py
+++ b/sos/cleaner/obfuscation_archive.py
@@ -15,6 +15,19 @@ import stat
import tarfile
import re
+from concurrent.futures import ProcessPoolExecutor
+
+
+# python older than 3.8 will hit a pickling error when we go to spawn a new
+# process for extraction if this method is a part of the SoSObfuscationArchive
+# class. So, the simplest solution is to remove it from the class.
+def extract_archive(archive_path, tmpdir):
+ archive = tarfile.open(archive_path)
+ path = os.path.join(tmpdir, 'cleaner')
+ archive.extractall(path)
+ archive.close()
+ return os.path.join(path, archive.name.split('/')[-1].split('.tar')[0])
+
class SoSObfuscationArchive():
"""A representation of an extracted archive or an sos archive build
@@ -193,11 +206,12 @@ class SoSObfuscationArchive():
"""Extract an archive into our tmpdir so that we may inspect it or
iterate through its contents for obfuscation
"""
- archive = tarfile.open(self.archive_path)
- path = os.path.join(self.tmpdir, 'cleaner')
- archive.extractall(path)
- archive.close()
- return os.path.join(path, archive.name.split('/')[-1].split('.tar')[0])
+
+ with ProcessPoolExecutor(1) as _pool:
+ _path_future = _pool.submit(extract_archive,
+ self.archive_path, self.tmpdir)
+ path = _path_future.result()
+ return path
def get_file_list(self):
"""Return a list of all files within the archive"""