diff options
author | Jake Hunsaker <jhunsake@redhat.com> | 2021-07-16 12:50:06 -0400 |
---|---|---|
committer | Jake Hunsaker <jhunsake@redhat.com> | 2021-08-04 09:00:27 -0400 |
commit | 611b17788ee289bebfe3b13404fae73efd112f70 (patch) | |
tree | c835998174af28aec1052ee9145d1810a30ac990 | |
parent | 4e5bebffca9936bcdf4d38aad9989970a15dd72b (diff) | |
download | sos-611b17788ee289bebfe3b13404fae73efd112f70.tar.gz |
[cleaner] Use a nested ProcessPoolExecutor for extraction
This commit inserts a nested ProcessPoolExecutor into the extraction
workflow for archives that are being obfuscated by `sos clean`.
Previously, the extraction was handled inside the same thread as the
rest of the obfuscation routines for each archive. However, it has been
found that when very large archives are manipulated concurrently,
performance can take a massive hit during the extraction process. This
is due to GIL limitations.
In this aspect 'very large archives' implies many tens of thousands of
files - e.g. 50K+. Because TarFile uses a 10K internal buffer, we end up
spinning a lot of time processing each file via the interpreter.
By shunting each extraction off into a new process space, we can avoid
the GIL issues altogether.
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
-rw-r--r-- | sos/cleaner/obfuscation_archive.py | 24 |
1 files changed, 19 insertions, 5 deletions
diff --git a/sos/cleaner/obfuscation_archive.py b/sos/cleaner/obfuscation_archive.py index e357450b..ea0b7012 100644 --- a/sos/cleaner/obfuscation_archive.py +++ b/sos/cleaner/obfuscation_archive.py @@ -15,6 +15,19 @@ import stat import tarfile import re +from concurrent.futures import ProcessPoolExecutor + + +# python older than 3.8 will hit a pickling error when we go to spawn a new +# process for extraction if this method is a part of the SoSObfuscationArchive +# class. So, the simplest solution is to remove it from the class. +def extract_archive(archive_path, tmpdir): + archive = tarfile.open(archive_path) + path = os.path.join(tmpdir, 'cleaner') + archive.extractall(path) + archive.close() + return os.path.join(path, archive.name.split('/')[-1].split('.tar')[0]) + class SoSObfuscationArchive(): """A representation of an extracted archive or an sos archive build @@ -193,11 +206,12 @@ class SoSObfuscationArchive(): """Extract an archive into our tmpdir so that we may inspect it or iterate through its contents for obfuscation """ - archive = tarfile.open(self.archive_path) - path = os.path.join(self.tmpdir, 'cleaner') - archive.extractall(path) - archive.close() - return os.path.join(path, archive.name.split('/')[-1].split('.tar')[0]) + + with ProcessPoolExecutor(1) as _pool: + _path_future = _pool.submit(extract_archive, + self.archive_path, self.tmpdir) + path = _path_future.result() + return path def get_file_list(self): """Return a list of all files within the archive""" |