diff options
author | Jake Hunsaker <jhunsake@redhat.com> | 2022-01-13 13:52:34 -0500 |
---|---|---|
committer | Jake Hunsaker <jhunsake@redhat.com> | 2022-01-17 12:24:06 -0500 |
commit | ed618678fd3d07e68e1a430eb7d225a9701332e0 (patch) | |
tree | ca347bf38aa8a5f84b4cc89fbc0b026b2bec5b14 | |
parent | f270220fddb70ef71a8da0376333b2454d7c4983 (diff) | |
download | sos-ed618678fd3d07e68e1a430eb7d225a9701332e0.tar.gz |
[clean,parsers] Build regex lists for static items only once
For parsers such as the username and keyword parsers, we don't discover
new items through parsing archives - these parsers use static lists
determined before we begin the actual obfuscation process.
As such, we can build a list of regexes for these static items once, and
then reference those regexes during execution, rather than rebuilding
the regex for each of these items for every obfuscation.
For use cases where hundreds of items, e.g. hundreds of usernames, are
being obfuscated this results in a significant performance increase.
Individual per-file gains are minor - fractions of a second - however
these gains build up over the course of the hundreds to thousands of
files a typical archive can be expected to contain.
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
-rw-r--r-- | sos/cleaner/__init__.py | 9 | ||||
-rw-r--r-- | sos/cleaner/parsers/__init__.py | 10 | ||||
-rw-r--r-- | sos/cleaner/parsers/keyword_parser.py | 15 | ||||
-rw-r--r-- | sos/cleaner/parsers/username_parser.py | 14 | ||||
-rw-r--r-- | tests/unittests/cleaner_tests.py | 1 |
5 files changed, 38 insertions, 11 deletions
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py index 5686e213..b76bef64 100644 --- a/sos/cleaner/__init__.py +++ b/sos/cleaner/__init__.py @@ -294,6 +294,7 @@ third party. # we have at least one valid target to obfuscate self.completed_reports = [] self.preload_all_archives_into_maps() + self.generate_parser_item_regexes() self.obfuscate_report_paths() if not self.completed_reports: @@ -498,6 +499,14 @@ third party. shutil.move(archive.final_archive_path, dest) archive.final_archive_path = dest_name + def generate_parser_item_regexes(self): + """For the parsers that use prebuilt lists of items, generate those + regexes now since all the parsers should be preloaded by the archive(s) + as well as being handed cmdline options and mapping file configuration. + """ + for parser in self.parsers: + parser.generate_item_regexes() + def preload_all_archives_into_maps(self): """Before doing the actual obfuscation, if we have multiple archives to obfuscate then we need to preload each of them into the mappings diff --git a/sos/cleaner/parsers/__init__.py b/sos/cleaner/parsers/__init__.py index e62fd938..6def863a 100644 --- a/sos/cleaner/parsers/__init__.py +++ b/sos/cleaner/parsers/__init__.py @@ -46,9 +46,19 @@ class SoSCleanerParser(): map_file_key = 'unset' def __init__(self, config={}): + self.regexes = {} if self.map_file_key in config: self.mapping.conf_update(config[self.map_file_key]) + def generate_item_regexes(self): + """Generate regexes for items the parser will be searching for + repeatedly without needing to generate them for every file and/or line + we process + + Not used by all parsers. + """ + pass + def parse_line(self, line): """This will be called for every line in every file we process, so that every parser has a chance to scrub everything. diff --git a/sos/cleaner/parsers/keyword_parser.py b/sos/cleaner/parsers/keyword_parser.py index 694c6073..362a1929 100644 --- a/sos/cleaner/parsers/keyword_parser.py +++ b/sos/cleaner/parsers/keyword_parser.py @@ -9,6 +9,7 @@ # See the LICENSE file in the source distribution for further information. import os +import re from sos.cleaner.parsers import SoSCleanerParser from sos.cleaner.mappings.keyword_map import SoSKeywordMap @@ -33,16 +34,20 @@ class SoSKeywordParser(SoSCleanerParser): # pre-generate an obfuscation mapping for each keyword # this is necessary for cases where filenames are being # obfuscated before or instead of file content - self.mapping.get(keyword) + self.mapping.get(keyword.lower()) self.user_keywords.append(keyword) if keyword_file and os.path.exists(keyword_file): with open(keyword_file, 'r') as kwf: self.user_keywords.extend(kwf.read().splitlines()) + def generate_item_regexes(self): + for kw in self.user_keywords: + self.regexes[kw] = re.compile(kw, re.I) + def parse_line(self, line): count = 0 - for keyword in sorted(self.user_keywords, reverse=True): - if keyword in line: - line = line.replace(keyword, self.mapping.get(keyword)) - count += 1 + for kwrd, reg in sorted(self.regexes.items(), key=len, reverse=True): + if reg.search(line): + line, _count = reg.subn(self.mapping.get(kwrd.lower()), line) + count += _count return line, count diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py index 3208a655..49640f7f 100644 --- a/sos/cleaner/parsers/username_parser.py +++ b/sos/cleaner/parsers/username_parser.py @@ -61,12 +61,14 @@ class SoSUsernameParser(SoSCleanerParser): for each in users: self.mapping.get(each) + def generate_item_regexes(self): + for user in self.mapping.dataset: + self.regexes[user] = re.compile(user, re.I) + def parse_line(self, line): count = 0 - for username in sorted(self.mapping.dataset.keys(), reverse=True): - _reg = re.compile(username, re.I) - if _reg.search(line): - line, count = _reg.subn( - self.mapping.get(username.lower()), line - ) + for user, reg in sorted(self.regexes.items(), key=len, reverse=True): + if reg.search(line): + line, _count = reg.subn(self.mapping.get(user.lower()), line) + count += _count return line, count diff --git a/tests/unittests/cleaner_tests.py b/tests/unittests/cleaner_tests.py index cb20772f..b59eade9 100644 --- a/tests/unittests/cleaner_tests.py +++ b/tests/unittests/cleaner_tests.py @@ -105,6 +105,7 @@ class CleanerParserTests(unittest.TestCase): self.host_parser = SoSHostnameParser(config={}, opt_domains='foobar.com') self.kw_parser = SoSKeywordParser(config={}, keywords=['foobar']) self.kw_parser_none = SoSKeywordParser(config={}) + self.kw_parser.generate_item_regexes() def test_ip_parser_valid_ipv4_line(self): line = 'foobar foo 10.0.0.1/24 barfoo bar' |