[clean,parsers] Build regex lists for static items only once

For parsers such as the username and keyword parsers, we don't discover new items through parsing archives - these parsers use static lists determined before we begin the actual obfuscation process. As such, we can build a list of regexes for these static items once, and then reference those regexes during execution, rather than rebuilding the regex for each of these items for every obfuscation. For use cases where hundreds of items, e.g. hundreds of usernames, are being obfuscated this results in a significant performance increase. Individual per-file gains are minor - fractions of a second - however these gains build up over the course of the hundreds to thousands of files a typical archive can be expected to contain. Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
author: Jake Hunsaker <jhunsake@redhat.com> 2022-01-13 13:52:34 -0500
committer: Jake Hunsaker <jhunsake@redhat.com> 2022-01-17 12:24:06 -0500
commit: ed618678fd3d07e68e1a430eb7d225a9701332e0 (patch)
tree: ca347bf38aa8a5f84b4cc89fbc0b026b2bec5b14
parent: f270220fddb70ef71a8da0376333b2454d7c4983 (diff)
download: sos-ed618678fd3d07e68e1a430eb7d225a9701332e0.tar.gz
5 files changed, 38 insertions, 11 deletions
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
index 5686e213..b76bef64 100644
--- a/sos/cleaner/__init__.py
+++ b/sos/cleaner/__init__.py
@@ -294,6 +294,7 @@ third party.
         # we have at least one valid target to obfuscate
         self.completed_reports = []
         self.preload_all_archives_into_maps()
+        self.generate_parser_item_regexes()
         self.obfuscate_report_paths()
 
         if not self.completed_reports:
@@ -498,6 +499,14 @@ third party.
             shutil.move(archive.final_archive_path, dest)
             archive.final_archive_path = dest_name
 
+    def generate_parser_item_regexes(self):
+        """For the parsers that use prebuilt lists of items, generate those
+        regexes now since all the parsers should be preloaded by the archive(s)
+        as well as being handed cmdline options and mapping file configuration.
+        """
+        for parser in self.parsers:
+            parser.generate_item_regexes()
+
     def preload_all_archives_into_maps(self):
         """Before doing the actual obfuscation, if we have multiple archives
         to obfuscate then we need to preload each of them into the mappings
diff --git a/sos/cleaner/parsers/__init__.py b/sos/cleaner/parsers/__init__.py
index e62fd938..6def863a 100644
--- a/sos/cleaner/parsers/__init__.py
+++ b/sos/cleaner/parsers/__init__.py
@@ -46,9 +46,19 @@ class SoSCleanerParser():
     map_file_key = 'unset'
 
     def __init__(self, config={}):
+        self.regexes = {}
         if self.map_file_key in config:
             self.mapping.conf_update(config[self.map_file_key])
 
+    def generate_item_regexes(self):
+        """Generate regexes for items the parser will be searching for
+        repeatedly without needing to generate them for every file and/or line
+        we process
+
+        Not used by all parsers.
+        """
+        pass
+
     def parse_line(self, line):
         """This will be called for every line in every file we process, so that
         every parser has a chance to scrub everything.
diff --git a/sos/cleaner/parsers/keyword_parser.py b/sos/cleaner/parsers/keyword_parser.py
index 694c6073..362a1929 100644
--- a/sos/cleaner/parsers/keyword_parser.py
+++ b/sos/cleaner/parsers/keyword_parser.py
@@ -9,6 +9,7 @@
 # See the LICENSE file in the source distribution for further information.
 
 import os
+import re
 
 from sos.cleaner.parsers import SoSCleanerParser
 from sos.cleaner.mappings.keyword_map import SoSKeywordMap
@@ -33,16 +34,20 @@ class SoSKeywordParser(SoSCleanerParser):
                     # pre-generate an obfuscation mapping for each keyword
                     # this is necessary for cases where filenames are being
                     # obfuscated before or instead of file content
-                    self.mapping.get(keyword)
+                    self.mapping.get(keyword.lower())
                     self.user_keywords.append(keyword)
         if keyword_file and os.path.exists(keyword_file):
             with open(keyword_file, 'r') as kwf:
                 self.user_keywords.extend(kwf.read().splitlines())
 
+    def generate_item_regexes(self):
+        for kw in self.user_keywords:
+            self.regexes[kw] = re.compile(kw, re.I)
+
     def parse_line(self, line):
         count = 0
-        for keyword in sorted(self.user_keywords, reverse=True):
-            if keyword in line:
-                line = line.replace(keyword, self.mapping.get(keyword))
-                count += 1
+        for kwrd, reg in sorted(self.regexes.items(), key=len, reverse=True):
+            if reg.search(line):
+                line, _count = reg.subn(self.mapping.get(kwrd.lower()), line)
+                count += _count
         return line, count
diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py
index 3208a655..49640f7f 100644
--- a/sos/cleaner/parsers/username_parser.py
+++ b/sos/cleaner/parsers/username_parser.py
@@ -61,12 +61,14 @@ class SoSUsernameParser(SoSCleanerParser):
         for each in users:
             self.mapping.get(each)
 
+    def generate_item_regexes(self):
+        for user in self.mapping.dataset:
+            self.regexes[user] = re.compile(user, re.I)
+
     def parse_line(self, line):
         count = 0
-        for username in sorted(self.mapping.dataset.keys(), reverse=True):
-            _reg = re.compile(username, re.I)
-            if _reg.search(line):
-                line, count = _reg.subn(
-                    self.mapping.get(username.lower()), line
-                )
+        for user, reg in sorted(self.regexes.items(), key=len, reverse=True):
+            if reg.search(line):
+                line, _count = reg.subn(self.mapping.get(user.lower()), line)
+                count += _count
         return line, count
diff --git a/tests/unittests/cleaner_tests.py b/tests/unittests/cleaner_tests.py
index cb20772f..b59eade9 100644
--- a/tests/unittests/cleaner_tests.py
+++ b/tests/unittests/cleaner_tests.py
@@ -105,6 +105,7 @@ class CleanerParserTests(unittest.TestCase):
         self.host_parser = SoSHostnameParser(config={}, opt_domains='foobar.com')
         self.kw_parser = SoSKeywordParser(config={}, keywords=['foobar'])
         self.kw_parser_none = SoSKeywordParser(config={})
+        self.kw_parser.generate_item_regexes()
 
     def test_ip_parser_valid_ipv4_line(self):
         line = 'foobar foo 10.0.0.1/24 barfoo bar'
author	Jake Hunsaker <jhunsake@redhat.com>	2022-01-13 13:52:34 -0500
committer	Jake Hunsaker <jhunsake@redhat.com>	2022-01-17 12:24:06 -0500
commit	ed618678fd3d07e68e1a430eb7d225a9701332e0 (patch)
tree	ca347bf38aa8a5f84b4cc89fbc0b026b2bec5b14
parent	f270220fddb70ef71a8da0376333b2454d7c4983 (diff)
download	sos-ed618678fd3d07e68e1a430eb7d225a9701332e0.tar.gz