[cleaner] Use compiled regex lists for parsers by default

This commit follows the initial change made to the username and keyword parsers in #2823 and applies it to all parsers by default. When a new match is found, a new `regex.Pattern()` object will now be compiled and saved in the parser's map, and this object will be used for ongoing obfuscations from that point forward, rather than rebuilding regexes for every line we iterate over with that parser. This is now built into the base `SoSMap` and leveraged by parsers, rather than being handled directly by parsers. Further, this will be enabled by default for all existing and new parsers. This shows decent improvements to hostname parser performance in local testing. Note that this functionality is explicitly disabled for the mac and ip parsers. Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
author: Jake Hunsaker <jhunsake@redhat.com> 2022-02-09 17:32:17 -0500
committer: Jake Hunsaker <jhunsake@redhat.com> 2022-02-28 10:24:04 -0500
commit: 045609e5705b1643fe2e800206b38a4ca4d30b02 (patch)
tree: fd93d96255e52fd40bc633ae8a09acf9de635faf
parent: 2ae16e0245e1b01b8547e507abb69c11871a8467 (diff)
download: sos-045609e5705b1643fe2e800206b38a4ca4d30b02.tar.gz
10 files changed, 129 insertions, 86 deletions
diff --git a/sos/cleaner/mappings/__init__.py b/sos/cleaner/mappings/__init__.py
index 48171a05..92aedf4a 100644
--- a/sos/cleaner/mappings/__init__.py
+++ b/sos/cleaner/mappings/__init__.py
@@ -24,22 +24,24 @@ class SoSMap():
     ignore_matches = []
     # used for filename obfuscations in parser.parse_string_for_keys()
     skip_keys = []
+    compile_regexes = True
 
     def __init__(self):
         self.dataset = {}
+        self._regexes_made = set()
+        self.compiled_regexes = []
         self.lock = Lock()
 
     def ignore_item(self, item):
         """Some items need to be completely ignored, for example link-local or
         loopback addresses should not be obfuscated
         """
+        if not item or item in self.skip_keys or item in self.dataset.values():
+            return True
         for skip in self.ignore_matches:
             if re.match(skip, item):
                 return True
 
-    def item_in_dataset_values(self, item):
-        return item in self.dataset.values()
-
     def add(self, item):
         """Add a particular item to the map, generating an obfuscated pair
         for it.
@@ -48,12 +50,52 @@ class SoSMap():
 
             :param item:        The plaintext object to obfuscate
         """
+        if self.ignore_item(item):
+            return item
         with self.lock:
-            if not item:
-                return item
             self.dataset[item] = self.sanitize_item(item)
+            if self.compile_regexes:
+                self.add_regex_item(item)
             return self.dataset[item]
 
+    def add_regex_item(self, item):
+        """Add an item to the regexes dict and then re-sort the list that the
+        parsers will use during parse_line()
+
+        :param item:    The unobfuscated item to generate a regex for
+        :type item:     ``str``
+        """
+        if self.ignore_item(item):
+            return
+        if item not in self._regexes_made:
+            # save the item in a set to avoid clobbering existing regexes,
+            # as searching this set is significantly faster than searching
+            # through the actual compiled_regexes list, especially for very
+            # large collections of entries
+            self._regexes_made.add(item)
+            # add the item, Pattern tuple directly to the compiled_regexes list
+            # and then sort the existing list, rather than rebuild the list
+            # from scratch every time we add something like we would do if we
+            # tracked/saved the item and the Pattern() object in a dict or in
+            # the set above
+            self.compiled_regexes.append((item, self.get_regex_result(item)))
+            self.compiled_regexes.sort(key=lambda x: len(x[0]), reverse=True)
+
+    def get_regex_result(self, item):
+        """Generate the object/value that is used by the parser when iterating
+        over pre-generated regexes during parse_line(). For most parsers this
+        will simply be a ``re.Pattern()`` object, but for more complex parsers
+        this can be overridden to provide a different object, e.g. a tuple,
+        for that parer's specific iteration needs.
+
+        :param item:    The unobfuscated string to generate the regex for
+        :type item:     ``str``
+
+        :returns:       A compiled regex pattern for the item
+        :rtype:         ``re.Pattern``
+        """
+        return re.compile(item, re.I)
+
     def sanitize_item(self, item):
         """Perform the obfuscation relevant to the item being added to the map.
 
@@ -69,8 +111,7 @@ class SoSMap():
         """Retrieve an item's obfuscated counterpart from the map. If the item
         does not yet exist in the map, add it by generating one on the fly
         """
-        if (not item or self.ignore_item(item) or
-                self.item_in_dataset_values(item)):
+        if self.ignore_item(item):
             return item
         if item not in self.dataset:
             return self.add(item)
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
index 4b66defb..be68511a 100644
--- a/sos/cleaner/mappings/hostname_map.py
+++ b/sos/cleaner/mappings/hostname_map.py
@@ -84,6 +84,15 @@ class SoSHostnameMap(SoSMap):
         for domain in domains:
             self.sanitize_domain(domain.split('.'))
 
+    def get_regex_result(self, item):
+        """Override the base get_regex_result() to provide a regex that, if
+        this is an FQDN or a straight domain, will include an underscore
+        formatted regex as well.
+        """
+        if '.' in item:
+            item = item.replace('.', '(\\.|_)')
+        return re.compile(item, re.I)
+
     def set_initial_counts(self):
         """Set the initial counter for host and domain obfuscation numbers
         based on what is already present in the mapping.
@@ -135,6 +144,8 @@ class SoSHostnameMap(SoSMap):
         while item.endswith(('.', '_')):
             suffix += item[-1]
             item = item[0:-1]
+        if item in self.dataset:
+            return self.dataset[item]
         if not self.domain_name_in_loaded_domains(item.lower()):
             return item
         if item.endswith(self.strip_exts):
@@ -211,14 +222,15 @@ class SoSHostnameMap(SoSMap):
         """Obfuscate the short name of the host with an incremented counter
         based on the total number of obfuscated host names
         """
-        if not hostname:
+        if not hostname or hostname in self.skip_keys:
             return hostname
-        if hostname not in self.hosts:
+        if hostname not in self.dataset:
             ob_host = "host%s" % self.host_count
             self.hosts[hostname] = ob_host
             self.host_count += 1
             self.dataset[hostname] = ob_host
-        return self.hosts[hostname]
+            self.add_regex_item(hostname)
+        return self.dataset[hostname]
 
     def sanitize_domain(self, domain):
         """Obfuscate the domainname, broken out into subdomains. Top-level
diff --git a/sos/cleaner/mappings/ip_map.py b/sos/cleaner/mappings/ip_map.py
index e6dffd60..55a841a5 100644
--- a/sos/cleaner/mappings/ip_map.py
+++ b/sos/cleaner/mappings/ip_map.py
@@ -44,6 +44,7 @@ class SoSIPMap(SoSMap):
     _networks = {}
     network_first_octet = 100
     skip_network_octets = ['127', '169', '172', '192']
+    compile_regexes = False
 
     def ip_in_dataset(self, ipaddr):
         """There are multiple ways in which an ip address could be handed to us
diff --git a/sos/cleaner/mappings/mac_map.py b/sos/cleaner/mappings/mac_map.py
index 4b9ea7ef..334a6681 100644
--- a/sos/cleaner/mappings/mac_map.py
+++ b/sos/cleaner/mappings/mac_map.py
@@ -48,6 +48,7 @@ class SoSMacMap(SoSMap):
     mac_template = '53:4f:53:%s:%s:%s'
     mac6_template = '53:4f:53:ff:fe:%s:%s:%s'
     mac6_quad_template = '534f:53ff:fe%s:%s%s'
+    compile_regexes = False
 
     def add(self, item):
         item = item.replace('-', ':').lower().strip('=.,').strip()
diff --git a/sos/cleaner/parsers/__init__.py b/sos/cleaner/parsers/__init__.py
index 6def863a..eebe1087 100644
--- a/sos/cleaner/parsers/__init__.py
+++ b/sos/cleaner/parsers/__init__.py
@@ -44,9 +44,9 @@ class SoSCleanerParser():
     skip_line_patterns = []
     skip_files = []
     map_file_key = 'unset'
+    compile_regexes = True
 
     def __init__(self, config={}):
-        self.regexes = {}
         if self.map_file_key in config:
             self.mapping.conf_update(config[self.map_file_key])
 
@@ -57,12 +57,52 @@ class SoSCleanerParser():
 
         Not used by all parsers.
         """
-        pass
+        if not self.compile_regexes:
+            return
+        for obitem in self.mapping.dataset:
+            self.mapping.add_regex_item(obitem)
 
     def parse_line(self, line):
         """This will be called for every line in every file we process, so that
         every parser has a chance to scrub everything.
 
+        This will first try to identify needed obfuscations for items we have
+        already encountered (if the parser uses compiled regexes that is) and
+        make those substitutions early on. After which, we will then parse the
+        line again looking for new matches.
+        """
+        count = 0
+        for skip_pattern in self.skip_line_patterns:
+            if re.match(skip_pattern, line, re.I):
+                return line, count
+        if self.compile_regexes:
+            line, _rcount = self._parse_line_with_compiled_regexes(line)
+            count += _rcount
+        line, _count = self._parse_line(line)
+        count += _count
+        return line, count
+
+    def _parse_line_with_compiled_regexes(self, line):
+        """Check the provided line against known items we have encountered
+        before and have pre-generated regex Pattern() objects for.
+
+        :param line:    The line to parse for possible matches for obfuscation
+        :type line:     ``str``
+
+        :returns:   The obfuscated line and the number of changes made
+        :rtype:     ``str``, ``int``
+        """
+        count = 0
+        for item, reg in self.mapping.compiled_regexes:
+            if reg.search(line):
+                line, _count = reg.subn(self.mapping.get(item.lower()), line)
+                count += _count
+        return line, count
+
+    def _parse_line(self, line):
+        """Check the provided line against the parser regex patterns to try
+        and discover _new_ items to obfuscate
+
         :param line: The line to parse for possible matches for obfuscation
         :type line: ``str``
 
@@ -70,16 +110,15 @@ class SoSCleanerParser():
         :rtype: ``tuple``, ``(str, int))``
         """
         count = 0
-        for skip_pattern in self.skip_line_patterns:
-            if re.match(skip_pattern, line, re.I):
-                return line, count
         for pattern in self.regex_patterns:
             matches = [m[0] for m in re.findall(pattern, line, re.I)]
             if matches:
-                matches.sort(reverse=True, key=lambda x: len(x))
+                matches.sort(reverse=True, key=len)
                 count += len(matches)
                 for match in matches:
                     match = match.strip()
+                    if match in self.mapping.dataset.values():
+                        continue
                     new_match = self.mapping.get(match)
                     if new_match != match:
                         line = line.replace(match, new_match)
@@ -99,13 +138,17 @@ class SoSCleanerParser():
         :returns: The obfuscated line
         :rtype: ``str``
         """
-        for pair in sorted(self.mapping.dataset.items(), reverse=True,
-                           key=lambda x: len(x[0])):
-            key, val = pair
-            if key in self.mapping.skip_keys:
-                continue
-            if key in string_data:
-                string_data = string_data.replace(key, val)
+        if self.compile_regexes:
+            for item, reg in self.mapping.compiled_regexes:
+                if reg.search(string_data):
+                    string_data = reg.sub(self.mapping.get(item), string_data)
+        else:
+            for k, ob in sorted(self.mapping.dataset.items(), reverse=True,
+                                key=lambda x: len(x[0])):
+                if k in self.mapping.skip_keys:
+                    continue
+                if k in string_data:
+                    string_data = string_data.replace(k, ob)
         return string_data
 
     def get_map_contents(self):
diff --git a/sos/cleaner/parsers/hostname_parser.py b/sos/cleaner/parsers/hostname_parser.py
index 7fd0e698..debdf182 100644
--- a/sos/cleaner/parsers/hostname_parser.py
+++ b/sos/cleaner/parsers/hostname_parser.py
@@ -8,8 +8,6 @@
 #
 # See the LICENSE file in the source distribution for further information.
 
-import re
-
 from sos.cleaner.parsers import SoSCleanerParser
 from sos.cleaner.mappings.hostname_map import SoSHostnameMap
 
@@ -79,37 +77,6 @@ class SoSHostnameParser(SoSCleanerParser):
                     # only generate a mapping for fqdns but still record the
                     # short name here for later obfuscation with parse_line()
                     self.short_names.append(host)
+                    self.mapping.add_regex_item(host)
                 else:
                     self.mapping.add(host)
-
-    def parse_line(self, line):
-        """Override the default parse_line() method to also check for the
-        shortname of the host derived from the hostname.
-        """
-
-        def _check_line(ln, count, search, repl=None):
-            """Perform a second manual check for substrings that may have been
-            missed by regex matching
-            """
-            if search in self.mapping.skip_keys:
-                return ln, count
-            _reg = re.compile(search, re.I)
-            if _reg.search(ln):
-                return _reg.subn(self.mapping.get(repl or search), ln)
-            return ln, count
-
-        count = 0
-        line, count = super(SoSHostnameParser, self).parse_line(line)
-        # make an additional pass checking for '_' formatted substrings that
-        # the regex patterns won't catch
-        hosts = [h for h in self.mapping.dataset.keys() if '.' in h]
-        for host in sorted(hosts, reverse=True, key=lambda x: len(x)):
-            fqdn = host
-            for c in '.-':
-                fqdn = fqdn.replace(c, '_')
-            line, count = _check_line(line, count, fqdn, host)
-
-        for short_name in sorted(self.short_names, reverse=True):
-            line, count = _check_line(line, count, short_name)
-
-        return line, count
diff --git a/sos/cleaner/parsers/ip_parser.py b/sos/cleaner/parsers/ip_parser.py
index b007368c..ece9cd73 100644
--- a/sos/cleaner/parsers/ip_parser.py
+++ b/sos/cleaner/parsers/ip_parser.py
@@ -42,6 +42,7 @@ class SoSIPParser(SoSCleanerParser):
     ]
 
     map_file_key = 'ip_map'
+    compile_regexes = False
 
     def __init__(self, config):
         self.mapping = SoSIPMap()
diff --git a/sos/cleaner/parsers/keyword_parser.py b/sos/cleaner/parsers/keyword_parser.py
index 362a1929..9a0f65ba 100644
--- a/sos/cleaner/parsers/keyword_parser.py
+++ b/sos/cleaner/parsers/keyword_parser.py
@@ -9,7 +9,6 @@
 # See the LICENSE file in the source distribution for further information.
 
 import os
-import re
 
 from sos.cleaner.parsers import SoSCleanerParser
 from sos.cleaner.mappings.keyword_map import SoSKeywordMap
@@ -40,14 +39,5 @@ class SoSKeywordParser(SoSCleanerParser):
             with open(keyword_file, 'r') as kwf:
                 self.user_keywords.extend(kwf.read().splitlines())
 
-    def generate_item_regexes(self):
-        for kw in self.user_keywords:
-            self.regexes[kw] = re.compile(kw, re.I)
-
-    def parse_line(self, line):
-        count = 0
-        for kwrd, reg in sorted(self.regexes.items(), key=len, reverse=True):
-            if reg.search(line):
-                line, _count = reg.subn(self.mapping.get(kwrd.lower()), line)
-                count += _count
-        return line, count
+    def _parse_line(self, line):
+        return line, 0
diff --git a/sos/cleaner/parsers/mac_parser.py b/sos/cleaner/parsers/mac_parser.py
index b7ed2bae..4cbcbdce 100644
--- a/sos/cleaner/parsers/mac_parser.py
+++ b/sos/cleaner/parsers/mac_parser.py
@@ -41,6 +41,7 @@ class SoSMacParser(SoSCleanerParser):
         'sos_commands/kernel/modinfo.*'
     ]
     map_file_key = 'mac_map'
+    compile_regexes = False
 
     def __init__(self, config):
         self.mapping = SoSMacMap()
@@ -57,11 +58,8 @@ class SoSMacParser(SoSCleanerParser):
         # just to be safe, call strip() to remove any padding
         return match.strip()
 
-    def parse_line(self, line):
+    def _parse_line(self, line):
         count = 0
-        for skip_pattern in self.skip_line_patterns:
-            if re.match(skip_pattern, line, re.I):
-                return line, count
         for pattern in self.regex_patterns:
             matches = [m[0] for m in re.findall(pattern, line, re.I)]
             if matches:
diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py
index 2853c860..6d9a9ff0 100644
--- a/sos/cleaner/parsers/username_parser.py
+++ b/sos/cleaner/parsers/username_parser.py
@@ -8,8 +8,6 @@
 #
 # See the LICENSE file in the source distribution for further information.
 
-import re
-
 from sos.cleaner.parsers import SoSCleanerParser
 from sos.cleaner.mappings.username_map import SoSUsernameMap
 
@@ -61,14 +59,5 @@ class SoSUsernameParser(SoSCleanerParser):
         for each in users:
             self.mapping.get(each)
 
-    def generate_item_regexes(self):
-        for user in self.mapping.dataset:
-            self.regexes[user] = re.compile(user, re.I)
-
-    def parse_line(self, line):
-        count = 0
-        for user, reg in sorted(self.regexes.items(), key=len, reverse=True):
-            if reg.search(line):
-                line, _count = reg.subn(self.mapping.get(user.lower()), line)
-                count += _count
-        return line, count
+    def _parse_line(self, line):
+        return line, 0
author	Jake Hunsaker <jhunsake@redhat.com>	2022-02-09 17:32:17 -0500
committer	Jake Hunsaker <jhunsake@redhat.com>	2022-02-28 10:24:04 -0500
commit	045609e5705b1643fe2e800206b38a4ca4d30b02 (patch)
tree	fd93d96255e52fd40bc633ae8a09acf9de635faf
parent	2ae16e0245e1b01b8547e507abb69c11871a8467 (diff)
download	sos-045609e5705b1643fe2e800206b38a4ca4d30b02.tar.gz