diff options
author | Jake Hunsaker <jhunsake@redhat.com> | 2022-02-09 17:32:17 -0500 |
---|---|---|
committer | Jake Hunsaker <jhunsake@redhat.com> | 2022-02-28 10:24:04 -0500 |
commit | 045609e5705b1643fe2e800206b38a4ca4d30b02 (patch) | |
tree | fd93d96255e52fd40bc633ae8a09acf9de635faf | |
parent | 2ae16e0245e1b01b8547e507abb69c11871a8467 (diff) | |
download | sos-045609e5705b1643fe2e800206b38a4ca4d30b02.tar.gz |
[cleaner] Use compiled regex lists for parsers by default
This commit follows the initial change made to the username and keyword
parsers in #2823 and applies it to all parsers by default.
When a new match is found, a new `regex.Pattern()` object will now be
compiled and saved in the parser's map, and this object will be used for
ongoing obfuscations from that point forward, rather than rebuilding
regexes for every line we iterate over with that parser.
This is now built into the base `SoSMap` and leveraged by parsers,
rather than being handled directly by parsers. Further, this will be
enabled by default for all existing and new parsers. This shows decent
improvements to hostname parser performance in local testing. Note that
this functionality is explicitly disabled for the mac and ip parsers.
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
-rw-r--r-- | sos/cleaner/mappings/__init__.py | 55 | ||||
-rw-r--r-- | sos/cleaner/mappings/hostname_map.py | 18 | ||||
-rw-r--r-- | sos/cleaner/mappings/ip_map.py | 1 | ||||
-rw-r--r-- | sos/cleaner/mappings/mac_map.py | 1 | ||||
-rw-r--r-- | sos/cleaner/parsers/__init__.py | 69 | ||||
-rw-r--r-- | sos/cleaner/parsers/hostname_parser.py | 35 | ||||
-rw-r--r-- | sos/cleaner/parsers/ip_parser.py | 1 | ||||
-rw-r--r-- | sos/cleaner/parsers/keyword_parser.py | 14 | ||||
-rw-r--r-- | sos/cleaner/parsers/mac_parser.py | 6 | ||||
-rw-r--r-- | sos/cleaner/parsers/username_parser.py | 15 |
10 files changed, 129 insertions, 86 deletions
diff --git a/sos/cleaner/mappings/__init__.py b/sos/cleaner/mappings/__init__.py index 48171a05..92aedf4a 100644 --- a/sos/cleaner/mappings/__init__.py +++ b/sos/cleaner/mappings/__init__.py @@ -24,22 +24,24 @@ class SoSMap(): ignore_matches = [] # used for filename obfuscations in parser.parse_string_for_keys() skip_keys = [] + compile_regexes = True def __init__(self): self.dataset = {} + self._regexes_made = set() + self.compiled_regexes = [] self.lock = Lock() def ignore_item(self, item): """Some items need to be completely ignored, for example link-local or loopback addresses should not be obfuscated """ + if not item or item in self.skip_keys or item in self.dataset.values(): + return True for skip in self.ignore_matches: if re.match(skip, item): return True - def item_in_dataset_values(self, item): - return item in self.dataset.values() - def add(self, item): """Add a particular item to the map, generating an obfuscated pair for it. @@ -48,12 +50,52 @@ class SoSMap(): :param item: The plaintext object to obfuscate """ + if self.ignore_item(item): + return item with self.lock: - if not item: - return item self.dataset[item] = self.sanitize_item(item) + if self.compile_regexes: + self.add_regex_item(item) return self.dataset[item] + def add_regex_item(self, item): + """Add an item to the regexes dict and then re-sort the list that the + parsers will use during parse_line() + + :param item: The unobfuscated item to generate a regex for + :type item: ``str`` + """ + if self.ignore_item(item): + return + if item not in self._regexes_made: + # save the item in a set to avoid clobbering existing regexes, + # as searching this set is significantly faster than searching + # through the actual compiled_regexes list, especially for very + # large collections of entries + self._regexes_made.add(item) + # add the item, Pattern tuple directly to the compiled_regexes list + # and then sort the existing list, rather than rebuild the list + # from scratch every time we add something like we would do if we + # tracked/saved the item and the Pattern() object in a dict or in + # the set above + self.compiled_regexes.append((item, self.get_regex_result(item))) + self.compiled_regexes.sort(key=lambda x: len(x[0]), reverse=True) + + def get_regex_result(self, item): + """Generate the object/value that is used by the parser when iterating + over pre-generated regexes during parse_line(). For most parsers this + will simply be a ``re.Pattern()`` object, but for more complex parsers + this can be overridden to provide a different object, e.g. a tuple, + for that parer's specific iteration needs. + + :param item: The unobfuscated string to generate the regex for + :type item: ``str`` + + :returns: A compiled regex pattern for the item + :rtype: ``re.Pattern`` + """ + return re.compile(item, re.I) + def sanitize_item(self, item): """Perform the obfuscation relevant to the item being added to the map. @@ -69,8 +111,7 @@ class SoSMap(): """Retrieve an item's obfuscated counterpart from the map. If the item does not yet exist in the map, add it by generating one on the fly """ - if (not item or self.ignore_item(item) or - self.item_in_dataset_values(item)): + if self.ignore_item(item): return item if item not in self.dataset: return self.add(item) diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py index 4b66defb..be68511a 100644 --- a/sos/cleaner/mappings/hostname_map.py +++ b/sos/cleaner/mappings/hostname_map.py @@ -84,6 +84,15 @@ class SoSHostnameMap(SoSMap): for domain in domains: self.sanitize_domain(domain.split('.')) + def get_regex_result(self, item): + """Override the base get_regex_result() to provide a regex that, if + this is an FQDN or a straight domain, will include an underscore + formatted regex as well. + """ + if '.' in item: + item = item.replace('.', '(\\.|_)') + return re.compile(item, re.I) + def set_initial_counts(self): """Set the initial counter for host and domain obfuscation numbers based on what is already present in the mapping. @@ -135,6 +144,8 @@ class SoSHostnameMap(SoSMap): while item.endswith(('.', '_')): suffix += item[-1] item = item[0:-1] + if item in self.dataset: + return self.dataset[item] if not self.domain_name_in_loaded_domains(item.lower()): return item if item.endswith(self.strip_exts): @@ -211,14 +222,15 @@ class SoSHostnameMap(SoSMap): """Obfuscate the short name of the host with an incremented counter based on the total number of obfuscated host names """ - if not hostname: + if not hostname or hostname in self.skip_keys: return hostname - if hostname not in self.hosts: + if hostname not in self.dataset: ob_host = "host%s" % self.host_count self.hosts[hostname] = ob_host self.host_count += 1 self.dataset[hostname] = ob_host - return self.hosts[hostname] + self.add_regex_item(hostname) + return self.dataset[hostname] def sanitize_domain(self, domain): """Obfuscate the domainname, broken out into subdomains. Top-level diff --git a/sos/cleaner/mappings/ip_map.py b/sos/cleaner/mappings/ip_map.py index e6dffd60..55a841a5 100644 --- a/sos/cleaner/mappings/ip_map.py +++ b/sos/cleaner/mappings/ip_map.py @@ -44,6 +44,7 @@ class SoSIPMap(SoSMap): _networks = {} network_first_octet = 100 skip_network_octets = ['127', '169', '172', '192'] + compile_regexes = False def ip_in_dataset(self, ipaddr): """There are multiple ways in which an ip address could be handed to us diff --git a/sos/cleaner/mappings/mac_map.py b/sos/cleaner/mappings/mac_map.py index 4b9ea7ef..334a6681 100644 --- a/sos/cleaner/mappings/mac_map.py +++ b/sos/cleaner/mappings/mac_map.py @@ -48,6 +48,7 @@ class SoSMacMap(SoSMap): mac_template = '53:4f:53:%s:%s:%s' mac6_template = '53:4f:53:ff:fe:%s:%s:%s' mac6_quad_template = '534f:53ff:fe%s:%s%s' + compile_regexes = False def add(self, item): item = item.replace('-', ':').lower().strip('=.,').strip() diff --git a/sos/cleaner/parsers/__init__.py b/sos/cleaner/parsers/__init__.py index 6def863a..eebe1087 100644 --- a/sos/cleaner/parsers/__init__.py +++ b/sos/cleaner/parsers/__init__.py @@ -44,9 +44,9 @@ class SoSCleanerParser(): skip_line_patterns = [] skip_files = [] map_file_key = 'unset' + compile_regexes = True def __init__(self, config={}): - self.regexes = {} if self.map_file_key in config: self.mapping.conf_update(config[self.map_file_key]) @@ -57,12 +57,52 @@ class SoSCleanerParser(): Not used by all parsers. """ - pass + if not self.compile_regexes: + return + for obitem in self.mapping.dataset: + self.mapping.add_regex_item(obitem) def parse_line(self, line): """This will be called for every line in every file we process, so that every parser has a chance to scrub everything. + This will first try to identify needed obfuscations for items we have + already encountered (if the parser uses compiled regexes that is) and + make those substitutions early on. After which, we will then parse the + line again looking for new matches. + """ + count = 0 + for skip_pattern in self.skip_line_patterns: + if re.match(skip_pattern, line, re.I): + return line, count + if self.compile_regexes: + line, _rcount = self._parse_line_with_compiled_regexes(line) + count += _rcount + line, _count = self._parse_line(line) + count += _count + return line, count + + def _parse_line_with_compiled_regexes(self, line): + """Check the provided line against known items we have encountered + before and have pre-generated regex Pattern() objects for. + + :param line: The line to parse for possible matches for obfuscation + :type line: ``str`` + + :returns: The obfuscated line and the number of changes made + :rtype: ``str``, ``int`` + """ + count = 0 + for item, reg in self.mapping.compiled_regexes: + if reg.search(line): + line, _count = reg.subn(self.mapping.get(item.lower()), line) + count += _count + return line, count + + def _parse_line(self, line): + """Check the provided line against the parser regex patterns to try + and discover _new_ items to obfuscate + :param line: The line to parse for possible matches for obfuscation :type line: ``str`` @@ -70,16 +110,15 @@ class SoSCleanerParser(): :rtype: ``tuple``, ``(str, int))`` """ count = 0 - for skip_pattern in self.skip_line_patterns: - if re.match(skip_pattern, line, re.I): - return line, count for pattern in self.regex_patterns: matches = [m[0] for m in re.findall(pattern, line, re.I)] if matches: - matches.sort(reverse=True, key=lambda x: len(x)) + matches.sort(reverse=True, key=len) count += len(matches) for match in matches: match = match.strip() + if match in self.mapping.dataset.values(): + continue new_match = self.mapping.get(match) if new_match != match: line = line.replace(match, new_match) @@ -99,13 +138,17 @@ class SoSCleanerParser(): :returns: The obfuscated line :rtype: ``str`` """ - for pair in sorted(self.mapping.dataset.items(), reverse=True, - key=lambda x: len(x[0])): - key, val = pair - if key in self.mapping.skip_keys: - continue - if key in string_data: - string_data = string_data.replace(key, val) + if self.compile_regexes: + for item, reg in self.mapping.compiled_regexes: + if reg.search(string_data): + string_data = reg.sub(self.mapping.get(item), string_data) + else: + for k, ob in sorted(self.mapping.dataset.items(), reverse=True, + key=lambda x: len(x[0])): + if k in self.mapping.skip_keys: + continue + if k in string_data: + string_data = string_data.replace(k, ob) return string_data def get_map_contents(self): diff --git a/sos/cleaner/parsers/hostname_parser.py b/sos/cleaner/parsers/hostname_parser.py index 7fd0e698..debdf182 100644 --- a/sos/cleaner/parsers/hostname_parser.py +++ b/sos/cleaner/parsers/hostname_parser.py @@ -8,8 +8,6 @@ # # See the LICENSE file in the source distribution for further information. -import re - from sos.cleaner.parsers import SoSCleanerParser from sos.cleaner.mappings.hostname_map import SoSHostnameMap @@ -79,37 +77,6 @@ class SoSHostnameParser(SoSCleanerParser): # only generate a mapping for fqdns but still record the # short name here for later obfuscation with parse_line() self.short_names.append(host) + self.mapping.add_regex_item(host) else: self.mapping.add(host) - - def parse_line(self, line): - """Override the default parse_line() method to also check for the - shortname of the host derived from the hostname. - """ - - def _check_line(ln, count, search, repl=None): - """Perform a second manual check for substrings that may have been - missed by regex matching - """ - if search in self.mapping.skip_keys: - return ln, count - _reg = re.compile(search, re.I) - if _reg.search(ln): - return _reg.subn(self.mapping.get(repl or search), ln) - return ln, count - - count = 0 - line, count = super(SoSHostnameParser, self).parse_line(line) - # make an additional pass checking for '_' formatted substrings that - # the regex patterns won't catch - hosts = [h for h in self.mapping.dataset.keys() if '.' in h] - for host in sorted(hosts, reverse=True, key=lambda x: len(x)): - fqdn = host - for c in '.-': - fqdn = fqdn.replace(c, '_') - line, count = _check_line(line, count, fqdn, host) - - for short_name in sorted(self.short_names, reverse=True): - line, count = _check_line(line, count, short_name) - - return line, count diff --git a/sos/cleaner/parsers/ip_parser.py b/sos/cleaner/parsers/ip_parser.py index b007368c..ece9cd73 100644 --- a/sos/cleaner/parsers/ip_parser.py +++ b/sos/cleaner/parsers/ip_parser.py @@ -42,6 +42,7 @@ class SoSIPParser(SoSCleanerParser): ] map_file_key = 'ip_map' + compile_regexes = False def __init__(self, config): self.mapping = SoSIPMap() diff --git a/sos/cleaner/parsers/keyword_parser.py b/sos/cleaner/parsers/keyword_parser.py index 362a1929..9a0f65ba 100644 --- a/sos/cleaner/parsers/keyword_parser.py +++ b/sos/cleaner/parsers/keyword_parser.py @@ -9,7 +9,6 @@ # See the LICENSE file in the source distribution for further information. import os -import re from sos.cleaner.parsers import SoSCleanerParser from sos.cleaner.mappings.keyword_map import SoSKeywordMap @@ -40,14 +39,5 @@ class SoSKeywordParser(SoSCleanerParser): with open(keyword_file, 'r') as kwf: self.user_keywords.extend(kwf.read().splitlines()) - def generate_item_regexes(self): - for kw in self.user_keywords: - self.regexes[kw] = re.compile(kw, re.I) - - def parse_line(self, line): - count = 0 - for kwrd, reg in sorted(self.regexes.items(), key=len, reverse=True): - if reg.search(line): - line, _count = reg.subn(self.mapping.get(kwrd.lower()), line) - count += _count - return line, count + def _parse_line(self, line): + return line, 0 diff --git a/sos/cleaner/parsers/mac_parser.py b/sos/cleaner/parsers/mac_parser.py index b7ed2bae..4cbcbdce 100644 --- a/sos/cleaner/parsers/mac_parser.py +++ b/sos/cleaner/parsers/mac_parser.py @@ -41,6 +41,7 @@ class SoSMacParser(SoSCleanerParser): 'sos_commands/kernel/modinfo.*' ] map_file_key = 'mac_map' + compile_regexes = False def __init__(self, config): self.mapping = SoSMacMap() @@ -57,11 +58,8 @@ class SoSMacParser(SoSCleanerParser): # just to be safe, call strip() to remove any padding return match.strip() - def parse_line(self, line): + def _parse_line(self, line): count = 0 - for skip_pattern in self.skip_line_patterns: - if re.match(skip_pattern, line, re.I): - return line, count for pattern in self.regex_patterns: matches = [m[0] for m in re.findall(pattern, line, re.I)] if matches: diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py index 2853c860..6d9a9ff0 100644 --- a/sos/cleaner/parsers/username_parser.py +++ b/sos/cleaner/parsers/username_parser.py @@ -8,8 +8,6 @@ # # See the LICENSE file in the source distribution for further information. -import re - from sos.cleaner.parsers import SoSCleanerParser from sos.cleaner.mappings.username_map import SoSUsernameMap @@ -61,14 +59,5 @@ class SoSUsernameParser(SoSCleanerParser): for each in users: self.mapping.get(each) - def generate_item_regexes(self): - for user in self.mapping.dataset: - self.regexes[user] = re.compile(user, re.I) - - def parse_line(self, line): - count = 0 - for user, reg in sorted(self.regexes.items(), key=len, reverse=True): - if reg.search(line): - line, _count = reg.subn(self.mapping.get(user.lower()), line) - count += _count - return line, count + def _parse_line(self, line): + return line, 0 |