aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJake Hunsaker <jhunsake@redhat.com>2022-02-09 17:32:17 -0500
committerJake Hunsaker <jhunsake@redhat.com>2022-02-28 10:24:04 -0500
commit045609e5705b1643fe2e800206b38a4ca4d30b02 (patch)
treefd93d96255e52fd40bc633ae8a09acf9de635faf
parent2ae16e0245e1b01b8547e507abb69c11871a8467 (diff)
downloadsos-045609e5705b1643fe2e800206b38a4ca4d30b02.tar.gz
[cleaner] Use compiled regex lists for parsers by default
This commit follows the initial change made to the username and keyword parsers in #2823 and applies it to all parsers by default. When a new match is found, a new `regex.Pattern()` object will now be compiled and saved in the parser's map, and this object will be used for ongoing obfuscations from that point forward, rather than rebuilding regexes for every line we iterate over with that parser. This is now built into the base `SoSMap` and leveraged by parsers, rather than being handled directly by parsers. Further, this will be enabled by default for all existing and new parsers. This shows decent improvements to hostname parser performance in local testing. Note that this functionality is explicitly disabled for the mac and ip parsers. Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
-rw-r--r--sos/cleaner/mappings/__init__.py55
-rw-r--r--sos/cleaner/mappings/hostname_map.py18
-rw-r--r--sos/cleaner/mappings/ip_map.py1
-rw-r--r--sos/cleaner/mappings/mac_map.py1
-rw-r--r--sos/cleaner/parsers/__init__.py69
-rw-r--r--sos/cleaner/parsers/hostname_parser.py35
-rw-r--r--sos/cleaner/parsers/ip_parser.py1
-rw-r--r--sos/cleaner/parsers/keyword_parser.py14
-rw-r--r--sos/cleaner/parsers/mac_parser.py6
-rw-r--r--sos/cleaner/parsers/username_parser.py15
10 files changed, 129 insertions, 86 deletions
diff --git a/sos/cleaner/mappings/__init__.py b/sos/cleaner/mappings/__init__.py
index 48171a05..92aedf4a 100644
--- a/sos/cleaner/mappings/__init__.py
+++ b/sos/cleaner/mappings/__init__.py
@@ -24,22 +24,24 @@ class SoSMap():
ignore_matches = []
# used for filename obfuscations in parser.parse_string_for_keys()
skip_keys = []
+ compile_regexes = True
def __init__(self):
self.dataset = {}
+ self._regexes_made = set()
+ self.compiled_regexes = []
self.lock = Lock()
def ignore_item(self, item):
"""Some items need to be completely ignored, for example link-local or
loopback addresses should not be obfuscated
"""
+ if not item or item in self.skip_keys or item in self.dataset.values():
+ return True
for skip in self.ignore_matches:
if re.match(skip, item):
return True
- def item_in_dataset_values(self, item):
- return item in self.dataset.values()
-
def add(self, item):
"""Add a particular item to the map, generating an obfuscated pair
for it.
@@ -48,12 +50,52 @@ class SoSMap():
:param item: The plaintext object to obfuscate
"""
+ if self.ignore_item(item):
+ return item
with self.lock:
- if not item:
- return item
self.dataset[item] = self.sanitize_item(item)
+ if self.compile_regexes:
+ self.add_regex_item(item)
return self.dataset[item]
+ def add_regex_item(self, item):
+ """Add an item to the regexes dict and then re-sort the list that the
+ parsers will use during parse_line()
+
+ :param item: The unobfuscated item to generate a regex for
+ :type item: ``str``
+ """
+ if self.ignore_item(item):
+ return
+ if item not in self._regexes_made:
+ # save the item in a set to avoid clobbering existing regexes,
+ # as searching this set is significantly faster than searching
+ # through the actual compiled_regexes list, especially for very
+ # large collections of entries
+ self._regexes_made.add(item)
+ # add the item, Pattern tuple directly to the compiled_regexes list
+ # and then sort the existing list, rather than rebuild the list
+ # from scratch every time we add something like we would do if we
+ # tracked/saved the item and the Pattern() object in a dict or in
+ # the set above
+ self.compiled_regexes.append((item, self.get_regex_result(item)))
+ self.compiled_regexes.sort(key=lambda x: len(x[0]), reverse=True)
+
+ def get_regex_result(self, item):
+ """Generate the object/value that is used by the parser when iterating
+ over pre-generated regexes during parse_line(). For most parsers this
+ will simply be a ``re.Pattern()`` object, but for more complex parsers
+ this can be overridden to provide a different object, e.g. a tuple,
+ for that parer's specific iteration needs.
+
+ :param item: The unobfuscated string to generate the regex for
+ :type item: ``str``
+
+ :returns: A compiled regex pattern for the item
+ :rtype: ``re.Pattern``
+ """
+ return re.compile(item, re.I)
+
def sanitize_item(self, item):
"""Perform the obfuscation relevant to the item being added to the map.
@@ -69,8 +111,7 @@ class SoSMap():
"""Retrieve an item's obfuscated counterpart from the map. If the item
does not yet exist in the map, add it by generating one on the fly
"""
- if (not item or self.ignore_item(item) or
- self.item_in_dataset_values(item)):
+ if self.ignore_item(item):
return item
if item not in self.dataset:
return self.add(item)
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
index 4b66defb..be68511a 100644
--- a/sos/cleaner/mappings/hostname_map.py
+++ b/sos/cleaner/mappings/hostname_map.py
@@ -84,6 +84,15 @@ class SoSHostnameMap(SoSMap):
for domain in domains:
self.sanitize_domain(domain.split('.'))
+ def get_regex_result(self, item):
+ """Override the base get_regex_result() to provide a regex that, if
+ this is an FQDN or a straight domain, will include an underscore
+ formatted regex as well.
+ """
+ if '.' in item:
+ item = item.replace('.', '(\\.|_)')
+ return re.compile(item, re.I)
+
def set_initial_counts(self):
"""Set the initial counter for host and domain obfuscation numbers
based on what is already present in the mapping.
@@ -135,6 +144,8 @@ class SoSHostnameMap(SoSMap):
while item.endswith(('.', '_')):
suffix += item[-1]
item = item[0:-1]
+ if item in self.dataset:
+ return self.dataset[item]
if not self.domain_name_in_loaded_domains(item.lower()):
return item
if item.endswith(self.strip_exts):
@@ -211,14 +222,15 @@ class SoSHostnameMap(SoSMap):
"""Obfuscate the short name of the host with an incremented counter
based on the total number of obfuscated host names
"""
- if not hostname:
+ if not hostname or hostname in self.skip_keys:
return hostname
- if hostname not in self.hosts:
+ if hostname not in self.dataset:
ob_host = "host%s" % self.host_count
self.hosts[hostname] = ob_host
self.host_count += 1
self.dataset[hostname] = ob_host
- return self.hosts[hostname]
+ self.add_regex_item(hostname)
+ return self.dataset[hostname]
def sanitize_domain(self, domain):
"""Obfuscate the domainname, broken out into subdomains. Top-level
diff --git a/sos/cleaner/mappings/ip_map.py b/sos/cleaner/mappings/ip_map.py
index e6dffd60..55a841a5 100644
--- a/sos/cleaner/mappings/ip_map.py
+++ b/sos/cleaner/mappings/ip_map.py
@@ -44,6 +44,7 @@ class SoSIPMap(SoSMap):
_networks = {}
network_first_octet = 100
skip_network_octets = ['127', '169', '172', '192']
+ compile_regexes = False
def ip_in_dataset(self, ipaddr):
"""There are multiple ways in which an ip address could be handed to us
diff --git a/sos/cleaner/mappings/mac_map.py b/sos/cleaner/mappings/mac_map.py
index 4b9ea7ef..334a6681 100644
--- a/sos/cleaner/mappings/mac_map.py
+++ b/sos/cleaner/mappings/mac_map.py
@@ -48,6 +48,7 @@ class SoSMacMap(SoSMap):
mac_template = '53:4f:53:%s:%s:%s'
mac6_template = '53:4f:53:ff:fe:%s:%s:%s'
mac6_quad_template = '534f:53ff:fe%s:%s%s'
+ compile_regexes = False
def add(self, item):
item = item.replace('-', ':').lower().strip('=.,').strip()
diff --git a/sos/cleaner/parsers/__init__.py b/sos/cleaner/parsers/__init__.py
index 6def863a..eebe1087 100644
--- a/sos/cleaner/parsers/__init__.py
+++ b/sos/cleaner/parsers/__init__.py
@@ -44,9 +44,9 @@ class SoSCleanerParser():
skip_line_patterns = []
skip_files = []
map_file_key = 'unset'
+ compile_regexes = True
def __init__(self, config={}):
- self.regexes = {}
if self.map_file_key in config:
self.mapping.conf_update(config[self.map_file_key])
@@ -57,12 +57,52 @@ class SoSCleanerParser():
Not used by all parsers.
"""
- pass
+ if not self.compile_regexes:
+ return
+ for obitem in self.mapping.dataset:
+ self.mapping.add_regex_item(obitem)
def parse_line(self, line):
"""This will be called for every line in every file we process, so that
every parser has a chance to scrub everything.
+ This will first try to identify needed obfuscations for items we have
+ already encountered (if the parser uses compiled regexes that is) and
+ make those substitutions early on. After which, we will then parse the
+ line again looking for new matches.
+ """
+ count = 0
+ for skip_pattern in self.skip_line_patterns:
+ if re.match(skip_pattern, line, re.I):
+ return line, count
+ if self.compile_regexes:
+ line, _rcount = self._parse_line_with_compiled_regexes(line)
+ count += _rcount
+ line, _count = self._parse_line(line)
+ count += _count
+ return line, count
+
+ def _parse_line_with_compiled_regexes(self, line):
+ """Check the provided line against known items we have encountered
+ before and have pre-generated regex Pattern() objects for.
+
+ :param line: The line to parse for possible matches for obfuscation
+ :type line: ``str``
+
+ :returns: The obfuscated line and the number of changes made
+ :rtype: ``str``, ``int``
+ """
+ count = 0
+ for item, reg in self.mapping.compiled_regexes:
+ if reg.search(line):
+ line, _count = reg.subn(self.mapping.get(item.lower()), line)
+ count += _count
+ return line, count
+
+ def _parse_line(self, line):
+ """Check the provided line against the parser regex patterns to try
+ and discover _new_ items to obfuscate
+
:param line: The line to parse for possible matches for obfuscation
:type line: ``str``
@@ -70,16 +110,15 @@ class SoSCleanerParser():
:rtype: ``tuple``, ``(str, int))``
"""
count = 0
- for skip_pattern in self.skip_line_patterns:
- if re.match(skip_pattern, line, re.I):
- return line, count
for pattern in self.regex_patterns:
matches = [m[0] for m in re.findall(pattern, line, re.I)]
if matches:
- matches.sort(reverse=True, key=lambda x: len(x))
+ matches.sort(reverse=True, key=len)
count += len(matches)
for match in matches:
match = match.strip()
+ if match in self.mapping.dataset.values():
+ continue
new_match = self.mapping.get(match)
if new_match != match:
line = line.replace(match, new_match)
@@ -99,13 +138,17 @@ class SoSCleanerParser():
:returns: The obfuscated line
:rtype: ``str``
"""
- for pair in sorted(self.mapping.dataset.items(), reverse=True,
- key=lambda x: len(x[0])):
- key, val = pair
- if key in self.mapping.skip_keys:
- continue
- if key in string_data:
- string_data = string_data.replace(key, val)
+ if self.compile_regexes:
+ for item, reg in self.mapping.compiled_regexes:
+ if reg.search(string_data):
+ string_data = reg.sub(self.mapping.get(item), string_data)
+ else:
+ for k, ob in sorted(self.mapping.dataset.items(), reverse=True,
+ key=lambda x: len(x[0])):
+ if k in self.mapping.skip_keys:
+ continue
+ if k in string_data:
+ string_data = string_data.replace(k, ob)
return string_data
def get_map_contents(self):
diff --git a/sos/cleaner/parsers/hostname_parser.py b/sos/cleaner/parsers/hostname_parser.py
index 7fd0e698..debdf182 100644
--- a/sos/cleaner/parsers/hostname_parser.py
+++ b/sos/cleaner/parsers/hostname_parser.py
@@ -8,8 +8,6 @@
#
# See the LICENSE file in the source distribution for further information.
-import re
-
from sos.cleaner.parsers import SoSCleanerParser
from sos.cleaner.mappings.hostname_map import SoSHostnameMap
@@ -79,37 +77,6 @@ class SoSHostnameParser(SoSCleanerParser):
# only generate a mapping for fqdns but still record the
# short name here for later obfuscation with parse_line()
self.short_names.append(host)
+ self.mapping.add_regex_item(host)
else:
self.mapping.add(host)
-
- def parse_line(self, line):
- """Override the default parse_line() method to also check for the
- shortname of the host derived from the hostname.
- """
-
- def _check_line(ln, count, search, repl=None):
- """Perform a second manual check for substrings that may have been
- missed by regex matching
- """
- if search in self.mapping.skip_keys:
- return ln, count
- _reg = re.compile(search, re.I)
- if _reg.search(ln):
- return _reg.subn(self.mapping.get(repl or search), ln)
- return ln, count
-
- count = 0
- line, count = super(SoSHostnameParser, self).parse_line(line)
- # make an additional pass checking for '_' formatted substrings that
- # the regex patterns won't catch
- hosts = [h for h in self.mapping.dataset.keys() if '.' in h]
- for host in sorted(hosts, reverse=True, key=lambda x: len(x)):
- fqdn = host
- for c in '.-':
- fqdn = fqdn.replace(c, '_')
- line, count = _check_line(line, count, fqdn, host)
-
- for short_name in sorted(self.short_names, reverse=True):
- line, count = _check_line(line, count, short_name)
-
- return line, count
diff --git a/sos/cleaner/parsers/ip_parser.py b/sos/cleaner/parsers/ip_parser.py
index b007368c..ece9cd73 100644
--- a/sos/cleaner/parsers/ip_parser.py
+++ b/sos/cleaner/parsers/ip_parser.py
@@ -42,6 +42,7 @@ class SoSIPParser(SoSCleanerParser):
]
map_file_key = 'ip_map'
+ compile_regexes = False
def __init__(self, config):
self.mapping = SoSIPMap()
diff --git a/sos/cleaner/parsers/keyword_parser.py b/sos/cleaner/parsers/keyword_parser.py
index 362a1929..9a0f65ba 100644
--- a/sos/cleaner/parsers/keyword_parser.py
+++ b/sos/cleaner/parsers/keyword_parser.py
@@ -9,7 +9,6 @@
# See the LICENSE file in the source distribution for further information.
import os
-import re
from sos.cleaner.parsers import SoSCleanerParser
from sos.cleaner.mappings.keyword_map import SoSKeywordMap
@@ -40,14 +39,5 @@ class SoSKeywordParser(SoSCleanerParser):
with open(keyword_file, 'r') as kwf:
self.user_keywords.extend(kwf.read().splitlines())
- def generate_item_regexes(self):
- for kw in self.user_keywords:
- self.regexes[kw] = re.compile(kw, re.I)
-
- def parse_line(self, line):
- count = 0
- for kwrd, reg in sorted(self.regexes.items(), key=len, reverse=True):
- if reg.search(line):
- line, _count = reg.subn(self.mapping.get(kwrd.lower()), line)
- count += _count
- return line, count
+ def _parse_line(self, line):
+ return line, 0
diff --git a/sos/cleaner/parsers/mac_parser.py b/sos/cleaner/parsers/mac_parser.py
index b7ed2bae..4cbcbdce 100644
--- a/sos/cleaner/parsers/mac_parser.py
+++ b/sos/cleaner/parsers/mac_parser.py
@@ -41,6 +41,7 @@ class SoSMacParser(SoSCleanerParser):
'sos_commands/kernel/modinfo.*'
]
map_file_key = 'mac_map'
+ compile_regexes = False
def __init__(self, config):
self.mapping = SoSMacMap()
@@ -57,11 +58,8 @@ class SoSMacParser(SoSCleanerParser):
# just to be safe, call strip() to remove any padding
return match.strip()
- def parse_line(self, line):
+ def _parse_line(self, line):
count = 0
- for skip_pattern in self.skip_line_patterns:
- if re.match(skip_pattern, line, re.I):
- return line, count
for pattern in self.regex_patterns:
matches = [m[0] for m in re.findall(pattern, line, re.I)]
if matches:
diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py
index 2853c860..6d9a9ff0 100644
--- a/sos/cleaner/parsers/username_parser.py
+++ b/sos/cleaner/parsers/username_parser.py
@@ -8,8 +8,6 @@
#
# See the LICENSE file in the source distribution for further information.
-import re
-
from sos.cleaner.parsers import SoSCleanerParser
from sos.cleaner.mappings.username_map import SoSUsernameMap
@@ -61,14 +59,5 @@ class SoSUsernameParser(SoSCleanerParser):
for each in users:
self.mapping.get(each)
- def generate_item_regexes(self):
- for user in self.mapping.dataset:
- self.regexes[user] = re.compile(user, re.I)
-
- def parse_line(self, line):
- count = 0
- for user, reg in sorted(self.regexes.items(), key=len, reverse=True):
- if reg.search(line):
- line, _count = reg.subn(self.mapping.get(user.lower()), line)
- count += _count
- return line, count
+ def _parse_line(self, line):
+ return line, 0