diff options
-rw-r--r-- | sos/cleaner/mappings/hostname_map.py | 59 | ||||
-rw-r--r-- | sos/cleaner/parsers/__init__.py | 3 | ||||
-rw-r--r-- | sos/cleaner/parsers/hostname_parser.py | 30 |
3 files changed, 81 insertions, 11 deletions
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py index c9a44d8d..d4b2c88e 100644 --- a/sos/cleaner/mappings/hostname_map.py +++ b/sos/cleaner/mappings/hostname_map.py @@ -104,7 +104,7 @@ class SoSHostnameMap(SoSMap): host = domain.split('.') if len(host) == 1: # don't block on host's shortname - return True + return host[0] in self.hosts.keys() else: domain = host[0:-1] for known_domain in self._domains: @@ -113,12 +113,59 @@ class SoSHostnameMap(SoSMap): return False def get(self, item): - if item.startswith(('.', '_')): - item = item.lstrip('._') - item = item.strip() + prefix = '' + suffix = '' + final = None + # The regex pattern match may include a leading and/or trailing '_' + # character due to the need to use word boundary matching, so we need + # to strip these from the string during processing, but still keep them + # in the returned string to not mangle the string replacement in the + # context of the file or filename + while item.startswith(('.', '_')): + prefix += item[0] + item = item[1:] + while item.endswith(('.', '_')): + suffix += item[-1] + item = item[0:-1] if not self.domain_name_in_loaded_domains(item.lower()): return item - return super(SoSHostnameMap, self).get(item) + if item.endswith(('.yaml', '.yml', '.crt', '.key', '.pem')): + ext = '.' + item.split('.')[-1] + item = item.replace(ext, '') + suffix += ext + if item not in self.dataset.keys(): + # try to account for use of '-' in names that include hostnames + # and don't create new mappings for each of these + for _existing in sorted(self.dataset.keys(), reverse=True, + key=lambda x: len(x)): + _host_substr = False + _test = item.split(_existing) + _h = _existing.split('.') + # avoid considering a full FQDN match as a new match off of + # the hostname of an existing match + if _h[0] and _h[0] in self.hosts.keys(): + _host_substr = True + if len(_test) == 1 or not _test[0]: + # does not match existing obfuscation + continue + elif _test[0].endswith('.') and not _host_substr: + # new hostname in known domain + final = super(SoSHostnameMap, self).get(item) + break + elif item.split(_test[0]): + # string that includes existing FQDN obfuscation substring + # so, only obfuscate the FQDN part + try: + itm = item.split(_test[0])[1] + final = _test[0] + super(SoSHostnameMap, self).get(itm) + break + except Exception: + # fallback to still obfuscating the entire item + pass + + if not final: + final = super(SoSHostnameMap, self).get(item) + return prefix + final + suffix def sanitize_item(self, item): host = item.split('.') @@ -146,6 +193,8 @@ class SoSHostnameMap(SoSMap): """Obfuscate the short name of the host with an incremented counter based on the total number of obfuscated host names """ + if not hostname: + return hostname if hostname not in self.hosts: ob_host = "host%s" % self.host_count self.hosts[hostname] = ob_host diff --git a/sos/cleaner/parsers/__init__.py b/sos/cleaner/parsers/__init__.py index 84874475..57d2020a 100644 --- a/sos/cleaner/parsers/__init__.py +++ b/sos/cleaner/parsers/__init__.py @@ -87,7 +87,8 @@ class SoSCleanerParser(): for match in matches: match = match.strip() new_match = self.mapping.get(match) - line = line.replace(match, new_match) + if new_match != match: + line = line.replace(match, new_match) return line, count def parse_string_for_keys(self, string_data): diff --git a/sos/cleaner/parsers/hostname_parser.py b/sos/cleaner/parsers/hostname_parser.py index 9982024b..3de6bb08 100644 --- a/sos/cleaner/parsers/hostname_parser.py +++ b/sos/cleaner/parsers/hostname_parser.py @@ -18,7 +18,7 @@ class SoSHostnameParser(SoSCleanerParser): map_file_key = 'hostname_map' prep_map_file = 'sos_commands/host/hostname' regex_patterns = [ - r'(((\b|_)[a-zA-Z0-9-\.]{1,200}\.[a-zA-Z]{1,63}\b))' + r'(((\b|_)[a-zA-Z0-9-\.]{1,200}\.[a-zA-Z]{1,63}(\b|_)))' ] def __init__(self, conf_file=None, opt_domains=None): @@ -66,10 +66,30 @@ class SoSHostnameParser(SoSCleanerParser): """Override the default parse_line() method to also check for the shortname of the host derived from the hostname. """ + + def _check_line(ln, count, search, repl=None): + """Perform a second manual check for substrings that may have been + missed by regex matching + """ + if search in self.mapping.skip_keys: + return ln, count + if search in ln: + count += ln.count(search) + ln = ln.replace(search, self.mapping.get(repl or search)) + return ln, count + count = 0 line, count = super(SoSHostnameParser, self).parse_line(line) - for short_name in self.short_names: - if short_name in line: - count += 1 - line = line.replace(short_name, self.mapping.get(short_name)) + # make an additional pass checking for '_' formatted substrings that + # the regex patterns won't catch + hosts = [h for h in self.mapping.dataset.keys() if '.' in h] + for host in sorted(hosts, reverse=True, key=lambda x: len(x)): + fqdn = host + for c in '.-': + fqdn = fqdn.replace(c, '_') + line, count = _check_line(line, count, fqdn, host) + + for short_name in sorted(self.short_names, reverse=True): + line, count = _check_line(line, count, short_name) + return line, count |