diff options
author | Jake Hunsaker <jhunsake@redhat.com> | 2020-04-09 11:19:16 -0400 |
---|---|---|
committer | Jake Hunsaker <jhunsake@redhat.com> | 2020-04-22 10:01:00 -0400 |
commit | c3782e303b87fb1754825fa78a84b9ac52ebbcb3 (patch) | |
tree | 5ae7d3bb0070eaeb8639fd0420447206e3cca5b5 | |
parent | b1d1f30132a8719d2bdff3c1c25a0b183b094c82 (diff) | |
download | sos-c3782e303b87fb1754825fa78a84b9ac52ebbcb3.tar.gz |
[collector] Initial import of sos-collector into sos
This commit represents the first import of the sos-collector project
into sos natively for 4.0. It is not expected to be functional as of
this commit.
A minimum integration has been done at this point - namely tying
SoSCollector() as a SoSComponent() subclass, and hooking up the parser
functions.
SoSCollector will load a policy in the same way as 'sos report', and
should use the same logging methods, which will be done in a future
commit in this series.
As a divergence from the standalone sos-collector project, this
integration aims to hook in host detection with the existing Policy
model provided by sos. Additionally, we should be able to drop the
Configuration dict-subclass approach by the time this series is ready
for merge.
Related: #1988
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
-rw-r--r-- | sos/__init__.py | 4 | ||||
-rw-r--r-- | sos/collector/__init__.py | 877 | ||||
-rw-r--r-- | sos/collector/clusters/__init__.py | 223 | ||||
-rw-r--r-- | sos/collector/clusters/jbon.py | 30 | ||||
-rw-r--r-- | sos/collector/clusters/kubernetes.py | 54 | ||||
-rw-r--r-- | sos/collector/clusters/ovirt.py | 181 | ||||
-rw-r--r-- | sos/collector/clusters/pacemaker.py | 57 | ||||
-rw-r--r-- | sos/collector/clusters/satellite.py | 39 | ||||
-rw-r--r-- | sos/collector/configuration.py | 238 | ||||
-rw-r--r-- | sos/collector/exceptions.py | 108 | ||||
-rw-r--r-- | sos/collector/hosts/__init__.py | 125 | ||||
-rw-r--r-- | sos/collector/hosts/debian.py | 31 | ||||
-rw-r--r-- | sos/collector/hosts/redhat.py | 83 | ||||
-rw-r--r-- | sos/collector/sosnode.py | 819 |
14 files changed, 2868 insertions, 1 deletions
diff --git a/sos/__init__.py b/sos/__init__.py index 5d333f7a..c0ac67a3 100644 --- a/sos/__init__.py +++ b/sos/__init__.py @@ -51,8 +51,10 @@ class SoS(): # of shorthand names to accept in place of the full subcommand # if no aliases are desired, pass an empty list import sos.report + import sos.collector self._components = { - 'report': (sos.report.SoSReport, ['rep']) + 'report': (sos.report.SoSReport, ['rep']), + 'collect': (sos.collector.SoSCollector, ['collector']) } # build the top-level parser _com_string = '' diff --git a/sos/collector/__init__.py b/sos/collector/__init__.py new file mode 100644 index 00000000..5999c894 --- /dev/null +++ b/sos/collector/__init__.py @@ -0,0 +1,877 @@ +# Copyright Red Hat 2020, Jake Hunsaker <jhunsake@redhat.com> + +# This file is part of the sos project: https://github.com/sosreport/sos +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions of +# version 2 of the GNU General Public License. +# +# See the LICENSE file in the source distribution for further information. + +import fnmatch +import inspect +import json +import logging +import os +import random +import re +import string +import tarfile +import tempfile +import shutil +import subprocess +import sys + +from datetime import datetime +from concurrent.futures import ThreadPoolExecutor +from getpass import getpass +from pipes import quote +from textwrap import fill +from sos.collector.configuration import Configuration +from sos.collector.sosnode import SosNode +from sos.collector.exceptions import ControlPersistUnsupportedException +from sos.component import SoSComponent +from sos import __version__ + +COLLECTOR_LIB_DIR = '/var/lib/sos-collector' + + +class SoSCollector(SoSComponent): + """Collect an sos report from multiple nodes simultaneously + """ + + arg_defaults = { + 'alloptions': False, + 'all_logs': False, + 'become_root': False, + 'batch': False, + 'case_id': False, + 'cluster_type': None, + 'cluster_options': [], + 'chroot': 'auto', + 'enable_plugins': [], + 'group': None, + 'save_group': '', + 'image': '', + 'ssh_key': '', + 'insecure_sudo': False, + 'plugin_options': [], + 'list_options': False, + 'label': '', + 'log_size': 0, + 'skip_plugins': [], + 'nodes': [], + 'no_pkg_check': False, + 'no_local': False, + 'master': '', + 'only_plugins': [], + 'ssh_port': 22, + 'password': False, + 'password_per_node': False, + 'preset': '', + 'sos_opt_line': '', + 'ssh_user': 'root', + 'timeout': 600, + 'verify': False, + 'compression': 'auto' + } + + def __init__(self, parser, parsed_args, cmdline_args): + super(SoSCollector, self).__init__(parser, parsed_args, cmdline_args) + os.umask(0o77) + self.client_list = [] + self.node_list = [] + self.master = False + self.retrieved = 0 + self.need_local_sudo = False + self.config = Configuration(parsed_args) + if not self.config['list_options']: + try: + self._check_for_control_persist() + self.log_debug('Executing %s' % ' '.join(s for s in sys.argv)) + self.log_debug("Found cluster profiles: %s" + % self.clusters.keys()) + self.log_debug("Found supported host types: %s" + % self.config['host_types'].keys()) + self._parse_options() + self.prep() + except KeyboardInterrupt: + self._exit('Exiting on user cancel', 130) + except Exception: + raise + + @classmethod + def add_parser_options(cls, parser): + parser.add_argument('-a', '--alloptions', action='store_true', + help='Enable all sos options') + parser.add_argument('--all-logs', action='store_true', + help='Collect logs regardless of size') + parser.add_argument('-b', '--become', action='store_true', + dest='become_root', + help='Become root on the remote nodes') + parser.add_argument('--batch', action='store_true', + help='Do not prompt interactively (except passwords)') + parser.add_argument('--case-id', help='Specify case number') + parser.add_argument('--cluster-type', + help='Specify a type of cluster profile') + parser.add_argument('-c', '--cluster-option', dest='cluster_options', + action='append', + help=('Specify a cluster options used by a profile' + ' and takes the form of cluster.option=value' + ) + ) + parser.add_argument('--chroot', default='', + choices=['auto', 'always', 'never'], + help="chroot executed commands to SYSROOT") + parser.add_argument('-e', '--enable-plugins', action="append", + help='Enable specific plugins for sosreport') + parser.add_argument('--group', default=None, + help='Use a predefined group JSON file') + parser.add_argument('--save-group', default='', + help='Save the resulting node list to a group') + parser.add_argument('--image', + help=('Specify the container image to use for ' + 'containerized hosts. Defaults to the ' + 'rhel7/support-tools image')) + parser.add_argument('-i', '--ssh-key', help='Specify an ssh key to use') + parser.add_argument('--insecure-sudo', action='store_true', + help='Use when passwordless sudo is configured') + parser.add_argument('-k', '--plugin-options', action="append", + help='Plugin option as plugname.option=value') + parser.add_argument('-l', '--list-options', action="store_true", + help='List options available for profiles') + parser.add_argument('--label', help='Assign a label to the archives') + parser.add_argument('--log-size', default=0, type=int, + help='Limit the size of individual logs (in MiB)') + parser.add_argument('-n', '--skip-plugins', action="append", + help='Skip these plugins') + parser.add_argument('--nodes', action="append", + help='Provide a comma delimited list of nodes, or a ' + 'regex to match against') + parser.add_argument('--no-pkg-check', action='store_true', + help=('Do not run package checks. Use this ' + 'with --cluster-type if there are rpm ' + 'or apt issues on node' + ) + ) + parser.add_argument('--no-local', action='store_true', + help='Do not collect a sosreport from localhost') + parser.add_argument('--master', help='Specify a remote master node') + parser.add_argument('-o', '--only-plugins', action="append", + help='Run these plugins only') + parser.add_argument('-p', '--ssh-port', type=int, + help='Specify SSH port for all nodes') + parser.add_argument('--password', action='store_true', default=False, + help='Prompt for user password for nodes') + parser.add_argument('--password-per-node', action='store_true', + default=False, + help='Prompt for password separately for each node') + parser.add_argument('--preset', default='', required=False, + help='Specify a sos preset to use') + parser.add_argument('--sos-cmd', dest='sos_opt_line', + help=("Manually specify the commandline options for " + "sosreport on remote nodes") + ) + parser.add_argument('--ssh-user', + help='Specify an SSH user. Default root') + parser.add_argument('--timeout', type=int, required=False, + help='Timeout for sosreport on each node. Default 300.' + ) + parser.add_argument('--verify', action="store_true", + help="perform data verification during collection") + parser.add_argument('-z', '--compression-type', dest="compression", + choices=['auto', 'gzip', 'bzip2', 'xz'], + help="compression technology to use") + + def _check_for_control_persist(self): + '''Checks to see if the local system supported SSH ControlPersist. + + ControlPersist allows OpenSSH to keep a single open connection to a + remote host rather than building a new session each time. This is the + same feature that Ansible uses in place of paramiko, which we have a + need to drop in sos-collector. + + This check relies on feedback from the ssh binary. The command being + run should always generate stderr output, but depending on what that + output reads we can determine if ControlPersist is supported or not. + + For our purposes, a host that does not support ControlPersist is not + able to run sos-collector. + + Returns + True if ControlPersist is supported, else raise Exception. + ''' + ssh_cmd = ['ssh', '-o', 'ControlPersist'] + cmd = subprocess.Popen(ssh_cmd, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + out, err = cmd.communicate() + err = err.decode('utf-8') + if 'Bad configuration option' in err or 'Usage:' in err: + raise ControlPersistUnsupportedException + return True + + def _exit(self, msg, error=1): + '''Used to safely terminate if sos-collector encounters an error''' + self.log_error(msg) + try: + self.close_all_connections() + except Exception: + pass + sys.exit(error) + + def _parse_options(self): + '''If there are cluster options set on the CLI, override the defaults + ''' + if self.config['cluster_options']: + for opt in self.config['cluster_options']: + match = False + for clust in self.clusters: + for option in self.clusters[clust].options: + if opt.name == option.name: + match = True + break + if not match: + self._exit('Unknown cluster option provided: %s.%s' + % (opt.cluster, opt.name)) + + def _validate_option(self, default, cli): + '''Checks to make sure that the option given on the CLI is valid. + Valid in this sense means that the type of value given matches what a + cluster profile expects (str for str, bool for bool, etc). + + For bool options, this will also convert the string equivalent to an + actual boolean value + ''' + if not default.opt_type == bool: + if not default.opt_type == cli.opt_type: + msg = "Invalid option type for %s. Expected %s got %s" + self._exit(msg % (cli.name, default.opt_type, cli.opt_type)) + return cli.value + else: + val = cli.value.lower() + if val not in ['true', 'on', 'false', 'off']: + msg = ("Invalid value for %s. Accepted values are: 'true', " + "'false', 'on', 'off'") + self._exit(msg % cli.name) + else: + if val in ['true', 'on']: + return True + else: + return False + + def log_info(self, msg): + '''Log info messages to both console and log file''' + self.logger.info(msg) + self.console.info(msg) + + def log_warn(self, msg): + '''Log warn messages to both console and log file''' + self.logger.warn(msg) + self.console.warn('WARNING: %s' % msg) + + def log_error(self, msg): + '''Log error messages to both console and log file''' + self.logger.error(msg) + self.console.error(msg) + + def log_debug(self, msg): + '''Log debug message to both console and log file''' + caller = inspect.stack()[1][3] + msg = '[sos_collector:%s] %s' % (caller, msg) + self.logger.debug(msg) + if self.config['verbose']: + self.console.debug(msg) + + def create_tmp_dir(self, location='/var/tmp'): + '''Creates a temp directory to transfer sosreports to''' + tmpdir = tempfile.mkdtemp(prefix='sos-collector-', dir=location) + self.config['tmp_dir'] = tmpdir + self.config['tmp_dir_created'] = True + + def list_options(self): + '''Display options for available clusters''' + + sys.stdout.write('\nThe following clusters are supported by this ' + 'installation\n') + sys.stdout.write('Use the short name with --cluster-type or cluster ' + 'options (-c)\n\n') + for cluster in sorted(self.clusters): + sys.stdout.write(" {:<15} {:30}\n".format( + cluster, + self.clusters[cluster].cluster_name)) + + _opts = {} + for _cluster in self.clusters: + for opt in self.clusters[_cluster].options: + if opt.name not in _opts.keys(): + _opts[opt.name] = opt + else: + for clust in opt.cluster: + if clust not in _opts[opt.name].cluster: + _opts[opt.name].cluster.append(clust) + + sys.stdout.write('\nThe following cluster options are available:\n\n') + sys.stdout.write(' {:25} {:15} {:<10} {:10} {:<}\n'.format( + 'Cluster', + 'Option Name', + 'Type', + 'Default', + 'Description' + )) + + for _opt in sorted(_opts, key=lambda x: _opts[x].cluster): + opt = _opts[_opt] + optln = ' {:25} {:15} {:<10} {:<10} {:<10}\n'.format( + ', '.join(c for c in sorted(opt.cluster)), + opt.name, + opt.opt_type.__name__, + str(opt.value), + opt.description) + sys.stdout.write(optln) + sys.stdout.write('\nOptions take the form of cluster.name=value' + '\nE.G. "ovirt.no-database=True" or ' + '"pacemaker.offline=False"\n') + + def delete_tmp_dir(self): + '''Removes the temp directory and all collected sosreports''' + shutil.rmtree(self.config['tmp_dir']) + + def _get_archive_name(self): + '''Generates a name for the tarball archive''' + nstr = 'sos-collector' + if self.config['label']: + nstr += '-%s' % self.config['label'] + if self.config['case_id']: + nstr += '-%s' % self.config['case_id'] + dt = datetime.strftime(datetime.now(), '%Y-%m-%d') + + try: + string.lowercase = string.ascii_lowercase + except NameError: + pass + + rand = ''.join(random.choice(string.lowercase) for x in range(5)) + return '%s-%s-%s' % (nstr, dt, rand) + + def _get_archive_path(self): + '''Returns the path, including filename, of the tarball we build + that contains the collected sosreports + ''' + self.arc_name = self._get_archive_name() + compr = 'gz' + return self.config['out_dir'] + self.arc_name + '.tar.' + compr + + def _fmt_msg(self, msg): + width = 80 + _fmt = '' + for line in msg.splitlines(): + _fmt = _fmt + fill(line, width, replace_whitespace=False) + '\n' + return _fmt + + def _load_group_config(self): + ''' + Attempts to load the host group specified on the command line. + Host groups are defined via JSON files, typically saved under + /var/lib/sos-collector/, although users can specify a full filepath + on the commandline to point to one existing anywhere on the system + + Host groups define a list of nodes and/or regexes and optionally the + master and cluster-type options. + ''' + if os.path.exists(self.config['group']): + fname = self.config['group'] + elif os.path.exists( + os.path.join(COLLECTOR_LIB_DIR, self.config['group']) + ): + fname = os.path.join(COLLECTOR_LIB_DIR, self.config['group']) + else: + raise OSError('Group not found') + + self.log_debug("Loading host group %s" % fname) + + with open(fname, 'r') as hf: + _group = json.load(hf) + for key in ['master', 'cluster_type']: + if _group[key]: + self.log_debug("Setting option '%s' to '%s' per host group" + % (key, _group[key])) + self.config[key] = _group[key] + if _group['nodes']: + self.log_debug("Adding %s to node list" % _group['nodes']) + self.config['nodes'].extend(_group['nodes']) + + def write_host_group(self): + ''' + Saves the results of this run of sos-collector to a host group file + on the system so it can be used later on. + + The host group will save the options master, cluster_type, and nodes + as determined by sos-collector prior to execution of sosreports. + ''' + cfg = { + 'name': self.config['save_group'], + 'master': self.config['master'], + 'cluster_type': self.config['cluster_type'], + 'nodes': [n for n in self.node_list] + } + if not os.path.isdir(COLLECTOR_LIB_DIR): + raise OSError("%s no such directory" % COLLECTOR_LIB_DIR) + fname = COLLECTOR_LIB_DIR + '/' + cfg['name'] + with open(fname, 'w') as hf: + json.dump(cfg, hf) + os.chmod(fname, 0o644) + return fname + + def prep(self): + '''Based on configuration, performs setup for collection''' + disclaimer = ("""\ +This utility is used to collect sosreports from multiple \ +nodes simultaneously. It uses OpenSSH's ControlPersist feature \ +to connect to nodes and run commands remotely. If your system \ +installation of OpenSSH is older than 5.6, please upgrade. + +An archive of sosreport tarballs collected from the nodes will be \ +generated in %s and may be provided to an appropriate support representative. + +The generated archive may contain data considered sensitive \ +and its content should be reviewed by the originating \ +organization before being passed to any third party. + +No configuration changes will be made to the system running \ +this utility or remote systems that it connects to. +""") + self.console.info("\nsos-collector (version %s)\n" % __version__) + intro_msg = self._fmt_msg(disclaimer % self.config['tmp_dir']) + self.console.info(intro_msg) + prompt = "\nPress ENTER to continue, or CTRL-C to quit\n" + if not self.config['batch']: + input(prompt) + + if (not self.config['password'] and not + self.config['password_per_node']): + self.log_debug('password not specified, assuming SSH keys') + msg = ('sos-collector ASSUMES that SSH keys are installed on all ' + 'nodes unless the --password option is provided.\n') + self.console.info(self._fmt_msg(msg)) + + if self.config['password'] or self.config['password_per_node']: + self.log_debug('password specified, not using SSH keys') + msg = ('Provide the SSH password for user %s: ' + % self.config['ssh_user']) + self.config['password'] = getpass(prompt=msg) + + if self.config['need_sudo'] and not self.config['insecure_sudo']: + if not self.config['password']: + self.log_debug('non-root user specified, will request ' + 'sudo password') + msg = ('A non-root user has been provided. Provide sudo ' + 'password for %s on remote nodes: ' + % self.config['ssh_user']) + self.config['sudo_pw'] = getpass(prompt=msg) + else: + if not self.config['insecure_sudo']: + self.config['sudo_pw'] = self.config['password'] + + if self.config['become_root']: + if not self.config['ssh_user'] == 'root': + self.log_debug('non-root user asking to become root remotely') + msg = ('User %s will attempt to become root. ' + 'Provide root password: ' % self.config['ssh_user']) + self.config['root_password'] = getpass(prompt=msg) + self.config['need_sudo'] = False + else: + self.log_info('Option to become root but ssh user is root.' + ' Ignoring request to change user on node') + self.config['become_root'] = False + + if self.config['group']: + try: + self._load_group_config() + except Exception as err: + self.log_error("Could not load specified group %s: %s" + % (self.config['group'], err)) + + if self.config['master']: + self.connect_to_master() + self.config['no_local'] = True + else: + try: + self.master = SosNode('localhost', self.config) + except Exception as err: + self.log_debug("Unable to determine local installation: %s" % + err) + self._exit('Unable to determine local installation. Use the ' + '--no-local option if localhost should not be ' + 'included.\nAborting...\n', 1) + + if self.config['cluster_type']: + if self.config['cluster_type'] == 'none': + self.config['cluster'] = self.clusters['jbon'] + else: + self.config['cluster'] = self.clusters[ + self.config['cluster_type'] + ] + self.config['cluster'].master = self.master + else: + self.determine_cluster() + if self.config['cluster'] is None and not self.config['nodes']: + msg = ('Cluster type could not be determined and no nodes provided' + '\nAborting...') + self._exit(msg, 1) + if self.config['cluster']: + self.config['cluster'].setup() + self.config['cluster'].modify_sos_cmd() + self.get_nodes() + if self.config['save_group']: + gname = self.config['save_group'] + try: + fname = self.write_host_group() + self.log_info("Wrote group '%s' to %s" % (gname, fname)) + except Exception as err: + self.log_error("Could not save group %s: %s" % (gname, err)) + self.intro() + self.configure_sos_cmd() + + def intro(self): + '''Prints initial messages and collects user and case if not + provided already. + ''' + self.console.info('') + + if not self.node_list and not self.master.connected: + self._exit('No nodes were detected, or nodes do not have sos ' + 'installed.\nAborting...') + + self.console.info('The following is a list of nodes to collect from:') + if self.master.connected: + self.console.info('\t%-*s' % (self.config['hostlen'], + self.config['master'])) + + for node in sorted(self.node_list): + self.console.info("\t%-*s" % (self.config['hostlen'], node)) + + self.console.info('') + + if not self.config['case_id'] and not self.config['batch']: + msg = 'Please enter the case id you are collecting reports for: ' + self.config['case_id'] = input(msg) + + def configure_sos_cmd(self): + '''Configures the sosreport command that is run on the nodes''' + if self.config['sos_opt_line']: + filt = ['&', '|', '>', '<', ';'] + if any(f in self.config['sos_opt_line'] for f in filt): + self.log_warn('Possible shell script found in provided sos ' + 'command. Ignoring --sos-cmd option entirely.') + self.config['sos_opt_line'] = None + else: + self.config['sos_cmd'] = '%s %s' % ( + self.config['sos_cmd'], quote(self.config['sos_opt_line'])) + self.log_debug("User specified manual sosreport command. " + "Command set to %s" % self.config['sos_cmd']) + return True + if self.config['case_id']: + self.config['sos_cmd'] += ' --case-id=%s' % ( + quote(self.config['case_id'])) + if self.config['alloptions']: + self.config['sos_cmd'] += ' --alloptions' + if self.config['all_logs']: + self.config['sos_cmd'] += ' --all-logs' + if self.config['verify']: + self.config['sos_cmd'] += ' --verify' + if self.config['log_size']: + self.config['sos_cmd'] += (' --log-size=%s' + % quote(self.config['log_size'])) + if self.config['sysroot']: + self.config['sos_cmd'] += ' -s %s' % quote(self.config['sysroot']) + if self.config['chroot']: + self.config['sos_cmd'] += ' -c %s' % quote(self.config['chroot']) + if self.config['compression']: + self.config['sos_cmd'] += ' -z %s' % ( + quote(self.config['compression'])) + self.log_debug('Initial sos cmd set to %s' % self.config['sos_cmd']) + + def connect_to_master(self): + '''If run with --master, we will run cluster checks again that + instead of the localhost. + ''' + try: + self.master = SosNode(self.config['master'], self.config) + except Exception as e: + self.log_debug('Failed to connect to master: %s' % e) + self._exit('Could not connect to master node. Aborting...', 1) + + def determine_cluster(self): + '''This sets the cluster type and loads that cluster's cluster. + + If no cluster type is matched and no list of nodes is provided by + the user, then we abort. + + If a list of nodes is given, this is not run, however the cluster + can still be run if the user sets a --cluster-type manually + ''' + checks = list(self.clusters.values()) + for cluster in self.clusters.values(): + checks.remove(cluster) + cluster.master = self.master + if cluster.check_enabled(): + cname = cluster.__class__.__name__ + self.log_debug("Installation matches %s, checking for layered " + "profiles" % cname) + for remaining in checks: + if issubclass(remaining.__class__, cluster.__class__): + rname = remaining.__class__.__name__ + self.log_debug("Layered profile %s found. " + "Checking installation" + % rname) + remaining.master = self.master + if remaining.check_enabled(): + self.log_debug("Installation matches both layered " + "profile %s and base profile %s, " + "setting cluster type to layered " + "profile" % (rname, cname)) + cluster = remaining + break + + self.config['cluster'] = cluster + self.config['cluster_type'] = cluster.name() + self.log_info( + 'Cluster type set to %s' % self.config['cluster_type']) + break + + def get_nodes_from_cluster(self): + '''Collects the list of nodes from the determined cluster cluster''' + if self.config['cluster_type']: + nodes = self.config['cluster']._get_nodes() + self.log_debug('Node list: %s' % nodes) + return nodes + + def reduce_node_list(self): + '''Reduce duplicate entries of the localhost and/or master node + if applicable''' + if (self.config['hostname'] in self.node_list and + self.config['no_local']): + self.node_list.remove(self.config['hostname']) + for i in self.config['ip_addrs']: + if i in self.node_list: + self.node_list.remove(i) + # remove the master node from the list, since we already have + # an open session to it. + if self.config['master']: + for n in self.node_list: + if n == self.master.hostname or n == self.config['master']: + self.node_list.remove(n) + self.node_list = list(set(n for n in self.node_list if n)) + self.log_debug('Node list reduced to %s' % self.node_list) + + def compare_node_to_regex(self, node): + '''Compares a discovered node name to a provided list of nodes from + the user. If there is not a match, the node is removed from the list''' + for regex in self.config['nodes']: + try: + regex = fnmatch.translate(regex) + if re.match(regex, node): + return True + except re.error as err: + msg = 'Error comparing %s to provided node regex %s: %s' + self.log_debug(msg % (node, regex, err)) + return False + + def get_nodes(self): + ''' Sets the list of nodes to collect sosreports from ''' + if not self.config['master'] and not self.config['cluster']: + msg = ('Could not determine a cluster type and no list of ' + 'nodes or master node was provided.\nAborting...' + ) + self._exit(msg) + + try: + nodes = self.get_nodes_from_cluster() + if self.config['nodes']: + for node in nodes: + if self.compare_node_to_regex(node): + self.node_list.append(node) + else: + self.node_list = nodes + except Exception as e: + self.log_debug("Error parsing node list: %s" % e) + self.log_debug('Setting node list to --nodes option') + self.node_list = self.config['nodes'] + for node in self.node_list: + if any(i in node for i in ('*', '\\', '?', '(', ')', '/')): + self.node_list.remove(node) + + # force add any non-regex node strings from nodes option + if self.config['nodes']: + for node in self.config['nodes']: + if any(i in node for i in '*\\?()/[]'): + continue + if node not in self.node_list: + self.log_debug("Force adding %s to node list" % node) + self.node_list.append(node) + + if not self.config['master']: + host = self.config['hostname'].split('.')[0] + # trust the local hostname before the node report from cluster + for node in self.node_list: + if host == node.split('.')[0]: + self.node_list.remove(node) + self.node_list.append(self.config['hostname']) + self.reduce_node_list() + try: + self.config['hostlen'] = len(max(self.node_list, key=len)) + except (TypeError, ValueError): + self.config['hostlen'] = len(self.config['master']) + + def _connect_to_node(self, node): + '''Try to connect to the node, and if we can add to the client list to + run sosreport on + + Positional arguments + node - a tuple specifying (address, password). If no password, set + to None + ''' + try: + client = SosNode(node[0], self.config, password=node[1]) + if client.connected: + self.client_list.append(client) + else: + client.close_ssh_session() + except Exception: + pass + + def collect(self): + ''' For each node, start a collection thread and then tar all + collected sosreports ''' + if self.master.connected: + self.client_list.append(self.master) + + self.console.info("\nConnecting to nodes...") + filters = [self.master.address, self.master.hostname] + nodes = [(n, None) for n in self.node_list if n not in filters] + + if self.config['password_per_node']: + _nodes = [] + for node in nodes: + msg = ("Please enter the password for %s@%s: " + % (self.config['ssh_user'], node[0])) + node_pwd = getpass(msg) + _nodes.append((node[0], node_pwd)) + nodes = _nodes + + try: + pool = ThreadPoolExecutor(self.config['threads']) + pool.map(self._connect_to_node, nodes, chunksize=1) + pool.shutdown(wait=True) + + self.report_num = len(self.client_list) + if self.config['no_local'] and self.master.address == 'localhost': + self.report_num -= 1 + + self.console.info("\nBeginning collection of sosreports from %s " + "nodes, collecting a maximum of %s " + "concurrently\n" + % (self.report_num, self.config['threads']) + ) + + pool = ThreadPoolExecutor(self.config['threads']) + pool.map(self._collect, self.client_list, chunksize=1) + pool.shutdown(wait=True) + except KeyboardInterrupt: + self.log_error('Exiting on user cancel\n') + os._exit(130) + except Exception as err: + self.log_error('Could not connect to nodes: %s' % err) + os._exit(1) + + if hasattr(self.config['cluster'], 'run_extra_cmd'): + self.console.info('Collecting additional data from master node...') + files = self.config['cluster']._run_extra_cmd() + if files: + self.master.collect_extra_cmd(files) + msg = '\nSuccessfully captured %s of %s sosreports' + self.log_info(msg % (self.retrieved, self.report_num)) + self.close_all_connections() + if self.retrieved > 0: + self.create_cluster_archive() + else: + msg = 'No sosreports were collected, nothing to archive...' + self._exit(msg, 1) + + def _collect(self, client): + '''Runs sosreport on each node''' + try: + if not client.local: + client.sosreport() + else: + if not self.config['no_local']: + client.sosreport() + if client.retrieved: + self.retrieved += 1 + except Exception as err: + self.log_error("Error running sosreport: %s" % err) + + def close_all_connections(self): + '''Close all ssh sessions for nodes''' + for client in self.client_list: + self.log_debug('Closing SSH connection to %s' % client.address) + client.close_ssh_session() + + def create_cluster_archive(self): + '''Calls for creation of tar archive then cleans up the temporary + files created by sos-collector''' + self.log_info('Creating archive of sosreports...') + self.create_sos_archive() + if self.archive: + self.logger.info('Archive created as %s' % self.archive) + self.cleanup() + self.console.info('\nThe following archive has been created. ' + 'Please provide it to your support team.') + self.console.info(' %s' % self.archive) + + def create_sos_archive(self): + '''Creates a tar archive containing all collected sosreports''' + try: + self.archive = self._get_archive_path() + with tarfile.open(self.archive, "w:gz") as tar: + for host in self.client_list: + for fname in host.file_list: + try: + if '.md5' in fname: + arc_name = (self.arc_name + '/md5/' + + fname.split('/')[-1]) + else: + arc_name = (self.arc_name + '/' + + fname.split('/')[-1]) + tar.add( + os.path.join(self.config['tmp_dir'], fname), + arcname=arc_name + ) + except Exception as err: + self.log_error("Could not add %s to archive: %s" + % (arc_name, err)) + tar.add( + self.logfile.name, + arcname=self.arc_name + '/logs/sos-collector.log' + ) + tar.add( + self.console_log_file.name, + arcname=self.arc_name + '/logs/ui.log' + ) + tar.close() + except Exception as e: + msg = 'Could not create archive: %s' % e + self._exit(msg, 2) + + def cleanup(self): + ''' Removes the tmp dir and all sosarchives therein. + + If tmp dir was supplied by user, only the sos archives within + that dir are removed. + ''' + if self.config['tmp_dir_created']: + self.delete_tmp_dir() + else: + for f in os.listdir(self.config['tmp_dir']): + if re.search('sosreport-*tar*', f): + os.remove(os.path.join(self.config['tmp_dir'], f)) diff --git a/sos/collector/clusters/__init__.py b/sos/collector/clusters/__init__.py new file mode 100644 index 00000000..7c3d01a8 --- /dev/null +++ b/sos/collector/clusters/__init__.py @@ -0,0 +1,223 @@ +# Copyright Red Hat 2020, Jake Hunsaker <jhunsake@redhat.com> + +# This file is part of the sos project: https://github.com/sosreport/sos +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions of +# version 2 of the GNU General Public License. +# +# See the LICENSE file in the source distribution for further information. + +import logging +import subprocess + +from sos.collector.configuration import ClusterOption + + +class Cluster(object): + + option_list = [] + packages = ('',) + sos_plugins = [] + sos_plugin_options = {} + sos_preset = '' + cluster_name = None + + def __init__(self, config): + '''This is the class that cluster profile should subclass in order to + add support for different clustering technologies and environments to + sos-collector. + + A profile should at minimum define a package that indicates the node is + configured for the type of cluster the profile is intended to serve and + then additionall be able to return a list of enumerated nodes via the + get_nodes() method + ''' + + self.master = None + self.config = config + self.cluster_type = [self.__class__.__name__] + for cls in self.__class__.__bases__: + if cls.__name__ != 'Cluster': + self.cluster_type.append(cls.__name__) + self.node_list = None + self.logger = logging.getLogger('sos_collector') + self.console = logging.getLogger('sos_collector_console') + self.options = [] + self._get_options() + + @classmethod + def name(cls): + '''Returns the cluster's name as a string. + ''' + if cls.cluster_name: + return cls.cluster_name + return cls.__name__.lower() + + def _get_options(self): + '''Loads the options defined by a cluster and sets the default value''' + for opt in self.option_list: + option = ClusterOption(name=opt[0], opt_type=opt[1].__class__, + value=opt[1], cluster=self.cluster_type, + description=opt[2]) + self.options.append(option) + + def _fmt_msg(self, msg): + return '[%s] %s' % (self.cluster_type, msg) + + def log_info(self, msg): + '''Used to print info messages''' + self.logger.info(self._fmt_msg(msg)) + self.console.info(msg) + + def log_error(self, msg): + '''Used to print error messages''' + self.logger.error(self._fmt_msg(msg)) + self.console.error(msg) + + def log_debug(self, msg): + '''Used to print debug messages''' + self.logger.debug(self._fmt_msg(msg)) + if self.config['verbose']: + self.console.debug(self._fmt_msg(msg)) + + def log_warn(self, msg): + '''Used to print warning messages''' + self.logger.warn(self._fmt_msg(msg)) + self.console.warn(msg) + + def get_option(self, option): + '''This is used to by clusters to check if a cluster option was + supplied to sos-collector. + ''' + # check CLI before defaults + for opt in self.config['cluster_options']: + if opt.name == option and opt.cluster in self.cluster_type: + return opt.value + # provide defaults otherwise + for opt in self.options: + if opt.name == option: + return opt.value + return False + + def exec_master_cmd(self, cmd, need_root=False): + '''Used to retrieve output from a (master) node in a cluster''' + res = self.master.run_command(cmd, get_pty=True, need_root=need_root) + if res['stdout']: + res['stdout'] = res['stdout'].replace('Password:', '') + return res + + def setup(self): + '''This MAY be used by a cluster to do prep work in case there are + extra commands to be run even if a node list is given by the user, and + thus get_nodes() would not be called + ''' + pass + + def check_enabled(self): + '''This may be overridden by clusters + + This is called by sos-collector on each cluster type that exists, and + is meant to return True when the cluster type matches a criteria + that indicates that is the cluster type is in use. + + Only the first cluster type to determine a match is run + ''' + for pkg in self.packages: + if self.master.is_installed(pkg): + return True + return False + + def get_nodes(self): + '''This MUST be overridden by a cluster. + A cluster should use this method to return a list or string that + contains all the nodes that a report should be collected from + ''' + pass + + def _get_nodes(self): + try: + return self.format_node_list() + except Exception as e: + self.log_debug('Failed to get node list: %s' % e) + return [] + + def get_node_label(self, node): + '''Used by SosNode() to retrieve the appropriate label from the cluster + as set by set_node_label() in the cluster profile. + ''' + return self.set_node_label(node) + + def set_node_label(self, node): + '''This may be overridden by clusters. + + If there is a distinction between masters and nodes, or types of nodes, + then this can be used to label the sosreport archive differently. + ''' + return '' + + def modify_sos_cmd(self): + '''This is used to modify the sosreport command run on the nodes. + By default, sosreport is run without any options, using this will + allow the profile to specify what plugins to run or not and what + options to use. + + This will NOT override user supplied options. + ''' + if self.sos_preset: + if not self.config['preset']: + self.config['preset'] = self.sos_preset + else: + self.log_debug('Cluster specified preset %s but user has also ' + 'defined a preset. Using user specification.' + % self.sos_preset) + if self.sos_plugins: + for plug in self.sos_plugins: + if plug not in self.config['sos_cmd']: + self.config['enable_plugins'].append(plug) + if self.sos_plugin_options: + for opt in self.sos_plugin_options: + if not any(opt in o for o in self.config['plugin_options']): + option = '%s=%s' % (opt, self.sos_plugin_options[opt]) + self.config['plugin_options'].append(option) + + def format_node_list(self): + '''Format the returned list of nodes from a cluster into a known + format. This being a list that contains no duplicates + ''' + try: + nodes = self.get_nodes() + except Exception as e: + self.log_error('\n%s failed to enumerate nodes: %s' + % (self.cluster_type, e)) + raise + if isinstance(nodes, list): + node_list = [n.strip() for n in nodes if n] + elif isinstance(nodes, str): + node_list = [n.split(',').strip() for n in nodes] + node_list = list(set(node_list)) + for node in node_list: + if node.startswith(('-', '_', '(', ')', '[', ']', '/', '\\')): + node_list.remove(node) + return node_list + + def _run_extra_cmd(self): + '''Ensures that any files returned by a cluster's run_extra_cmd() + method are properly typed as a list for iterative collection. If any + of the files are an additional sosreport (e.g. the ovirt db dump) then + the md5 sum file is automatically added to the list + ''' + files = [] + try: + res = self.run_extra_cmd() + if res: + if not isinstance(res, list): + res = [res] + for extra_file in res: + extra_file = extra_file.strip() + files.append(extra_file) + if 'sosreport' in extra_file: + files.append(extra_file + '.md5') + except AttributeError: + pass + return files diff --git a/sos/collector/clusters/jbon.py b/sos/collector/clusters/jbon.py new file mode 100644 index 00000000..488fbd16 --- /dev/null +++ b/sos/collector/clusters/jbon.py @@ -0,0 +1,30 @@ +# Copyright Red Hat 2020, Jake Hunsaker <jhunsake@redhat.com> + +# This file is part of the sos project: https://github.com/sosreport/sos +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions of +# version 2 of the GNU General Public License. +# +# See the LICENSE file in the source distribution for further information. + +from sos.collector.clusters import Cluster + + +class jbon(Cluster): + '''Just a Bunch of Nodes + + Used when --cluster-type=none (or jbon), to avoid cluster checks, and just + use the provided --nodes list + ''' + + cluster_name = 'Just a Bunch Of Nodes (no cluster)' + packages = None + + def get_nodes(self): + return [] + + def check_enabled(self): + # This should never be called, but as insurance explicitly never + # allow this to be enabled via the determine_cluster() path + return False diff --git a/sos/collector/clusters/kubernetes.py b/sos/collector/clusters/kubernetes.py new file mode 100644 index 00000000..6a867e31 --- /dev/null +++ b/sos/collector/clusters/kubernetes.py @@ -0,0 +1,54 @@ +# Copyright Red Hat 2020, Jake Hunsaker <jhunsake@redhat.com> + +# This file is part of the sos project: https://github.com/sosreport/sos +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions of +# version 2 of the GNU General Public License. +# +# See the LICENSE file in the source distribution for further information. + +from pipes import quote +from sos.collector.clusters import Cluster + + +class kubernetes(Cluster): + + cluster_name = 'Community Kubernetes' + packages = ('kubernetes-master',) + sos_plugins = ['kubernetes'] + sos_plugin_options = {'kubernetes.all': 'on'} + + cmd = 'kubectl' + + option_list = [ + ('label', '', 'Filter node list to those with matching label'), + ('role', '', 'Filter node list to those with matching role') + ] + + def get_nodes(self): + self.cmd += ' get nodes' + if self.get_option('label'): + self.cmd += ' -l %s ' % quote(self.get_option('label')) + res = self.exec_master_cmd(self.cmd) + if res['status'] == 0: + nodes = [] + roles = [x for x in self.get_option('role').split(',') if x] + for nodeln in res['stdout'].splitlines()[1:]: + node = nodeln.split() + if not roles: + nodes.append(node[0]) + else: + if node[2] in roles: + nodes.append(node[0]) + return nodes + else: + raise Exception('Node enumeration did not return usable output') + + +class openshift(kubernetes): + + cluster_name = 'OpenShift Container Platform' + packages = ('atomic-openshift',) + sos_preset = 'ocp' + cmd = 'oc' diff --git a/sos/collector/clusters/ovirt.py b/sos/collector/clusters/ovirt.py new file mode 100644 index 00000000..5b34f480 --- /dev/null +++ b/sos/collector/clusters/ovirt.py @@ -0,0 +1,181 @@ +# Copyright Red Hat 2020, Jake Hunsaker <jhunsake@redhat.com> + +# This file is part of the sos project: https://github.com/sosreport/sos +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions of +# version 2 of the GNU General Public License. +# +# See the LICENSE file in the source distribution for further information. + +import fnmatch + +from pipes import quote +from sos.collector.clusters import Cluster + +ENGINE_KEY = '/etc/pki/ovirt-engine/keys/engine_id_rsa' + + +class ovirt(Cluster): + + cluster_name = 'Community oVirt' + packages = ('ovirt-engine',) + db_exec = '/usr/share/ovirt-engine/dbscripts/engine-psql.sh -c' + + option_list = [ + ('no-database', False, 'Do not collect a database dump'), + ('cluster', '', 'Only collect from hosts in this cluster'), + ('datacenter', '', 'Only collect from hosts in this datacenter'), + ('no-hypervisors', False, 'Do not collect from hypervisors'), + ('spm-only', False, 'Only collect from SPM host(s)') + ] + + def _run_db_query(self, query): + ''' + Wrapper for running DB queries on the master. Any scrubbing of the + query should be done _before_ passing the query to this method. + ''' + cmd = "%s %s" % (self.db_exec, quote(query)) + return self.exec_master_cmd(cmd, need_root=True) + + def _sql_scrub(self, val): + ''' + Manually sanitize SQL queries since we can't leave this up to the + driver since we do not have an actual DB connection + ''' + if not val: + return '%' + + invalid_chars = ['\x00', '\\', '\n', '\r', '\032', '"', '\''] + if any(x in invalid_chars for x in val): + self.log_warn("WARNING: Cluster option \'%s\' contains invalid " + "characters. Using '%%' instead." % val) + return '%' + + return val + + def _check_for_engine_keys(self): + ''' + Checks for the presence of the VDSM ssh keys the manager uses for + communication with hypervisors. + + This only runs if we're locally on the RHV-M, *and* if no ssh-keys are + called out on the command line, *and* no --password option is given. + ''' + if self.master.local: + if not any([self.config['ssh_key'], self.config['password'], + self.config['password_per_node']]): + if self.master.file_exists(ENGINE_KEY): + self.config['ssh_key'] = ENGINE_KEY + self.log_debug("Found engine SSH key. User command line" + " does not specify a key or password, using" + " engine key.") + + def setup(self): + self.pg_pass = False + if not self.get_option('no-database'): + self.conf = self.parse_db_conf() + self.format_db_cmd() + self._check_for_engine_keys() + + def format_db_cmd(self): + cluster = self._sql_scrub(self.get_option('cluster')) + datacenter = self._sql_scrub(self.get_option('datacenter')) + self.dbquery = ("SELECT host_name from vds where cluster_id in " + "(select cluster_id FROM cluster WHERE name like '%s'" + " and storage_pool_id in (SELECT id FROM storage_pool " + "WHERE name like '%s'))" % (cluster, datacenter)) + if self.get_option('spm-only'): + # spm_status is an integer with the following meanings + # 0 - Normal (not SPM) + # 1 - Contending (SPM election in progress, but is not SPM) + # 2 - SPM + self.dbquery += ' AND spm_status = 2' + self.log_debug('Query command for ovirt DB set to: %s' % self.dbquery) + + def get_nodes(self): + if self.get_option('no-hypervisors'): + return [] + res = self._run_db_query(self.dbquery) + if res['status'] == 0: + nodes = res['stdout'].splitlines()[2:-1] + return [n.split('(')[0].strip() for n in nodes] + else: + raise Exception('database query failed, return code: %s' + % res['status']) + + def run_extra_cmd(self): + if not self.get_option('no-database') and self.conf: + return self.collect_database() + return False + + def parse_db_conf(self): + conf = {} + engconf = '/etc/ovirt-engine/engine.conf.d/10-setup-database.conf' + res = self.exec_master_cmd('cat %s' % engconf, need_root=True) + if res['status'] == 0: + config = res['stdout'].splitlines() + for line in config: + try: + k = str(line.split('=')[0]) + v = str(line.split('=')[1].replace('"', '')) + conf[k] = v + except IndexError: + pass + return conf + return False + + def collect_database(self): + sos_opt = ( + '-k {plugin}.dbname={db} ' + '-k {plugin}.dbhost={dbhost} ' + '-k {plugin}.dbport={dbport} ' + '-k {plugin}.username={dbuser} ' + ).format(plugin='postgresql', + db=self.conf['ENGINE_DB_DATABASE'], + dbhost=self.conf['ENGINE_DB_HOST'], + dbport=self.conf['ENGINE_DB_PORT'], + dbuser=self.conf['ENGINE_DB_USER'] + ) + cmd = ('PGPASSWORD={} /usr/sbin/sosreport --name=postgresql ' + '--batch -o postgresql {}' + ).format(self.conf['ENGINE_DB_PASSWORD'], sos_opt) + db_sos = self.exec_master_cmd(cmd, need_root=True) + for line in db_sos['stdout'].splitlines(): + if fnmatch.fnmatch(line, '*sosreport-*tar*'): + return line.strip() + self.log_error('Failed to gather database dump') + return False + + +class rhv(ovirt): + + cluster_name = 'Red Hat Virtualization' + packages = ('rhevm', 'rhvm') + sos_preset = 'rhv' + + def set_node_label(self, node): + if node.address == self.master.address: + return 'manager' + if node.is_installed('ovirt-node-ng-nodectl'): + return 'rhvh' + else: + return 'rhelh' + + +class rhhi_virt(rhv): + + cluster_name = 'Red Hat Hyperconverged Infrastructure - Virtualization' + sos_plugins = ('gluster',) + sos_plugin_options = {'gluster.dump': 'on'} + sos_preset = 'rhv' + + def check_enabled(self): + return (self.master.is_installed('rhvm') and self._check_for_rhhiv()) + + def _check_for_rhhiv(self): + ret = self._run_db_query('SELECT count(server_id) FROM gluster_server') + if ret['status'] == 0: + # if there are any entries in this table, RHHI-V is in use + return ret['stdout'].splitlines()[2].strip() != '0' + return False diff --git a/sos/collector/clusters/pacemaker.py b/sos/collector/clusters/pacemaker.py new file mode 100644 index 00000000..c64ec654 --- /dev/null +++ b/sos/collector/clusters/pacemaker.py @@ -0,0 +1,57 @@ +# Copyright Red Hat 2020, Jake Hunsaker <jhunsake@redhat.com> + +# This file is part of the sos project: https://github.com/sosreport/sos +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions of +# version 2 of the GNU General Public License. +# +# See the LICENSE file in the source distribution for further information. + +from sos.collector.clusters import Cluster + + +class pacemaker(Cluster): + + cluster_name = 'Pacemaker High Availability Cluster Manager' + sos_plugins = ['pacemaker'] + packages = ('pacemaker',) + option_list = [ + ('online', True, 'Collect nodes listed as online'), + ('offline', True, 'Collect nodes listed as offline') + ] + + def get_nodes(self): + self.res = self.exec_master_cmd('pcs status') + if self.res['status'] != 0: + self.log_error('Cluster status could not be determined. Is the ' + 'cluster running on this node?') + return [] + if 'node names do not match' in self.res['stdout']: + self.log_warn('Warning: node name mismatch reported. Attempts to ' + 'connect to some nodes may fail.\n') + return self.parse_pcs_output() + + def parse_pcs_output(self): + nodes = [] + if self.get_option('online'): + nodes += self.get_online_nodes() + if self.get_option('offline'): + nodes += self.get_offline_nodes() + return nodes + + def get_online_nodes(self): + for line in self.res['stdout'].splitlines(): + if line.startswith('Online:'): + nodes = line.split('[')[1].split(']')[0] + return [n for n in nodes.split(' ') if n] + + def get_offline_nodes(self): + offline = [] + for line in self.res['stdout'].splitlines(): + if line.startswith('Node') and line.endswith('(offline)'): + offline.append(line.split()[1].replace(':', '')) + if line.startswith('OFFLINE:'): + nodes = line.split('[')[1].split(']')[0] + offline.extend([n for n in nodes.split(' ') if n]) + return offline diff --git a/sos/collector/clusters/satellite.py b/sos/collector/clusters/satellite.py new file mode 100644 index 00000000..fb666a40 --- /dev/null +++ b/sos/collector/clusters/satellite.py @@ -0,0 +1,39 @@ +# Copyright Red Hat 2020, Jake Hunsaker <jhunsake@redhat.com> + +# This file is part of the sos project: https://github.com/sosreport/sos +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions of +# version 2 of the GNU General Public License. +# +# See the LICENSE file in the source distribution for further information. + +from pipes import quote +from sos.collector.clusters import Cluster + + +class satellite(Cluster): + """Red Hat Satellite 6""" + + cluster_name = 'Red Hat Satellite 6' + packages = ('satellite', 'satellite-installer') + + def _psql_cmd(self, query): + _cmd = "su postgres -c %s" + _dbcmd = "psql foreman -c %s" + return _cmd % quote(_dbcmd % quote(query)) + + def get_nodes(self): + cmd = self._psql_cmd('select name from smart_proxies') + res = self.exec_master_cmd(cmd, need_root=True) + if res['status'] == 0: + idx = 2 + if 'could not change' in res['stdout']: + idx = 3 + nodes = [n.strip() for n in res['stdout'].splitlines()[idx:-1]] + return nodes + + def set_node_label(self, node): + if node.address == self.master.address: + return 'satellite' + return 'capsule' diff --git a/sos/collector/configuration.py b/sos/collector/configuration.py new file mode 100644 index 00000000..8ec63139 --- /dev/null +++ b/sos/collector/configuration.py @@ -0,0 +1,238 @@ +# Copyright Red Hat 2020, Jake Hunsaker <jhunsake@redhat.com> + +# This file is part of the sos project: https://github.com/sosreport/sos +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions of +# version 2 of the GNU General Public License. +# +# See the LICENSE file in the source distribution for further information. + +import inspect +import os +import pipes +import re +import six +import socket + + +class Configuration(dict): + """ Dict subclass that is used to handle configuration information + needed by both SosCollector and the SosNode classes """ + + def __init__(self, args=None): + self.args = args + self.set_defaults() + self.parse_config() + self.parse_options() + self.check_user_privs() + self.parse_node_strings() + self['host_types'] = self._load_supported_hosts() + self['cluster_types'] = self._load_clusters() + + def set_defaults(self): + self['sos_mod'] = {} + self['master'] = '' + self['strip_sos_path'] = '' + self['ssh_port'] = 22 + self['ssh_user'] = 'root' + self['ssh_key'] = None + self['sos_cmd'] = 'sosreport --batch' + self['no_local'] = False + self['tmp_dir'] = '/var/tmp' + self['out_dir'] = '/var/tmp/' + self['nodes'] = [] + self['debug'] = False + self['tmp_dir_created'] = False + self['cluster_type'] = None + self['cluster'] = None + self['password'] = False + self['label'] = None + self['case_id'] = None + self['timeout'] = 300 + self['all_logs'] = False + self['alloptions'] = False + self['no_pkg_check'] = False + self['hostname'] = socket.gethostname() + ips = [i[4][0] for i in socket.getaddrinfo(socket.gethostname(), None)] + self['ip_addrs'] = list(set(ips)) + self['cluster_options'] = [] + self['image'] = None + self['skip_plugins'] = [] + self['enable_plugins'] = [] + self['plugin_options'] = [] + self['only_plugins'] = [] + self['list_options'] = False + self['hostlen'] = len(self['master']) or len(self['hostname']) + self['need_sudo'] = False + self['sudo_pw'] = '' + self['become_root'] = False + self['root_password'] = '' + self['threads'] = 4 + self['compression'] = '' + self['verify'] = False + self['chroot'] = '' + self['sysroot'] = '' + self['sos_opt_line'] = '' + self['batch'] = False + self['verbose'] = False + self['preset'] = '' + self['insecure_sudo'] = False + self['log_size'] = 0 + self['host_types'] = [] + self['password_per_node'] = False + self['group'] = None + self['save_group'] = '' + + def parse_node_strings(self): + ''' + Parses the given --nodes option(s) to properly format the regex + list that we use. We cannot blindly split on ',' chars since it is a + valid regex character, so we need to scan along the given strings and + check at each comma if we should use the preceeding string by itself + or not, based on if there is a valid regex at that index. + ''' + if not self['nodes']: + return + nodes = [] + if not isinstance(self['nodes'], list): + self['nodes'] = [self['nodes']] + for node in self['nodes']: + idxs = [i for i, m in enumerate(node) if m == ','] + idxs.append(len(node)) + start = 0 + pos = 0 + for idx in idxs: + try: + pos = idx + reg = node[start:idx] + re.compile(re.escape(reg)) + # make sure we aren't splitting a regex value + if '[' in reg and ']' not in reg: + continue + nodes.append(reg.lstrip(',')) + start = idx + except re.error: + continue + if pos != len(node): + nodes.append(node[pos+1:]) + self['nodes'] = nodes + + def parse_config(self): + for k in self.args: + if self.args[k]: + self[k] = self.args[k] + if self['sos_opt_line']: + self['sos_opt_line'] = pipes.quote(self['sos_opt_line']) + + def parse_cluster_options(self): + opts = [] + if not isinstance(self['cluster_options'], list): + self['cluster_options'] = [self['cluster_options']] + if self['cluster_options']: + for option in self['cluster_options']: + cluster = option.split('.')[0] + name = option.split('.')[1].split('=')[0] + try: + # there are no instances currently where any cluster option + # should contain a legitimate space. + value = option.split('=')[1].split()[0] + except IndexError: + # conversion to boolean is handled during validation + value = 'True' + + opts.append( + ClusterOption(name, value, value.__class__, cluster) + ) + self['cluster_options'] = opts + + def parse_options(self): + self.parse_cluster_options() + for opt in ['skip_plugins', 'enable_plugins', 'plugin_options', + 'only_plugins']: + if self[opt]: + opts = [] + if isinstance(self[opt], six.string_types): + self[opt] = [self[opt]] + for option in self[opt]: + opts += option.split(',') + self[opt] = opts + + def check_user_privs(self): + if not self['ssh_user'] == 'root': + self['need_sudo'] = True + + def _import_modules(self, modname): + '''Import and return all found classes in a module''' + mod_short_name = modname.split('.')[2] + module = __import__(modname, globals(), locals(), [mod_short_name]) + modules = inspect.getmembers(module, inspect.isclass) + for mod in modules: + if mod[0] in ('SosHost', 'Cluster'): + modules.remove(mod) + return modules + + def _find_modules_in_path(self, path, modulename): + '''Given a path and a module name, find everything that can be imported + and then import it + + path - the filesystem path of the package + modulename - the name of the module in the package + + E.G. a path of 'clusters', and a modulename of 'ovirt' equates to + importing soscollector.clusters.ovirt + ''' + modules = [] + if os.path.exists(path): + for pyfile in sorted(os.listdir(path)): + if not pyfile.endswith('.py'): + continue + if '__' in pyfile: + continue + fname, ext = os.path.splitext(pyfile) + modname = 'soscollector.%s.%s' % (modulename, fname) + modules.extend(self._import_modules(modname)) + return modules + + def _load_modules(self, package, submod): + '''Helper to import cluster and host types''' + modules = [] + for path in package.__path__: + if os.path.isdir(path): + modules.extend(self._find_modules_in_path(path, submod)) + return modules + + def _load_clusters(self): + '''Load an instance of each cluster so that sos-collector can later + determine what type of cluster is in use + ''' + import soscollector.clusters + package = soscollector.clusters + supported_clusters = {} + clusters = self._load_modules(package, 'clusters') + for cluster in clusters: + supported_clusters[cluster[0]] = cluster[1](self) + return supported_clusters + + def _load_supported_hosts(self): + '''Load all the supported/defined host types for sos-collector. + These will then be used to match against each node we run on + ''' + import soscollector.hosts + package = soscollector.hosts + supported_hosts = {} + hosts = self._load_modules(package, 'hosts') + for host in hosts: + supported_hosts[host[0]] = host[1] + return supported_hosts + + +class ClusterOption(): + '''Used to store/manipulate options for cluster profiles.''' + + def __init__(self, name, value, opt_type, cluster, description=None): + self.name = name + self.value = value + self.opt_type = opt_type + self.cluster = cluster + self.description = description diff --git a/sos/collector/exceptions.py b/sos/collector/exceptions.py new file mode 100644 index 00000000..1d1e76df --- /dev/null +++ b/sos/collector/exceptions.py @@ -0,0 +1,108 @@ +# Copyright Red Hat 2020, Jake Hunsaker <jhunsake@redhat.com> + +# This file is part of the sos project: https://github.com/sosreport/sos +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions of +# version 2 of the GNU General Public License. +# +# See the LICENSE file in the source distribution for further information. + + +class InvalidPasswordException(Exception): + '''Raised when the provided password is rejected by the remote host''' + + def __init__(self): + message = 'Invalid password provided' + super(InvalidPasswordException, self).__init__(message) + + +class TimeoutPasswordAuthException(Exception): + '''Raised when a timeout is hit waiting for an auth reply using a password + ''' + + def __init__(self): + message = 'Timeout hit while waiting for password validation' + super(TimeoutPasswordAuthException, self).__init__(message) + + +class PasswordRequestException(Exception): + '''Raised when the remote host requests a password that was not anticipated + ''' + + def __init__(self): + message = 'Host requested password, but none provided' + super(PasswordRequestException, self).__init__(message) + + +class AuthPermissionDeniedException(Exception): + '''Raised when authentication attempts return a permission error''' + + def __init__(self): + message = 'Permission denied while trying to authenticate' + super(AuthPermissionDeniedException, self).__init__(message) + + +class ConnectionException(Exception): + '''Raised when an attempt to connect fails''' + + def __init__(self, address='', port=''): + message = ("Could not connect to host %s on specified port %s" + % (address, port)) + super(ConnectionException, self).__init__(message) + + +class CommandTimeoutException(Exception): + '''Raised when a timeout expires''' + + def __init__(self, command=None): + message = 'Timeout expired' + if command: + message += " executing %s" % command + super(CommandTimeoutException, self).__init__(message) + + +class ConnectionTimeoutException(Exception): + '''Raised when a timeout expires while trying to connect to the host''' + + def __init__(self): + message = 'Timeout expires while trying to connect' + super(ConnectionTimeoutException, self).__init__(message) + + +class ControlSocketMissingException(Exception): + '''Raised when the SSH control socket is missing''' + + def __init__(self, path=''): + message = "SSH control socket %s does not exist" % path + super(ControlSocketMissingException, self).__init__(message) + + +class ControlPersistUnsupportedException(Exception): + '''Raised when SSH ControlPersist is unsupported locally''' + + def __init__(self): + message = 'ControlPersist unsupported by local SSH installation' + super(ControlPersistUnsupportedException, self).__init__(message) + + +class UnsupportedHostException(Exception): + '''Raised when the host type is unsupported or undetermined''' + + def __init__(self): + message = 'Host did not match any supported distributions' + super(UnsupportedHostException, self).__init__(message) + + +__all__ = [ + 'AuthPermissionDeniedException', + 'CommandTimeoutException', + 'ConnectionException', + 'ConnectionTimeoutException', + 'ControlPersistUnsupportedException', + 'ControlSocketMissingException', + 'InvalidPasswordException', + 'PasswordRequestException', + 'TimeoutPasswordAuthException', + 'UnsupportedHostException' +] diff --git a/sos/collector/hosts/__init__.py b/sos/collector/hosts/__init__.py new file mode 100644 index 00000000..c66ee44d --- /dev/null +++ b/sos/collector/hosts/__init__.py @@ -0,0 +1,125 @@ +# Copyright Red Hat 2020, Jake Hunsaker <jhunsake@redhat.com> + +# This file is part of the sos project: https://github.com/sosreport/sos +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions of +# version 2 of the GNU General Public License. +# +# See the LICENSE file in the source distribution for further information. + + +class SosHost(): + '''Base class for defining host types - usually defined by distribution + + This should be subclassed for any distro/release that sos-collector can be + expected to run on. At minimum it needs to define a package manager and a + way to identify the node as a particular distribution - usually through + inspection of /etc/os-release or related file. + + The check_enabled() method should handle looking for the necessary string + inside the release_file, or any other way to uniquely identify the host + installation. + + The release_file should be set to an identifying file like /etc/os-release + that can be inspected. + + ''' + distribution = '' + release_file = '/etc/os-release' + package_manager = { + 'name': '', + 'query': '' + } + release = '' + containerized = False + container_runtime = None + container_image = None + sos_path_strip = None + sos_pkg_name = None # package name in deb/rpm/etc + sos_bin_path = None # path to sosreport binary + sos_container_name = 'sos-collector-tmp' + + def __init__(self, address): + self.address = address + + def _check_enabled(self, rel_string): + self.release = rel_string.strip() + return self.check_enabled(rel_string) + + def check_enabled(self, rel_string): + '''Should handle identifying the given host as being of the defined + distribution. + + MUST return either True or False. + ''' + return False + + def report_facts(self): + '''Assemble relevant information and return as a dict''' + facts = { + 'distribution': self.distribution, + 'release': self.release, + 'containerized': self.containerized, + 'container_runtime': self.container_runtime, + 'sos_prefix': self.set_sos_prefix() % { + 'image': self.container_image}, + 'cleanup_command': self.set_cleanup_cmd() + } + return facts + + def pkg_query(self, pkg): + '''Returns the command string to query a given package. + + Note that this DOES NOT run the query itself. That is left to the + SosNode instance that maintains the SSH connection. + ''' + return self.package_manager['query'] + ' %s' % pkg + + def set_sos_prefix(self): + '''If sosreport commands need to always be prefixed with something, + for example running in a specific container image, then it should be + defined here. + + If no prefix should be set, return an empty string instead of None. + ''' + return '' + + def set_cleanup_cmd(self): + '''If a host requires additional cleanup, the command should be set and + returned here + ''' + return '' + + def create_sos_container(self): + '''Returns the command that will create the container that will be + used for running commands inside a container on hosts that require it. + + This will use the container runtime defined for the host type to + launch a container. From there, we use the defined runtime to exec into + the container's namespace. + ''' + return '' + + def restart_sos_container(self): + '''Restarts the container created for sos-collector if it has stopped. + + This is called immediately after create_sos_container() as the command + to create the container will exit and the container will stop. For + current container runtimes, subsequently starting the container will + default to opening a bash shell in the container to keep it running, + thus allowing us to exec into it again. + ''' + return "%s start %s" % (self.container_runtime, + self.sos_container_name) + + def format_container_command(self, cmd): + '''Returns the command that allows us to exec into the created + container for sos-collector. + ''' + if self.container_runtime: + return '%s exec %s %s' % (self.container_runtime, + self.sos_container_name, + cmd) + else: + return cmd diff --git a/sos/collector/hosts/debian.py b/sos/collector/hosts/debian.py new file mode 100644 index 00000000..bfe6e24e --- /dev/null +++ b/sos/collector/hosts/debian.py @@ -0,0 +1,31 @@ +# Copyright Red Hat 2020, Jake Hunsaker <jhunsake@redhat.com> + +# This file is part of the sos project: https://github.com/sosreport/sos +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions of +# version 2 of the GNU General Public License. +# +# See the LICENSE file in the source distribution for further information. + +from sos.collector.hosts import SosHost + + +class DebianHost(SosHost): + '''Base class for defining Debian based systems''' + + distribution = 'Debian' + releases = ['ubuntu', 'debian'] + package_manager = { + 'name': 'dpkg', + 'query': "dpkg-query -W -f='${Package}-${Version}\\\n' " + } + sos_pkg_name = 'sosreport' + sos_bin_path = '/usr/bin/sosreport' + + def check_enabled(self, rel_string): + for release in self.releases: + if release in rel_string: + return True + return False +# vim:ts=4 et sw=4 diff --git a/sos/collector/hosts/redhat.py b/sos/collector/hosts/redhat.py new file mode 100644 index 00000000..967b6f5d --- /dev/null +++ b/sos/collector/hosts/redhat.py @@ -0,0 +1,83 @@ +# Copyright Red Hat 2020, Jake Hunsaker <jhunsake@redhat.com> + +# This file is part of the sos project: https://github.com/sosreport/sos +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions of +# version 2 of the GNU General Public License. +# +# See the LICENSE file in the source distribution for further information. + +from sos.collector.hosts import SosHost + + +class RedHatHost(SosHost): + '''Base class for defining Red Hat family systems''' + + distribution = 'Red Hat' + release_file = '/etc/redhat-release' + releases = ['fedora', 'red hat', 'centos'] + package_manager = { + 'name': 'rpm', + 'query': 'rpm -q' + } + sos_pkg_name = 'sos' + sos_bin_path = '/usr/sbin/sosreport' + + def check_enabled(self, rel_string): + for release in self.releases: + if release in rel_string.lower() and 'CoreOS' not in rel_string: + return True + return False + + +class RedHatAtomicHost(RedHatHost): + + containerized = True + container_runtime = 'docker' + container_image = 'registry.access.redhat.com/rhel7/support-tools' + sos_path_strip = '/host' + + def check_enabled(self, rel_string): + return 'Atomic Host' in rel_string + + def create_sos_container(self): + _cmd = ("{runtime} run -di --name {name} --privileged --ipc=host" + " --net=host --pid=host -e HOST=/host -e NAME={name} -e " + "IMAGE={image} -v /run:/run -v /var/log:/var/log -v " + "/etc/machine-id:/etc/machine-id -v " + "/etc/localtime:/etc/localtime -v /:/host {image}") + return _cmd.format( + runtime=self.container_runtime, + name=self.sos_container_name, + image=self.container_image + ) + + def set_cleanup_cmd(self): + return 'docker rm --force sos-collector-tmp' + + +class RedHatCoreOSHost(RedHatHost): + + containerized = True + container_runtime = 'podman' + container_image = 'registry.redhat.io/rhel8/support-tools' + sos_path_strip = '/host' + + def check_enabled(self, rel_string): + return 'CoreOS' in rel_string + + def create_sos_container(self): + _cmd = ("{runtime} run -di --name {name} --privileged --ipc=host" + " --net=host --pid=host -e HOST=/host -e NAME={name} -e " + "IMAGE={image} -v /run:/run -v /var/log:/var/log -v " + "/etc/machine-id:/etc/machine-id -v " + "/etc/localtime:/etc/localtime -v /:/host {image}") + return _cmd.format( + runtime=self.container_runtime, + name=self.sos_container_name, + image=self.container_image + ) + + def set_cleanup_cmd(self): + return 'podman rm --force %s' % self.sos_container_name diff --git a/sos/collector/sosnode.py b/sos/collector/sosnode.py new file mode 100644 index 00000000..50e4b3e2 --- /dev/null +++ b/sos/collector/sosnode.py @@ -0,0 +1,819 @@ +# Copyright Red Hat 2020, Jake Hunsaker <jhunsake@redhat.com> + +# This file is part of the sos project: https://github.com/sosreport/sos +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions of +# version 2 of the GNU General Public License. +# +# See the LICENSE file in the source distribution for further information. + +import fnmatch +import inspect +import logging +import os +import pexpect +import re +import shutil + +from distutils.version import LooseVersion +from pipes import quote +from sos.collector.exceptions import * + + +class SosNode(): + + def __init__(self, address, config, password=None, force=False, + load_facts=True): + self.address = address.strip() + self.local = False + self.hostname = None + self.config = config + self._password = password or self.config['password'] + self.sos_path = None + self.retrieved = False + self.hash_retrieved = False + self.file_list = [] + self.sos_info = { + 'version': None, + 'enabled': [], + 'disabled': [], + 'options': [], + 'presets': [] + } + filt = ['localhost', '127.0.0.1', self.config['hostname']] + self.logger = logging.getLogger('sos_collector') + self.console = logging.getLogger('sos_collector_console') + self.control_path = ("%s/.sos-collector-%s" + % (self.config['tmp_dir'], self.address)) + self.ssh_cmd = self._create_ssh_command() + if self.address not in filt or force: + try: + self.connected = self._create_ssh_session() + except Exception as err: + self.log_error('Unable to open SSH session: %s' % err) + raise + else: + self.connected = True + self.local = True + if self.connected and load_facts: + self.host = self.determine_host() + if not self.host: + self.connected = False + self.close_ssh_session() + return None + if self.local: + if self.check_in_container(): + self.host.containerized = False + self.log_debug("Host facts found to be %s" % + self.host.report_facts()) + self.get_hostname() + if self.host.containerized: + self.create_sos_container() + self._load_sos_info() + + def _create_ssh_command(self): + '''Build the complete ssh command for this node''' + cmd = "ssh -oControlPath=%s " % self.control_path + cmd += "%s@%s " % (self.config['ssh_user'], self.address) + return cmd + + def _fmt_msg(self, msg): + return '{:<{}} : {}'.format(self._hostname, self.config['hostlen'] + 1, + msg) + + def check_in_container(self): + ''' + Tries to identify if we are currently running in a container or not. + ''' + if os.path.exists('/run/.containerenv'): + self.log_debug('Found /run/.containerenv. Running in container.') + return True + if os.environ.get('container') is not None: + self.log_debug("Found env var 'container'. Running in container") + return True + return False + + def create_sos_container(self): + '''If the host is containerized, create the container we'll be using + ''' + if self.host.containerized: + res = self.run_command(self.host.create_sos_container()) + if res['status'] in [0, 125]: # 125 means container exists + ret = self.run_command(self.host.restart_sos_container()) + if ret['status'] == 0: + self.log_debug("Temporary container %s created" + % self.host.sos_container_name) + return True + else: + self.log_error("Could not start container after create: %s" + % ret['stdout']) + raise Exception + else: + self.log_error("Could not create container on host: %s" + % res['stdout']) + raise Exception + + def file_exists(self, fname): + '''Checks for the presence of fname on the remote node''' + if not self.local: + try: + res = self.run_command("stat %s" % fname) + return res['status'] == 0 + except Exception as err: + return False + else: + try: + os.stat(fname) + return True + except Exception: + return False + + @property + def _hostname(self): + if self.hostname and 'localhost' not in self.hostname: + return self.hostname + return self.address + + @property + def control_socket_exists(self): + '''Check if the SSH control socket exists + + The control socket is automatically removed by the SSH daemon in the + event that the last connection to the node was greater than the timeout + set by the ControlPersist option. This can happen for us if we are + collecting from a large number of nodes, and the timeout expires before + we start collection. + ''' + return os.path.exists(self.control_path) + + def _sanitize_log_msg(self, msg): + '''Attempts to obfuscate sensitive information in log messages such as + passwords''' + reg = r'(?P<var>(pass|key|secret|PASS|KEY|SECRET).*?=)(?P<value>.*?\s)' + return re.sub(reg, r'\g<var>****** ', msg) + + def log_info(self, msg): + '''Used to print and log info messages''' + caller = inspect.stack()[1][3] + lmsg = '[%s:%s] %s' % (self._hostname, caller, msg) + self.logger.info(lmsg) + self.console.info(self._fmt_msg(msg)) + + def log_error(self, msg): + '''Used to print and log error messages''' + caller = inspect.stack()[1][3] + lmsg = '[%s:%s] %s' % (self._hostname, caller, msg) + self.logger.error(lmsg) + self.console.error(self._fmt_msg(msg)) + + def log_debug(self, msg): + '''Used to print and log debug messages''' + msg = self._sanitize_log_msg(msg) + caller = inspect.stack()[1][3] + msg = '[%s:%s] %s' % (self._hostname, caller, msg) + self.logger.debug(msg) + if self.config['verbose']: + self.console.debug(msg) + + def get_hostname(self): + '''Get the node's hostname''' + sout = self.run_command('hostname') + self.hostname = sout['stdout'].strip() + self.log_debug( + 'Hostname set to %s' % self.hostname) + + def _format_cmd(self, cmd): + '''If we need to provide a sudo or root password to a command, then + here we prefix the command with the correct bits + ''' + if self.config['become_root']: + return "su -c %s" % quote(cmd) + if self.config['need_sudo']: + return "sudo -S %s" % cmd + return cmd + + def _fmt_output(self, output=None, rc=0): + '''Formats the returned output from a command into a dict''' + if rc == 0: + stdout = output + stderr = '' + else: + stdout = '' + stderr = output + res = {'status': rc, + 'stdout': stdout, + 'stderr': stderr} + return res + + def _load_sos_info(self): + '''Queries the node for information about the installed version of sos + ''' + cmd = self.host.pkg_query(self.host.sos_pkg_name) + res = self.run_command(cmd, use_container=True) + if res['status'] == 0: + ver = res['stdout'].splitlines()[-1].split('-')[1] + self.sos_info['version'] = ver + self.log_debug('sos version is %s' % self.sos_info['version']) + else: + self.log_error('sos is not installed on this node') + self.connected = False + return False + cmd = 'sosreport -l' + sosinfo = self.run_command(cmd, use_container=True) + if sosinfo['status'] == 0: + self._load_sos_plugins(sosinfo['stdout']) + if self.check_sos_version('3.6'): + self._load_sos_presets() + + def _load_sos_presets(self): + cmd = 'sosreport --list-presets' + res = self.run_command(cmd, use_container=True) + if res['status'] == 0: + for line in res['stdout'].splitlines(): + if line.strip().startswith('name:'): + pname = line.split('name:')[1].strip() + self.sos_info['presets'].append(pname) + + def _load_sos_plugins(self, sosinfo): + ENABLED = 'The following plugins are currently enabled:' + DISABLED = 'The following plugins are currently disabled:' + OPTIONS = 'The following plugin options are available:' + PROFILES = 'Profiles:' + + enablereg = ENABLED + '(.*?)' + DISABLED + disreg = DISABLED + '(.*?)' + OPTIONS + optreg = OPTIONS + '(.*?)' + PROFILES + proreg = PROFILES + '(.*?)' + '\n\n' + + self.sos_info['enabled'] = self._regex_sos_help(enablereg, sosinfo) + self.sos_info['disabled'] = self._regex_sos_help(disreg, sosinfo) + self.sos_info['options'] = self._regex_sos_help(optreg, sosinfo) + self.sos_info['profiles'] = self._regex_sos_help(proreg, sosinfo, True) + + def _regex_sos_help(self, regex, sosinfo, is_list=False): + res = [] + for result in re.findall(regex, sosinfo, re.S): + for line in result.splitlines(): + if not is_list: + try: + res.append(line.split()[0]) + except Exception: + pass + else: + r = line.split(',') + res.extend(p.strip() for p in r if p.strip()) + return res + + def read_file(self, to_read): + '''Reads the specified file and returns the contents''' + try: + self.log_debug("Reading file %s" % to_read) + if not self.local: + res = self.run_command("cat %s" % to_read, timeout=5) + if res['status'] == 0: + return res['stdout'] + else: + if 'No such file' in res['stdout']: + self.log_debug("File %s does not exist on node" + % to_read) + else: + self.log_error("Error reading %s: %s" % + (to_read, res['stdout'].split(':')[1:])) + return '' + else: + with open(to_read, 'r') as rfile: + return rfile.read() + except Exception as err: + self.log_error("Exception while reading %s: %s" % (to_read, err)) + return '' + + def determine_host(self): + '''Attempts to identify the host installation against supported + distributions + ''' + for host_type in self.config['host_types']: + host = self.config['host_types'][host_type](self.address) + rel_string = self.read_file(host.release_file) + if host._check_enabled(rel_string): + self.log_debug("Host installation found to be %s" % + host.distribution) + return host + self.log_error('Unable to determine host installation. Ignoring node') + raise UnsupportedHostException + + def check_sos_version(self, ver): + '''Checks to see if the sos installation on the node is AT LEAST the + given ver. This means that if the installed version is greater than + ver, this will still return True + ''' + return LooseVersion(self.sos_info['version']) >= ver + + def is_installed(self, pkg): + '''Checks if a given package is installed on the node''' + cmd = self.host.pkg_query(pkg) + res = self.run_command(cmd) + if res['status'] == 0: + return True + return False + + def run_command(self, cmd, timeout=180, get_pty=False, need_root=False, + force_local=False, use_container=False): + '''Runs a given cmd, either via the SSH session or locally + + Arguments: + cmd - the full command to be run + timeout - time in seconds to wait for the command to complete + get_pty - If a shell is absolutely needed to run a command, set + this to True + need_root - if a command requires root privileges, setting this to + True tells sos-collector to format the command with + sudo or su - as appropriate and to input the password + force_local - force a command to run locally. Mainly used for scp. + use_container - Run this command in a container *IF* the host is + containerized + ''' + if not self.control_socket_exists and not self.local: + self.log_debug('Control socket does not exist, attempting to ' + 're-create') + try: + _sock = self._create_ssh_session() + if not _sock: + self.log_debug('Failed to re-create control socket') + raise ControlSocketMissingException + except Exception as err: + self.log_error('Cannot run command: control socket does not ' + 'exist') + self.log_debug("Error while trying to create new SSH control " + "socket: %s" % err) + raise + if cmd.startswith('sosreport'): + cmd = cmd.replace('sosreport', self.host.sos_bin_path) + need_root = True + if need_root: + get_pty = True + cmd = self._format_cmd(cmd) + if use_container and self.host.containerized: + cmd = self.host.format_container_command(cmd) + self.log_debug('Running command %s' % cmd) + if 'atomic' in cmd: + get_pty = True + if not self.local and not force_local: + cmd = "%s %s" % (self.ssh_cmd, quote(cmd)) + else: + if get_pty: + cmd = "/bin/bash -c %s" % quote(cmd) + res = pexpect.spawn(cmd, encoding='utf-8') + if need_root: + if self.config['need_sudo']: + res.sendline(self.config['sudo_pw']) + if self.config['become_root']: + res.sendline(self.config['root_password']) + output = res.expect([pexpect.EOF, pexpect.TIMEOUT], + timeout=timeout) + if output == 0: + out = res.before + res.close() + rc = res.exitstatus + return {'status': rc, 'stdout': out} + elif output == 1: + raise CommandTimeoutException(cmd) + + def sosreport(self): + '''Run a sosreport on the node, then collect it''' + self.finalize_sos_cmd() + self.log_debug('Final sos command set to %s' % self.sos_cmd) + try: + path = self.execute_sos_command() + if path: + self.finalize_sos_path(path) + else: + self.log_error('Unable to determine path of sos archive') + if self.sos_path: + self.retrieved = self.retrieve_sosreport() + except Exception: + pass + self.cleanup() + + def _create_ssh_session(self): + ''' + Using ControlPersist, create the initial connection to the node. + + This will generate an OpenSSH ControlPersist socket within the tmp + directory created or specified for sos-collector to use. + + At most, we will wait 30 seconds for a connection. This involves a 15 + second wait for the initial connection attempt, and a subsequent 15 + second wait for a response when we supply a password. + + Since we connect to nodes in parallel (using the --threads value), this + means that the time between 'Connecting to nodes...' and 'Beginning + collection of sosreports' that users see can be up to an amount of time + equal to 30*(num_nodes/threads) seconds. + + Returns + True if session is successfully opened, else raise Exception + ''' + # Don't use self.ssh_cmd here as we need to add a few additional + # parameters to establish the initial connection + self.log_debug('Opening SSH session to create control socket') + connected = False + ssh_key = '' + ssh_port = '' + if self.config['ssh_port'] != 22: + ssh_port = "-p%s " % self.config['ssh_port'] + if self.config['ssh_key']: + ssh_key = "-i%s" % self.config['ssh_key'] + cmd = ("ssh %s %s -oControlPersist=600 -oControlMaster=auto " + "-oStrictHostKeyChecking=no -oControlPath=%s %s@%s " + "\"echo Connected\"" % (ssh_key, + ssh_port, + self.control_path, + self.config['ssh_user'], + self.address)) + res = pexpect.spawn(cmd, encoding='utf-8') + + connect_expects = [ + u'Connected', + u'password:', + u'.*Permission denied.*', + u'.* port .*: No route to host', + u'.*Could not resolve hostname.*', + pexpect.TIMEOUT + ] + + index = res.expect(connect_expects, timeout=15) + if index == 0: + connected = True + elif index == 1: + if self._password: + pass_expects = [ + u'Connected', + u'Permission denied, please try again.', + pexpect.TIMEOUT + ] + res.sendline(self._password) + pass_index = res.expect(pass_expects, timeout=15) + if pass_index == 0: + connected = True + elif pass_index == 1: + # Note that we do not get an exitstatus here, so matching + # this line means an invalid password will be reported for + # both invalid passwords and invalid user names + raise InvalidPasswordException + elif pass_index == 2: + raise TimeoutPasswordAuthException + else: + raise PasswordRequestException + elif index == 2: + raise AuthPermissionDeniedException + elif index == 3: + raise ConnectionException(self.address, self.config['ssh_port']) + elif index == 4: + raise ConnectionException(self.address) + elif index == 5: + raise ConnectionTimeoutException + else: + raise Exception("Unknown error, client returned %s" % res.before) + if connected: + self.log_debug("Successfully created control socket at %s" + % self.control_path) + return True + return False + + def close_ssh_session(self): + '''Remove the control socket to effectively terminate the session''' + if self.local: + return True + try: + res = self.run_command("rm -f %s" % self.control_path, + force_local=True) + if res['status'] == 0: + return True + self.log_error("Could not remove ControlPath %s: %s" + % (self.control_path, res['stdout'])) + return False + except Exception as e: + self.log_error('Error closing SSH session: %s' % e) + return False + + def _preset_exists(self, preset): + '''Verifies if the given preset exists on the node''' + return preset in self.sos_info['presets'] + + def _plugin_exists(self, plugin): + '''Verifies if the given plugin exists on the node''' + return any(plugin in s for s in [self.sos_info['enabled'], + self.sos_info['disabled']]) + + def _check_enabled(self, plugin): + '''Checks to see if the plugin is default enabled on node''' + return plugin in self.sos_info['enabled'] + + def _check_disabled(self, plugin): + '''Checks to see if the plugin is default disabled on node''' + return plugin in self.sos_info['disabled'] + + def _plugin_option_exists(self, opt): + '''Attempts to verify that the given option is available on the node. + Note that we only get available options for enabled plugins, so if a + plugin has been force-enabled we cannot validate if the plugin option + is correct or not''' + plug = opt.split('.')[0] + if not self._plugin_exists(plug): + return False + if (self._check_disabled(plug) and + plug not in self.config['enable_plugins']): + return False + if self._check_enabled(plug): + return opt in self.sos_info['options'] + # plugin exists, but is normally disabled. Assume user knows option is + # valid when enabling the plugin + return True + + def _fmt_sos_opt_list(self, opts): + '''Returns a comma delimited list for sos plugins that are confirmed + to exist on the node''' + return ','.join(o for o in opts if self._plugin_exists(o)) + + def finalize_sos_cmd(self): + '''Use host facts and compare to the cluster type to modify the sos + command if needed''' + self.sos_cmd = self.config['sos_cmd'] + label = self.determine_sos_label() + if label: + self.sos_cmd = ' %s %s' % (self.sos_cmd, quote(label)) + + if self.config['sos_opt_line']: + return True + + if self.config['only_plugins']: + plugs = [o for o in self.config['only_plugins'] + if self._plugin_exists(o)] + if len(plugs) != len(self.config['only_plugins']): + not_only = list(set(self.config['only_plugins']) - set(plugs)) + self.log_debug('Requested plugins %s were requested to be ' + 'enabled but do not exist' % not_only) + only = self._fmt_sos_opt_list(self.config['only_plugins']) + if only: + self.sos_cmd += ' --only-plugins=%s' % quote(only) + return True + + if self.config['skip_plugins']: + # only run skip-plugins for plugins that are enabled + skip = [o for o in self.config['skip_plugins'] + if self._check_enabled(o)] + if len(skip) != len(self.config['skip_plugins']): + not_skip = list(set(self.config['skip_plugins']) - set(skip)) + self.log_debug('Requested to skip plugins %s, but plugins are ' + 'already not enabled' % not_skip) + skipln = self._fmt_sos_opt_list(skip) + if skipln: + self.sos_cmd += ' --skip-plugins=%s' % quote(skipln) + + if self.config['enable_plugins']: + # only run enable for plugins that are disabled + opts = [o for o in self.config['enable_plugins'] + if o not in self.config['skip_plugins'] + and self._check_disabled(o) and self._plugin_exists(o)] + if len(opts) != len(self.config['enable_plugins']): + not_on = list(set(self.config['enable_plugins']) - set(opts)) + self.log_debug('Requested to enable plugins %s, but plugins ' + 'are already enabled or do not exist' % not_on) + enable = self._fmt_sos_opt_list(opts) + if enable: + self.sos_cmd += ' --enable-plugins=%s' % quote(enable) + + if self.config['plugin_options']: + opts = [o for o in self.config['plugin_options'] + if self._plugin_exists(o.split('.')[0]) + and self._plugin_option_exists(o.split('=')[0])] + if opts: + self.sos_cmd += ' -k %s' % quote(','.join(o for o in opts)) + + if self.config['preset']: + if self._preset_exists(self.config['preset']): + self.sos_cmd += ' --preset=%s' % quote(self.config['preset']) + else: + self.log_debug('Requested to enable preset %s but preset does ' + 'not exist on node' % self.config['preset']) + + def determine_sos_label(self): + '''Determine what, if any, label should be added to the sosreport''' + label = '' + label += self.config['cluster'].get_node_label(self) + + if self.config['label']: + label += ('%s' % self.config['label'] if not label + else '-%s' % self.config['label']) + + if not label: + return None + + self.log_debug('Label for sosreport set to %s' % label) + if self.check_sos_version('3.6'): + lcmd = '--label' + else: + lcmd = '--name' + label = '%s-%s' % (self.address.split('.')[0], label) + return '%s=%s' % (lcmd, label) + + def finalize_sos_path(self, path): + '''Use host facts to determine if we need to change the sos path + we are retrieving from''' + pstrip = self.host.sos_path_strip + if pstrip: + path = path.replace(pstrip, '') + path = path.split()[0] + self.log_debug('Final sos path: %s' % path) + self.sos_path = path + self.archive = path.split('/')[-1] + + def determine_sos_error(self, rc, stdout): + if rc == -1: + return 'sosreport process received SIGKILL on node' + if rc == 1: + if 'sudo' in stdout: + return 'sudo attempt failed' + if rc == 127: + return 'sosreport terminated unexpectedly. Check disk space' + if len(stdout) > 0: + return stdout.split('\n')[0:1] + else: + return 'sos exited with code %s' % rc + + def execute_sos_command(self): + '''Run sosreport and capture the resulting file path''' + self.log_info("Generating sosreport...") + try: + path = False + res = self.run_command(self.sos_cmd, + timeout=self.config['timeout'], + get_pty=True, need_root=True, + use_container=True) + if res['status'] == 0: + for line in res['stdout'].splitlines(): + if fnmatch.fnmatch(line, '*sosreport-*tar*'): + path = line.strip() + else: + err = self.determine_sos_error(res['status'], res['stdout']) + self.log_debug("Error running sosreport. rc = %s msg = %s" + % (res['status'], res['stdout'] or + res['stderr'])) + raise Exception(err) + return path + except CommandTimeoutException: + self.log_error('Timeout exceeded') + raise + except Exception as e: + self.log_error('Error running sosreport: %s' % e) + raise + + def retrieve_file(self, path): + '''Copies the specified file from the host to our temp dir''' + destdir = self.config['tmp_dir'] + '/' + dest = destdir + path.split('/')[-1] + try: + if not self.local: + if self.file_exists(path): + self.log_debug("Copying remote %s to local %s" % + (path, destdir)) + cmd = "/usr/bin/scp -oControlPath=%s %s@%s:%s %s" % ( + self.control_path, + self.config['ssh_user'], + self.address, + path, + destdir + ) + res = self.run_command(cmd, force_local=True) + return res['status'] == 0 + else: + self.log_debug("Attempting to copy remote file %s, but it " + "does not exist on filesystem" % path) + return False + else: + self.log_debug("Moving %s to %s" % (path, destdir)) + shutil.copy(path, dest) + return True + except Exception as err: + self.log_debug("Failed to retrieve %s: %s" % (path, err)) + return False + + def remove_file(self, path): + '''Removes the spciefied file from the host. This should only be used + after we have retrieved the file already + ''' + path = ''.join(path.split()) + try: + if len(path) <= 2: # ensure we have a non '/' path + self.log_debug("Refusing to remove path %s: appears to be " + "incorrect and possibly dangerous" % path) + return False + if self.file_exists(path): + self.log_debug("Removing file %s" % path) + cmd = "rm -f %s" % path + res = self.run_command(cmd, need_root=True) + return res['status'] == 0 + else: + self.log_debug("Attempting to remove remote file %s, but it " + "does not exist on filesystem" % path) + return False + except Exception as e: + self.log_debug('Failed to remove %s: %s' % (path, e)) + return False + + def retrieve_sosreport(self): + '''Collect the sosreport archive from the node''' + if self.sos_path: + if self.config['need_sudo'] or self.config['become_root']: + try: + self.make_archive_readable(self.sos_path) + except Exception: + self.log_error('Failed to make archive readable') + return False + try: + self.make_archive_readable(self.sos_path + '.md5') + except Exception: + self.log_debug('Failed to make md5 readable') + self.logger.info('Retrieving sosreport from %s' % self.address) + self.log_info('Retrieving sosreport...') + ret = self.retrieve_file(self.sos_path) + if ret: + self.log_info('Successfully collected sosreport') + self.file_list.append(self.sos_path.split('/')[-1]) + else: + self.log_error('Failed to retrieve sosreport') + raise SystemExit + self.hash_retrieved = self.retrieve_file(self.sos_path + '.md5') + if self.hash_retrieved: + self.file_list.append(self.sos_path.split('/')[-1] + '.md5') + return True + else: + # sos sometimes fails but still returns a 0 exit code + if self.stderr.read(): + e = self.stderr.read() + else: + e = [x.strip() for x in self.stdout.readlines() if x.strip][-1] + self.logger.error( + 'Failed to run sosreport on %s: %s' % (self.address, e)) + self.log_error('Failed to run sosreport. %s' % e) + return False + + def remove_sos_archive(self): + '''Remove the sosreport archive from the node, since we have + collected it and it would be wasted space otherwise''' + if self.sos_path is None: + return + if 'sosreport' not in self.sos_path: + self.log_debug("Node sosreport path %s looks incorrect. Not " + "attempting to remove path" % self.sos_path) + return + removed = self.remove_file(self.sos_path) + if not removed: + self.log_error('Failed to remove sosreport') + + def cleanup(self): + '''Remove the sos archive from the node once we have it locally''' + self.remove_sos_archive() + if self.hash_retrieved: + self.remove_file(self.sos_path + '.md5') + cleanup = self.host.set_cleanup_cmd() + if cleanup: + self.run_command(cleanup) + + def collect_extra_cmd(self, filenames): + '''Collect the file created by a cluster outside of sos''' + for filename in filenames: + try: + if self.config['need_sudo'] or self.config['become_root']: + try: + self.make_archive_readable(filename) + except Exception as err: + self.log_error("Unable to retrieve file %s" % filename) + self.log_debug("Failed to make file %s readable: %s" + % (filename, err)) + continue + ret = self.retrieve_file(filename) + if ret: + self.file_list.append(filename.split('/')[-1]) + self.remove_file(filename) + else: + self.log_error("Unable to retrieve file %s" % filename) + except Exception as e: + msg = 'Error collecting additional data from master: %s' % e + self.log_error(msg) + + def make_archive_readable(self, filepath): + '''Used to make the given archive world-readable, which is slightly + better than changing the ownership outright. + + This is only used when we're not connecting as root. + ''' + cmd = 'chmod o+r %s' % filepath + res = self.run_command(cmd, timeout=10, need_root=True) + if res['status'] == 0: + return True + else: + msg = "Exception while making %s readable. Return code was %s" + self.log_error(msg % (filepath, res['status'])) + raise Exception |