diff options
-rw-r--r-- | man/en/sos-collect.1 | 1 | ||||
-rw-r--r-- | sos/collector/clusters/ocp.py | 14 | ||||
-rw-r--r-- | sos/collector/sosnode.py | 8 | ||||
-rw-r--r-- | sos/collector/transports/__init__.py | 20 | ||||
-rw-r--r-- | sos/collector/transports/oc.py | 220 |
5 files changed, 257 insertions, 6 deletions
diff --git a/man/en/sos-collect.1 b/man/en/sos-collect.1 index 8ad4fe5e..a1f6c10e 100644 --- a/man/en/sos-collect.1 +++ b/man/en/sos-collect.1 @@ -364,6 +364,7 @@ The types of transports supported are currently as follows: \fBauto\fR Allow the cluster type to determine the transport used \fBcontrol_persist\fR Use OpenSSH's ControlPersist feature. This is the default behavior + \fBoc\fR Use a \fBlocally\fR configured \fBoc\fR binary to deploy collection pods on OCP nodes .TP \fB\-\-tmp\-dir\fR TMP_DIR diff --git a/sos/collector/clusters/ocp.py b/sos/collector/clusters/ocp.py index ad97587f..a9357dbf 100644 --- a/sos/collector/clusters/ocp.py +++ b/sos/collector/clusters/ocp.py @@ -12,6 +12,7 @@ import os from pipes import quote from sos.collector.clusters import Cluster +from sos.utilities import is_executable class ocp(Cluster): @@ -83,6 +84,19 @@ class ocp(Cluster): nodes[_node[0]][column] = _node[idx[column]] return nodes + def set_transport_type(self): + if is_executable('oc'): + return 'oc' + self.log_info("Local installation of 'oc' not found or is not " + "correctly configured. Will use ControlPersist") + self.ui_log.warn( + "Preferred transport 'oc' not available, will fallback to SSH." + ) + if not self.opts.batch: + input("Press ENTER to continue connecting with SSH, or Ctrl+C to" + "abort.") + return 'control_persist' + def get_nodes(self): nodes = [] self.node_dict = {} diff --git a/sos/collector/sosnode.py b/sos/collector/sosnode.py index 5c5c7201..8a9dbd7a 100644 --- a/sos/collector/sosnode.py +++ b/sos/collector/sosnode.py @@ -20,6 +20,7 @@ from sos.policies import load from sos.policies.init_systems import InitSystem from sos.collector.transports.control_persist import SSHControlPersist from sos.collector.transports.local import LocalTransport +from sos.collector.transports.oc import OCTransport from sos.collector.exceptions import (CommandTimeoutException, ConnectionException, UnsupportedHostException, @@ -28,6 +29,7 @@ from sos.collector.exceptions import (CommandTimeoutException, TRANSPORTS = { 'local': LocalTransport, 'control_persist': SSHControlPersist, + 'oc': OCTransport } @@ -421,13 +423,11 @@ class SosNode(): if 'atomic' in cmd: get_pty = True - if get_pty: - cmd = "/bin/bash -c %s" % quote(cmd) - if env: _cmd_env = self.env_vars env = _cmd_env.update(env) - return self._transport.run_command(cmd, timeout, need_root, env) + return self._transport.run_command(cmd, timeout, need_root, env, + get_pty) def sosreport(self): """Run an sos report on the node, then collect it""" diff --git a/sos/collector/transports/__init__.py b/sos/collector/transports/__init__.py index 5be7dc6d..7bffee62 100644 --- a/sos/collector/transports/__init__.py +++ b/sos/collector/transports/__init__.py @@ -144,7 +144,8 @@ class RemoteTransport(): raise NotImplementedError("Transport %s does not define disconnect" % self.name) - def run_command(self, cmd, timeout=180, need_root=False, env=None): + def run_command(self, cmd, timeout=180, need_root=False, env=None, + get_pty=False): """Run a command on the node, returning its output and exit code. This should return the exit code of the command being executed, not the exit code of whatever mechanism the transport uses to execute that @@ -165,10 +166,15 @@ class RemoteTransport(): :param env: Specify env vars to be passed to the ``cmd`` :type env: ``dict`` + :param get_pty: Does ``cmd`` require execution with a pty? + :type get_pty: ``bool`` + :returns: Output of ``cmd`` and the exit code :rtype: ``dict`` with keys ``output`` and ``status`` """ self.log_debug('Running command %s' % cmd) + if get_pty: + cmd = "/bin/bash -c %s" % quote(cmd) # currently we only use/support the use of pexpect for handling the # execution of these commands, as opposed to directly invoking # subprocess.Popen() in conjunction with tools like sshpass. @@ -212,6 +218,13 @@ class RemoteTransport(): :type env: ``dict`` """ cmd = self._format_cmd_for_exec(cmd) + + # if for any reason env is empty, set it to None as otherwise + # pexpect interprets this to mean "run this command with no env vars of + # any kind" + if not env: + env = None + result = pexpect.spawn(cmd, encoding='utf-8', env=env) _expects = [pexpect.EOF, pexpect.TIMEOUT] @@ -268,6 +281,9 @@ class RemoteTransport(): _out = self.run_command('hostname') if _out['status'] == 0: self._hostname = _out['output'].strip() + + if not self._hostname: + self._hostname = self.address self.log_info("Hostname set to %s" % self._hostname) return self._hostname @@ -302,7 +318,7 @@ class RemoteTransport(): return self._read_file(fname) def _read_file(self, fname): - res = self.run_command("cat %s" % fname, timeout=5) + res = self.run_command("cat %s" % fname, timeout=10) if res['status'] == 0: return res['output'] else: diff --git a/sos/collector/transports/oc.py b/sos/collector/transports/oc.py new file mode 100644 index 00000000..649037b9 --- /dev/null +++ b/sos/collector/transports/oc.py @@ -0,0 +1,220 @@ +# Copyright Red Hat 2021, Jake Hunsaker <jhunsake@redhat.com> + +# This file is part of the sos project: https://github.com/sosreport/sos +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions of +# version 2 of the GNU General Public License. +# +# See the LICENSE file in the source distribution for further information. + +import json +import tempfile +import os + +from sos.collector.transports import RemoteTransport +from sos.utilities import (is_executable, sos_get_command_output, + SoSTimeoutError) + + +class OCTransport(RemoteTransport): + """This transport leverages the execution of commands via a locally + available and configured ``oc`` binary for OCPv4 environments. + + OCPv4 clusters generally discourage the use of SSH, so this transport may + be used to remove our use of SSH in favor of the environment provided + method of connecting to nodes and executing commands via debug pods. + + Note that this approach will generate multiple debug pods over the course + of our execution + """ + + name = 'oc' + project = 'sos-collect-tmp' + + def run_oc(self, cmd, **kwargs): + """Format and run a command with `oc` in the project defined for our + execution + """ + return sos_get_command_output( + "oc -n sos-collect-tmp %s" % cmd, + **kwargs + ) + + @property + def connected(self): + up = self.run_oc( + "wait --timeout=0s --for=condition=ready pod/%s" % self.pod_name + ) + return up['status'] == 0 + + def get_node_pod_config(self): + """Based on our template for the debug container, add the node-specific + items so that we can deploy one of these on each node we're collecting + from + """ + return { + "kind": "Pod", + "apiVersion": "v1", + "metadata": { + "name": "%s-sos-collector" % self.address.split('.')[0], + "namespace": "sos-collect-tmp" + }, + "priorityClassName": "system-cluster-critical", + "spec": { + "volumes": [ + { + "name": "host", + "hostPath": { + "path": "/", + "type": "Directory" + } + }, + { + "name": "run", + "hostPath": { + "path": "/run", + "type": "Directory" + } + }, + { + "name": "varlog", + "hostPath": { + "path": "/var/log", + "type": "Directory" + } + }, + { + "name": "machine-id", + "hostPath": { + "path": "/etc/machine-id", + "type": "File" + } + } + ], + "containers": [ + { + "name": "sos-collector-tmp", + "image": "registry.redhat.io/rhel8/support-tools", + "command": [ + "/bin/bash" + ], + "env": [ + { + "name": "HOST", + "value": "/host" + } + ], + "resources": {}, + "volumeMounts": [ + { + "name": "host", + "mountPath": "/host" + }, + { + "name": "run", + "mountPath": "/run" + }, + { + "name": "varlog", + "mountPath": "/var/log" + }, + { + "name": "machine-id", + "mountPath": "/etc/machine-id" + } + ], + "securityContext": { + "privileged": True, + "runAsUser": 0 + }, + "stdin": True, + "stdinOnce": True, + "tty": True + } + ], + "restartPolicy": "Never", + "nodeName": self.address, + "hostNetwork": True, + "hostPID": True, + "hostIPC": True + } + } + + def _connect(self, password): + # the oc binary must be _locally_ available for this to work + if not is_executable('oc'): + return False + + # deploy the debug container we'll exec into + podconf = self.get_node_pod_config() + self.pod_name = podconf['metadata']['name'] + fd, self.pod_tmp_conf = tempfile.mkstemp(dir=self.tmpdir) + with open(fd, 'w') as cfile: + json.dump(podconf, cfile) + self.log_debug("Starting sos collector container '%s'" % self.pod_name) + # this specifically does not need to run with a project definition + out = sos_get_command_output( + "oc create -f %s" % self.pod_tmp_conf + ) + if (out['status'] != 0 or "pod/%s created" % self.pod_name not in + out['output']): + self.log_error("Unable to deploy sos collect pod") + self.log_debug("Debug pod deployment failed: %s" % out['output']) + return False + self.log_debug("Pod '%s' successfully deployed, waiting for pod to " + "enter ready state" % self.pod_name) + + # wait for the pod to report as running + try: + up = self.run_oc("wait --for=condition=Ready pod/%s --timeout=30s" + % self.pod_name, + # timeout is for local safety, not oc + timeout=40) + if not up['status'] == 0: + self.log_error("Pod not available after 30 seconds") + return False + except SoSTimeoutError: + self.log_error("Timeout while polling for pod readiness") + return False + except Exception as err: + self.log_error("Error while waiting for pod to be ready: %s" + % err) + return False + + return True + + def _format_cmd_for_exec(self, cmd): + if cmd.startswith('oc'): + return ("oc -n %s exec --request-timeout=0 %s -- chroot /host %s" + % (self.project, self.pod_name, cmd)) + return super(OCTransport, self)._format_cmd_for_exec(cmd) + + def run_command(self, cmd, timeout=180, need_root=False, env=None, + get_pty=False): + # debug pod setup is slow, extend all timeouts to account for this + if timeout: + timeout += 10 + + # since we always execute within a bash shell, force disable get_pty + # to avoid double-quoting + return super(OCTransport, self).run_command(cmd, timeout, need_root, + env, False) + + def _disconnect(self): + os.unlink(self.pod_tmp_conf) + removed = self.run_oc("delete pod %s" % self.pod_name) + if "deleted" not in removed['output']: + self.log_debug("Calling delete on pod '%s' failed: %s" + % (self.pod_name, removed)) + return False + return True + + @property + def remote_exec(self): + return ("oc -n %s exec --request-timeout=0 %s -- /bin/bash -c" + % (self.project, self.pod_name)) + + def _retrieve_file(self, fname, dest): + cmd = self.run_oc("cp %s:%s %s" % (self.pod_name, fname, dest)) + return cmd['status'] == 0 |