aboutsummaryrefslogtreecommitdiffstats
path: root/sos/collector/clusters/ocp.py
blob: cc8787703974eba4ea4893b560cdeed00be7491b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
# Copyright Red Hat 2021, Jake Hunsaker <jhunsake@redhat.com>

# This file is part of the sos project: https://github.com/sosreport/sos
#
# This copyrighted material is made available to anyone wishing to use,
# modify, copy, or redistribute it subject to the terms and conditions of
# version 2 of the GNU General Public License.
#
# See the LICENSE file in the source distribution for further information.

import os

from shlex import quote
from sos.collector.clusters import Cluster
from sos.utilities import is_executable


class ocp(Cluster):
    """
    This profile is for use with OpenShift Container Platform (v4) clusters
    instead of the kubernetes profile.

    This profile will favor using the `oc` transport type, which means it will
    leverage a locally installed `oc` binary. This is also how node enumeration
    is done. To instead use SSH to connect to the nodes, use the
    '--transport=control_persist' option.

    Thus, a functional `oc` binary for the user executing sos collect is
    required. Functional meaning that the user can run `oc` commands with
    clusterAdmin privileges.

    If this requires the use of a secondary configuration file, specify that
    path with the 'kubeconfig' cluster option. This config file will also be
    used on a single master node to perform API collections if the `with-api`
    option is enabled (default disabled). If no `kubeconfig` option is given,
    but `with-api` is enabled, the cluster profile will attempt to use a
    well-known default kubeconfig file if it is available on the host.

    Alternatively, provide a clusterAdmin access token either via the 'token'
    cluster option or, preferably, the SOSOCPTOKEN environment variable.

    By default, this profile will enumerate only master nodes within the
    cluster, and this may be changed by overriding the 'role' cluster option.
    To collect from all nodes in the cluster regardless of role, use the form
    -c ocp.role=''.

    Filtering nodes by a label applied to that node is also possible via the
    label cluster option, though be aware that this is _combined_ with the role
    option mentioned above.

    To avoid redundant collections of OCP API information (e.g. 'oc get'
    commands), this profile will attempt to enable the API collections on only
    a single master node. If the none of the master nodes have a functional
    'oc' binary available, *and* the --no-local option is used, that means that
    no API data will be collected.
    """

    cluster_name = 'OpenShift Container Platform v4'
    packages = ('openshift-hyperkube', 'openshift-clients')

    api_collect_enabled = False
    token = None
    project = 'sos-collect-tmp'
    oc_cluster_admin = None
    _oc_cmd = ''

    option_list = [
        ('label', '', 'Colon delimited list of labels to select nodes with'),
        ('role', 'master', 'Colon delimited list of roles to filter on'),
        ('kubeconfig', '', 'Path to the kubeconfig file'),
        ('token', '', 'Service account token to use for oc authorization'),
        ('with-api', False, 'Collect OCP API data from a master node'),
        ('api-url', '', 'Alternate API URL of an external control-plane'),
    ]

    @property
    def oc_cmd(self):
        if not self._oc_cmd:
            self._oc_cmd = 'oc'
            if self.primary.host.in_container():
                _oc_path = self.primary.run_command(
                    'which oc', chroot=self.primary.host.sysroot
                )
                if _oc_path['status'] == 0:
                    self._oc_cmd = os.path.join(
                        self.primary.host.sysroot,
                        _oc_path['output'].strip().lstrip('/')
                    )
                else:
                    self.log_warn(
                        "Unable to to determine PATH for 'oc' command, "
                        "node enumeration may fail."
                    )
                    self.log_debug(
                        f"Locating 'oc' failed: {_oc_path['output']}")
            if self.get_option('kubeconfig'):
                self._oc_cmd += " --kubeconfig " \
                        f"{self.get_option('kubeconfig')}"
            self.log_debug(f"oc base command set to {self._oc_cmd}")
        return self._oc_cmd

    def fmt_oc_cmd(self, cmd):
        """Format the oc command to optionall include the kubeconfig file if
        one is specified
        """
        return f"{self.oc_cmd} {cmd}"

    def _attempt_oc_login(self):
        """Attempt to login to the API using the oc command using a provided
        token
        """
        _res = self.exec_primary_cmd(
            self.fmt_oc_cmd("login --insecure-skip-tls-verify=True "
                            f"--token={self.token} "
                            f"{self.get_option('api-url')}")
        )
        return _res['status'] == 0

    def check_enabled(self):
        if super(ocp, self).check_enabled():
            return True
        self.token = self.get_option('token') or os.getenv('SOSOCPTOKEN', None)
        if self.token:
            self._attempt_oc_login()
        _who = self.fmt_oc_cmd('whoami')
        return self.exec_primary_cmd(_who)['status'] == 0

    def setup(self):
        """Create the project that we will be executing in for any nodes'
        collection via a container image
        """
        if not self.set_transport_type() == 'oc':
            return None

        out = self.exec_primary_cmd(self.fmt_oc_cmd("auth can-i '*' '*'"))
        self.oc_cluster_admin = out['status'] == 0
        if not self.oc_cluster_admin:
            self.log_debug("Check for cluster-admin privileges returned false,"
                           " cannot create project in OCP cluster")
            raise Exception("Insufficient permissions to create temporary "
                            "collection project.\nAborting...")

        self.log_info(f"Creating new temporary project '{self.project}'")
        ret = self.exec_primary_cmd(
            self.fmt_oc_cmd(f"new-project {self.project}")
        )
        if ret['status'] == 0:
            self._label_sos_project()
            return True

        self.log_debug(f"Failed to create project: {ret['output']}")
        raise Exception("Failed to create temporary project for collection. "
                        "\nAborting...")

    def _label_sos_project(self):
        """Add pertinent labels to the temporary project we've created so that
        our privileged containers can properly run.
        """
        labels = [
            "security.openshift.io/scc.podSecurityLabelSync=false",
            "pod-security.kubernetes.io/enforce=privileged"
        ]
        for label in labels:
            ret = self.exec_primary_cmd(
                self.fmt_oc_cmd(
                    f"label namespace {self.project} {label} --overwrite"
                )
            )
            if not ret['status'] == 0:
                raise Exception(
                    f"Error applying namespace labels: {ret['output']}"
                )

    def cleanup(self):
        """Remove the project we created to execute within
        """
        if self.project:
            try:
                ret = self.exec_primary_cmd(
                    self.fmt_oc_cmd(f"delete project {self.project}"),
                    timeout=30
                )
                if not ret['status'] == 0:
                    self.log_error(
                        f"Error deleting temporary project: {ret['output']}"
                    )
                ret = self.exec_primary_cmd(
                    self.fmt_oc_cmd(
                        f"wait namespace/{self.project} --for=delete "
                        f"--timeout=30s"
                    )
                )
                if not ret['status'] == 0:
                    self.log_error(
                        f"Error waiting for temporary project to be deleted: "
                        f"{ret['output']}"
                    )
            except Exception as err:
                self.log_error(
                    f"Failed attempting to remove temporary project "
                    f"'sos-collect-tmp': {err}\n"
                    f"Please manually remove the temporary project."
                )
            # don't leave the config on a non-existing project
            self.exec_primary_cmd(self.fmt_oc_cmd("project default"))
            self.project = None
        return True

    def _build_dict(self, nodelist):
        """From the output of get_nodes(), construct an easier-to-reference
        dict of nodes that will be used in determining labels, primary status,
        etc...

        :param nodelist:        The split output of `oc get nodes`
        :type nodelist:         ``list``

        :returns:           A dict of nodes with `get nodes` columns as keys
        :rtype:             ``dict``
        """
        nodes = {}
        if 'NAME' in nodelist[0]:
            # get the index of the fields
            statline = nodelist.pop(0).split()
            idx = {}
            for state in ['status', 'roles', 'version', 'os-image']:
                try:
                    idx[state] = statline.index(state.upper())
                except Exception:
                    # label is not available, which is not fatal for our dict
                    # construction here
                    pass
            for node in nodelist:
                _node = node.split()
                nodes[_node[0]] = {}
                for column in idx:
                    nodes[_node[0]][column] = _node[idx[column]]
        return nodes

    def set_transport_type(self):
        if self.opts.transport != 'auto':
            return self.opts.transport
        if is_executable('oc', sysroot=self.primary.host.sysroot):
            return 'oc'
        self.log_info("Local installation of 'oc' not found or is not "
                      "correctly configured. Will use ControlPersist.")
        self.ui_log.warning(
            "Preferred transport 'oc' not available, will fallback to SSH."
        )
        if not self.opts.batch:
            input("Press ENTER to continue connecting with SSH, or Ctrl+C to"
                  "abort.")
        return 'control_persist'

    def get_nodes(self):
        nodes = []
        self.node_dict = {}
        cmd = 'get nodes -o wide'
        if self.get_option('label'):
            labels = ','.join(self.get_option('label').split(':'))
            cmd += f" -l {quote(labels)}"
        res = self.exec_primary_cmd(self.fmt_oc_cmd(cmd))
        if res['status'] == 0:
            if self.get_option('role') == 'master':
                self.log_warn("NOTE: By default, only master nodes are listed."
                              "\nTo collect from all/more nodes, override the "
                              "role option with '-c ocp.role=role1:role2'")
            roles = [r for r in self.get_option('role').split(':')]
            self.node_dict = self._build_dict(res['output'].splitlines())
            for node_name, node in self.node_dict.items():
                if roles:
                    for role in roles:
                        if role in node['roles']:
                            nodes.append(node_name)
                            break
                else:
                    nodes.append(node_name)
        else:
            msg = "'oc' command failed"
            if 'Missing or incomplete' in res['output']:
                msg = ("'oc' failed due to missing kubeconfig on primary node."
                       " Specify one via '-c ocp.kubeconfig=<path>'")
            raise Exception(msg)
        return nodes

    def set_node_label(self, node):
        if node.address not in self.node_dict:
            return ''
        for label in ['master', 'worker']:
            if label in self.node_dict[node.address]['roles']:
                return label
        return ''

    def check_node_is_primary(self, sosnode):
        if sosnode.address not in self.node_dict:
            return False
        return 'master' in self.node_dict[sosnode.address]['roles']

    def _toggle_api_opt(self, node, use_api):
        """In earlier versions of sos, the openshift plugin option that is
        used to toggle the API collections was called `no-oc` rather than
        `with-api`. This older plugin option had the inverse logic of the
        current `with-api` option.

        Use this to toggle the correct plugin option given the node's sos
        version. Note that the use of version 4.2 here is tied to the RHEL
        release (the only usecase for this cluster profile) rather than
        the upstream version given the backports for that downstream.

        :param node:    The node being inspected for API collections
        :type node:     ``SoSNode``

        :param use_api: Should this node enable API collections?
        :type use_api:  ``bool``
        """
        if node.check_sos_version('4.2-16'):
            _opt = 'with-api'
            _val = 'on' if use_api else 'off'
        else:
            _opt = 'no-oc'
            _val = 'off' if use_api else 'on'
        node.plugopts.append(f"openshift.{_opt}={_val}")

    def set_primary_options(self, node):

        node.enable_plugins.append('openshift')
        if not self.get_option('with-api'):
            self._toggle_api_opt(node, False)
            return
        if self.api_collect_enabled:
            # a primary has already been enabled for API collection, disable
            # it among others
            self._toggle_api_opt(node, False)
        else:
            # running in a container, so reference the /host mount point
            master_kube = (
                '/host/etc/kubernetes/static-pod-resources/'
                'kube-apiserver-certs/secrets/node-kubeconfigs/'
                'localhost.kubeconfig'
            )
            _optconfig = self.get_option('kubeconfig')
            if _optconfig and not _optconfig.startswith('/host'):
                _optconfig = '/host/' + _optconfig
            _kubeconfig = _optconfig or master_kube
            _oc_cmd = 'oc'
            if node.host.containerized:
                _oc_cmd = '/host/bin/oc'
                # when run from a container, the oc command does not inherit
                # the default config, so if it's present then pass it here to
                # detect a funcitonal oc command. This is sidestepped in sos
                # report by being able to chroot the `oc` execution which we
                # cannot do remotely
                if node.file_exists('/root/.kube/config', need_root=True):
                    _oc_cmd += ' --kubeconfig /host/root/.kube/config'
            can_oc = node.run_command(f"{_oc_cmd} whoami",
                                      use_container=node.host.containerized,
                                      # container is available only to root
                                      # and if rhel, need to run sos as root
                                      # anyways which will run oc as root
                                      need_root=True)
            if can_oc['status'] == 0:
                # the primary node can already access the API
                self._toggle_api_opt(node, True)
                self.api_collect_enabled = True
            elif self.token:
                node.sos_env_vars['SOSOCPTOKEN'] = self.token
                self._toggle_api_opt(node, True)
                self.api_collect_enabled = True
            elif node.file_exists(_kubeconfig):
                # if the file exists, then the openshift sos plugin will use it
                # if the with-api option is turned on
                if not _kubeconfig == master_kube:
                    node.plugopts.append(
                        f"openshift.kubeconfig={_kubeconfig}"
                    )
                self._toggle_api_opt(node, True)
                self.api_collect_enabled = True
            if self.api_collect_enabled:
                msg = (f"API collections will be performed on {node.address}\n"
                       "Note: API collections may extend runtime by 10s of "
                       "minutes\n")
                self.soslog.info(msg)
                self.ui_log.info(msg)

    def set_node_options(self, node):
        # don't attempt OC API collections on non-primary nodes
        self._toggle_api_opt(node, False)

# vim: set et ts=4 sw=4 :