diff options
author | Jake Hunsaker <jhunsake@redhat.com> | 2022-03-18 16:25:35 -0400 |
---|---|---|
committer | Jake Hunsaker <jhunsake@redhat.com> | 2022-03-28 12:42:25 -0400 |
commit | 3b84b4ccfa9e4924a5a3829d3810568dfb69bf63 (patch) | |
tree | 06338718dccbc86e166d5dbf7f03d91d76988927 | |
parent | 1e12325efaa500d304dcbfbeeb50e72ed0f938f5 (diff) | |
download | sos-3b84b4ccfa9e4924a5a3829d3810568dfb69bf63.tar.gz |
[pacemaker] Redesign node enumeration logic
It has been found that `pcs status` output is liable to change, which
ends up breaking our parsing of node lists when using it on newer
versions.
Instead, first try to parse through `crm_mon` output, which is what `pcs
status` uses under the hood, but as a stable and reliable xml format.
Failing that, for example if the `--primary` node is not functioning as
part of the cluster, source `/etc/corosync/corosync.conf` instead.
Related: RHBZ2065805
Related: RHBZ2065811
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
-rw-r--r-- | sos/collector/clusters/pacemaker.py | 110 |
1 files changed, 76 insertions, 34 deletions
diff --git a/sos/collector/clusters/pacemaker.py b/sos/collector/clusters/pacemaker.py index 55024314..49d0ce51 100644 --- a/sos/collector/clusters/pacemaker.py +++ b/sos/collector/clusters/pacemaker.py @@ -8,7 +8,11 @@ # # See the LICENSE file in the source distribution for further information. +import re + from sos.collector.clusters import Cluster +from setuptools._vendor.packaging import version +from xml.etree import ElementTree class pacemaker(Cluster): @@ -18,42 +22,80 @@ class pacemaker(Cluster): packages = ('pacemaker',) option_list = [ ('online', True, 'Collect nodes listed as online'), - ('offline', True, 'Collect nodes listed as offline') + ('offline', True, 'Collect nodes listed as offline'), + ('only-corosync', False, 'Only use corosync.conf to enumerate nodes') ] def get_nodes(self): - self.res = self.exec_primary_cmd('pcs status') - if self.res['status'] != 0: - self.log_error('Cluster status could not be determined. Is the ' - 'cluster running on this node?') - return [] - if 'node names do not match' in self.res['output']: - self.log_warn('Warning: node name mismatch reported. Attempts to ' - 'connect to some nodes may fail.\n') - return self.parse_pcs_output() - - def parse_pcs_output(self): - nodes = [] - if self.get_option('online'): - nodes += self.get_online_nodes() - if self.get_option('offline'): - nodes += self.get_offline_nodes() - return nodes - - def get_online_nodes(self): - for line in self.res['output'].splitlines(): - if line.startswith('Online:'): - nodes = line.split('[')[1].split(']')[0] - return [n for n in nodes.split(' ') if n] - - def get_offline_nodes(self): - offline = [] - for line in self.res['output'].splitlines(): - if line.startswith('Node') and line.endswith('(offline)'): - offline.append(line.split()[1].replace(':', '')) - if line.startswith('OFFLINE:'): - nodes = line.split('[')[1].split(']')[0] - offline.extend([n for n in nodes.split(' ') if n]) - return offline + self.nodes = [] + # try crm_mon first + try: + if not self.get_option('only-corosync'): + try: + self.get_nodes_from_crm() + except Exception as err: + self.log_warn("Falling back to sourcing corosync.conf. " + "Could not parse crm_mon output: %s" % err) + if not self.nodes: + # fallback to corosync.conf, in case the node we're inspecting + # is offline from the cluster + self.get_nodes_from_corosync() + except Exception as err: + self.log_error("Could not determine nodes from cluster: %s" % err) + + _shorts = [n for n in self.nodes if '.' not in n] + if _shorts: + self.log_warn( + "WARNING: Node addresses '%s' may not resolve locally if you " + "are not running on a node in the cluster. Try using option " + "'-c pacemaker.only-corosync' if these connections fail." + % ','.join(_shorts) + ) + return self.nodes + + def get_nodes_from_crm(self): + """ + Try to parse crm_mon output for node list and status. + """ + xmlopt = '--output-as=xml' + # older pacemaker had a different option for xml output + _ver = self.exec_primary_cmd('crm_mon --version') + if _ver['status'] == 0: + cver = _ver['output'].split()[1].split('-')[0] + if not version.parse(cver) > version.parse('2.0.3'): + xmlopt = '--as-xml' + else: + return + _out = self.exec_primary_cmd( + "crm_mon --one-shot --inactive %s" % xmlopt, + need_root=True + ) + if _out['status'] == 0: + self.parse_crm_xml(_out['output']) + + def parse_crm_xml(self, xmlstring): + """ + Parse the xml output string provided by crm_mon + """ + _xml = ElementTree.fromstring(xmlstring) + nodes = _xml.find('nodes') + for node in nodes: + _node = node.attrib + if self.get_option('online') and _node['online'] == 'true': + self.nodes.append(_node['name']) + elif self.get_option('offline') and _node['online'] == 'false': + self.nodes.append(_node['name']) + + def get_nodes_from_corosync(self): + """ + As a fallback measure, read corosync.conf to get the node list. Note + that this prevents us from separating online nodes from offline nodes. + """ + self.log_warn("WARNING: unable to distinguish online nodes from " + "offline nodes when sourcing from corosync.conf") + cc = self.primary.read_file('/etc/corosync/corosync.conf') + nodes = re.findall(r'((\sring0_addr:)(.*))', cc) + for node in nodes: + self.nodes.append(node[-1].strip()) # vim: set et ts=4 sw=4 : |