You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
231 lines
8.9 KiB
231 lines
8.9 KiB
2 years ago
|
From 3b84b4ccfa9e4924a5a3829d3810568dfb69bf63 Mon Sep 17 00:00:00 2001
|
||
|
From: Jake Hunsaker <jhunsake@redhat.com>
|
||
|
Date: Fri, 18 Mar 2022 16:25:35 -0400
|
||
|
Subject: [PATCH 1/2] [pacemaker] Redesign node enumeration logic
|
||
|
|
||
|
It has been found that `pcs status` output is liable to change, which
|
||
|
ends up breaking our parsing of node lists when using it on newer
|
||
|
versions.
|
||
|
|
||
|
Instead, first try to parse through `crm_mon` output, which is what `pcs
|
||
|
status` uses under the hood, but as a stable and reliable xml format.
|
||
|
|
||
|
Failing that, for example if the `--primary` node is not functioning as
|
||
|
part of the cluster, source `/etc/corosync/corosync.conf` instead.
|
||
|
|
||
|
Related: RHBZ2065805
|
||
|
Related: RHBZ2065811
|
||
|
|
||
|
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
|
||
|
---
|
||
|
sos/collector/clusters/pacemaker.py | 110 +++++++++++++++++++---------
|
||
|
1 file changed, 76 insertions(+), 34 deletions(-)
|
||
|
|
||
|
diff --git a/sos/collector/clusters/pacemaker.py b/sos/collector/clusters/pacemaker.py
|
||
|
index 55024314..49d0ce51 100644
|
||
|
--- a/sos/collector/clusters/pacemaker.py
|
||
|
+++ b/sos/collector/clusters/pacemaker.py
|
||
|
@@ -8,7 +8,11 @@
|
||
|
#
|
||
|
# See the LICENSE file in the source distribution for further information.
|
||
|
|
||
|
+import re
|
||
|
+
|
||
|
from sos.collector.clusters import Cluster
|
||
|
+from setuptools._vendor.packaging import version
|
||
|
+from xml.etree import ElementTree
|
||
|
|
||
|
|
||
|
class pacemaker(Cluster):
|
||
|
@@ -18,42 +22,80 @@ class pacemaker(Cluster):
|
||
|
packages = ('pacemaker',)
|
||
|
option_list = [
|
||
|
('online', True, 'Collect nodes listed as online'),
|
||
|
- ('offline', True, 'Collect nodes listed as offline')
|
||
|
+ ('offline', True, 'Collect nodes listed as offline'),
|
||
|
+ ('only-corosync', False, 'Only use corosync.conf to enumerate nodes')
|
||
|
]
|
||
|
|
||
|
def get_nodes(self):
|
||
|
- self.res = self.exec_primary_cmd('pcs status')
|
||
|
- if self.res['status'] != 0:
|
||
|
- self.log_error('Cluster status could not be determined. Is the '
|
||
|
- 'cluster running on this node?')
|
||
|
- return []
|
||
|
- if 'node names do not match' in self.res['output']:
|
||
|
- self.log_warn('Warning: node name mismatch reported. Attempts to '
|
||
|
- 'connect to some nodes may fail.\n')
|
||
|
- return self.parse_pcs_output()
|
||
|
-
|
||
|
- def parse_pcs_output(self):
|
||
|
- nodes = []
|
||
|
- if self.get_option('online'):
|
||
|
- nodes += self.get_online_nodes()
|
||
|
- if self.get_option('offline'):
|
||
|
- nodes += self.get_offline_nodes()
|
||
|
- return nodes
|
||
|
-
|
||
|
- def get_online_nodes(self):
|
||
|
- for line in self.res['output'].splitlines():
|
||
|
- if line.startswith('Online:'):
|
||
|
- nodes = line.split('[')[1].split(']')[0]
|
||
|
- return [n for n in nodes.split(' ') if n]
|
||
|
-
|
||
|
- def get_offline_nodes(self):
|
||
|
- offline = []
|
||
|
- for line in self.res['output'].splitlines():
|
||
|
- if line.startswith('Node') and line.endswith('(offline)'):
|
||
|
- offline.append(line.split()[1].replace(':', ''))
|
||
|
- if line.startswith('OFFLINE:'):
|
||
|
- nodes = line.split('[')[1].split(']')[0]
|
||
|
- offline.extend([n for n in nodes.split(' ') if n])
|
||
|
- return offline
|
||
|
+ self.nodes = []
|
||
|
+ # try crm_mon first
|
||
|
+ try:
|
||
|
+ if not self.get_option('only-corosync'):
|
||
|
+ try:
|
||
|
+ self.get_nodes_from_crm()
|
||
|
+ except Exception as err:
|
||
|
+ self.log_warn("Falling back to sourcing corosync.conf. "
|
||
|
+ "Could not parse crm_mon output: %s" % err)
|
||
|
+ if not self.nodes:
|
||
|
+ # fallback to corosync.conf, in case the node we're inspecting
|
||
|
+ # is offline from the cluster
|
||
|
+ self.get_nodes_from_corosync()
|
||
|
+ except Exception as err:
|
||
|
+ self.log_error("Could not determine nodes from cluster: %s" % err)
|
||
|
+
|
||
|
+ _shorts = [n for n in self.nodes if '.' not in n]
|
||
|
+ if _shorts:
|
||
|
+ self.log_warn(
|
||
|
+ "WARNING: Node addresses '%s' may not resolve locally if you "
|
||
|
+ "are not running on a node in the cluster. Try using option "
|
||
|
+ "'-c pacemaker.only-corosync' if these connections fail."
|
||
|
+ % ','.join(_shorts)
|
||
|
+ )
|
||
|
+ return self.nodes
|
||
|
+
|
||
|
+ def get_nodes_from_crm(self):
|
||
|
+ """
|
||
|
+ Try to parse crm_mon output for node list and status.
|
||
|
+ """
|
||
|
+ xmlopt = '--output-as=xml'
|
||
|
+ # older pacemaker had a different option for xml output
|
||
|
+ _ver = self.exec_primary_cmd('crm_mon --version')
|
||
|
+ if _ver['status'] == 0:
|
||
|
+ cver = _ver['output'].split()[1].split('-')[0]
|
||
|
+ if not version.parse(cver) > version.parse('2.0.3'):
|
||
|
+ xmlopt = '--as-xml'
|
||
|
+ else:
|
||
|
+ return
|
||
|
+ _out = self.exec_primary_cmd(
|
||
|
+ "crm_mon --one-shot --inactive %s" % xmlopt,
|
||
|
+ need_root=True
|
||
|
+ )
|
||
|
+ if _out['status'] == 0:
|
||
|
+ self.parse_crm_xml(_out['output'])
|
||
|
+
|
||
|
+ def parse_crm_xml(self, xmlstring):
|
||
|
+ """
|
||
|
+ Parse the xml output string provided by crm_mon
|
||
|
+ """
|
||
|
+ _xml = ElementTree.fromstring(xmlstring)
|
||
|
+ nodes = _xml.find('nodes')
|
||
|
+ for node in nodes:
|
||
|
+ _node = node.attrib
|
||
|
+ if self.get_option('online') and _node['online'] == 'true':
|
||
|
+ self.nodes.append(_node['name'])
|
||
|
+ elif self.get_option('offline') and _node['online'] == 'false':
|
||
|
+ self.nodes.append(_node['name'])
|
||
|
+
|
||
|
+ def get_nodes_from_corosync(self):
|
||
|
+ """
|
||
|
+ As a fallback measure, read corosync.conf to get the node list. Note
|
||
|
+ that this prevents us from separating online nodes from offline nodes.
|
||
|
+ """
|
||
|
+ self.log_warn("WARNING: unable to distinguish online nodes from "
|
||
|
+ "offline nodes when sourcing from corosync.conf")
|
||
|
+ cc = self.primary.read_file('/etc/corosync/corosync.conf')
|
||
|
+ nodes = re.findall(r'((\sring0_addr:)(.*))', cc)
|
||
|
+ for node in nodes:
|
||
|
+ self.nodes.append(node[-1].strip())
|
||
|
|
||
|
# vim: set et ts=4 sw=4 :
|
||
|
--
|
||
|
2.34.3
|
||
|
|
||
|
|
||
|
From 6701a7d77ecc998b018b54ecc00f9fd102ae9518 Mon Sep 17 00:00:00 2001
|
||
|
From: Jake Hunsaker <jhunsake@redhat.com>
|
||
|
Date: Mon, 21 Mar 2022 12:05:59 -0400
|
||
|
Subject: [PATCH 2/2] [clusters] Allow clusters to not add localhost to node
|
||
|
list
|
||
|
|
||
|
For most of our supported clusters, we end up needing to add the
|
||
|
local host executing `sos collect` to the node list (unless `--no-local`
|
||
|
is used) as that accounts for the primary node that may otherwise be
|
||
|
left off. However, this is not helpful for clusters that may reports
|
||
|
node names as something other than resolveable names. In those cases,
|
||
|
such as with pacemaker, adding the local hostname may result in
|
||
|
duplicate collections.
|
||
|
|
||
|
Add a toggle to cluster profiles via a new `strict_node_list` class attr
|
||
|
that, if True, will skip this addition. This toggle is default `False`
|
||
|
to preserve existing behavior, and is now enabled for `pacemaker`
|
||
|
specifically.
|
||
|
|
||
|
Related: RHBZ#2065821
|
||
|
|
||
|
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
|
||
|
---
|
||
|
sos/collector/__init__.py | 3 ++-
|
||
|
sos/collector/clusters/__init__.py | 4 ++++
|
||
|
sos/collector/clusters/pacemaker.py | 1 +
|
||
|
3 files changed, 7 insertions(+), 1 deletion(-)
|
||
|
|
||
|
diff --git a/sos/collector/__init__.py b/sos/collector/__init__.py
|
||
|
index a8bb0064..d898ca34 100644
|
||
|
--- a/sos/collector/__init__.py
|
||
|
+++ b/sos/collector/__init__.py
|
||
|
@@ -1073,7 +1073,8 @@ class SoSCollector(SoSComponent):
|
||
|
for node in self.node_list:
|
||
|
if host == node.split('.')[0]:
|
||
|
self.node_list.remove(node)
|
||
|
- self.node_list.append(self.hostname)
|
||
|
+ if not self.cluster.strict_node_list:
|
||
|
+ self.node_list.append(self.hostname)
|
||
|
self.reduce_node_list()
|
||
|
try:
|
||
|
_node_max = len(max(self.node_list, key=len))
|
||
|
diff --git a/sos/collector/clusters/__init__.py b/sos/collector/clusters/__init__.py
|
||
|
index f3f550ad..f00677b8 100644
|
||
|
--- a/sos/collector/clusters/__init__.py
|
||
|
+++ b/sos/collector/clusters/__init__.py
|
||
|
@@ -57,6 +57,10 @@ class Cluster():
|
||
|
sos_plugin_options = {}
|
||
|
sos_preset = ''
|
||
|
cluster_name = None
|
||
|
+ # set this to True if the local host running collect should *not* be
|
||
|
+ # forcibly added to the node list. This can be helpful in situations where
|
||
|
+ # the host's fqdn and the name the cluster uses are different
|
||
|
+ strict_node_list = False
|
||
|
|
||
|
def __init__(self, commons):
|
||
|
self.primary = None
|
||
|
diff --git a/sos/collector/clusters/pacemaker.py b/sos/collector/clusters/pacemaker.py
|
||
|
index 49d0ce51..bebcb265 100644
|
||
|
--- a/sos/collector/clusters/pacemaker.py
|
||
|
+++ b/sos/collector/clusters/pacemaker.py
|
||
|
@@ -20,6 +20,7 @@ class pacemaker(Cluster):
|
||
|
cluster_name = 'Pacemaker High Availability Cluster Manager'
|
||
|
sos_plugins = ['pacemaker']
|
||
|
packages = ('pacemaker',)
|
||
|
+ strict_node_list = True
|
||
|
option_list = [
|
||
|
('online', True, 'Collect nodes listed as online'),
|
||
|
('offline', True, 'Collect nodes listed as offline'),
|
||
|
--
|
||
|
2.34.3
|
||
|
|