commit
90e6c311af
@ -0,0 +1,2 @@
|
|||||||
|
SOURCES/nagios-agents-metadata-105ab8a.tar.gz
|
||||||
|
SOURCES/pacemaker-a3f4479.tar.gz
|
@ -0,0 +1,2 @@
|
|||||||
|
ea6c0a27fd0ae8ce02f84a11f08a0d79377041c3 SOURCES/nagios-agents-metadata-105ab8a.tar.gz
|
||||||
|
883efa27f94c6a07942f51cf7c8959c5fbb624fe SOURCES/pacemaker-a3f4479.tar.gz
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,98 @@
|
|||||||
|
From d8e08729ad5e3dc62f774172f992210902fc0ed4 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Ken Gaillot <kgaillot@redhat.com>
|
||||||
|
Date: Mon, 23 Jan 2023 14:25:56 -0600
|
||||||
|
Subject: [PATCH] High: executor: fix regression in remote node shutdown
|
||||||
|
|
||||||
|
This reverts the essential part of d61494347, which was based on misdiagnosing
|
||||||
|
a remote node shutdown issue. Initially, it was thought that a "TLS server
|
||||||
|
session ended" log just after a remote node requested shutdown indicated that
|
||||||
|
the proxy connection coincidentally dropped at that moment. It actually is the
|
||||||
|
routine stopping of accepting new proxy connections, and existing when that
|
||||||
|
happens makes the remote node exit immediately without waiting for the
|
||||||
|
all-clear from the cluster.
|
||||||
|
|
||||||
|
Fixes T361
|
||||||
|
---
|
||||||
|
daemons/execd/pacemaker-execd.c | 19 +------------------
|
||||||
|
daemons/execd/pacemaker-execd.h | 3 +--
|
||||||
|
daemons/execd/remoted_tls.c | 6 +-----
|
||||||
|
3 files changed, 3 insertions(+), 25 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/daemons/execd/pacemaker-execd.c b/daemons/execd/pacemaker-execd.c
|
||||||
|
index db12674f13..491808974a 100644
|
||||||
|
--- a/daemons/execd/pacemaker-execd.c
|
||||||
|
+++ b/daemons/execd/pacemaker-execd.c
|
||||||
|
@@ -1,5 +1,5 @@
|
||||||
|
/*
|
||||||
|
- * Copyright 2012-2022 the Pacemaker project contributors
|
||||||
|
+ * Copyright 2012-2023 the Pacemaker project contributors
|
||||||
|
*
|
||||||
|
* The version control history for this file may have further details.
|
||||||
|
*
|
||||||
|
@@ -305,23 +305,6 @@ lrmd_exit(gpointer data)
|
||||||
|
return FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
-/*!
|
||||||
|
- * \internal
|
||||||
|
- * \brief Clean up and exit if shutdown has started
|
||||||
|
- *
|
||||||
|
- * \return Doesn't return
|
||||||
|
- */
|
||||||
|
-void
|
||||||
|
-execd_exit_if_shutting_down(void)
|
||||||
|
-{
|
||||||
|
-#ifdef PCMK__COMPILE_REMOTE
|
||||||
|
- if (shutting_down) {
|
||||||
|
- crm_warn("exit because TLS connection was closed and 'shutting_down' set");
|
||||||
|
- lrmd_exit(NULL);
|
||||||
|
- }
|
||||||
|
-#endif
|
||||||
|
-}
|
||||||
|
-
|
||||||
|
/*!
|
||||||
|
* \internal
|
||||||
|
* \brief Request cluster shutdown if appropriate, otherwise exit immediately
|
||||||
|
diff --git a/daemons/execd/pacemaker-execd.h b/daemons/execd/pacemaker-execd.h
|
||||||
|
index 6646ae29e3..f78e8dcdde 100644
|
||||||
|
--- a/daemons/execd/pacemaker-execd.h
|
||||||
|
+++ b/daemons/execd/pacemaker-execd.h
|
||||||
|
@@ -1,5 +1,5 @@
|
||||||
|
/*
|
||||||
|
- * Copyright 2012-2022 the Pacemaker project contributors
|
||||||
|
+ * Copyright 2012-2023 the Pacemaker project contributors
|
||||||
|
*
|
||||||
|
* The version control history for this file may have further details.
|
||||||
|
*
|
||||||
|
@@ -105,6 +105,5 @@ void remoted_spawn_pidone(int argc, char **argv, char **envp);
|
||||||
|
int process_lrmd_alert_exec(pcmk__client_t *client, uint32_t id,
|
||||||
|
xmlNode *request);
|
||||||
|
void lrmd_drain_alerts(GMainLoop *mloop);
|
||||||
|
-void execd_exit_if_shutting_down(void);
|
||||||
|
|
||||||
|
#endif // PACEMAKER_EXECD__H
|
||||||
|
diff --git a/daemons/execd/remoted_tls.c b/daemons/execd/remoted_tls.c
|
||||||
|
index 6f4b2d0062..c65e3f394d 100644
|
||||||
|
--- a/daemons/execd/remoted_tls.c
|
||||||
|
+++ b/daemons/execd/remoted_tls.c
|
||||||
|
@@ -1,5 +1,5 @@
|
||||||
|
/*
|
||||||
|
- * Copyright 2012-2022 the Pacemaker project contributors
|
||||||
|
+ * Copyright 2012-2023 the Pacemaker project contributors
|
||||||
|
*
|
||||||
|
* The version control history for this file may have further details.
|
||||||
|
*
|
||||||
|
@@ -250,10 +250,6 @@ static void
|
||||||
|
tls_server_dropped(gpointer user_data)
|
||||||
|
{
|
||||||
|
crm_notice("TLS server session ended");
|
||||||
|
- /* If we are in the process of shutting down, then we should actually exit.
|
||||||
|
- * bz#1804259
|
||||||
|
- */
|
||||||
|
- execd_exit_if_shutting_down();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
--
|
||||||
|
2.31.1
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,107 @@
|
|||||||
|
From 45617b727e280cac384a28ae3d96145e066e6197 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Reid Wahl <nrwahl@protonmail.com>
|
||||||
|
Date: Fri, 3 Feb 2023 12:08:57 -0800
|
||||||
|
Subject: [PATCH 01/02] Fix: fencer: Prevent double g_source_remove of op_timer_one
|
||||||
|
|
||||||
|
QE observed a rarely reproducible core dump in the fencer during
|
||||||
|
Pacemaker shutdown, in which we try to g_source_remove() an op timer
|
||||||
|
that's already been removed.
|
||||||
|
|
||||||
|
free_stonith_remote_op_list()
|
||||||
|
-> g_hash_table_destroy()
|
||||||
|
-> g_hash_table_remove_all_nodes()
|
||||||
|
-> clear_remote_op_timers()
|
||||||
|
-> g_source_remove()
|
||||||
|
-> crm_glib_handler()
|
||||||
|
-> "Source ID 190 was not found when attempting to remove it"
|
||||||
|
|
||||||
|
The likely cause is that request_peer_fencing() doesn't set
|
||||||
|
op->op_timer_one to 0 after calling g_source_remove() on it, so if that
|
||||||
|
op is still in the stonith_remote_op_list at shutdown with the same
|
||||||
|
timer, clear_remote_op_timers() tries to remove the source for
|
||||||
|
op_timer_one again.
|
||||||
|
|
||||||
|
There are only five locations that call g_source_remove() on a
|
||||||
|
remote_fencing_op_t timer.
|
||||||
|
* Three of them are in clear_remote_op_timers(), which first 0-checks
|
||||||
|
the timer and then sets it to 0 after g_source_remove().
|
||||||
|
* One is in remote_op_query_timeout(), which does the same.
|
||||||
|
* The last is the one we fix here in request_peer_fencing().
|
||||||
|
|
||||||
|
I don't know all the conditions of QE's test scenario at this point.
|
||||||
|
What I do know:
|
||||||
|
* have-watchdog=true
|
||||||
|
* stonith-watchdog-timeout=10
|
||||||
|
* no explicit topology
|
||||||
|
* fence agent script is missing for the configured fence device
|
||||||
|
* requested fencing of one node
|
||||||
|
* cluster shutdown
|
||||||
|
|
||||||
|
Fixes RHBZ2166967
|
||||||
|
|
||||||
|
Signed-off-by: Reid Wahl <nrwahl@protonmail.com>
|
||||||
|
---
|
||||||
|
daemons/fenced/fenced_remote.c | 1 +
|
||||||
|
1 file changed, 1 insertion(+)
|
||||||
|
|
||||||
|
diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c
|
||||||
|
index d61b5bd..b7426ff 100644
|
||||||
|
--- a/daemons/fenced/fenced_remote.c
|
||||||
|
+++ b/daemons/fenced/fenced_remote.c
|
||||||
|
@@ -1825,6 +1825,7 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
|
||||||
|
op->state = st_exec;
|
||||||
|
if (op->op_timer_one) {
|
||||||
|
g_source_remove(op->op_timer_one);
|
||||||
|
+ op->op_timer_one = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!((stonith_watchdog_timeout_ms > 0)
|
||||||
|
--
|
||||||
|
2.31.1
|
||||||
|
|
||||||
|
From 0291db4750322ec7f01ae6a4a2a30abca9d8e19e Mon Sep 17 00:00:00 2001
|
||||||
|
From: Reid Wahl <nrwahl@protonmail.com>
|
||||||
|
Date: Wed, 15 Feb 2023 22:30:27 -0800
|
||||||
|
Subject: [PATCH 02/02] Fix: fencer: Avoid double source remove of op_timer_total
|
||||||
|
|
||||||
|
remote_op_timeout() returns G_SOURCE_REMOVE, which tells GLib to remove
|
||||||
|
the source from the main loop after returning. Currently this function
|
||||||
|
is used as the callback only when creating op->op_timer_total.
|
||||||
|
|
||||||
|
If we don't set op->op_timer_total to 0 before returning from
|
||||||
|
remote_op_timeout(), then we can get an assertion and core dump from
|
||||||
|
GLib when the op's timers are being cleared (either during op
|
||||||
|
finalization or during fencer shutdown). This is because
|
||||||
|
clear_remote_op_timers() sees that op->op_timer_total != 0 and tries to
|
||||||
|
remove the source, but the source has already been removed.
|
||||||
|
|
||||||
|
Note that we're already (correctly) zeroing op->op_timer_one and
|
||||||
|
op->query_timeout as appropriate in their respective callback functions.
|
||||||
|
|
||||||
|
Fortunately, GLib doesn't care whether the source has already been
|
||||||
|
removed before we return G_SOURCE_REMOVE from a callback. So it's safe
|
||||||
|
to call finalize_op() (which removes all the op's timer sources) from
|
||||||
|
within a callback.
|
||||||
|
|
||||||
|
Fixes RHBZ#2166967
|
||||||
|
|
||||||
|
Signed-off-by: Reid Wahl <nrwahl@protonmail.com>
|
||||||
|
---
|
||||||
|
daemons/fenced/fenced_remote.c | 2 ++
|
||||||
|
1 file changed, 2 insertions(+)
|
||||||
|
|
||||||
|
diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c
|
||||||
|
index b7426ff88..adea3d7d8 100644
|
||||||
|
--- a/daemons/fenced/fenced_remote.c
|
||||||
|
+++ b/daemons/fenced/fenced_remote.c
|
||||||
|
@@ -718,6 +718,8 @@ remote_op_timeout(gpointer userdata)
|
||||||
|
{
|
||||||
|
remote_fencing_op_t *op = userdata;
|
||||||
|
|
||||||
|
+ op->op_timer_total = 0;
|
||||||
|
+
|
||||||
|
if (op->state == st_done) {
|
||||||
|
crm_debug("Action '%s' targeting %s for client %s already completed "
|
||||||
|
CRM_XS " id=%.8s",
|
||||||
|
--
|
||||||
|
2.39.0
|
@ -0,0 +1,151 @@
|
|||||||
|
From 0d15568a538349ac41028db6b506d13dd23e8732 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Chris Lumens <clumens@redhat.com>
|
||||||
|
Date: Tue, 14 Feb 2023 14:00:37 -0500
|
||||||
|
Subject: [PATCH] High: libcrmcommon: Fix handling node=NULL in
|
||||||
|
pcmk__attrd_api_query.
|
||||||
|
|
||||||
|
According to the header file, if node is NULL, pcmk__attrd_api_query
|
||||||
|
should query the value of the given attribute on all cluster nodes.
|
||||||
|
This is also what the server expects and how attrd_updater is supposed
|
||||||
|
to work.
|
||||||
|
|
||||||
|
However, pcmk__attrd_api_query has no way of letting callers decide
|
||||||
|
whether they want to query all nodes or whether they want to use the
|
||||||
|
local node. We were passing NULL for the node name, which it took to
|
||||||
|
mean it should look up the local node name. This calls
|
||||||
|
pcmk__node_attr_target, which probes the local cluster name and returns
|
||||||
|
that to pcmk__attrd_api_query. If it returns non-NULL, that value will
|
||||||
|
then be put into the XML IPC call which means the server will only
|
||||||
|
return the value for that node.
|
||||||
|
|
||||||
|
In testing this was usually fine. However, in pratice, the methods
|
||||||
|
pcmk__node_attr_target uses to figure out the local cluster node name
|
||||||
|
involves checking the OCF_RESKEY_CRM_meta_on_node environment variable
|
||||||
|
among others.
|
||||||
|
|
||||||
|
This variable was never set in testing, but can be set in the real
|
||||||
|
world. This leads to circumstances where the user did "attrd_updater -QA"
|
||||||
|
expecting to get the values on all nodes, but instead only got the value
|
||||||
|
on the local cluster node.
|
||||||
|
|
||||||
|
In pacemaker-2.1.4 and prior, pcmk__node_attr_target was simply never
|
||||||
|
called if the node was NULL but was called otherwise.
|
||||||
|
|
||||||
|
The fix is to modify pcmk__attrd_api_query to take an option for
|
||||||
|
querying all nodes. If that's present, we'll query all nodes. If it's
|
||||||
|
not present, we'll look at the given node name - NULL means look it up,
|
||||||
|
anything else means just that node.
|
||||||
|
|
||||||
|
Regression in 2.1.5 introduced by eb20a65577
|
||||||
|
---
|
||||||
|
include/crm/common/attrd_internal.h | 6 +++++-
|
||||||
|
include/crm/common/ipc_attrd_internal.h | 7 +++++--
|
||||||
|
lib/common/ipc_attrd.c | 12 ++++++++----
|
||||||
|
tools/attrd_updater.c | 5 +++--
|
||||||
|
4 files changed, 21 insertions(+), 9 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/include/crm/common/attrd_internal.h b/include/crm/common/attrd_internal.h
|
||||||
|
index 389be48..7337c38 100644
|
||||||
|
--- a/include/crm/common/attrd_internal.h
|
||||||
|
+++ b/include/crm/common/attrd_internal.h
|
||||||
|
@@ -1,5 +1,5 @@
|
||||||
|
/*
|
||||||
|
- * Copyright 2004-2022 the Pacemaker project contributors
|
||||||
|
+ * Copyright 2004-2023 the Pacemaker project contributors
|
||||||
|
*
|
||||||
|
* The version control history for this file may have further details.
|
||||||
|
*
|
||||||
|
@@ -25,6 +25,10 @@ enum pcmk__node_attr_opts {
|
||||||
|
pcmk__node_attr_perm = (1 << 5),
|
||||||
|
pcmk__node_attr_sync_local = (1 << 6),
|
||||||
|
pcmk__node_attr_sync_cluster = (1 << 7),
|
||||||
|
+ // pcmk__node_attr_utilization is 8, but that has not been backported.
|
||||||
|
+ // I'm leaving the gap here in case we backport that in the future and
|
||||||
|
+ // also to avoid problems on mixed-version clusters.
|
||||||
|
+ pcmk__node_attr_query_all = (1 << 9),
|
||||||
|
};
|
||||||
|
|
||||||
|
#define pcmk__set_node_attr_flags(node_attr_flags, flags_to_set) do { \
|
||||||
|
diff --git a/include/crm/common/ipc_attrd_internal.h b/include/crm/common/ipc_attrd_internal.h
|
||||||
|
index 2c6713f..b1b7584 100644
|
||||||
|
--- a/include/crm/common/ipc_attrd_internal.h
|
||||||
|
+++ b/include/crm/common/ipc_attrd_internal.h
|
||||||
|
@@ -1,5 +1,5 @@
|
||||||
|
/*
|
||||||
|
- * Copyright 2022 the Pacemaker project contributors
|
||||||
|
+ * Copyright 2022-2023 the Pacemaker project contributors
|
||||||
|
*
|
||||||
|
* The version control history for this file may have further details.
|
||||||
|
*
|
||||||
|
@@ -110,10 +110,13 @@ int pcmk__attrd_api_purge(pcmk_ipc_api_t *api, const char *node);
|
||||||
|
*
|
||||||
|
* \param[in,out] api Connection to pacemaker-attrd
|
||||||
|
* \param[in] node Look up the attribute for this node
|
||||||
|
- * (or NULL for all nodes)
|
||||||
|
+ * (or NULL for the local node)
|
||||||
|
* \param[in] name Attribute name
|
||||||
|
* \param[in] options Bitmask of pcmk__node_attr_opts
|
||||||
|
*
|
||||||
|
+ * \note Passing pcmk__node_attr_query_all will cause the function to query
|
||||||
|
+ * the value of \p name on all nodes, regardless of the value of \p node.
|
||||||
|
+ *
|
||||||
|
* \return Standard Pacemaker return code
|
||||||
|
*/
|
||||||
|
int pcmk__attrd_api_query(pcmk_ipc_api_t *api, const char *node, const char *name,
|
||||||
|
diff --git a/lib/common/ipc_attrd.c b/lib/common/ipc_attrd.c
|
||||||
|
index 4606509..dece49b 100644
|
||||||
|
--- a/lib/common/ipc_attrd.c
|
||||||
|
+++ b/lib/common/ipc_attrd.c
|
||||||
|
@@ -1,5 +1,5 @@
|
||||||
|
/*
|
||||||
|
- * Copyright 2011-2022 the Pacemaker project contributors
|
||||||
|
+ * Copyright 2011-2023 the Pacemaker project contributors
|
||||||
|
*
|
||||||
|
* The version control history for this file may have further details.
|
||||||
|
*
|
||||||
|
@@ -332,10 +332,14 @@ pcmk__attrd_api_query(pcmk_ipc_api_t *api, const char *node, const char *name,
|
||||||
|
return EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
- target = pcmk__node_attr_target(node);
|
||||||
|
+ if (pcmk_is_set(options, pcmk__node_attr_query_all)) {
|
||||||
|
+ node = NULL;
|
||||||
|
+ } else {
|
||||||
|
+ target = pcmk__node_attr_target(node);
|
||||||
|
|
||||||
|
- if (target != NULL) {
|
||||||
|
- node = target;
|
||||||
|
+ if (target != NULL) {
|
||||||
|
+ node = target;
|
||||||
|
+ }
|
||||||
|
}
|
||||||
|
|
||||||
|
request = create_attrd_op(NULL);
|
||||||
|
diff --git a/tools/attrd_updater.c b/tools/attrd_updater.c
|
||||||
|
index 3cd766d..cbd341d 100644
|
||||||
|
--- a/tools/attrd_updater.c
|
||||||
|
+++ b/tools/attrd_updater.c
|
||||||
|
@@ -376,6 +376,7 @@ attrd_event_cb(pcmk_ipc_api_t *attrd_api, enum pcmk_ipc_event event_type,
|
||||||
|
static int
|
||||||
|
send_attrd_query(pcmk__output_t *out, const char *attr_name, const char *attr_node, gboolean query_all)
|
||||||
|
{
|
||||||
|
+ uint32_t options = pcmk__node_attr_none;
|
||||||
|
pcmk_ipc_api_t *attrd_api = NULL;
|
||||||
|
int rc = pcmk_rc_ok;
|
||||||
|
|
||||||
|
@@ -400,10 +401,10 @@ send_attrd_query(pcmk__output_t *out, const char *attr_name, const char *attr_no
|
||||||
|
|
||||||
|
/* Decide which node(s) to query */
|
||||||
|
if (query_all == TRUE) {
|
||||||
|
- attr_node = NULL;
|
||||||
|
+ options |= pcmk__node_attr_query_all;
|
||||||
|
}
|
||||||
|
|
||||||
|
- rc = pcmk__attrd_api_query(attrd_api, attr_node, attr_name, 0);
|
||||||
|
+ rc = pcmk__attrd_api_query(attrd_api, attr_node, attr_name, options);
|
||||||
|
|
||||||
|
if (rc != pcmk_rc_ok) {
|
||||||
|
g_set_error(&error, PCMK__RC_ERROR, rc, "Could not query value of %s: %s (%d)",
|
||||||
|
--
|
||||||
|
2.31.1
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue