You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1567 lines
46 KiB
1567 lines
46 KiB
From 6045e383f65432084cd07032eb5515cb8231dc04 Mon Sep 17 00:00:00 2001
|
|
From: Hideo Yamauchi <renayama19661014@ybb.ne.jp>
|
|
Date: Mon, 24 Jul 2023 06:46:23 +0900
|
|
Subject: [PATCH 1/4] Mid: storage-mon: Functionalization of test_device call
|
|
processing.
|
|
|
|
---
|
|
tools/storage_mon.c | 141 +++++++++++++++++++++++---------------------
|
|
1 file changed, 75 insertions(+), 66 deletions(-)
|
|
|
|
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
|
|
index f829c50814..b0e277cbe0 100644
|
|
--- a/tools/storage_mon.c
|
|
+++ b/tools/storage_mon.c
|
|
@@ -146,18 +146,87 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
|
|
exit(-1);
|
|
}
|
|
|
|
+static int test_device_main(size_t device_count, char *devices[MAX_DEVICES], int scores[MAX_DEVICES], int verbose, int inject_error_percent, int timeout)
|
|
+{
|
|
+ pid_t test_forks[MAX_DEVICES];
|
|
+ size_t i;
|
|
+ struct timespec ts;
|
|
+ time_t start_time;
|
|
+ size_t finished_count = 0;
|
|
+ int final_score = 0;
|
|
+
|
|
+ memset(test_forks, 0, sizeof(test_forks));
|
|
+ for (i=0; i<device_count; i++) {
|
|
+ test_forks[i] = fork();
|
|
+ if (test_forks[i] < 0) {
|
|
+ fprintf(stderr, "Error spawning fork for %s: %s\n", devices[i], strerror(errno));
|
|
+ syslog(LOG_ERR, "Error spawning fork for %s: %s\n", devices[i], strerror(errno));
|
|
+ /* Just test the devices we have */
|
|
+ break;
|
|
+ }
|
|
+ /* child */
|
|
+ if (test_forks[i] == 0) {
|
|
+ test_device(devices[i], verbose, inject_error_percent);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* See if they have finished */
|
|
+ clock_gettime(CLOCK_REALTIME, &ts);
|
|
+ start_time = ts.tv_sec;
|
|
+
|
|
+ while ((finished_count < device_count) && ((start_time + timeout) > ts.tv_sec)) {
|
|
+ for (i=0; i<device_count; i++) {
|
|
+ int wstatus;
|
|
+ pid_t w;
|
|
+
|
|
+ if (test_forks[i] > 0) {
|
|
+ w = waitpid(test_forks[i], &wstatus, WUNTRACED | WNOHANG | WCONTINUED);
|
|
+ if (w < 0) {
|
|
+ fprintf(stderr, "waitpid on %s failed: %s\n", devices[i], strerror(errno));
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ if (w == test_forks[i]) {
|
|
+ if (WIFEXITED(wstatus)) {
|
|
+ if (WEXITSTATUS(wstatus) != 0) {
|
|
+ syslog(LOG_ERR, "Error reading from device %s", devices[i]);
|
|
+ final_score += scores[i];
|
|
+ }
|
|
+
|
|
+ finished_count++;
|
|
+ test_forks[i] = 0;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ usleep(100000);
|
|
+
|
|
+ clock_gettime(CLOCK_REALTIME, &ts);
|
|
+ }
|
|
+
|
|
+ /* See which threads have not finished */
|
|
+ for (i=0; i<device_count; i++) {
|
|
+ if (test_forks[i] != 0) {
|
|
+ syslog(LOG_ERR, "Reading from device %s did not complete in %d seconds timeout", devices[i], timeout);
|
|
+ fprintf(stderr, "Thread for device %s did not complete in time\n", devices[i]);
|
|
+ final_score += scores[i];
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (verbose) {
|
|
+ printf("Final score is %d\n", final_score);
|
|
+ }
|
|
+ return final_score;
|
|
+}
|
|
+
|
|
int main(int argc, char *argv[])
|
|
{
|
|
char *devices[MAX_DEVICES];
|
|
int scores[MAX_DEVICES];
|
|
- pid_t test_forks[MAX_DEVICES];
|
|
size_t device_count = 0;
|
|
size_t score_count = 0;
|
|
- size_t finished_count = 0;
|
|
int timeout = DEFAULT_TIMEOUT;
|
|
- struct timespec ts;
|
|
- time_t start_time;
|
|
- size_t i;
|
|
int final_score = 0;
|
|
int opt, option_index;
|
|
int verbose = 0;
|
|
@@ -237,67 +306,7 @@ int main(int argc, char *argv[])
|
|
|
|
openlog("storage_mon", 0, LOG_DAEMON);
|
|
|
|
- memset(test_forks, 0, sizeof(test_forks));
|
|
- for (i=0; i<device_count; i++) {
|
|
- test_forks[i] = fork();
|
|
- if (test_forks[i] < 0) {
|
|
- fprintf(stderr, "Error spawning fork for %s: %s\n", devices[i], strerror(errno));
|
|
- syslog(LOG_ERR, "Error spawning fork for %s: %s\n", devices[i], strerror(errno));
|
|
- /* Just test the devices we have */
|
|
- break;
|
|
- }
|
|
- /* child */
|
|
- if (test_forks[i] == 0) {
|
|
- test_device(devices[i], verbose, inject_error_percent);
|
|
- }
|
|
- }
|
|
|
|
- /* See if they have finished */
|
|
- clock_gettime(CLOCK_REALTIME, &ts);
|
|
- start_time = ts.tv_sec;
|
|
-
|
|
- while ((finished_count < device_count) && ((start_time + timeout) > ts.tv_sec)) {
|
|
- for (i=0; i<device_count; i++) {
|
|
- int wstatus;
|
|
- pid_t w;
|
|
-
|
|
- if (test_forks[i] > 0) {
|
|
- w = waitpid(test_forks[i], &wstatus, WUNTRACED | WNOHANG | WCONTINUED);
|
|
- if (w < 0) {
|
|
- fprintf(stderr, "waitpid on %s failed: %s\n", devices[i], strerror(errno));
|
|
- return -1;
|
|
- }
|
|
-
|
|
- if (w == test_forks[i]) {
|
|
- if (WIFEXITED(wstatus)) {
|
|
- if (WEXITSTATUS(wstatus) != 0) {
|
|
- syslog(LOG_ERR, "Error reading from device %s", devices[i]);
|
|
- final_score += scores[i];
|
|
- }
|
|
-
|
|
- finished_count++;
|
|
- test_forks[i] = 0;
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- usleep(100000);
|
|
-
|
|
- clock_gettime(CLOCK_REALTIME, &ts);
|
|
- }
|
|
-
|
|
- /* See which threads have not finished */
|
|
- for (i=0; i<device_count; i++) {
|
|
- if (test_forks[i] != 0) {
|
|
- syslog(LOG_ERR, "Reading from device %s did not complete in %d seconds timeout", devices[i], timeout);
|
|
- fprintf(stderr, "Thread for device %s did not complete in time\n", devices[i]);
|
|
- final_score += scores[i];
|
|
- }
|
|
- }
|
|
-
|
|
- if (verbose) {
|
|
- printf("Final score is %d\n", final_score);
|
|
- }
|
|
+ final_score = test_device_main(device_count, devices, scores, verbose, inject_error_percent, timeout);
|
|
return final_score;
|
|
}
|
|
|
|
From 437162be482462047502b4098d7d2c1328d453a4 Mon Sep 17 00:00:00 2001
|
|
From: Hideo Yamauchi <renayama19661014@ybb.ne.jp>
|
|
Date: Mon, 24 Jul 2023 06:47:20 +0900
|
|
Subject: [PATCH 2/4] Mid: storage-mon: Added daemon/client mode.
|
|
|
|
---
|
|
configure.ac | 1 +
|
|
heartbeat/storage-mon.in | 222 +++++++++---
|
|
resource-agents.spec.in | 2 +-
|
|
tools/Makefile.am | 3 +-
|
|
tools/storage_mon.c | 724 +++++++++++++++++++++++++++++++++++----
|
|
5 files changed, 828 insertions(+), 124 deletions(-)
|
|
|
|
diff --git a/configure.ac b/configure.ac
|
|
index 7b5faff584..74766899b8 100644
|
|
--- a/configure.ac
|
|
+++ b/configure.ac
|
|
@@ -620,6 +620,7 @@ fi
|
|
PKG_CHECK_MODULES([GLIB], [$GPKGNAME])
|
|
CPPFLAGS="$CPPFLAGS $GLIB_CFLAGS"
|
|
LIBS="$LIBS $GLIB_LIBS"
|
|
+PKG_CHECK_MODULES([LIBQB], "libqb")
|
|
|
|
dnl ========================================================================
|
|
dnl Headers
|
|
diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in
|
|
index d764b49d7c..81d8f5bcec 100644
|
|
--- a/heartbeat/storage-mon.in
|
|
+++ b/heartbeat/storage-mon.in
|
|
@@ -48,20 +48,26 @@
|
|
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
|
|
|
|
#
|
|
-STORAGEMON=$HA_BIN/storage_mon
|
|
-ATTRDUP=/usr/sbin/attrd_updater
|
|
+STORAGEMON=${HA_BIN}/storage_mon
|
|
+ATTRDUP=${HA_SBIN_DIR}/attrd_updater
|
|
+PIDFILE=${HA_VARRUN}/storage-mon-${OCF_RESOURCE_INSTANCE}.pid
|
|
+ATTRNAME="#health-${OCF_RESOURCE_INSTANCE}"
|
|
|
|
OCF_RESKEY_CRM_meta_interval_default="0"
|
|
OCF_RESKEY_io_timeout_default="10"
|
|
+OCF_RESKEY_check_interval_default="30"
|
|
OCF_RESKEY_inject_errors_default=""
|
|
OCF_RESKEY_state_file_default="${HA_RSCTMP%%/}/storage-mon-${OCF_RESOURCE_INSTANCE}.state"
|
|
+OCF_RESKEY_daemonize_default=""
|
|
|
|
# Explicitly list all environment variables used, to make static analysis happy
|
|
: ${OCF_RESKEY_CRM_meta_interval:=${OCF_RESKEY_CRM_meta_interval_default}}
|
|
: ${OCF_RESKEY_drives:=""}
|
|
: ${OCF_RESKEY_io_timeout:=${OCF_RESKEY_io_timeout_default}}
|
|
+: ${OCF_RESKEY_check_interval:=${OCF_RESKEY_check_interval_default}}
|
|
: ${OCF_RESKEY_inject_errors:=${OCF_RESKEY_inject_errors_default}}
|
|
: ${OCF_RESKEY_state_file:=${OCF_RESKEY_state_file_default}}
|
|
+: ${OCF_RESKEY_daemonize:=${OCF_RESKEY_daemonize_default}}
|
|
|
|
#######################################################################
|
|
|
|
@@ -106,6 +112,14 @@ Specify disk I/O timeout in seconds. Minimum 1, recommended 10 (default).
|
|
<content type="integer" default="${OCF_RESKEY_io_timeout_default}" />
|
|
</parameter>
|
|
|
|
+<parameter name="check_interval" unique="0">
|
|
+<longdesc lang="en">
|
|
+Specify interval between I/O checks in seconds.(Only supported with the damonize option.)
|
|
+</longdesc>
|
|
+<shortdesc lang="en">I/O check interval</shortdesc>
|
|
+<content type="integer" default="${OCF_RESKEY_check_interval_default}" />
|
|
+</parameter>
|
|
+
|
|
<parameter name="inject_errors" unique="0">
|
|
<longdesc lang="en">
|
|
Used only for testing! Specify % of I/O errors to simulate drives failures.
|
|
@@ -114,6 +128,14 @@ Used only for testing! Specify % of I/O errors to simulate drives failures.
|
|
<content type="integer" default="${OCF_RESKEY_inject_errors_default}" />
|
|
</parameter>
|
|
|
|
+<parameter name="daemonize" unique="0">
|
|
+<longdesc lang="en">
|
|
+Specifies to start storage-mon as a daemon and check for devices.
|
|
+</longdesc>
|
|
+<shortdesc lang="en">start storage-mon with daemon</shortdesc>
|
|
+<content type="string" default="" />
|
|
+</parameter>
|
|
+
|
|
</parameters>
|
|
|
|
<actions>
|
|
@@ -146,6 +168,11 @@ storage-mon_init() {
|
|
exit $OCF_ERR_INSTALLED
|
|
fi
|
|
|
|
+ if [ ! -x "$ATTRDUP" ] ; then
|
|
+ ocf_log err "${ATTRDUP} not installed."
|
|
+ exit $OCF_ERR_INSTALLED
|
|
+ fi
|
|
+
|
|
i=0
|
|
for DRIVE in ${OCF_RESKEY_drives}; do
|
|
if [ ! -e "$DRIVE" ] ; then
|
|
@@ -161,7 +188,12 @@ storage-mon_init() {
|
|
fi
|
|
|
|
if [ "${OCF_RESKEY_io_timeout}" -lt "1" ]; then
|
|
- ocf_log err "Minimum timeout is 1. Recommended 10 (default)."
|
|
+ ocf_log err "Minimum timeout is 1. Recommended ${OCF_RESKEY_io_timeout_default} (default)."
|
|
+ exit $OCF_ERR_CONFIGURED
|
|
+ fi
|
|
+
|
|
+ if [ "${OCF_RESKEY_check_interval}" -lt "1" ]; then
|
|
+ ocf_log err "Minimum interval to check is 1. default ${OCF_RESKEY_check_interval_default}."
|
|
exit $OCF_ERR_CONFIGURED
|
|
fi
|
|
|
|
@@ -173,63 +205,147 @@ storage-mon_init() {
|
|
fi
|
|
}
|
|
|
|
-storage-mon_validate() {
|
|
- storage-mon_init
|
|
-
|
|
- # Is the state directory writable?
|
|
- state_dir=$(dirname "$OCF_RESKEY_state_file")
|
|
- touch "$state_dir/$$"
|
|
- if [ $? -ne 0 ]; then
|
|
- return $OCF_ERR_CONFIGURED
|
|
- fi
|
|
- rm "$state_dir/$$"
|
|
-
|
|
- return $OCF_SUCCESS
|
|
-}
|
|
-
|
|
storage-mon_monitor() {
|
|
- storage-mon_init
|
|
+ if [ -z "$OCF_RESKEY_daemonize" ]; then
|
|
+ storage-mon_init
|
|
|
|
- # Monitor _MUST!_ differentiate correctly between running
|
|
- # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
|
|
- # That is THREE states, not just yes/no.
|
|
+ # Monitor _MUST!_ differentiate correctly between running
|
|
+ # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
|
|
+ # That is THREE states, not just yes/no.
|
|
|
|
- if [ ! -f "${OCF_RESKEY_state_file}" ]; then
|
|
- return $OCF_NOT_RUNNING
|
|
- fi
|
|
+ if [ ! -f "${OCF_RESKEY_state_file}" ]; then
|
|
+ return $OCF_NOT_RUNNING
|
|
+ fi
|
|
|
|
- # generate command line
|
|
- cmdline=""
|
|
- for DRIVE in ${OCF_RESKEY_drives}; do
|
|
- cmdline="$cmdline --device $DRIVE --score 1"
|
|
- done
|
|
- cmdline="$cmdline --timeout ${OCF_RESKEY_io_timeout}"
|
|
- if [ -n "${OCF_RESKEY_inject_errors}" ]; then
|
|
- cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}"
|
|
- fi
|
|
- $STORAGEMON $cmdline
|
|
- if [ $? -ne 0 ]; then
|
|
- status="red"
|
|
+ # generate command line
|
|
+ cmdline=""
|
|
+ for DRIVE in ${OCF_RESKEY_drives}; do
|
|
+ cmdline="$cmdline --device $DRIVE --score 1"
|
|
+ done
|
|
+ cmdline="$cmdline --timeout ${OCF_RESKEY_io_timeout}"
|
|
+ if [ -n "${OCF_RESKEY_inject_errors}" ]; then
|
|
+ cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}"
|
|
+ fi
|
|
+ $STORAGEMON $cmdline
|
|
+ if [ $? -ne 0 ]; then
|
|
+ status="red"
|
|
+ else
|
|
+ status="green"
|
|
+ fi
|
|
+
|
|
+ "$ATTRDUP" -n ${ATTRNAME} -U "$status" -d "5s"
|
|
+ return $OCF_SUCCESS
|
|
else
|
|
- status="green"
|
|
- fi
|
|
+ ocf_pidfile_status "${PIDFILE}" > /dev/null 2>&1
|
|
+ case "$?" in
|
|
+ 0) rc=$OCF_SUCCESS;;
|
|
+ 1|2) rc=$OCF_NOT_RUNNING;;
|
|
+ *) rc=$OCF_ERR_GENERIC;;
|
|
+ esac
|
|
+
|
|
+ if [ $rc -ne $OCF_SUCCESS ]; then
|
|
+ return "$rc"
|
|
+ fi
|
|
+ if [ "$1" = "pid_check_only" ]; then
|
|
+ return "$rc"
|
|
+ fi
|
|
|
|
- "$ATTRDUP" -n "#health-${OCF_RESOURCE_INSTANCE}" -U "$status" -d "5s"
|
|
- return $OCF_SUCCESS
|
|
+ # generate client command line
|
|
+ cmdline=""
|
|
+ cmdline="$cmdline --client --attrname ${ATTRNAME}"
|
|
+ while :
|
|
+ do
|
|
+ # 0 : Normal.
|
|
+ # greater than 0 : monitoring error.
|
|
+ # 255(-1) : communication system error.
|
|
+ # 254(-2) : Not all checks completed for first device in daemon mode.
|
|
+ $STORAGEMON $cmdline
|
|
+ rc=$?
|
|
+ case "$rc" in
|
|
+ 254|255)
|
|
+ # If there is a communication error or the initial check of all devices has not been completed,
|
|
+ # it will loop and try to reconnect.
|
|
+ # When everything ends with a communication error during monitor, a monitor timeout occurs.
|
|
+ ocf_log debug "client monitor error : $rc"
|
|
+ ;;
|
|
+ 0)
|
|
+ status="green"
|
|
+ break
|
|
+ ;;
|
|
+ *)
|
|
+ status="red"
|
|
+ break
|
|
+ ;;
|
|
+ esac
|
|
+ done
|
|
+
|
|
+ "$ATTRDUP" -n ${ATTRNAME} -U "$status" -d "5s"
|
|
+ return $OCF_SUCCESS
|
|
+ fi
|
|
}
|
|
|
|
storage-mon_start() {
|
|
- storage-mon_monitor
|
|
- if [ $? -eq $OCF_SUCCESS ]; then
|
|
- return $OCF_SUCCESS
|
|
+ if [ -z "$OCF_RESKEY_daemonize" ]; then
|
|
+ storage-mon_monitor
|
|
+ if [ $? -eq $OCF_SUCCESS ]; then
|
|
+ return $OCF_SUCCESS
|
|
+ fi
|
|
+ touch "${OCF_RESKEY_state_file}"
|
|
+ else
|
|
+ storage-mon_init
|
|
+ # generate command line
|
|
+ cmdline=""
|
|
+ for DRIVE in ${OCF_RESKEY_drives}; do
|
|
+ cmdline="$cmdline --device $DRIVE --score 1"
|
|
+ done
|
|
+ #cmdline="$cmdline --daemonize --timeout ${OCF_RESKEY_io_timeout} --interval ${OCF_RESKEY_check_interval} --pidfile ${PIDFILE} --attrname ${ATTRNAME} --ha-sbin-dir ${HA_SBIN_DIR}"
|
|
+ cmdline="$cmdline --daemonize --timeout ${OCF_RESKEY_io_timeout} --interval ${OCF_RESKEY_check_interval} --pidfile ${PIDFILE} --attrname ${ATTRNAME}"
|
|
+ if [ -n "${OCF_RESKEY_inject_errors}" ]; then
|
|
+ cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}"
|
|
+ fi
|
|
+ $STORAGEMON $cmdline
|
|
+ if [ "$?" -ne 0 ]; then
|
|
+ return $OCF_ERR_GENERIC
|
|
+ fi
|
|
fi
|
|
- touch "${OCF_RESKEY_state_file}"
|
|
}
|
|
|
|
storage-mon_stop() {
|
|
storage-mon_monitor
|
|
- if [ $? -eq $OCF_SUCCESS ]; then
|
|
- rm "${OCF_RESKEY_state_file}"
|
|
+ rc=$?
|
|
+
|
|
+ if [ -z "$OCF_RESKEY_daemonize" ]; then
|
|
+ if [ $rc -eq $OCF_SUCCESS ]; then
|
|
+ rm "${OCF_RESKEY_state_file}"
|
|
+ fi
|
|
+ else
|
|
+ case "$rc" in
|
|
+ $OCF_SUCCESS)
|
|
+ ;;
|
|
+ $OCF_NOT_RUNNING)
|
|
+ return "$OCF_SUCCESS";;
|
|
+ *)
|
|
+ return "$rc";;
|
|
+ esac
|
|
+
|
|
+ kill -TERM $(cat "${PIDFILE}")
|
|
+ if [ "$?" -ne 0 ]; then
|
|
+ return $OCF_ERR_GENERIC
|
|
+ fi
|
|
+
|
|
+ while true; do
|
|
+ storage-mon_monitor pid_check_only
|
|
+ rc="$?"
|
|
+ case "$rc" in
|
|
+ $OCF_SUCCESS)
|
|
+ ;;
|
|
+ $OCF_NOT_RUNNING)
|
|
+ return "$OCF_SUCCESS";;
|
|
+ *)
|
|
+ return "$rc";;
|
|
+ esac
|
|
+ sleep 1
|
|
+ done
|
|
fi
|
|
return $OCF_SUCCESS
|
|
}
|
|
@@ -237,13 +353,15 @@ storage-mon_stop() {
|
|
storage-mon_validate() {
|
|
storage-mon_init
|
|
|
|
- # Is the state directory writable?
|
|
- state_dir=$(dirname "${OCF_RESKEY_state_file}")
|
|
- touch "$state_dir/$$"
|
|
- if [ $? -ne 0 ]; then
|
|
- return $OCF_ERR_CONFIGURED
|
|
+ if [ -z "$OCF_RESKEY_daemonize" ]; then
|
|
+ # Is the state directory writable?
|
|
+ state_dir=$(dirname "${OCF_RESKEY_state_file}")
|
|
+ touch "$state_dir/$$"
|
|
+ if [ $? -ne 0 ]; then
|
|
+ return $OCF_ERR_CONFIGURED
|
|
+ fi
|
|
+ rm "$state_dir/$$"
|
|
fi
|
|
- rm "$state_dir/$$"
|
|
|
|
return $OCF_SUCCESS
|
|
}
|
|
diff --git a/resource-agents.spec.in b/resource-agents.spec.in
|
|
index 2ffa00d946..1cbf28c033 100644
|
|
--- a/resource-agents.spec.in
|
|
+++ b/resource-agents.spec.in
|
|
@@ -55,7 +55,7 @@ Provides: heartbeat-resources = %{version}
|
|
BuildRequires: make
|
|
BuildRequires: automake autoconf pkgconfig gcc
|
|
BuildRequires: perl
|
|
-BuildRequires: libxslt glib2-devel
|
|
+BuildRequires: libxslt glib2-devel libqb-devel
|
|
BuildRequires: systemd
|
|
BuildRequires: which
|
|
|
|
diff --git a/tools/Makefile.am b/tools/Makefile.am
|
|
index 08323fee3a..55e292cec5 100644
|
|
--- a/tools/Makefile.am
|
|
+++ b/tools/Makefile.am
|
|
@@ -74,7 +74,8 @@ sfex_stat_LDADD = $(GLIBLIB) -lplumb -lplumbgpl
|
|
findif_SOURCES = findif.c
|
|
|
|
storage_mon_SOURCES = storage_mon.c
|
|
-storage_mon_CFLAGS = -D_GNU_SOURCE
|
|
+storage_mon_CFLAGS = -D_GNU_SOURCE ${LIBQB_CFLAGS}
|
|
+storage_mon_LDADD = ${LIBQB_LIBS}
|
|
|
|
if BUILD_TICKLE
|
|
halib_PROGRAMS += tickle_tcp
|
|
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
|
|
index b0e277cbe0..1231570c85 100644
|
|
--- a/tools/storage_mon.c
|
|
+++ b/tools/storage_mon.c
|
|
@@ -16,9 +16,87 @@
|
|
#ifdef __FreeBSD__
|
|
#include <sys/disk.h>
|
|
#endif
|
|
+#include <config.h>
|
|
+#include <glib.h>
|
|
+#include <libgen.h>
|
|
+
|
|
+#include <qb/qbdefs.h>
|
|
+#include <qb/qblog.h>
|
|
+#include <qb/qbloop.h>
|
|
+#include <qb/qbutil.h>
|
|
+#include <qb/qbipcs.h>
|
|
+#include <qb/qbipcc.h>
|
|
|
|
#define MAX_DEVICES 25
|
|
#define DEFAULT_TIMEOUT 10
|
|
+#define DEFAULT_INTERVAL 30
|
|
+#define DEFAULT_PIDFILE HA_VARRUNDIR "storage_mon.pid"
|
|
+#define DEFAULT_ATTRNAME "#health-storage_mon"
|
|
+#define SMON_GET_RESULT_COMMAND "get_check_value"
|
|
+#define SMON_RESULT_OK "green"
|
|
+#define SMON_RESULT_NG "red"
|
|
+#define SMON_RESULT_COMMAND_ERROR "unknown command"
|
|
+#define SMON_BUFF_1MEG 1048576
|
|
+#define SMON_MAX_IPCSNAME 256
|
|
+#define SMON_MAX_MSGSIZE 128
|
|
+#define SMON_MAX_RESP_SIZE 100
|
|
+
|
|
+#define PRINT_STORAGE_MON_ERR(fmt, ...) if (!daemonize) { \
|
|
+ fprintf(stderr, fmt"\n", __VA_ARGS__); \
|
|
+ } else { \
|
|
+ syslog(LOG_ERR, fmt, __VA_ARGS__); \
|
|
+ }
|
|
+#define PRINT_STORAGE_MON_ERR_NOARGS(str) if (!daemonize) { \
|
|
+ fprintf(stderr, str"\n"); \
|
|
+ } else { \
|
|
+ syslog(LOG_ERR, str); \
|
|
+ }
|
|
+
|
|
+#define PRINT_STORAGE_MON_INFO(fmt, ...) if (!daemonize) { \
|
|
+ printf(fmt"\n", __VA_ARGS__); \
|
|
+ } else { \
|
|
+ syslog(LOG_INFO, fmt, __VA_ARGS__); \
|
|
+ }
|
|
+
|
|
+struct storage_mon_timer_data {
|
|
+ int interval;
|
|
+};
|
|
+
|
|
+struct storage_mon_check_value_req {
|
|
+ struct qb_ipc_request_header hdr;
|
|
+ char message[SMON_MAX_MSGSIZE];
|
|
+};
|
|
+
|
|
+
|
|
+struct storage_mon_check_value_res {
|
|
+ struct qb_ipc_response_header hdr;
|
|
+ char message[SMON_MAX_MSGSIZE];
|
|
+};
|
|
+
|
|
+
|
|
+char *devices[MAX_DEVICES];
|
|
+int scores[MAX_DEVICES];
|
|
+size_t device_count = 0;
|
|
+int timeout = DEFAULT_TIMEOUT;
|
|
+int verbose = 0;
|
|
+int inject_error_percent = 0;
|
|
+const char *attrname = DEFAULT_ATTRNAME;
|
|
+gboolean daemonize = FALSE;
|
|
+int shutting_down = FALSE;
|
|
+static qb_ipcs_service_t *ipcs;
|
|
+int final_score = 0;
|
|
+int response_final_score = 0;
|
|
+pid_t test_forks[MAX_DEVICES];
|
|
+size_t finished_count = 0;
|
|
+gboolean daemon_check_first_all_devices = FALSE;
|
|
+
|
|
+static qb_loop_t *storage_mon_poll_handle;
|
|
+static qb_loop_timer_handle timer_handle;
|
|
+static qb_loop_timer_handle expire_handle;
|
|
+static struct storage_mon_timer_data timer_d;
|
|
+
|
|
+static int test_device_main(gpointer data);
|
|
+static void wrap_test_device_main(void *data);
|
|
|
|
static void usage(char *name, FILE *f)
|
|
{
|
|
@@ -27,6 +105,11 @@ static void usage(char *name, FILE *f)
|
|
fprintf(f, " --score <n> score if device fails the test. Must match --device count\n");
|
|
fprintf(f, " --timeout <n> max time to wait for a device test to come back. in seconds (default %d)\n", DEFAULT_TIMEOUT);
|
|
fprintf(f, " --inject-errors-percent <n> Generate EIO errors <n>%% of the time (for testing only)\n");
|
|
+ fprintf(f, " --daemonize test run in daemons.\n");
|
|
+ fprintf(f, " --client client connection to daemon. requires the attrname option.\n");
|
|
+ fprintf(f, " --interval <n> interval to test. in seconds (default %d)(for daemonize only)\n", DEFAULT_INTERVAL);
|
|
+ fprintf(f, " --pidfile <path> file path to record pid (default %s)(for daemonize only)\n", DEFAULT_PIDFILE);
|
|
+ fprintf(f, " --attrname <attr> attribute name to update test result (default %s)(for daemonize/client only)\n", DEFAULT_ATTRNAME);
|
|
fprintf(f, " --verbose emit extra output to stdout\n");
|
|
fprintf(f, " --help print this message\n");
|
|
}
|
|
@@ -47,13 +130,13 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
|
|
device_fd = open(device, flags);
|
|
if (device_fd < 0) {
|
|
if (errno != EINVAL) {
|
|
- fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno));
|
|
+ PRINT_STORAGE_MON_ERR("Failed to open %s: %s", device, strerror(errno));
|
|
exit(-1);
|
|
}
|
|
flags &= ~O_DIRECT;
|
|
device_fd = open(device, flags);
|
|
if (device_fd < 0) {
|
|
- fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno));
|
|
+ PRINT_STORAGE_MON_ERR("Failed to open %s: %s", device, strerror(errno));
|
|
exit(-1);
|
|
}
|
|
}
|
|
@@ -63,11 +146,11 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
|
|
res = ioctl(device_fd, BLKGETSIZE64, &devsize);
|
|
#endif
|
|
if (res < 0) {
|
|
- fprintf(stderr, "Failed to get device size for %s: %s\n", device, strerror(errno));
|
|
+ PRINT_STORAGE_MON_ERR("Failed to get device size for %s: %s", device, strerror(errno));
|
|
goto error;
|
|
}
|
|
if (verbose) {
|
|
- printf("%s: opened %s O_DIRECT, size=%zu\n", device, (flags & O_DIRECT)?"with":"without", devsize);
|
|
+ PRINT_STORAGE_MON_INFO("%s: opened %s O_DIRECT, size=%zu", device, (flags & O_DIRECT)?"with":"without", devsize);
|
|
}
|
|
|
|
/* Don't fret about real randomness */
|
|
@@ -76,11 +159,11 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
|
|
seek_spot = (rand() % (devsize-1024)) & 0xFFFFFFFFFFFFFE00;
|
|
res = lseek(device_fd, seek_spot, SEEK_SET);
|
|
if (res < 0) {
|
|
- fprintf(stderr, "Failed to seek %s: %s\n", device, strerror(errno));
|
|
+ PRINT_STORAGE_MON_ERR("Failed to seek %s: %s", device, strerror(errno));
|
|
goto error;
|
|
}
|
|
if (verbose) {
|
|
- printf("%s: reading from pos %ld\n", device, seek_spot);
|
|
+ PRINT_STORAGE_MON_INFO("%s: reading from pos %ld", device, seek_spot);
|
|
}
|
|
|
|
if (flags & O_DIRECT) {
|
|
@@ -93,22 +176,22 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
|
|
res = ioctl(device_fd, BLKSSZGET, &sec_size);
|
|
#endif
|
|
if (res < 0) {
|
|
- fprintf(stderr, "Failed to get block device sector size for %s: %s\n", device, strerror(errno));
|
|
+ PRINT_STORAGE_MON_ERR("Failed to get block device sector size for %s: %s", device, strerror(errno));
|
|
goto error;
|
|
}
|
|
|
|
if (posix_memalign(&buffer, sysconf(_SC_PAGESIZE), sec_size) != 0) {
|
|
- fprintf(stderr, "Failed to allocate aligned memory: %s\n", strerror(errno));
|
|
+ PRINT_STORAGE_MON_ERR("Failed to allocate aligned memory: %s", strerror(errno));
|
|
goto error;
|
|
}
|
|
res = read(device_fd, buffer, sec_size);
|
|
free(buffer);
|
|
if (res < 0) {
|
|
- fprintf(stderr, "Failed to read %s: %s\n", device, strerror(errno));
|
|
+ PRINT_STORAGE_MON_ERR("Failed to read %s: %s", device, strerror(errno));
|
|
goto error;
|
|
}
|
|
if (res < sec_size) {
|
|
- fprintf(stderr, "Failed to read %d bytes from %s, got %d\n", sec_size, device, res);
|
|
+ PRINT_STORAGE_MON_ERR("Failed to read %d bytes from %s, got %d", sec_size, device, res);
|
|
goto error;
|
|
}
|
|
} else {
|
|
@@ -116,28 +199,28 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
|
|
|
|
res = read(device_fd, buffer, sizeof(buffer));
|
|
if (res < 0) {
|
|
- fprintf(stderr, "Failed to read %s: %s\n", device, strerror(errno));
|
|
+ PRINT_STORAGE_MON_ERR("Failed to read %s: %s", device, strerror(errno));
|
|
goto error;
|
|
}
|
|
if (res < (int)sizeof(buffer)) {
|
|
- fprintf(stderr, "Failed to read %ld bytes from %s, got %d\n", sizeof(buffer), device, res);
|
|
+ PRINT_STORAGE_MON_ERR("Failed to read %ld bytes from %s, got %d", sizeof(buffer), device, res);
|
|
goto error;
|
|
}
|
|
}
|
|
|
|
/* Fake an error */
|
|
if (inject_error_percent && ((rand() % 100) < inject_error_percent)) {
|
|
- fprintf(stderr, "People, please fasten your seatbelts, injecting errors!\n");
|
|
+ PRINT_STORAGE_MON_ERR_NOARGS("People, please fasten your seatbelts, injecting errors!");
|
|
goto error;
|
|
}
|
|
res = close(device_fd);
|
|
if (res != 0) {
|
|
- fprintf(stderr, "Failed to close %s: %s\n", device, strerror(errno));
|
|
+ PRINT_STORAGE_MON_ERR("Failed to close %s: %s", device, strerror(errno));
|
|
exit(-1);
|
|
}
|
|
|
|
if (verbose) {
|
|
- printf("%s: done\n", device);
|
|
+ PRINT_STORAGE_MON_INFO("%s: done", device);
|
|
}
|
|
exit(0);
|
|
|
|
@@ -146,101 +229,563 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
|
|
exit(-1);
|
|
}
|
|
|
|
-static int test_device_main(size_t device_count, char *devices[MAX_DEVICES], int scores[MAX_DEVICES], int verbose, int inject_error_percent, int timeout)
|
|
+static gboolean is_child_runnning(void)
|
|
{
|
|
- pid_t test_forks[MAX_DEVICES];
|
|
size_t i;
|
|
- struct timespec ts;
|
|
- time_t start_time;
|
|
- size_t finished_count = 0;
|
|
- int final_score = 0;
|
|
|
|
- memset(test_forks, 0, sizeof(test_forks));
|
|
for (i=0; i<device_count; i++) {
|
|
- test_forks[i] = fork();
|
|
- if (test_forks[i] < 0) {
|
|
- fprintf(stderr, "Error spawning fork for %s: %s\n", devices[i], strerror(errno));
|
|
- syslog(LOG_ERR, "Error spawning fork for %s: %s\n", devices[i], strerror(errno));
|
|
- /* Just test the devices we have */
|
|
- break;
|
|
- }
|
|
- /* child */
|
|
- if (test_forks[i] == 0) {
|
|
- test_device(devices[i], verbose, inject_error_percent);
|
|
+ if (test_forks[i] != 0) {
|
|
+ return TRUE;
|
|
}
|
|
}
|
|
+ return FALSE;
|
|
+}
|
|
+
|
|
+static void stop_child(pid_t pid, int signal)
|
|
+{
|
|
+ errno = 0;
|
|
|
|
- /* See if they have finished */
|
|
- clock_gettime(CLOCK_REALTIME, &ts);
|
|
- start_time = ts.tv_sec;
|
|
+ if (kill(pid, signal) == 0) {
|
|
+ syslog(LOG_DEBUG, "Stopping chilg sent signal %d to process %lld", signal, (long long) pid);
|
|
+ } else {
|
|
+ syslog(LOG_ERR, "Could not stop child (process %lld) with signal %d: %s", (long long) pid, signal, strerror(errno));
|
|
+ }
|
|
+}
|
|
+
|
|
+static int32_t sigterm_handler(int num, void *data)
|
|
+{
|
|
+ size_t i;
|
|
+ shutting_down = TRUE;
|
|
|
|
- while ((finished_count < device_count) && ((start_time + timeout) > ts.tv_sec)) {
|
|
+ /* If there is an unfired timer, stop it. */
|
|
+ qb_loop_timer_del(storage_mon_poll_handle, timer_handle);
|
|
+
|
|
+ /* Send SIGTERM to non-terminating device monitoring processes. */
|
|
+ if (is_child_runnning()) {
|
|
+ /* See if threads have finished */
|
|
for (i=0; i<device_count; i++) {
|
|
- int wstatus;
|
|
- pid_t w;
|
|
+ if (test_forks[i] > 0 ) {
|
|
+ stop_child(test_forks[i], SIGTERM);
|
|
+ }
|
|
+ }
|
|
|
|
+ }
|
|
+
|
|
+ /* Set a timer for termination. */
|
|
+ qb_loop_timer_add(storage_mon_poll_handle, QB_LOOP_HIGH, 0, NULL, wrap_test_device_main, &timer_handle);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static size_t find_child_pid(int pid)
|
|
+{
|
|
+ size_t i;
|
|
+
|
|
+ for (i=0; i<device_count; i++) {
|
|
+ if (test_forks[i] > 0 ) {
|
|
+ if (test_forks[i] == pid) {
|
|
+ return i;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ return -1;
|
|
+}
|
|
+
|
|
+static int32_t sigchld_handler(int32_t sig, void *data)
|
|
+{
|
|
+ pid_t pid;
|
|
+ size_t index;
|
|
+ int status;
|
|
+
|
|
+ if (is_child_runnning()) {
|
|
+ while(1) {
|
|
+ pid = waitpid(-1, &status, WNOHANG);
|
|
+ if (pid > 0) {
|
|
+ if (WIFEXITED(status)) {
|
|
+ index = find_child_pid(pid);
|
|
+ if (index >= 0) {
|
|
+ /* If the expire timer is running, no timeout has occurred, */
|
|
+ /* so add the final_score from the exit code of the terminated child process. */
|
|
+ if (qb_loop_timer_is_running(storage_mon_poll_handle, expire_handle)) {
|
|
+ if (WEXITSTATUS(status) !=0) {
|
|
+ final_score += scores[index];
|
|
+
|
|
+ /* Update response values immediately in preparation for inquiries from clients. */
|
|
+ response_final_score = final_score;
|
|
+
|
|
+ /* Even in the first demon mode check, if there is an error device, clear */
|
|
+ /* the flag to return the response to the client without waiting for all devices to finish. */
|
|
+ daemon_check_first_all_devices = TRUE;
|
|
+ }
|
|
+ }
|
|
+#if 0
|
|
+ if (shutting_down == FALSE) {
|
|
+ finished_count++;
|
|
+ test_forks[index] = 0;
|
|
+ }
|
|
+#endif
|
|
+ finished_count++;
|
|
+ test_forks[index] = 0;
|
|
+
|
|
+ }
|
|
+ }
|
|
+ } else {
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void child_shutdown(int nsig)
|
|
+{
|
|
+ exit(1);
|
|
+}
|
|
+
|
|
+static int write_pid_file(const char *pidfile)
|
|
+{
|
|
+ char *pid;
|
|
+ char *dir, *str = NULL;
|
|
+ int fd = -1;
|
|
+ int rc = -1;
|
|
+ int i, len;
|
|
+
|
|
+ if (asprintf(&pid, "%jd", (intmax_t)getpid()) < 0) {
|
|
+ syslog(LOG_ERR, "Failed to allocate memory to store PID");
|
|
+ pid = NULL;
|
|
+ goto done;
|
|
+ }
|
|
+
|
|
+ str = strdup(pidfile);
|
|
+ if (str == NULL) {
|
|
+ syslog(LOG_ERR, "Failed to duplicate string ['%s']", pidfile);
|
|
+ goto done;
|
|
+ }
|
|
+ dir = dirname(str);
|
|
+ for (i = 1, len = strlen(dir); i < len; i++) {
|
|
+ if (dir[i] == '/') {
|
|
+ dir[i] = 0;
|
|
+ if ((mkdir(dir, 0640) < 0) && (errno != EEXIST)) {
|
|
+ syslog(LOG_ERR, "Failed to create directory %s: %s", dir, strerror(errno));
|
|
+ goto done;
|
|
+ }
|
|
+ dir[i] = '/';
|
|
+ }
|
|
+ }
|
|
+ if ((mkdir(dir, 0640) < 0) && (errno != EEXIST)) {
|
|
+ syslog(LOG_ERR, "Failed to create directory %s: %s", dir, strerror(errno));
|
|
+ goto done;
|
|
+ }
|
|
+
|
|
+ fd = open(pidfile, O_CREAT | O_WRONLY, 0640);
|
|
+ if (fd < 0) {
|
|
+ syslog(LOG_ERR, "Failed to open %s: %s", pidfile, strerror(errno));
|
|
+ goto done;
|
|
+ }
|
|
+
|
|
+ if (write(fd, pid, strlen(pid)) != strlen(pid)) {
|
|
+ syslog(LOG_ERR, "Failed to write '%s' to %s: %s", pid, pidfile, strerror(errno));
|
|
+ goto done;
|
|
+ }
|
|
+ close(fd);
|
|
+ rc = 0;
|
|
+done:
|
|
+ if (pid != NULL) {
|
|
+ free(pid);
|
|
+ }
|
|
+ if (str != NULL) {
|
|
+ free(str);
|
|
+ }
|
|
+ return rc;
|
|
+}
|
|
+
|
|
+static void child_timeout_handler(void *data)
|
|
+{
|
|
+ size_t i;
|
|
+
|
|
+ if (is_child_runnning()) {
|
|
+ for (i=0; i<device_count; i++) {
|
|
if (test_forks[i] > 0) {
|
|
- w = waitpid(test_forks[i], &wstatus, WUNTRACED | WNOHANG | WCONTINUED);
|
|
- if (w < 0) {
|
|
- fprintf(stderr, "waitpid on %s failed: %s\n", devices[i], strerror(errno));
|
|
- return -1;
|
|
+ /* If timeout occurs before SIGCHLD, add child process failure score to final_score. */
|
|
+ final_score += scores[i];
|
|
+
|
|
+ /* Update response values immediately in preparation for inquiries from clients. */
|
|
+ response_final_score = final_score;
|
|
+
|
|
+ /* Even in the first demon mode check, if there is an error device, clear */
|
|
+ /* the flag to return the response to the client without waiting for all devices to finish. */
|
|
+ daemon_check_first_all_devices = TRUE;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+static void wrap_test_device_main(void *data)
|
|
+{
|
|
+ struct storage_mon_timer_data *timer_data = (struct storage_mon_timer_data*)data;
|
|
+ test_device_main((timer_data != NULL) ? &timer_data->interval : NULL);
|
|
+}
|
|
+
|
|
+static int test_device_main(gpointer data)
|
|
+{
|
|
+ size_t i;
|
|
+ struct timespec ts;
|
|
+ time_t start_time;
|
|
+ gboolean device_check = TRUE;
|
|
+
|
|
+ if (daemonize) {
|
|
+ if (shutting_down == TRUE) {
|
|
+ goto done;
|
|
+ }
|
|
+
|
|
+ /* In the case of daemon mode, it is avoided that the timer is triggered and the number of */
|
|
+ /* child processes increases while the device monitoring child process is not completed. */
|
|
+ if (is_child_runnning()) {
|
|
+ device_check = FALSE;
|
|
+ }
|
|
+
|
|
+ if (device_count == finished_count && device_check) {
|
|
+ /* Update the result value for the client response once all checks have completed. */
|
|
+ response_final_score = final_score;
|
|
+
|
|
+ if (!daemon_check_first_all_devices) {
|
|
+ daemon_check_first_all_devices = TRUE;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (device_check) {
|
|
+ /* Reset final_score, finished_count, test_forks[] */
|
|
+ final_score = 0;
|
|
+ finished_count = 0;
|
|
+
|
|
+ memset(test_forks, 0, sizeof(test_forks));
|
|
+ for (i=0; i<device_count; i++) {
|
|
+ test_forks[i] = fork();
|
|
+ if (test_forks[i] < 0) {
|
|
+ PRINT_STORAGE_MON_ERR("Error spawning fork for %s: %s\n", devices[i], strerror(errno));
|
|
+ /* Just test the devices we have */
|
|
+ break;
|
|
+ }
|
|
+ /* child */
|
|
+ if (test_forks[i] == 0) {
|
|
+ if (daemonize) {
|
|
+ signal(SIGTERM, &child_shutdown);
|
|
}
|
|
+ test_device(devices[i], verbose, inject_error_percent);
|
|
+ }
|
|
+ }
|
|
|
|
- if (w == test_forks[i]) {
|
|
- if (WIFEXITED(wstatus)) {
|
|
- if (WEXITSTATUS(wstatus) != 0) {
|
|
- syslog(LOG_ERR, "Error reading from device %s", devices[i]);
|
|
- final_score += scores[i];
|
|
+ if (!daemonize) {
|
|
+ /* See if they have finished */
|
|
+ clock_gettime(CLOCK_REALTIME, &ts);
|
|
+ start_time = ts.tv_sec;
|
|
+
|
|
+ while ((finished_count < device_count) && ((start_time + timeout) > ts.tv_sec)) {
|
|
+ for (i=0; i<device_count; i++) {
|
|
+ int wstatus;
|
|
+ pid_t w;
|
|
+
|
|
+ if (test_forks[i] > 0) {
|
|
+ w = waitpid(test_forks[i], &wstatus, WUNTRACED | WNOHANG | WCONTINUED);
|
|
+ if (w < 0) {
|
|
+ PRINT_STORAGE_MON_ERR("waitpid on %s failed: %s", devices[i], strerror(errno));
|
|
+ return -1;
|
|
}
|
|
|
|
- finished_count++;
|
|
- test_forks[i] = 0;
|
|
+ if (w == test_forks[i]) {
|
|
+ if (WIFEXITED(wstatus)) {
|
|
+ if (WEXITSTATUS(wstatus) != 0) {
|
|
+ syslog(LOG_ERR, "Error reading from device %s", devices[i]);
|
|
+ final_score += scores[i];
|
|
+ }
|
|
+
|
|
+ finished_count++;
|
|
+ test_forks[i] = 0;
|
|
+ }
|
|
+ }
|
|
}
|
|
}
|
|
+
|
|
+ usleep(100000);
|
|
+
|
|
+ clock_gettime(CLOCK_REALTIME, &ts);
|
|
+ }
|
|
+
|
|
+ /* See which threads have not finished */
|
|
+ for (i=0; i<device_count; i++) {
|
|
+ if (test_forks[i] != 0) {
|
|
+ syslog(LOG_ERR, "Reading from device %s did not complete in %d seconds timeout", devices[i], timeout);
|
|
+ fprintf(stderr, "Thread for device %s did not complete in time\n", devices[i]);
|
|
+ final_score += scores[i];
|
|
+ }
|
|
}
|
|
+ } else {
|
|
+ /* Rrun the child process timeout watch timer. */
|
|
+ qb_loop_timer_add(storage_mon_poll_handle, QB_LOOP_MED, timeout * QB_TIME_NS_IN_SEC, NULL, child_timeout_handler, &expire_handle);
|
|
+ }
|
|
+ }
|
|
+ if (!daemonize) {
|
|
+ if (verbose) {
|
|
+ printf("Final score is %d\n", final_score);
|
|
+ }
|
|
+ return final_score;
|
|
+ } else {
|
|
+ if (data != NULL) {
|
|
+ /* Sets the device check to run on the next timer. */
|
|
+ qb_loop_timer_add(storage_mon_poll_handle, QB_LOOP_MED, timer_d.interval * QB_TIME_NS_IN_SEC, &timer_d, wrap_test_device_main, &timer_handle);
|
|
}
|
|
+ return TRUE;
|
|
+ }
|
|
+done:
|
|
+ qb_loop_stop(storage_mon_poll_handle);
|
|
+ return FALSE;
|
|
+}
|
|
+
|
|
+static int32_t
|
|
+storage_mon_job_add(enum qb_loop_priority p, void *data, qb_loop_job_dispatch_fn fn)
|
|
+{
|
|
+ return qb_loop_job_add(storage_mon_poll_handle, p, data, fn);
|
|
+}
|
|
+
|
|
+static int32_t
|
|
+storage_mon_dispatch_add(enum qb_loop_priority p, int32_t fd, int32_t evts,
|
|
+ void *data, qb_ipcs_dispatch_fn_t fn)
|
|
+{
|
|
+ return qb_loop_poll_add(storage_mon_poll_handle, p, fd, evts, data, fn);
|
|
+}
|
|
+
|
|
+static int32_t
|
|
+storage_mon_dispatch_mod(enum qb_loop_priority p, int32_t fd, int32_t evts,
|
|
+ void *data, qb_ipcs_dispatch_fn_t fn)
|
|
+{
|
|
+ return qb_loop_poll_mod(storage_mon_poll_handle, p, fd, evts, data, fn);
|
|
+}
|
|
+
|
|
+static int32_t
|
|
+storage_mon_dispatch_del(int32_t fd)
|
|
+{
|
|
+ return qb_loop_poll_del(storage_mon_poll_handle, fd);
|
|
+}
|
|
|
|
- usleep(100000);
|
|
+static int32_t
|
|
+storage_mon_ipcs_connection_accept_fn(qb_ipcs_connection_t * c, uid_t uid, gid_t gid)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void
|
|
+storage_mon_ipcs_connection_created_fn(qb_ipcs_connection_t *c)
|
|
+{
|
|
+ struct qb_ipcs_stats srv_stats;
|
|
+
|
|
+ qb_ipcs_stats_get(ipcs, &srv_stats, QB_FALSE);
|
|
+ syslog(LOG_DEBUG, "Connection created (active:%d, closed:%d)",
|
|
+ srv_stats.active_connections, srv_stats.closed_connections);
|
|
+}
|
|
+
|
|
+static void
|
|
+storage_mon_ipcs_connection_destroyed_fn(qb_ipcs_connection_t *c)
|
|
+{
|
|
+ syslog(LOG_DEBUG, "Connection about to be freed");
|
|
+}
|
|
+
|
|
+static int32_t
|
|
+storage_mon_ipcs_connection_closed_fn(qb_ipcs_connection_t *c)
|
|
+{
|
|
+ struct qb_ipcs_connection_stats stats;
|
|
+ struct qb_ipcs_stats srv_stats;
|
|
+
|
|
+ qb_ipcs_stats_get(ipcs, &srv_stats, QB_FALSE);
|
|
+ qb_ipcs_connection_stats_get(c, &stats, QB_FALSE);
|
|
+
|
|
+ syslog(LOG_DEBUG,
|
|
+ "Connection to pid:%d destroyed (active:%d, closed:%d)",
|
|
+ stats.client_pid, srv_stats.active_connections,
|
|
+ srv_stats.closed_connections);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
|
|
- clock_gettime(CLOCK_REALTIME, &ts);
|
|
+static int32_t
|
|
+storage_mon_ipcs_msg_process_fn(qb_ipcs_connection_t *c, void *data, size_t size)
|
|
+{
|
|
+ struct storage_mon_check_value_req *request;
|
|
+ struct qb_ipc_response_header resps;
|
|
+ ssize_t res;
|
|
+ struct iovec iov[2];
|
|
+ char resp[SMON_MAX_RESP_SIZE];
|
|
+ int32_t rc;
|
|
+ int send_score = response_final_score;
|
|
+
|
|
+ request = (struct storage_mon_check_value_req *)data;
|
|
+ syslog(LOG_DEBUG, "msg received (id:%d, size:%d, data:%s)",
|
|
+ request->hdr.id, request->hdr.size, request->message);
|
|
+
|
|
+ if (strcmp(request->message, SMON_GET_RESULT_COMMAND) != 0) {
|
|
+ syslog(LOG_DEBUG, "request command is unknown.");
|
|
+ send_score = -1;
|
|
+ } else if (!daemon_check_first_all_devices) {
|
|
+ send_score = -2;
|
|
}
|
|
|
|
- /* See which threads have not finished */
|
|
- for (i=0; i<device_count; i++) {
|
|
- if (test_forks[i] != 0) {
|
|
- syslog(LOG_ERR, "Reading from device %s did not complete in %d seconds timeout", devices[i], timeout);
|
|
- fprintf(stderr, "Thread for device %s did not complete in time\n", devices[i]);
|
|
- final_score += scores[i];
|
|
+ resps.size = sizeof(struct qb_ipc_response_header);
|
|
+ resps.id = 13;
|
|
+ resps.error = 0;
|
|
+
|
|
+ rc = snprintf(resp, SMON_MAX_RESP_SIZE, "%d", send_score) + 1;
|
|
+ iov[0].iov_len = sizeof(resps);
|
|
+ iov[0].iov_base = &resps;
|
|
+ iov[1].iov_len = rc;
|
|
+ iov[1].iov_base = resp;
|
|
+ resps.size += rc;
|
|
+
|
|
+ res = qb_ipcs_response_sendv(c, iov, 2);
|
|
+ if (res < 0) {
|
|
+ errno = -res;
|
|
+ syslog(LOG_ERR, "qb_ipcs_response_send : errno = %d", errno);
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int32_t
|
|
+storage_mon_client(void)
|
|
+{
|
|
+ struct storage_mon_check_value_req request;
|
|
+ struct storage_mon_check_value_res response;
|
|
+ qb_ipcc_connection_t *conn;
|
|
+ char ipcs_name[SMON_MAX_IPCSNAME];
|
|
+ int32_t rc;
|
|
+
|
|
+
|
|
+ snprintf(ipcs_name, SMON_MAX_IPCSNAME, "storage_mon_%s", attrname);
|
|
+ conn = qb_ipcc_connect(ipcs_name, 0);
|
|
+ if (conn == NULL) {
|
|
+ syslog(LOG_ERR, "qb_ipcc_connect error\n");
|
|
+ return(-1);
|
|
+ }
|
|
+
|
|
+ snprintf(request.message, SMON_MAX_MSGSIZE, "%s", SMON_GET_RESULT_COMMAND);
|
|
+ request.hdr.id = 0;
|
|
+ request.hdr.size = sizeof(struct storage_mon_check_value_req);
|
|
+ rc = qb_ipcc_send(conn, &request, request.hdr.size);
|
|
+ if (rc < 0) {
|
|
+ syslog(LOG_ERR, "qb_ipcc_send error : %d\n", rc);
|
|
+ return(-1);
|
|
+ }
|
|
+ if (rc > 0) {
|
|
+ rc = qb_ipcc_recv(conn, &response, sizeof(response), -1);
|
|
+ if (rc < 0) {
|
|
+ syslog(LOG_ERR, "qb_ipcc_recv error : %d\n", rc);
|
|
+ return(-1);
|
|
}
|
|
}
|
|
|
|
- if (verbose) {
|
|
- printf("Final score is %d\n", final_score);
|
|
+ qb_ipcc_disconnect(conn);
|
|
+
|
|
+ /* Set score to result */
|
|
+ /* 0 : Normal. */
|
|
+ /* greater than 0 : monitoring error. */
|
|
+ /* -1 : communication system error. */
|
|
+ /* -2 : Not all checks completed for first device in daemon mode. */
|
|
+ rc = atoi(response.message);
|
|
+
|
|
+ syslog(LOG_DEBUG, "daemon response[%d]: %s \n", response.hdr.id, response.message);
|
|
+
|
|
+ return(rc);
|
|
+}
|
|
+
|
|
+static int32_t
|
|
+storage_mon_daemon(int interval, const char *pidfile)
|
|
+{
|
|
+ int32_t rc;
|
|
+ char ipcs_name[SMON_MAX_IPCSNAME];
|
|
+
|
|
+ struct qb_ipcs_service_handlers service_handle = {
|
|
+ .connection_accept = storage_mon_ipcs_connection_accept_fn,
|
|
+ .connection_created = storage_mon_ipcs_connection_created_fn,
|
|
+ .msg_process = storage_mon_ipcs_msg_process_fn,
|
|
+ .connection_destroyed = storage_mon_ipcs_connection_destroyed_fn,
|
|
+ .connection_closed = storage_mon_ipcs_connection_closed_fn,
|
|
+ };
|
|
+
|
|
+ struct qb_ipcs_poll_handlers poll_handle = {
|
|
+ .job_add = storage_mon_job_add,
|
|
+ .dispatch_add = storage_mon_dispatch_add,
|
|
+ .dispatch_mod = storage_mon_dispatch_mod,
|
|
+ .dispatch_del = storage_mon_dispatch_del,
|
|
+ };
|
|
+
|
|
+ if (daemon(0, 0) < 0) {
|
|
+ syslog(LOG_ERR, "Failed to daemonize: %s", strerror(errno));
|
|
+ return -1;
|
|
}
|
|
- return final_score;
|
|
+
|
|
+ umask(S_IWGRP | S_IWOTH | S_IROTH);
|
|
+
|
|
+ if (write_pid_file(pidfile) < 0) {
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ snprintf(ipcs_name, SMON_MAX_IPCSNAME, "storage_mon_%s", attrname);
|
|
+ ipcs = qb_ipcs_create(ipcs_name, 0, QB_IPC_NATIVE, &service_handle);
|
|
+ if (ipcs == 0) {
|
|
+ syslog(LOG_ERR, "qb_ipcs_create");
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ qb_ipcs_enforce_buffer_size(ipcs, SMON_BUFF_1MEG);
|
|
+
|
|
+ storage_mon_poll_handle = qb_loop_create();
|
|
+
|
|
+ qb_ipcs_poll_handlers_set(ipcs, &poll_handle);
|
|
+ rc = qb_ipcs_run(ipcs);
|
|
+ if (rc != 0) {
|
|
+ errno = -rc;
|
|
+ syslog(LOG_ERR, "qb_ipcs_run");
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ qb_loop_signal_add(storage_mon_poll_handle, QB_LOOP_HIGH,
|
|
+ SIGTERM, NULL, sigterm_handler, NULL);
|
|
+
|
|
+ qb_loop_signal_add(storage_mon_poll_handle, QB_LOOP_MED,
|
|
+ SIGCHLD, NULL, sigchld_handler, NULL);
|
|
+
|
|
+ timer_d.interval = interval;
|
|
+ qb_loop_timer_add(storage_mon_poll_handle, QB_LOOP_MED, 0, &timer_d, wrap_test_device_main, &timer_handle);
|
|
+
|
|
+ qb_loop_run(storage_mon_poll_handle);
|
|
+ qb_loop_destroy(storage_mon_poll_handle);
|
|
+
|
|
+ unlink(pidfile);
|
|
+
|
|
+ return 0;
|
|
}
|
|
|
|
int main(int argc, char *argv[])
|
|
{
|
|
- char *devices[MAX_DEVICES];
|
|
- int scores[MAX_DEVICES];
|
|
- size_t device_count = 0;
|
|
size_t score_count = 0;
|
|
- int timeout = DEFAULT_TIMEOUT;
|
|
- int final_score = 0;
|
|
int opt, option_index;
|
|
- int verbose = 0;
|
|
- int inject_error_percent = 0;
|
|
+ int interval = DEFAULT_INTERVAL;
|
|
+ const char *pidfile = DEFAULT_PIDFILE;
|
|
+ gboolean client = FALSE;
|
|
struct option long_options[] = {
|
|
{"timeout", required_argument, 0, 't' },
|
|
{"device", required_argument, 0, 'd' },
|
|
{"score", required_argument, 0, 's' },
|
|
{"inject-errors-percent", required_argument, 0, 0 },
|
|
+ {"daemonize", no_argument, 0, 0 },
|
|
+ {"client", no_argument, 0, 0 },
|
|
+ {"interval", required_argument, 0, 'i' },
|
|
+ {"pidfile", required_argument, 0, 'p' },
|
|
+ {"attrname", required_argument, 0, 'a' },
|
|
{"verbose", no_argument, 0, 'v' },
|
|
{"help", no_argument, 0, 'h' },
|
|
{0, 0, 0, 0 }
|
|
};
|
|
- while ( (opt = getopt_long(argc, argv, "hvt:d:s:",
|
|
+
|
|
+ while ( (opt = getopt_long(argc, argv, "hvt:d:s:i:p:a:",
|
|
long_options, &option_index)) != -1 ) {
|
|
switch (opt) {
|
|
case 0: /* Long-only options */
|
|
@@ -251,6 +796,16 @@ int main(int argc, char *argv[])
|
|
return -1;
|
|
}
|
|
}
|
|
+ if (strcmp(long_options[option_index].name, "daemonize") == 0) {
|
|
+ daemonize = TRUE;
|
|
+ }
|
|
+ if (strcmp(long_options[option_index].name, "client") == 0) {
|
|
+ client = TRUE;
|
|
+ }
|
|
+ if (daemonize && client) {
|
|
+ fprintf(stderr,"The daemonize option and client option cannot be specified at the same time.");
|
|
+ return -1;
|
|
+ }
|
|
break;
|
|
case 'd':
|
|
if (device_count < MAX_DEVICES) {
|
|
@@ -287,6 +842,27 @@ int main(int argc, char *argv[])
|
|
usage(argv[0], stdout);
|
|
return 0;
|
|
break;
|
|
+ case 'i':
|
|
+ interval = atoi(optarg);
|
|
+ if (interval < 1) {
|
|
+ fprintf(stderr, "invalid interval %d. Min 1, default is %d\n", interval, DEFAULT_INTERVAL);
|
|
+ return -1;
|
|
+ }
|
|
+ break;
|
|
+ case 'p':
|
|
+ pidfile = strdup(optarg);
|
|
+ if (pidfile == NULL) {
|
|
+ fprintf(stderr, "Failed to duplicate string ['%s']\n", optarg);
|
|
+ return -1;
|
|
+ }
|
|
+ break;
|
|
+ case 'a':
|
|
+ attrname = strdup(optarg);
|
|
+ if (attrname == NULL) {
|
|
+ fprintf(stderr, "Failed to duplicate string ['%s']\n", optarg);
|
|
+ return -1;
|
|
+ }
|
|
+ break;
|
|
default:
|
|
usage(argv[0], stderr);
|
|
return -1;
|
|
@@ -294,6 +870,11 @@ int main(int argc, char *argv[])
|
|
}
|
|
|
|
}
|
|
+
|
|
+ if (client) {
|
|
+ return(storage_mon_client());
|
|
+ }
|
|
+
|
|
if (device_count == 0) {
|
|
fprintf(stderr, "No devices to test, use the -d or --device argument\n");
|
|
return -1;
|
|
@@ -306,7 +887,10 @@ int main(int argc, char *argv[])
|
|
|
|
openlog("storage_mon", 0, LOG_DAEMON);
|
|
|
|
-
|
|
- final_score = test_device_main(device_count, devices, scores, verbose, inject_error_percent, timeout);
|
|
+ if (!daemonize) {
|
|
+ final_score = test_device_main(NULL);
|
|
+ } else {
|
|
+ return(storage_mon_daemon(interval, pidfile));
|
|
+ }
|
|
return final_score;
|
|
}
|
|
|
|
From 406ff43a6caeb0add7493892236753acee293f27 Mon Sep 17 00:00:00 2001
|
|
From: Hideo Yamauchi <renayama19661014@ybb.ne.jp>
|
|
Date: Mon, 24 Jul 2023 06:47:39 +0900
|
|
Subject: [PATCH 3/4] Mid: storage-mon: Retry failed attrd_updater.
|
|
|
|
---
|
|
heartbeat/storage-mon.in | 27 +++++++++++++++++++++++----
|
|
1 file changed, 23 insertions(+), 4 deletions(-)
|
|
|
|
diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in
|
|
index 81d8f5bcec..9662e06dbb 100644
|
|
--- a/heartbeat/storage-mon.in
|
|
+++ b/heartbeat/storage-mon.in
|
|
@@ -205,6 +205,25 @@ storage-mon_init() {
|
|
fi
|
|
}
|
|
|
|
+storage-mon_update_attribute() {
|
|
+
|
|
+ while :
|
|
+ do
|
|
+ "$ATTRDUP" -n ${ATTRNAME} -U "$1" -d "5s"
|
|
+ rc=$?
|
|
+ if [ $rc -eq 0 ]; then
|
|
+ break
|
|
+ fi
|
|
+
|
|
+ ocf_log debug "${1} attribute by attrd_updater failed"
|
|
+ if [ "$1" = "red" ]; then
|
|
+ # If the attrd_updater fails with the red attribute, return an error to let pacemaker handle the failure immediately.
|
|
+ return $OCF_ERR_GENERIC
|
|
+ fi
|
|
+ done
|
|
+ return $OCF_SUCCESS
|
|
+}
|
|
+
|
|
storage-mon_monitor() {
|
|
if [ -z "$OCF_RESKEY_daemonize" ]; then
|
|
storage-mon_init
|
|
@@ -233,8 +252,8 @@ storage-mon_monitor() {
|
|
status="green"
|
|
fi
|
|
|
|
- "$ATTRDUP" -n ${ATTRNAME} -U "$status" -d "5s"
|
|
- return $OCF_SUCCESS
|
|
+ storage-mon_update_attribute $status
|
|
+ return "$?"
|
|
else
|
|
ocf_pidfile_status "${PIDFILE}" > /dev/null 2>&1
|
|
case "$?" in
|
|
@@ -279,8 +298,8 @@ storage-mon_monitor() {
|
|
esac
|
|
done
|
|
|
|
- "$ATTRDUP" -n ${ATTRNAME} -U "$status" -d "5s"
|
|
- return $OCF_SUCCESS
|
|
+ storage-mon_update_attribute $status
|
|
+ return "$?"
|
|
fi
|
|
}
|
|
|
|
|
|
From d1cf0b42f1eb6c41ef5887cb7d9ce055f3bbcb3a Mon Sep 17 00:00:00 2001
|
|
From: Hideo Yamauchi <renayama19661014@ybb.ne.jp>
|
|
Date: Thu, 17 Aug 2023 17:18:53 +0900
|
|
Subject: [PATCH 4/4] Mid: storage-mon RA: Changed OCF_RESKEY_daemonize_default
|
|
and OCF_RESKEY_daemonize default and judgment part.
|
|
|
|
---
|
|
heartbeat/storage-mon.in | 13 ++++++-------
|
|
1 file changed, 6 insertions(+), 7 deletions(-)
|
|
|
|
diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in
|
|
index 9662e06dbb..284dec30f2 100644
|
|
--- a/heartbeat/storage-mon.in
|
|
+++ b/heartbeat/storage-mon.in
|
|
@@ -58,7 +58,7 @@ OCF_RESKEY_io_timeout_default="10"
|
|
OCF_RESKEY_check_interval_default="30"
|
|
OCF_RESKEY_inject_errors_default=""
|
|
OCF_RESKEY_state_file_default="${HA_RSCTMP%%/}/storage-mon-${OCF_RESOURCE_INSTANCE}.state"
|
|
-OCF_RESKEY_daemonize_default=""
|
|
+OCF_RESKEY_daemonize_default="false"
|
|
|
|
# Explicitly list all environment variables used, to make static analysis happy
|
|
: ${OCF_RESKEY_CRM_meta_interval:=${OCF_RESKEY_CRM_meta_interval_default}}
|
|
@@ -133,7 +133,7 @@ Used only for testing! Specify % of I/O errors to simulate drives failures.
|
|
Specifies to start storage-mon as a daemon and check for devices.
|
|
</longdesc>
|
|
<shortdesc lang="en">start storage-mon with daemon</shortdesc>
|
|
-<content type="string" default="" />
|
|
+<content type="boolean" default="${OCF_RESKEY_daemonize_default}" />
|
|
</parameter>
|
|
|
|
</parameters>
|
|
@@ -225,7 +225,7 @@ storage-mon_update_attribute() {
|
|
}
|
|
|
|
storage-mon_monitor() {
|
|
- if [ -z "$OCF_RESKEY_daemonize" ]; then
|
|
+ if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
|
|
storage-mon_init
|
|
|
|
# Monitor _MUST!_ differentiate correctly between running
|
|
@@ -304,7 +304,7 @@ storage-mon_monitor() {
|
|
}
|
|
|
|
storage-mon_start() {
|
|
- if [ -z "$OCF_RESKEY_daemonize" ]; then
|
|
+ if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
|
|
storage-mon_monitor
|
|
if [ $? -eq $OCF_SUCCESS ]; then
|
|
return $OCF_SUCCESS
|
|
@@ -317,7 +317,6 @@ storage-mon_start() {
|
|
for DRIVE in ${OCF_RESKEY_drives}; do
|
|
cmdline="$cmdline --device $DRIVE --score 1"
|
|
done
|
|
- #cmdline="$cmdline --daemonize --timeout ${OCF_RESKEY_io_timeout} --interval ${OCF_RESKEY_check_interval} --pidfile ${PIDFILE} --attrname ${ATTRNAME} --ha-sbin-dir ${HA_SBIN_DIR}"
|
|
cmdline="$cmdline --daemonize --timeout ${OCF_RESKEY_io_timeout} --interval ${OCF_RESKEY_check_interval} --pidfile ${PIDFILE} --attrname ${ATTRNAME}"
|
|
if [ -n "${OCF_RESKEY_inject_errors}" ]; then
|
|
cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}"
|
|
@@ -333,7 +332,7 @@ storage-mon_stop() {
|
|
storage-mon_monitor
|
|
rc=$?
|
|
|
|
- if [ -z "$OCF_RESKEY_daemonize" ]; then
|
|
+ if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
|
|
if [ $rc -eq $OCF_SUCCESS ]; then
|
|
rm "${OCF_RESKEY_state_file}"
|
|
fi
|
|
@@ -372,7 +371,7 @@ storage-mon_stop() {
|
|
storage-mon_validate() {
|
|
storage-mon_init
|
|
|
|
- if [ -z "$OCF_RESKEY_daemonize" ]; then
|
|
+ if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
|
|
# Is the state directory writable?
|
|
state_dir=$(dirname "${OCF_RESKEY_state_file}")
|
|
touch "$state_dir/$$"
|