You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
220 lines
8.0 KiB
220 lines
8.0 KiB
3 days ago
|
From e503d6466ec8dd6c51b5891bd52f6f4076210f8b Mon Sep 17 00:00:00 2001
|
||
|
From: Peter Xu <peterx@redhat.com>
|
||
|
Date: Wed, 19 Jun 2024 18:30:46 -0400
|
||
|
Subject: [PATCH 11/11] tests/migration-tests: Cover postcopy failure on
|
||
|
reconnect
|
||
|
|
||
|
RH-Author: Juraj Marcin <None>
|
||
|
RH-MergeRequest: 419: migration: New postcopy state, and some cleanups [rhel-9.5.z]
|
||
|
RH-Jira: RHEL-63874
|
||
|
RH-Acked-by: Peter Xu <peterx@redhat.com>
|
||
|
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||
|
RH-Commit: [11/11] 0295f627c0e8e7bae6dcd695c063b17717c7590f
|
||
|
|
||
|
Make sure there will be an event for postcopy recovery, irrelevant of
|
||
|
whether the reconnect will success, or when the failure happens.
|
||
|
|
||
|
The added new case is to fail early in postcopy recovery, in which case it
|
||
|
didn't even reach RECOVER stage on src (and in real life it'll be the same
|
||
|
to dest, but the test case is just slightly more involved due to the dual
|
||
|
socketpair setup).
|
||
|
|
||
|
To do that, rename the postcopy_recovery_test_fail to reflect either stage
|
||
|
to fail, instead of a boolean.
|
||
|
|
||
|
Reviewed-by: Fabiano Rosas <farosas@suse.de>
|
||
|
Signed-off-by: Peter Xu <peterx@redhat.com>
|
||
|
Signed-off-by: Fabiano Rosas <farosas@suse.de>
|
||
|
|
||
|
(cherry picked from commit 6cf56a87baf8b99c4296a943d220eb8276ca035a)
|
||
|
|
||
|
JIRA: https://issues.redhat.com/browse/RHEL-63874
|
||
|
Y-JIRA: https://issues.redhat.com/browse/RHEL-38485
|
||
|
|
||
|
Signed-off-by: Juraj Marcin <jmarcin@redhat.com>
|
||
|
---
|
||
|
tests/qtest/migration-test.c | 95 +++++++++++++++++++++++++++++-------
|
||
|
1 file changed, 77 insertions(+), 18 deletions(-)
|
||
|
|
||
|
diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
|
||
|
index afe8270dd0..d903e3e0fa 100644
|
||
|
--- a/tests/qtest/migration-test.c
|
||
|
+++ b/tests/qtest/migration-test.c
|
||
|
@@ -74,6 +74,17 @@ static QTestMigrationState dst_state;
|
||
|
#define QEMU_ENV_SRC "QTEST_QEMU_BINARY_SRC"
|
||
|
#define QEMU_ENV_DST "QTEST_QEMU_BINARY_DST"
|
||
|
|
||
|
+typedef enum PostcopyRecoveryFailStage {
|
||
|
+ /*
|
||
|
+ * "no failure" must be 0 as it's the default. OTOH, real failure
|
||
|
+ * cases must be >0 to make sure they trigger by a "if" test.
|
||
|
+ */
|
||
|
+ POSTCOPY_FAIL_NONE = 0,
|
||
|
+ POSTCOPY_FAIL_CHANNEL_ESTABLISH,
|
||
|
+ POSTCOPY_FAIL_RECOVERY,
|
||
|
+ POSTCOPY_FAIL_MAX
|
||
|
+} PostcopyRecoveryFailStage;
|
||
|
+
|
||
|
#if defined(__linux__)
|
||
|
#include <sys/syscall.h>
|
||
|
#include <sys/vfs.h>
|
||
|
@@ -753,7 +764,7 @@ typedef struct {
|
||
|
/* Postcopy specific fields */
|
||
|
void *postcopy_data;
|
||
|
bool postcopy_preempt;
|
||
|
- bool postcopy_recovery_test_fail;
|
||
|
+ PostcopyRecoveryFailStage postcopy_recovery_fail_stage;
|
||
|
} MigrateCommon;
|
||
|
|
||
|
static int test_migrate_start(QTestState **from, QTestState **to,
|
||
|
@@ -1467,12 +1478,16 @@ static void wait_for_postcopy_status(QTestState *one, const char *status)
|
||
|
"completed", NULL });
|
||
|
}
|
||
|
|
||
|
-static void postcopy_recover_fail(QTestState *from, QTestState *to)
|
||
|
+static void postcopy_recover_fail(QTestState *from, QTestState *to,
|
||
|
+ PostcopyRecoveryFailStage stage)
|
||
|
{
|
||
|
#ifndef _WIN32
|
||
|
+ bool fail_early = (stage == POSTCOPY_FAIL_CHANNEL_ESTABLISH);
|
||
|
int ret, pair1[2], pair2[2];
|
||
|
char c;
|
||
|
|
||
|
+ g_assert(stage > POSTCOPY_FAIL_NONE && stage < POSTCOPY_FAIL_MAX);
|
||
|
+
|
||
|
/* Create two unrelated socketpairs */
|
||
|
ret = qemu_socketpair(PF_LOCAL, SOCK_STREAM, 0, pair1);
|
||
|
g_assert_cmpint(ret, ==, 0);
|
||
|
@@ -1506,6 +1521,14 @@ static void postcopy_recover_fail(QTestState *from, QTestState *to)
|
||
|
ret = send(pair2[1], &c, 1, 0);
|
||
|
g_assert_cmpint(ret, ==, 1);
|
||
|
|
||
|
+ if (stage == POSTCOPY_FAIL_CHANNEL_ESTABLISH) {
|
||
|
+ /*
|
||
|
+ * This will make src QEMU to fail at an early stage when trying to
|
||
|
+ * resume later, where it shouldn't reach RECOVER stage at all.
|
||
|
+ */
|
||
|
+ close(pair1[1]);
|
||
|
+ }
|
||
|
+
|
||
|
migrate_recover(to, "fd:fd-mig");
|
||
|
migrate_qmp(from, "fd:fd-mig", "{'resume': true}");
|
||
|
|
||
|
@@ -1515,28 +1538,53 @@ static void postcopy_recover_fail(QTestState *from, QTestState *to)
|
||
|
*/
|
||
|
migration_event_wait(from, "postcopy-recover-setup");
|
||
|
|
||
|
+ if (fail_early) {
|
||
|
+ /*
|
||
|
+ * When fails at reconnection, src QEMU will automatically goes
|
||
|
+ * back to PAUSED state. Making sure there is an event in this
|
||
|
+ * case: Libvirt relies on this to detect early reconnection
|
||
|
+ * errors.
|
||
|
+ */
|
||
|
+ migration_event_wait(from, "postcopy-paused");
|
||
|
+ } else {
|
||
|
+ /*
|
||
|
+ * We want to test "fail later" at RECOVER stage here. Make sure
|
||
|
+ * both QEMU instances will go into RECOVER stage first, then test
|
||
|
+ * kicking them out using migrate-pause.
|
||
|
+ *
|
||
|
+ * Explicitly check the RECOVER event on src, that's what Libvirt
|
||
|
+ * relies on, rather than polling.
|
||
|
+ */
|
||
|
+ migration_event_wait(from, "postcopy-recover");
|
||
|
+ wait_for_postcopy_status(from, "postcopy-recover");
|
||
|
+
|
||
|
+ /* Need an explicit kick on src QEMU in this case */
|
||
|
+ migrate_pause(from);
|
||
|
+ }
|
||
|
+
|
||
|
/*
|
||
|
- * Make sure both QEMU instances will go into RECOVER stage, then test
|
||
|
- * kicking them out using migrate-pause.
|
||
|
+ * For all failure cases, we'll reach such states on both sides now.
|
||
|
+ * Check them.
|
||
|
*/
|
||
|
- wait_for_postcopy_status(from, "postcopy-recover");
|
||
|
+ wait_for_postcopy_status(from, "postcopy-paused");
|
||
|
wait_for_postcopy_status(to, "postcopy-recover");
|
||
|
|
||
|
/*
|
||
|
- * This would be issued by the admin upon noticing the hang, we should
|
||
|
- * make sure we're able to kick this out.
|
||
|
+ * Kick dest QEMU out too. This is normally not needed in reality
|
||
|
+ * because when the channel is shutdown it should also happen on src.
|
||
|
+ * However here we used separate socket pairs so we need to do that
|
||
|
+ * explicitly.
|
||
|
*/
|
||
|
- migrate_pause(from);
|
||
|
- wait_for_postcopy_status(from, "postcopy-paused");
|
||
|
-
|
||
|
- /* Do the same test on dest */
|
||
|
migrate_pause(to);
|
||
|
wait_for_postcopy_status(to, "postcopy-paused");
|
||
|
|
||
|
close(pair1[0]);
|
||
|
- close(pair1[1]);
|
||
|
close(pair2[0]);
|
||
|
close(pair2[1]);
|
||
|
+
|
||
|
+ if (stage != POSTCOPY_FAIL_CHANNEL_ESTABLISH) {
|
||
|
+ close(pair1[1]);
|
||
|
+ }
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
@@ -1578,12 +1626,12 @@ static void test_postcopy_recovery_common(MigrateCommon *args)
|
||
|
wait_for_postcopy_status(to, "postcopy-paused");
|
||
|
wait_for_postcopy_status(from, "postcopy-paused");
|
||
|
|
||
|
- if (args->postcopy_recovery_test_fail) {
|
||
|
+ if (args->postcopy_recovery_fail_stage) {
|
||
|
/*
|
||
|
* Test when a wrong socket specified for recover, and then the
|
||
|
* ability to kick it out, and continue with a correct socket.
|
||
|
*/
|
||
|
- postcopy_recover_fail(from, to);
|
||
|
+ postcopy_recover_fail(from, to, args->postcopy_recovery_fail_stage);
|
||
|
/* continue with a good recovery */
|
||
|
}
|
||
|
|
||
|
@@ -1623,10 +1671,19 @@ static void test_postcopy_recovery_compress(void)
|
||
|
test_postcopy_recovery_common(&args);
|
||
|
}
|
||
|
|
||
|
-static void test_postcopy_recovery_double_fail(void)
|
||
|
+static void test_postcopy_recovery_fail_handshake(void)
|
||
|
+{
|
||
|
+ MigrateCommon args = {
|
||
|
+ .postcopy_recovery_fail_stage = POSTCOPY_FAIL_RECOVERY,
|
||
|
+ };
|
||
|
+
|
||
|
+ test_postcopy_recovery_common(&args);
|
||
|
+}
|
||
|
+
|
||
|
+static void test_postcopy_recovery_fail_reconnect(void)
|
||
|
{
|
||
|
MigrateCommon args = {
|
||
|
- .postcopy_recovery_test_fail = true,
|
||
|
+ .postcopy_recovery_fail_stage = POSTCOPY_FAIL_CHANNEL_ESTABLISH,
|
||
|
};
|
||
|
|
||
|
test_postcopy_recovery_common(&args);
|
||
|
@@ -3604,8 +3661,10 @@ int main(int argc, char **argv)
|
||
|
migration_test_add("/migration/postcopy/recovery/compress/plain",
|
||
|
test_postcopy_recovery_compress);
|
||
|
}
|
||
|
- migration_test_add("/migration/postcopy/recovery/double-failures",
|
||
|
- test_postcopy_recovery_double_fail);
|
||
|
+ migration_test_add("/migration/postcopy/recovery/double-failures/handshake",
|
||
|
+ test_postcopy_recovery_fail_handshake);
|
||
|
+ migration_test_add("/migration/postcopy/recovery/double-failures/reconnect",
|
||
|
+ test_postcopy_recovery_fail_reconnect);
|
||
|
if (is_x86) {
|
||
|
migration_test_add("/migration/postcopy/suspend",
|
||
|
test_postcopy_suspend);
|
||
|
--
|
||
|
2.39.3
|
||
|
|