diff --git a/SOURCES/kvm-migration-Cleanup-incoming-migration-setup-state-cha.patch b/SOURCES/kvm-migration-Cleanup-incoming-migration-setup-state-cha.patch new file mode 100644 index 0000000..265ae8b --- /dev/null +++ b/SOURCES/kvm-migration-Cleanup-incoming-migration-setup-state-cha.patch @@ -0,0 +1,89 @@ +From d913ecc85156d25f2df5317615eef7144aa26af5 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Wed, 19 Jun 2024 18:30:39 -0400 +Subject: [PATCH 04/11] migration: Cleanup incoming migration setup state + change + +RH-Author: Juraj Marcin +RH-MergeRequest: 419: migration: New postcopy state, and some cleanups [rhel-9.5.z] +RH-Jira: RHEL-63874 +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina +RH-Commit: [4/11] d485ab0a9d091ce98f1487fd8a3882f8b0130747 + +Destination QEMU can setup incoming ports for two purposes: either a fresh +new incoming migration, in which QEMU will switch to SETUP for channel +establishment, or a paused postcopy migration, in which QEMU will stay in +POSTCOPY_PAUSED until kicking off the RECOVER phase. + +Now the state machine worked on dest node for the latter, only because +migrate_set_state() implicitly will become a noop if the current state +check failed. It wasn't clear at all. + +Clean it up by providing a helper migration_incoming_state_setup() doing +proper checks over current status. Postcopy-paused will be explicitly +checked now, and then we can bail out for unknown states. + +Reviewed-by: Fabiano Rosas +Signed-off-by: Peter Xu +Signed-off-by: Fabiano Rosas + +(cherry picked from commit 4dd5f7b8d568116b3ce594b0055a47c6db50f49c) + +JIRA: https://issues.redhat.com/browse/RHEL-63874 +Y-JIRA: https://issues.redhat.com/browse/RHEL-38485 + +Signed-off-by: Juraj Marcin +--- + migration/migration.c | 28 ++++++++++++++++++++++++++-- + 1 file changed, 26 insertions(+), 2 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index b6cf04e043..21f20a8e1c 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -595,6 +595,29 @@ bool migrate_uri_parse(const char *uri, MigrationChannel **channel, + return true; + } + ++static bool ++migration_incoming_state_setup(MigrationIncomingState *mis, Error **errp) ++{ ++ MigrationStatus current = mis->state; ++ ++ if (current == MIGRATION_STATUS_POSTCOPY_PAUSED) { ++ /* ++ * Incoming postcopy migration will stay in PAUSED state even if ++ * reconnection happened. ++ */ ++ return true; ++ } ++ ++ if (current != MIGRATION_STATUS_NONE) { ++ error_setg(errp, "Illegal migration incoming state: %s", ++ MigrationStatus_str(current)); ++ return false; ++ } ++ ++ migrate_set_state(&mis->state, current, MIGRATION_STATUS_SETUP); ++ return true; ++} ++ + static void qemu_start_incoming_migration(const char *uri, bool has_channels, + MigrationChannelList *channels, + Error **errp) +@@ -633,8 +656,9 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels, + return; + } + +- migrate_set_state(&mis->state, MIGRATION_STATUS_NONE, +- MIGRATION_STATUS_SETUP); ++ if (!migration_incoming_state_setup(mis, errp)) { ++ return; ++ } + + if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { + SocketAddress *saddr = &addr->u.socket; +-- +2.39.3 + diff --git a/SOURCES/kvm-migration-Rename-thread-debug-names.patch b/SOURCES/kvm-migration-Rename-thread-debug-names.patch new file mode 100644 index 0000000..e674991 --- /dev/null +++ b/SOURCES/kvm-migration-Rename-thread-debug-names.patch @@ -0,0 +1,157 @@ +From 210d413ed90983f8a29576cd13c02b8598dc3b2b Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Wed, 19 Jun 2024 18:30:37 -0400 +Subject: [PATCH 02/11] migration: Rename thread debug names + +RH-Author: Juraj Marcin +RH-MergeRequest: 419: migration: New postcopy state, and some cleanups [rhel-9.5.z] +RH-Jira: RHEL-63874 +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina +RH-Commit: [2/11] b038b81af86b7b18642f81a3e23528912d1bd4ea + +The postcopy thread names on dest QEMU are slightly confusing, partly I'll +need to blame myself on 36f62f11e4 ("migration: Postcopy preemption +preparation on channel creation"). E.g., "fault-fast" reads like a fast +version of "fault-default", but it's actually the fast version of +"postcopy/listen". + +Taking this chance, rename all the migration threads with proper rules. +Considering we only have 15 chars usable, prefix all threads with "mig/", +meanwhile identify src/dst threads properly this time. So now most thread +names will look like "mig/DIR/xxx", where DIR will be "src"/"dst", except +the bg-snapshot thread which doesn't have a direction. + +For multifd threads, making them "mig/{src|dst}/{send|recv}_%d". + +We used to have "live_migration" thread for a very long time, now it's +called "mig/src/main". We may hope to have "mig/dst/main" soon but not +yet. + +Reviewed-by: Fabiano Rosas +Reviewed-by: Zhijian Li (Fujitsu) +Signed-off-by: Peter Xu +Signed-off-by: Fabiano Rosas + +(cherry picked from commit 60ce47675d74ddae3f13a32767d097d9fecbda4b) + +JIRA: https://issues.redhat.com/browse/RHEL-63874 +Y-JIRA: https://issues.redhat.com/browse/RHEL-38485 + +Signed-off-by: Juraj Marcin +--- + migration/colo.c | 2 +- + migration/migration.c | 6 +++--- + migration/multifd.c | 6 +++--- + migration/postcopy-ram.c | 4 ++-- + migration/savevm.c | 2 +- + 5 files changed, 10 insertions(+), 10 deletions(-) + +diff --git a/migration/colo.c b/migration/colo.c +index 84632a603e..560f910fb0 100644 +--- a/migration/colo.c ++++ b/migration/colo.c +@@ -938,7 +938,7 @@ int coroutine_fn colo_incoming_co(void) + return -EINVAL; + } + +- qemu_thread_create(&th, "COLO incoming", colo_process_incoming_thread, ++ qemu_thread_create(&th, "mig/dst/colo", colo_process_incoming_thread, + mis, QEMU_THREAD_JOINABLE); + + mis->colo_incoming_co = qemu_coroutine_self(); +diff --git a/migration/migration.c b/migration/migration.c +index 86bf76e925..4e9d3522be 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -2447,7 +2447,7 @@ static int open_return_path_on_source(MigrationState *ms) + + trace_open_return_path_on_source(); + +- qemu_thread_create(&ms->rp_state.rp_thread, "return path", ++ qemu_thread_create(&ms->rp_state.rp_thread, "mig/src/rp-thr", + source_return_path_thread, ms, QEMU_THREAD_JOINABLE); + ms->rp_state.rp_thread_created = true; + +@@ -3755,10 +3755,10 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) + } + + if (migrate_background_snapshot()) { +- qemu_thread_create(&s->thread, "bg_snapshot", ++ qemu_thread_create(&s->thread, "mig/snapshot", + bg_migration_thread, s, QEMU_THREAD_JOINABLE); + } else { +- qemu_thread_create(&s->thread, "live_migration", ++ qemu_thread_create(&s->thread, "mig/src/main", + migration_thread, s, QEMU_THREAD_JOINABLE); + } + s->migration_thread_running = true; +diff --git a/migration/multifd.c b/migration/multifd.c +index 2802afe79d..8b5be2a17e 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -1058,7 +1058,7 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, + args->p = p; + + p->tls_thread_created = true; +- qemu_thread_create(&p->tls_thread, "multifd-tls-handshake-worker", ++ qemu_thread_create(&p->tls_thread, "mig/src/tls", + multifd_tls_handshake_thread, args, + QEMU_THREAD_JOINABLE); + return true; +@@ -1184,7 +1184,7 @@ bool multifd_send_setup(void) + } else { + p->iov = g_new0(struct iovec, page_count); + } +- p->name = g_strdup_printf("multifdsend_%d", i); ++ p->name = g_strdup_printf("mig/src/send_%d", i); + p->page_size = qemu_target_page_size(); + p->page_count = page_count; + p->write_flags = 0; +@@ -1600,7 +1600,7 @@ int multifd_recv_setup(Error **errp) + + sizeof(uint64_t) * page_count; + p->packet = g_malloc0(p->packet_len); + } +- p->name = g_strdup_printf("multifdrecv_%d", i); ++ p->name = g_strdup_printf("mig/dst/recv_%d", i); + p->iov = g_new0(struct iovec, page_count); + p->normal = g_new0(ram_addr_t, page_count); + p->zero = g_new0(ram_addr_t, page_count); +diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c +index eccff499cb..ef184d8d08 100644 +--- a/migration/postcopy-ram.c ++++ b/migration/postcopy-ram.c +@@ -1238,7 +1238,7 @@ int postcopy_ram_incoming_setup(MigrationIncomingState *mis) + return -1; + } + +- postcopy_thread_create(mis, &mis->fault_thread, "fault-default", ++ postcopy_thread_create(mis, &mis->fault_thread, "mig/dst/fault", + postcopy_ram_fault_thread, QEMU_THREAD_JOINABLE); + mis->have_fault_thread = true; + +@@ -1258,7 +1258,7 @@ int postcopy_ram_incoming_setup(MigrationIncomingState *mis) + * This thread needs to be created after the temp pages because + * it'll fetch RAM_CHANNEL_POSTCOPY PostcopyTmpPage immediately. + */ +- postcopy_thread_create(mis, &mis->postcopy_prio_thread, "fault-fast", ++ postcopy_thread_create(mis, &mis->postcopy_prio_thread, "mig/dst/preempt", + postcopy_preempt_thread, QEMU_THREAD_JOINABLE); + mis->preempt_thread_status = PREEMPT_THREAD_CREATED; + } +diff --git a/migration/savevm.c b/migration/savevm.c +index e7c1215671..5aa595e365 100644 +--- a/migration/savevm.c ++++ b/migration/savevm.c +@@ -2127,7 +2127,7 @@ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis) + } + + mis->have_listen_thread = true; +- postcopy_thread_create(mis, &mis->listen_thread, "postcopy/listen", ++ postcopy_thread_create(mis, &mis->listen_thread, "mig/dst/listen", + postcopy_ram_listen_thread, QEMU_THREAD_DETACHED); + trace_loadvm_postcopy_handle_listen("return"); + +-- +2.39.3 + diff --git a/SOURCES/kvm-migration-Use-MigrationStatus-instead-of-int.patch b/SOURCES/kvm-migration-Use-MigrationStatus-instead-of-int.patch new file mode 100644 index 0000000..9808d92 --- /dev/null +++ b/SOURCES/kvm-migration-Use-MigrationStatus-instead-of-int.patch @@ -0,0 +1,146 @@ +From 04b4c59f81eb7547c6baa5e269c795b98ddce3ef Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Wed, 19 Jun 2024 18:30:38 -0400 +Subject: [PATCH 03/11] migration: Use MigrationStatus instead of int + +RH-Author: Juraj Marcin +RH-MergeRequest: 419: migration: New postcopy state, and some cleanups [rhel-9.5.z] +RH-Jira: RHEL-63874 +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina +RH-Commit: [3/11] 47e144753584750732f716b13c172cd67806cb17 + +QEMU uses "int" in most cases even if it stores MigrationStatus. I don't +know why, so let's try to do that right and see what blows up.. + +Reviewed-by: Fabiano Rosas +Signed-off-by: Peter Xu +Signed-off-by: Fabiano Rosas + +(cherry picked from commit a5c24e13e9f176901058b460e61425756322f3e8) + +JIRA: https://issues.redhat.com/browse/RHEL-63874 +Y-JIRA: https://issues.redhat.com/browse/RHEL-38485 + +Signed-off-by: Juraj Marcin +--- + migration/migration.c | 24 +++++++----------------- + migration/migration.h | 9 +++++---- + 2 files changed, 12 insertions(+), 21 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index 4e9d3522be..b6cf04e043 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -390,7 +390,7 @@ void migration_incoming_state_destroy(void) + yank_unregister_instance(MIGRATION_YANK_INSTANCE); + } + +-static void migrate_generate_event(int new_state) ++static void migrate_generate_event(MigrationStatus new_state) + { + if (migrate_events()) { + qapi_event_send_migration(new_state); +@@ -1294,8 +1294,6 @@ static void fill_destination_migration_info(MigrationInfo *info) + } + + switch (mis->state) { +- case MIGRATION_STATUS_NONE: +- return; + case MIGRATION_STATUS_SETUP: + case MIGRATION_STATUS_CANCELLING: + case MIGRATION_STATUS_CANCELLED: +@@ -1311,6 +1309,8 @@ static void fill_destination_migration_info(MigrationInfo *info) + info->has_status = true; + fill_destination_postcopy_migration_info(info); + break; ++ default: ++ return; + } + info->status = mis->state; + } +@@ -1349,7 +1349,8 @@ void qmp_migrate_start_postcopy(Error **errp) + + /* shared migration helpers */ + +-void migrate_set_state(int *state, int old_state, int new_state) ++void migrate_set_state(MigrationStatus *state, MigrationStatus old_state, ++ MigrationStatus new_state) + { + assert(new_state < MIGRATION_STATUS__MAX); + if (qatomic_cmpxchg(state, old_state, new_state) == old_state) { +@@ -1555,7 +1556,7 @@ bool migration_in_postcopy(void) + } + } + +-bool migration_postcopy_is_alive(int state) ++bool migration_postcopy_is_alive(MigrationStatus state) + { + switch (state) { + case MIGRATION_STATUS_POSTCOPY_ACTIVE: +@@ -1600,20 +1601,9 @@ bool migration_is_idle(void) + case MIGRATION_STATUS_COMPLETED: + case MIGRATION_STATUS_FAILED: + return true; +- case MIGRATION_STATUS_SETUP: +- case MIGRATION_STATUS_CANCELLING: +- case MIGRATION_STATUS_ACTIVE: +- case MIGRATION_STATUS_POSTCOPY_ACTIVE: +- case MIGRATION_STATUS_COLO: +- case MIGRATION_STATUS_PRE_SWITCHOVER: +- case MIGRATION_STATUS_DEVICE: +- case MIGRATION_STATUS_WAIT_UNPLUG: ++ default: + return false; +- case MIGRATION_STATUS__MAX: +- g_assert_not_reached(); + } +- +- return false; + } + + bool migration_is_active(void) +diff --git a/migration/migration.h b/migration/migration.h +index 8045e39c26..bc9c802595 100644 +--- a/migration/migration.h ++++ b/migration/migration.h +@@ -160,7 +160,7 @@ struct MigrationIncomingState { + /* PostCopyFD's for external userfaultfds & handlers of shared memory */ + GArray *postcopy_remote_fds; + +- int state; ++ MigrationStatus state; + + /* + * The incoming migration coroutine, non-NULL during qemu_loadvm_state(). +@@ -298,7 +298,7 @@ struct MigrationState { + /* params from 'migrate-set-parameters' */ + MigrationParameters parameters; + +- int state; ++ MigrationStatus state; + + /* State related to return path */ + struct { +@@ -467,7 +467,8 @@ struct MigrationState { + bool rdma_migration; + }; + +-void migrate_set_state(int *state, int old_state, int new_state); ++void migrate_set_state(MigrationStatus *state, MigrationStatus old_state, ++ MigrationStatus new_state); + + void migration_fd_process_incoming(QEMUFile *f); + void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp); +@@ -487,7 +488,7 @@ int migrate_init(MigrationState *s, Error **errp); + bool migration_is_blocked(Error **errp); + /* True if outgoing migration has entered postcopy phase */ + bool migration_in_postcopy(void); +-bool migration_postcopy_is_alive(int state); ++bool migration_postcopy_is_alive(MigrationStatus state); + MigrationState *migrate_get_current(void); + bool migration_has_failed(MigrationState *); + bool migrate_mode_is_cpr(MigrationState *); +-- +2.39.3 + diff --git a/SOURCES/kvm-migration-docs-Update-postcopy-recover-session-for-S.patch b/SOURCES/kvm-migration-docs-Update-postcopy-recover-session-for-S.patch new file mode 100644 index 0000000..6abd957 --- /dev/null +++ b/SOURCES/kvm-migration-docs-Update-postcopy-recover-session-for-S.patch @@ -0,0 +1,94 @@ +From a35f4af0c143c0b6655bb1123e1734a5a9dd890e Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Wed, 19 Jun 2024 18:30:41 -0400 +Subject: [PATCH 06/11] migration/docs: Update postcopy recover session for + SETUP phase + +RH-Author: Juraj Marcin +RH-MergeRequest: 419: migration: New postcopy state, and some cleanups [rhel-9.5.z] +RH-Jira: RHEL-63874 +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina +RH-Commit: [6/11] f84c228f019a30f23313cbfe7cb39ca8aa0aee84 + +Firstly, the "Paused" state was added in the wrong place before. The state +machine section was describing PostcopyState, rather than MigrationStatus. +Drop the Paused state descriptions. + +Then in the postcopy recover session, add more information on the state +machine for MigrationStatus in the lines. Add the new RECOVER_SETUP phase. + +Reviewed-by: Fabiano Rosas +Signed-off-by: Peter Xu +[fix typo s/reconnects/reconnect] +Signed-off-by: Fabiano Rosas + +(cherry picked from commit 21e89f7ad526f0dddfc722e615bfb0fcdb705c87) + +JIRA: https://issues.redhat.com/browse/RHEL-63874 +Y-JIRA: https://issues.redhat.com/browse/RHEL-38485 + +Signed-off-by: Juraj Marcin +--- + docs/devel/migration/postcopy.rst | 31 ++++++++++++++++--------------- + 1 file changed, 16 insertions(+), 15 deletions(-) + +diff --git a/docs/devel/migration/postcopy.rst b/docs/devel/migration/postcopy.rst +index 6c51e96d79..82e7a848c6 100644 +--- a/docs/devel/migration/postcopy.rst ++++ b/docs/devel/migration/postcopy.rst +@@ -99,17 +99,6 @@ ADVISE->DISCARD->LISTEN->RUNNING->END + (although it can't do the cleanup it would do as it + finishes a normal migration). + +- - Paused +- +- Postcopy can run into a paused state (normally on both sides when +- happens), where all threads will be temporarily halted mostly due to +- network errors. When reaching paused state, migration will make sure +- the qemu binary on both sides maintain the data without corrupting +- the VM. To continue the migration, the admin needs to fix the +- migration channel using the QMP command 'migrate-recover' on the +- destination node, then resume the migration using QMP command 'migrate' +- again on source node, with resume=true flag set. +- + - End + + The listen thread can now quit, and perform the cleanup of migration +@@ -221,7 +210,8 @@ paused postcopy migration. + + The recovery phase normally contains a few steps: + +- - When network issue occurs, both QEMU will go into PAUSED state ++ - When network issue occurs, both QEMU will go into **POSTCOPY_PAUSED** ++ migration state. + + - When the network is recovered (or a new network is provided), the admin + can setup the new channel for migration using QMP command +@@ -229,9 +219,20 @@ The recovery phase normally contains a few steps: + + - On source host, the admin can continue the interrupted postcopy + migration using QMP command 'migrate' with resume=true flag set. +- +- - After the connection is re-established, QEMU will continue the postcopy +- migration on both sides. ++ Source QEMU will go into **POSTCOPY_RECOVER_SETUP** state trying to ++ re-establish the channels. ++ ++ - When both sides of QEMU successfully reconnect using a new or fixed up ++ channel, they will go into **POSTCOPY_RECOVER** state, some handshake ++ procedure will be needed to properly synchronize the VM states between ++ the two QEMUs to continue the postcopy migration. For example, there ++ can be pages sent right during the window when the network is ++ interrupted, then the handshake will guarantee pages lost in-flight ++ will be resent again. ++ ++ - After a proper handshake synchronization, QEMU will continue the ++ postcopy migration on both sides and go back to **POSTCOPY_ACTIVE** ++ state. Postcopy migration will continue. + + During a paused postcopy migration, the VM can logically still continue + running, and it will not be impacted from any page access to pages that +-- +2.39.3 + diff --git a/SOURCES/kvm-migration-multifd-Avoid-the-final-FLUSH-in-complete.patch b/SOURCES/kvm-migration-multifd-Avoid-the-final-FLUSH-in-complete.patch new file mode 100644 index 0000000..8756f72 --- /dev/null +++ b/SOURCES/kvm-migration-multifd-Avoid-the-final-FLUSH-in-complete.patch @@ -0,0 +1,50 @@ +From ee276dfcc7d4b25214ec6745ebf55c4666b3bd0a Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Wed, 19 Jun 2024 18:30:36 -0400 +Subject: [PATCH 01/11] migration/multifd: Avoid the final FLUSH in complete() + +RH-Author: Juraj Marcin +RH-MergeRequest: 419: migration: New postcopy state, and some cleanups [rhel-9.5.z] +RH-Jira: RHEL-63874 +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina +RH-Commit: [1/11] 028e310f65eaad098ef62bdb8a5d30b9a5cd32e2 + +We always do the flush when finishing one round of scan, and during +complete() phase we should scan one more round making sure no dirty page +existed. In that case we shouldn't need one explicit FLUSH at the end of +complete(), as when reaching there all pages should have been flushed. + +Reviewed-by: Fabiano Rosas +Tested-by: Fabiano Rosas +Signed-off-by: Peter Xu +Signed-off-by: Fabiano Rosas + +(cherry picked from commit 637280aeb242517ede480aa2d5ba1c29d41eac11) + +JIRA: https://issues.redhat.com/browse/RHEL-63874 +Y-JIRA: https://issues.redhat.com/browse/RHEL-38485 + +Signed-off-by: Juraj Marcin +--- + migration/ram.c | 4 ---- + 1 file changed, 4 deletions(-) + +diff --git a/migration/ram.c b/migration/ram.c +index 8deb84984f..3ef84e7036 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -3383,10 +3383,6 @@ static int ram_save_complete(QEMUFile *f, void *opaque) + } + } + +- if (migrate_multifd() && !migrate_multifd_flush_after_each_section() && +- !migrate_mapped_ram()) { +- qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); +- } + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); + return qemu_fflush(f); + } +-- +2.39.3 + diff --git a/SOURCES/kvm-migration-postcopy-Add-postcopy-recover-setup-phase.patch b/SOURCES/kvm-migration-postcopy-Add-postcopy-recover-setup-phase.patch new file mode 100644 index 0000000..064d1c1 --- /dev/null +++ b/SOURCES/kvm-migration-postcopy-Add-postcopy-recover-setup-phase.patch @@ -0,0 +1,280 @@ +From c9eb5f8e86d031060c72aeb9d995844c6f842c58 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Wed, 19 Jun 2024 18:30:40 -0400 +Subject: [PATCH 05/11] migration/postcopy: Add postcopy-recover-setup phase + +RH-Author: Juraj Marcin +RH-MergeRequest: 419: migration: New postcopy state, and some cleanups [rhel-9.5.z] +RH-Jira: RHEL-63874 +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina +RH-Commit: [5/11] ce81d3b247b9f9541a75265a07082394ce419f3a + +This patch adds a migration state on src called "postcopy-recover-setup". +The new state will describe the intermediate step starting from when the +src QEMU received a postcopy recovery request, until the migration channels +are properly established, but before the recovery process take place. + +The request came from Libvirt where Libvirt currently rely on the migration +state events to detect migration state changes. That works for most of the +migration process but except postcopy recovery failures at the beginning. + +Currently postcopy recovery only has two major states: + + - postcopy-paused: this is the state that both sides of QEMU will be in + for a long time as long as the migration channel was interrupted. + + - postcopy-recover: this is the state where both sides of QEMU handshake + with each other, preparing for a continuation of postcopy which used to + be interrupted. + +The issue here is when the recovery port is invalid, the src QEMU will take +the URI/channels, noticing the ports are not valid, and it'll silently keep +in the postcopy-paused state, with no event sent to Libvirt. In this case, +the only thing Libvirt can do is to poll the migration status with a proper +interval, however that's less optimal. + +Considering that this is the only case where Libvirt won't get a +notification from QEMU on such events, let's add postcopy-recover-setup +state to mimic what we have with the "setup" state of a newly initialized +migration, describing the phase of connection establishment. + +With that, postcopy recovery will have two paths to go now, and either path +will guarantee an event generated. Now the events will look like this +during a recovery process on src QEMU: + + - Initially when the recovery is initiated on src, QEMU will go from + "postcopy-paused" -> "postcopy-recover-setup". Old QEMUs don't have + this event. + + - Depending on whether the channel re-establishment is succeeded: + + - In succeeded case, src QEMU will move from "postcopy-recover-setup" + to "postcopy-recover". Old QEMUs also have this event. + + - In failure case, src QEMU will move from "postcopy-recover-setup" to + "postcopy-paused" again. Old QEMUs don't have this event. + +This guarantees that Libvirt will always receive a notification for +recovery process properly. + +One thing to mention is, such new status is only needed on src QEMU not +both. On dest QEMU, the state machine doesn't change. Hence the events +don't change either. It's done like so because dest QEMU may not have an +explicit point of setup start. E.g., it can happen that when dest QEMUs +doesn't use migrate-recover command to use a new URI/channel, but the old +URI/channels can be reused in recovery, in which case the old ports simply +can work again after the network routes are fixed up. + +Add a new helper postcopy_is_paused() detecting whether postcopy is still +paused, taking RECOVER_SETUP into account too. When using it on both +src/dst, a slight change is done altogether to always wait for the +semaphore before checking the status, because for both sides a sem_post() +will be required for a recovery. + +Cc: Jiri Denemark +Cc: Prasad Pandit +Reviewed-by: Fabiano Rosas +Buglink: https://issues.redhat.com/browse/RHEL-38485 +Signed-off-by: Peter Xu +Signed-off-by: Fabiano Rosas + +(cherry picked from commit 4146b77ec7640d3c30d42558e13423594b114385) + +JIRA: https://issues.redhat.com/browse/RHEL-63874 +Y-JIRA: https://issues.redhat.com/browse/RHEL-38485 + +Signed-off-by: Juraj Marcin +--- + migration/migration.c | 40 ++++++++++++++++++++++++++++++++++------ + migration/postcopy-ram.c | 6 ++++++ + migration/postcopy-ram.h | 3 +++ + migration/savevm.c | 4 ++-- + qapi/migration.json | 4 ++++ + 5 files changed, 49 insertions(+), 8 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index 21f20a8e1c..03e151a045 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -1100,6 +1100,7 @@ bool migration_is_setup_or_active(void) + case MIGRATION_STATUS_ACTIVE: + case MIGRATION_STATUS_POSTCOPY_ACTIVE: + case MIGRATION_STATUS_POSTCOPY_PAUSED: ++ case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: + case MIGRATION_STATUS_POSTCOPY_RECOVER: + case MIGRATION_STATUS_SETUP: + case MIGRATION_STATUS_PRE_SWITCHOVER: +@@ -1122,6 +1123,7 @@ bool migration_is_running(void) + case MIGRATION_STATUS_ACTIVE: + case MIGRATION_STATUS_POSTCOPY_ACTIVE: + case MIGRATION_STATUS_POSTCOPY_PAUSED: ++ case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: + case MIGRATION_STATUS_POSTCOPY_RECOVER: + case MIGRATION_STATUS_SETUP: + case MIGRATION_STATUS_PRE_SWITCHOVER: +@@ -1273,6 +1275,7 @@ static void fill_source_migration_info(MigrationInfo *info) + case MIGRATION_STATUS_PRE_SWITCHOVER: + case MIGRATION_STATUS_DEVICE: + case MIGRATION_STATUS_POSTCOPY_PAUSED: ++ case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: + case MIGRATION_STATUS_POSTCOPY_RECOVER: + /* TODO add some postcopy stats */ + populate_time_info(info, s); +@@ -1469,10 +1472,31 @@ static void migrate_error_free(MigrationState *s) + + static void migrate_fd_error(MigrationState *s, const Error *error) + { ++ MigrationStatus current = s->state; ++ MigrationStatus next; ++ + trace_migrate_fd_error(error_get_pretty(error)); + assert(s->to_dst_file == NULL); +- migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, +- MIGRATION_STATUS_FAILED); ++ ++ switch (current) { ++ case MIGRATION_STATUS_SETUP: ++ next = MIGRATION_STATUS_FAILED; ++ break; ++ case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: ++ /* Never fail a postcopy migration; switch back to PAUSED instead */ ++ next = MIGRATION_STATUS_POSTCOPY_PAUSED; ++ break; ++ default: ++ /* ++ * This really shouldn't happen. Just be careful to not crash a VM ++ * just for this. Instead, dump something. ++ */ ++ error_report("%s: Illegal migration status (%s) detected", ++ __func__, MigrationStatus_str(current)); ++ return; ++ } ++ ++ migrate_set_state(&s->state, current, next); + migrate_set_error(s, error); + } + +@@ -1573,6 +1597,7 @@ bool migration_in_postcopy(void) + switch (s->state) { + case MIGRATION_STATUS_POSTCOPY_ACTIVE: + case MIGRATION_STATUS_POSTCOPY_PAUSED: ++ case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: + case MIGRATION_STATUS_POSTCOPY_RECOVER: + return true; + default: +@@ -1965,6 +1990,9 @@ static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc, + return false; + } + ++ migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED, ++ MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP); ++ + /* This is a resume, skip init status */ + return true; + } +@@ -3020,9 +3048,9 @@ static MigThrError postcopy_pause(MigrationState *s) + * We wait until things fixed up. Then someone will setup the + * status back for us. + */ +- while (s->state == MIGRATION_STATUS_POSTCOPY_PAUSED) { ++ do { + qemu_sem_wait(&s->postcopy_pause_sem); +- } ++ } while (postcopy_is_paused(s->state)); + + if (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) { + /* Woken up by a recover procedure. Give it a shot */ +@@ -3687,7 +3715,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) + { + Error *local_err = NULL; + uint64_t rate_limit; +- bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED; ++ bool resume = (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP); + int ret; + + /* +@@ -3754,7 +3782,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) + + if (resume) { + /* Wakeup the main migration thread to do the recovery */ +- migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED, ++ migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP, + MIGRATION_STATUS_POSTCOPY_RECOVER); + qemu_sem_post(&s->postcopy_pause_sem); + return; +diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c +index ef184d8d08..be10611048 100644 +--- a/migration/postcopy-ram.c ++++ b/migration/postcopy-ram.c +@@ -1770,3 +1770,9 @@ void *postcopy_preempt_thread(void *opaque) + + return NULL; + } ++ ++bool postcopy_is_paused(MigrationStatus status) ++{ ++ return status == MIGRATION_STATUS_POSTCOPY_PAUSED || ++ status == MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP; ++} +diff --git a/migration/postcopy-ram.h b/migration/postcopy-ram.h +index ecae941211..a6df1b2811 100644 +--- a/migration/postcopy-ram.h ++++ b/migration/postcopy-ram.h +@@ -13,6 +13,8 @@ + #ifndef QEMU_POSTCOPY_RAM_H + #define QEMU_POSTCOPY_RAM_H + ++#include "qapi/qapi-types-migration.h" ++ + /* Return true if the host supports everything we need to do postcopy-ram */ + bool postcopy_ram_supported_by_host(MigrationIncomingState *mis, + Error **errp); +@@ -193,5 +195,6 @@ enum PostcopyChannels { + void postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file); + void postcopy_preempt_setup(MigrationState *s); + int postcopy_preempt_establish_channel(MigrationState *s); ++bool postcopy_is_paused(MigrationStatus status); + + #endif +diff --git a/migration/savevm.c b/migration/savevm.c +index 5aa595e365..a0f7a9dceb 100644 +--- a/migration/savevm.c ++++ b/migration/savevm.c +@@ -2860,9 +2860,9 @@ static bool postcopy_pause_incoming(MigrationIncomingState *mis) + error_report("Detected IO failure for postcopy. " + "Migration paused."); + +- while (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) { ++ do { + qemu_sem_wait(&mis->postcopy_pause_sem_dst); +- } ++ } while (postcopy_is_paused(mis->state)); + + trace_postcopy_pause_incoming_continued(); + +diff --git a/qapi/migration.json b/qapi/migration.json +index 8c65b90328..e518563f67 100644 +--- a/qapi/migration.json ++++ b/qapi/migration.json +@@ -150,6 +150,9 @@ + # + # @postcopy-paused: during postcopy but paused. (since 3.0) + # ++# @postcopy-recover-setup: setup phase for a postcopy recovery process, ++# preparing for a recovery phase to start. (since 9.1) ++# + # @postcopy-recover: trying to recover from a paused postcopy. (since + # 3.0) + # +@@ -174,6 +177,7 @@ + { 'enum': 'MigrationStatus', + 'data': [ 'none', 'setup', 'cancelling', 'cancelled', + 'active', 'postcopy-active', 'postcopy-paused', ++ 'postcopy-recover-setup', + 'postcopy-recover', 'completed', 'failed', 'colo', + 'pre-switchover', 'device', 'wait-unplug' ] } + ## +-- +2.39.3 + diff --git a/SOURCES/kvm-tests-migration-tests-Always-enable-migration-events.patch b/SOURCES/kvm-tests-migration-tests-Always-enable-migration-events.patch new file mode 100644 index 0000000..902d067 --- /dev/null +++ b/SOURCES/kvm-tests-migration-tests-Always-enable-migration-events.patch @@ -0,0 +1,69 @@ +From 16ba989e9c7606719bb1ab4d5511bac6c2c0d625 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Wed, 19 Jun 2024 18:30:43 -0400 +Subject: [PATCH 08/11] tests/migration-tests: Always enable migration events + +RH-Author: Juraj Marcin +RH-MergeRequest: 419: migration: New postcopy state, and some cleanups [rhel-9.5.z] +RH-Jira: RHEL-63874 +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina +RH-Commit: [8/11] 02182a6c1e9b492ca90b86e0568657c55bac121d + +Libvirt should always enable it, so it'll be nice qtest also cover that for +all tests on both sides. migrate_incoming_qmp() used to enable it only on +dst, now we enable them on both, as we'll start to sanity check events even +on the src QEMU. + +We'll need to leave the one in migrate_incoming_qmp(), because +virtio-net-failover test uses that one only, and it relies on the events to +work. + +Signed-off-by: Peter Xu +Reviewed-by: Fabiano Rosas +Signed-off-by: Fabiano Rosas + +(cherry picked from commit cd313b66f203381f2f2f984d5155d7942d26725d) + +JIRA: https://issues.redhat.com/browse/RHEL-63874 +Y-JIRA: https://issues.redhat.com/browse/RHEL-38485 + +Signed-off-by: Juraj Marcin +--- + tests/qtest/migration-helpers.c | 1 + + tests/qtest/migration-test.c | 7 +++++++ + 2 files changed, 8 insertions(+) + +diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c +index e451dbdbed..50a6bc2569 100644 +--- a/tests/qtest/migration-helpers.c ++++ b/tests/qtest/migration-helpers.c +@@ -107,6 +107,7 @@ void migrate_incoming_qmp(QTestState *to, const char *uri, const char *fmt, ...) + g_assert(!qdict_haskey(args, "uri")); + qdict_put_str(args, "uri", uri); + ++ /* This function relies on the event to work, make sure it's enabled */ + migrate_set_capability(to, "events", true); + + rsp = qtest_qmp(to, "{ 'execute': 'migrate-incoming', 'arguments': %p}", +diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c +index 0808300f5b..9f29f4e4f3 100644 +--- a/tests/qtest/migration-test.c ++++ b/tests/qtest/migration-test.c +@@ -908,6 +908,13 @@ static int test_migrate_start(QTestState **from, QTestState **to, + unlink(shmem_path); + } + ++ /* ++ * Always enable migration events. Libvirt always uses it, let's try ++ * to mimic as closer as that. ++ */ ++ migrate_set_capability(*from, "events", true); ++ migrate_set_capability(*to, "events", true); ++ + return 0; + } + +-- +2.39.3 + diff --git a/SOURCES/kvm-tests-migration-tests-Cover-postcopy-failure-on-reco.patch b/SOURCES/kvm-tests-migration-tests-Cover-postcopy-failure-on-reco.patch new file mode 100644 index 0000000..6411ea1 --- /dev/null +++ b/SOURCES/kvm-tests-migration-tests-Cover-postcopy-failure-on-reco.patch @@ -0,0 +1,219 @@ +From e503d6466ec8dd6c51b5891bd52f6f4076210f8b Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Wed, 19 Jun 2024 18:30:46 -0400 +Subject: [PATCH 11/11] tests/migration-tests: Cover postcopy failure on + reconnect + +RH-Author: Juraj Marcin +RH-MergeRequest: 419: migration: New postcopy state, and some cleanups [rhel-9.5.z] +RH-Jira: RHEL-63874 +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina +RH-Commit: [11/11] 0295f627c0e8e7bae6dcd695c063b17717c7590f + +Make sure there will be an event for postcopy recovery, irrelevant of +whether the reconnect will success, or when the failure happens. + +The added new case is to fail early in postcopy recovery, in which case it +didn't even reach RECOVER stage on src (and in real life it'll be the same +to dest, but the test case is just slightly more involved due to the dual +socketpair setup). + +To do that, rename the postcopy_recovery_test_fail to reflect either stage +to fail, instead of a boolean. + +Reviewed-by: Fabiano Rosas +Signed-off-by: Peter Xu +Signed-off-by: Fabiano Rosas + +(cherry picked from commit 6cf56a87baf8b99c4296a943d220eb8276ca035a) + +JIRA: https://issues.redhat.com/browse/RHEL-63874 +Y-JIRA: https://issues.redhat.com/browse/RHEL-38485 + +Signed-off-by: Juraj Marcin +--- + tests/qtest/migration-test.c | 95 +++++++++++++++++++++++++++++------- + 1 file changed, 77 insertions(+), 18 deletions(-) + +diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c +index afe8270dd0..d903e3e0fa 100644 +--- a/tests/qtest/migration-test.c ++++ b/tests/qtest/migration-test.c +@@ -74,6 +74,17 @@ static QTestMigrationState dst_state; + #define QEMU_ENV_SRC "QTEST_QEMU_BINARY_SRC" + #define QEMU_ENV_DST "QTEST_QEMU_BINARY_DST" + ++typedef enum PostcopyRecoveryFailStage { ++ /* ++ * "no failure" must be 0 as it's the default. OTOH, real failure ++ * cases must be >0 to make sure they trigger by a "if" test. ++ */ ++ POSTCOPY_FAIL_NONE = 0, ++ POSTCOPY_FAIL_CHANNEL_ESTABLISH, ++ POSTCOPY_FAIL_RECOVERY, ++ POSTCOPY_FAIL_MAX ++} PostcopyRecoveryFailStage; ++ + #if defined(__linux__) + #include + #include +@@ -753,7 +764,7 @@ typedef struct { + /* Postcopy specific fields */ + void *postcopy_data; + bool postcopy_preempt; +- bool postcopy_recovery_test_fail; ++ PostcopyRecoveryFailStage postcopy_recovery_fail_stage; + } MigrateCommon; + + static int test_migrate_start(QTestState **from, QTestState **to, +@@ -1467,12 +1478,16 @@ static void wait_for_postcopy_status(QTestState *one, const char *status) + "completed", NULL }); + } + +-static void postcopy_recover_fail(QTestState *from, QTestState *to) ++static void postcopy_recover_fail(QTestState *from, QTestState *to, ++ PostcopyRecoveryFailStage stage) + { + #ifndef _WIN32 ++ bool fail_early = (stage == POSTCOPY_FAIL_CHANNEL_ESTABLISH); + int ret, pair1[2], pair2[2]; + char c; + ++ g_assert(stage > POSTCOPY_FAIL_NONE && stage < POSTCOPY_FAIL_MAX); ++ + /* Create two unrelated socketpairs */ + ret = qemu_socketpair(PF_LOCAL, SOCK_STREAM, 0, pair1); + g_assert_cmpint(ret, ==, 0); +@@ -1506,6 +1521,14 @@ static void postcopy_recover_fail(QTestState *from, QTestState *to) + ret = send(pair2[1], &c, 1, 0); + g_assert_cmpint(ret, ==, 1); + ++ if (stage == POSTCOPY_FAIL_CHANNEL_ESTABLISH) { ++ /* ++ * This will make src QEMU to fail at an early stage when trying to ++ * resume later, where it shouldn't reach RECOVER stage at all. ++ */ ++ close(pair1[1]); ++ } ++ + migrate_recover(to, "fd:fd-mig"); + migrate_qmp(from, "fd:fd-mig", "{'resume': true}"); + +@@ -1515,28 +1538,53 @@ static void postcopy_recover_fail(QTestState *from, QTestState *to) + */ + migration_event_wait(from, "postcopy-recover-setup"); + ++ if (fail_early) { ++ /* ++ * When fails at reconnection, src QEMU will automatically goes ++ * back to PAUSED state. Making sure there is an event in this ++ * case: Libvirt relies on this to detect early reconnection ++ * errors. ++ */ ++ migration_event_wait(from, "postcopy-paused"); ++ } else { ++ /* ++ * We want to test "fail later" at RECOVER stage here. Make sure ++ * both QEMU instances will go into RECOVER stage first, then test ++ * kicking them out using migrate-pause. ++ * ++ * Explicitly check the RECOVER event on src, that's what Libvirt ++ * relies on, rather than polling. ++ */ ++ migration_event_wait(from, "postcopy-recover"); ++ wait_for_postcopy_status(from, "postcopy-recover"); ++ ++ /* Need an explicit kick on src QEMU in this case */ ++ migrate_pause(from); ++ } ++ + /* +- * Make sure both QEMU instances will go into RECOVER stage, then test +- * kicking them out using migrate-pause. ++ * For all failure cases, we'll reach such states on both sides now. ++ * Check them. + */ +- wait_for_postcopy_status(from, "postcopy-recover"); ++ wait_for_postcopy_status(from, "postcopy-paused"); + wait_for_postcopy_status(to, "postcopy-recover"); + + /* +- * This would be issued by the admin upon noticing the hang, we should +- * make sure we're able to kick this out. ++ * Kick dest QEMU out too. This is normally not needed in reality ++ * because when the channel is shutdown it should also happen on src. ++ * However here we used separate socket pairs so we need to do that ++ * explicitly. + */ +- migrate_pause(from); +- wait_for_postcopy_status(from, "postcopy-paused"); +- +- /* Do the same test on dest */ + migrate_pause(to); + wait_for_postcopy_status(to, "postcopy-paused"); + + close(pair1[0]); +- close(pair1[1]); + close(pair2[0]); + close(pair2[1]); ++ ++ if (stage != POSTCOPY_FAIL_CHANNEL_ESTABLISH) { ++ close(pair1[1]); ++ } + #endif + } + +@@ -1578,12 +1626,12 @@ static void test_postcopy_recovery_common(MigrateCommon *args) + wait_for_postcopy_status(to, "postcopy-paused"); + wait_for_postcopy_status(from, "postcopy-paused"); + +- if (args->postcopy_recovery_test_fail) { ++ if (args->postcopy_recovery_fail_stage) { + /* + * Test when a wrong socket specified for recover, and then the + * ability to kick it out, and continue with a correct socket. + */ +- postcopy_recover_fail(from, to); ++ postcopy_recover_fail(from, to, args->postcopy_recovery_fail_stage); + /* continue with a good recovery */ + } + +@@ -1623,10 +1671,19 @@ static void test_postcopy_recovery_compress(void) + test_postcopy_recovery_common(&args); + } + +-static void test_postcopy_recovery_double_fail(void) ++static void test_postcopy_recovery_fail_handshake(void) ++{ ++ MigrateCommon args = { ++ .postcopy_recovery_fail_stage = POSTCOPY_FAIL_RECOVERY, ++ }; ++ ++ test_postcopy_recovery_common(&args); ++} ++ ++static void test_postcopy_recovery_fail_reconnect(void) + { + MigrateCommon args = { +- .postcopy_recovery_test_fail = true, ++ .postcopy_recovery_fail_stage = POSTCOPY_FAIL_CHANNEL_ESTABLISH, + }; + + test_postcopy_recovery_common(&args); +@@ -3604,8 +3661,10 @@ int main(int argc, char **argv) + migration_test_add("/migration/postcopy/recovery/compress/plain", + test_postcopy_recovery_compress); + } +- migration_test_add("/migration/postcopy/recovery/double-failures", +- test_postcopy_recovery_double_fail); ++ migration_test_add("/migration/postcopy/recovery/double-failures/handshake", ++ test_postcopy_recovery_fail_handshake); ++ migration_test_add("/migration/postcopy/recovery/double-failures/reconnect", ++ test_postcopy_recovery_fail_reconnect); + if (is_x86) { + migration_test_add("/migration/postcopy/suspend", + test_postcopy_suspend); +-- +2.39.3 + diff --git a/SOURCES/kvm-tests-migration-tests-Drop-most-WIN32-ifdefs-for-pos.patch b/SOURCES/kvm-tests-migration-tests-Drop-most-WIN32-ifdefs-for-pos.patch new file mode 100644 index 0000000..8d61a71 --- /dev/null +++ b/SOURCES/kvm-tests-migration-tests-Drop-most-WIN32-ifdefs-for-pos.patch @@ -0,0 +1,102 @@ +From 13f85a7187ed25b41c6064b94fdcc798e3bf61a0 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Wed, 19 Jun 2024 18:30:42 -0400 +Subject: [PATCH 07/11] tests/migration-tests: Drop most WIN32 ifdefs for + postcopy failure tests + +RH-Author: Juraj Marcin +RH-MergeRequest: 419: migration: New postcopy state, and some cleanups [rhel-9.5.z] +RH-Jira: RHEL-63874 +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina +RH-Commit: [7/11] 86500403d1f4baef170d3bc6f6a9cd34862c9076 + +Most of them are not needed, we can stick with one ifdef inside +postcopy_recover_fail() so as to cover the scm right tricks only. +The tests won't run on windows anyway due to has_uffd always false. + +Reviewed-by: Fabiano Rosas +Signed-off-by: Peter Xu +Signed-off-by: Fabiano Rosas + +(cherry picked from commit 0fd397359540a6622c5f2164e76fc2cefd811f2a) + +JIRA: https://issues.redhat.com/browse/RHEL-63874 +Y-JIRA: https://issues.redhat.com/browse/RHEL-38485 + +Signed-off-by: Juraj Marcin +--- + tests/qtest/migration-test.c | 10 ++-------- + 1 file changed, 2 insertions(+), 8 deletions(-) + +diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c +index 1d2cee87ea..0808300f5b 100644 +--- a/tests/qtest/migration-test.c ++++ b/tests/qtest/migration-test.c +@@ -1460,9 +1460,9 @@ static void wait_for_postcopy_status(QTestState *one, const char *status) + "completed", NULL }); + } + +-#ifndef _WIN32 + static void postcopy_recover_fail(QTestState *from, QTestState *to) + { ++#ifndef _WIN32 + int ret, pair1[2], pair2[2]; + char c; + +@@ -1524,8 +1524,8 @@ static void postcopy_recover_fail(QTestState *from, QTestState *to) + close(pair1[1]); + close(pair2[0]); + close(pair2[1]); ++#endif + } +-#endif /* _WIN32 */ + + static void test_postcopy_recovery_common(MigrateCommon *args) + { +@@ -1565,7 +1565,6 @@ static void test_postcopy_recovery_common(MigrateCommon *args) + wait_for_postcopy_status(to, "postcopy-paused"); + wait_for_postcopy_status(from, "postcopy-paused"); + +-#ifndef _WIN32 + if (args->postcopy_recovery_test_fail) { + /* + * Test when a wrong socket specified for recover, and then the +@@ -1574,7 +1573,6 @@ static void test_postcopy_recovery_common(MigrateCommon *args) + postcopy_recover_fail(from, to); + /* continue with a good recovery */ + } +-#endif /* _WIN32 */ + + /* + * Create a new socket to emulate a new channel that is different +@@ -1612,7 +1610,6 @@ static void test_postcopy_recovery_compress(void) + test_postcopy_recovery_common(&args); + } + +-#ifndef _WIN32 + static void test_postcopy_recovery_double_fail(void) + { + MigrateCommon args = { +@@ -1621,7 +1618,6 @@ static void test_postcopy_recovery_double_fail(void) + + test_postcopy_recovery_common(&args); + } +-#endif /* _WIN32 */ + + #ifdef CONFIG_GNUTLS + static void test_postcopy_recovery_tls_psk(void) +@@ -3595,10 +3591,8 @@ int main(int argc, char **argv) + migration_test_add("/migration/postcopy/recovery/compress/plain", + test_postcopy_recovery_compress); + } +-#ifndef _WIN32 + migration_test_add("/migration/postcopy/recovery/double-failures", + test_postcopy_recovery_double_fail); +-#endif /* _WIN32 */ + if (is_x86) { + migration_test_add("/migration/postcopy/suspend", + test_postcopy_suspend); +-- +2.39.3 + diff --git a/SOURCES/kvm-tests-migration-tests-Verify-postcopy-recover-setup-.patch b/SOURCES/kvm-tests-migration-tests-Verify-postcopy-recover-setup-.patch new file mode 100644 index 0000000..bbfcc73 --- /dev/null +++ b/SOURCES/kvm-tests-migration-tests-Verify-postcopy-recover-setup-.patch @@ -0,0 +1,50 @@ +From f779d9def0b1f3446054842373b994c3f60cec41 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Wed, 19 Jun 2024 18:30:45 -0400 +Subject: [PATCH 10/11] tests/migration-tests: Verify postcopy-recover-setup + status + +RH-Author: Juraj Marcin +RH-MergeRequest: 419: migration: New postcopy state, and some cleanups [rhel-9.5.z] +RH-Jira: RHEL-63874 +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina +RH-Commit: [10/11] 67ebd3ec3714510483101c84253d0b71ddb5632a + +Making sure the postcopy-recover-setup status is present in the postcopy +failure unit test. Note that it only applies to src QEMU not dest. + +Signed-off-by: Peter Xu +Reviewed-by: Fabiano Rosas +Signed-off-by: Fabiano Rosas + +(cherry picked from commit 8dbd24d3aa6d67b2d3576da016fb631fd1edfc2c) + +JIRA: https://issues.redhat.com/browse/RHEL-63874 +Y-JIRA: https://issues.redhat.com/browse/RHEL-38485 + +Signed-off-by: Juraj Marcin +--- + tests/qtest/migration-test.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c +index 9f29f4e4f3..afe8270dd0 100644 +--- a/tests/qtest/migration-test.c ++++ b/tests/qtest/migration-test.c +@@ -1509,6 +1509,12 @@ static void postcopy_recover_fail(QTestState *from, QTestState *to) + migrate_recover(to, "fd:fd-mig"); + migrate_qmp(from, "fd:fd-mig", "{'resume': true}"); + ++ /* ++ * Source QEMU has an extra RECOVER_SETUP phase, dest doesn't have it. ++ * Make sure it appears along the way. ++ */ ++ migration_event_wait(from, "postcopy-recover-setup"); ++ + /* + * Make sure both QEMU instances will go into RECOVER stage, then test + * kicking them out using migrate-pause. +-- +2.39.3 + diff --git a/SOURCES/kvm-tests-migration-tests-migration_event_wait.patch b/SOURCES/kvm-tests-migration-tests-migration_event_wait.patch new file mode 100644 index 0000000..69fc0e9 --- /dev/null +++ b/SOURCES/kvm-tests-migration-tests-migration_event_wait.patch @@ -0,0 +1,98 @@ +From 0f824a811ff30b2d8bd78eb97ee835598c6be65f Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Wed, 19 Jun 2024 18:30:44 -0400 +Subject: [PATCH 09/11] tests/migration-tests: migration_event_wait() + +RH-Author: Juraj Marcin +RH-MergeRequest: 419: migration: New postcopy state, and some cleanups [rhel-9.5.z] +RH-Jira: RHEL-63874 +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina +RH-Commit: [9/11] 417c600dfc6acbc125a82c1b56c9637041555c15 + +Introduce a small helper to wait for a migration event, generalized from +the incoming migration path. Make the helper easier to use by allowing it +to keep waiting until the expected event is received. + +Signed-off-by: Peter Xu +Reviewed-by: Fabiano Rosas +Signed-off-by: Fabiano Rosas + +(cherry picked from commit d444e5673c223241bd2edbc207b02cc1b2114b71) + +JIRA: https://issues.redhat.com/browse/RHEL-63874 +Y-JIRA: https://issues.redhat.com/browse/RHEL-38485 + +Signed-off-by: Juraj Marcin +--- + tests/qtest/migration-helpers.c | 31 ++++++++++++++++++++++--------- + tests/qtest/migration-helpers.h | 2 ++ + 2 files changed, 24 insertions(+), 9 deletions(-) + +diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c +index 50a6bc2569..31d83ab970 100644 +--- a/tests/qtest/migration-helpers.c ++++ b/tests/qtest/migration-helpers.c +@@ -98,7 +98,7 @@ void migrate_set_capability(QTestState *who, const char *capability, + void migrate_incoming_qmp(QTestState *to, const char *uri, const char *fmt, ...) + { + va_list ap; +- QDict *args, *rsp, *data; ++ QDict *args, *rsp; + + va_start(ap, fmt); + args = qdict_from_vjsonf_nofail(fmt, ap); +@@ -121,14 +121,7 @@ void migrate_incoming_qmp(QTestState *to, const char *uri, const char *fmt, ...) + g_assert(qdict_haskey(rsp, "return")); + qobject_unref(rsp); + +- rsp = qtest_qmp_eventwait_ref(to, "MIGRATION"); +- g_assert(qdict_haskey(rsp, "data")); +- +- data = qdict_get_qdict(rsp, "data"); +- g_assert(qdict_haskey(data, "status")); +- g_assert_cmpstr(qdict_get_str(data, "status"), ==, "setup"); +- +- qobject_unref(rsp); ++ migration_event_wait(to, "setup"); + } + + /* +@@ -324,3 +317,23 @@ void migration_test_add(const char *path, void (*fn)(void)) + qtest_add_data_func_full(path, test, migration_test_wrapper, + migration_test_destroy); + } ++ ++/* ++ * Wait for a "MIGRATION" event. This is what Libvirt uses to track ++ * migration status changes. ++ */ ++void migration_event_wait(QTestState *s, const char *target) ++{ ++ QDict *response, *data; ++ const char *status; ++ bool found; ++ ++ do { ++ response = qtest_qmp_eventwait_ref(s, "MIGRATION"); ++ data = qdict_get_qdict(response, "data"); ++ g_assert(data); ++ status = qdict_get_str(data, "status"); ++ found = (strcmp(status, target) == 0); ++ qobject_unref(response); ++ } while (!found); ++} +diff --git a/tests/qtest/migration-helpers.h b/tests/qtest/migration-helpers.h +index 3bf7ded1b9..83f277c054 100644 +--- a/tests/qtest/migration-helpers.h ++++ b/tests/qtest/migration-helpers.h +@@ -53,4 +53,6 @@ char *find_common_machine_version(const char *mtype, const char *var1, + char *resolve_machine_version(const char *alias, const char *var1, + const char *var2); + void migration_test_add(const char *path, void (*fn)(void)); ++void migration_event_wait(QTestState *s, const char *target); ++ + #endif /* MIGRATION_HELPERS_H */ +-- +2.39.3 + diff --git a/SOURCES/kvm-vhost-fail-device-start-if-iotlb-update-fails.patch b/SOURCES/kvm-vhost-fail-device-start-if-iotlb-update-fails.patch new file mode 100644 index 0000000..94395d6 --- /dev/null +++ b/SOURCES/kvm-vhost-fail-device-start-if-iotlb-update-fails.patch @@ -0,0 +1,62 @@ +From 2052d94ffccde5d6eb5af8cca77aaf8bba650c68 Mon Sep 17 00:00:00 2001 +From: Prasad Pandit +Date: Thu, 7 Nov 2024 17:02:47 +0530 +Subject: [PATCH] vhost: fail device start if iotlb update fails + +RH-Author: Prasad Pandit +RH-MergeRequest: 426: vhost: fail device start if iotlb update fails +RH-Jira: RHEL-73006 +RH-Acked-by: Stefano Garzarella +RH-Commit: [1/1] a96478385297d0559dd7dcaebd1834141bb5fb75 + +While starting a vhost device, updating iotlb entries +via 'vhost_device_iotlb_miss' may return an error. + + qemu-kvm: vhost_device_iotlb_miss: + 700871,700871: Fail to update device iotlb + +Fail device start when such an error occurs. + +Jira: https://issues.redhat.com/browse/RHEL-73006 +Signed-off-by: Prasad Pandit +Message-Id: <20241107113247.46532-1-ppandit@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Stefano Garzarella +(cherry picked from commit 571bdc97b83646dfd3746ec56fb2f70bca55b9a2) +Signed-off-by: Prasad Pandit +--- + hw/virtio/vhost.c | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c +index f50180e60e..da0f10c4dc 100644 +--- a/hw/virtio/vhost.c ++++ b/hw/virtio/vhost.c +@@ -2074,11 +2074,22 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) + * vhost-kernel code requires for this.*/ + for (i = 0; i < hdev->nvqs; ++i) { + struct vhost_virtqueue *vq = hdev->vqs + i; +- vhost_device_iotlb_miss(hdev, vq->used_phys, true); ++ r = vhost_device_iotlb_miss(hdev, vq->used_phys, true); ++ if (r) { ++ goto fail_iotlb; ++ } + } + } + vhost_start_config_intr(hdev); + return 0; ++fail_iotlb: ++ if (vhost_dev_has_iommu(hdev) && ++ hdev->vhost_ops->vhost_set_iotlb_callback) { ++ hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false); ++ } ++ if (hdev->vhost_ops->vhost_dev_start) { ++ hdev->vhost_ops->vhost_dev_start(hdev, false); ++ } + fail_start: + if (vrings) { + vhost_dev_set_vring_enable(hdev, false); +-- +2.39.3 + diff --git a/SPECS/qemu-kvm.spec b/SPECS/qemu-kvm.spec index cc7b200..c1f98a0 100644 --- a/SPECS/qemu-kvm.spec +++ b/SPECS/qemu-kvm.spec @@ -149,7 +149,7 @@ Obsoletes: %{name}-block-ssh <= %{epoch}:%{version} \ Summary: QEMU is a machine emulator and virtualizer Name: qemu-kvm Version: 9.0.0 -Release: 10%{?rcrel}%{?dist}%{?cc_suffix} +Release: 10%{?rcrel}%{?dist}%{?cc_suffix}.2 # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped # Epoch 15 used for RHEL 8 # Epoch 17 used for RHEL 9 (due to release versioning offset in RHEL 8.5) @@ -432,6 +432,30 @@ Patch136: kvm-nbd-server-CVE-2024-7409-Close-stray-clients-at-serv.patch Patch137: kvm-qemu-guest-agent-Update-the-logfile-path-of-qga-fsfr.patch # For RHEL-52617 - CVE-2024-7409 qemu-kvm: Denial of Service via Improper Synchronization in QEMU NBD Server During Socket Closure [rhel-9.5] Patch138: kvm-nbd-server-CVE-2024-7409-Avoid-use-after-free-when-c.patch +# For RHEL-63874 - Failure to resume paused post-copy migration is undetectable [rhel-9.5.z] +Patch139: kvm-migration-multifd-Avoid-the-final-FLUSH-in-complete.patch +# For RHEL-63874 - Failure to resume paused post-copy migration is undetectable [rhel-9.5.z] +Patch140: kvm-migration-Rename-thread-debug-names.patch +# For RHEL-63874 - Failure to resume paused post-copy migration is undetectable [rhel-9.5.z] +Patch141: kvm-migration-Use-MigrationStatus-instead-of-int.patch +# For RHEL-63874 - Failure to resume paused post-copy migration is undetectable [rhel-9.5.z] +Patch142: kvm-migration-Cleanup-incoming-migration-setup-state-cha.patch +# For RHEL-63874 - Failure to resume paused post-copy migration is undetectable [rhel-9.5.z] +Patch143: kvm-migration-postcopy-Add-postcopy-recover-setup-phase.patch +# For RHEL-63874 - Failure to resume paused post-copy migration is undetectable [rhel-9.5.z] +Patch144: kvm-migration-docs-Update-postcopy-recover-session-for-S.patch +# For RHEL-63874 - Failure to resume paused post-copy migration is undetectable [rhel-9.5.z] +Patch145: kvm-tests-migration-tests-Drop-most-WIN32-ifdefs-for-pos.patch +# For RHEL-63874 - Failure to resume paused post-copy migration is undetectable [rhel-9.5.z] +Patch146: kvm-tests-migration-tests-Always-enable-migration-events.patch +# For RHEL-63874 - Failure to resume paused post-copy migration is undetectable [rhel-9.5.z] +Patch147: kvm-tests-migration-tests-migration_event_wait.patch +# For RHEL-63874 - Failure to resume paused post-copy migration is undetectable [rhel-9.5.z] +Patch148: kvm-tests-migration-tests-Verify-postcopy-recover-setup-.patch +# For RHEL-63874 - Failure to resume paused post-copy migration is undetectable [rhel-9.5.z] +Patch149: kvm-tests-migration-tests-Cover-postcopy-failure-on-reco.patch +# For RHEL-73006 - qemu-kvm: vhost: reports error while updating IOTLB entries [rhel-9.5.z] +Patch150: kvm-vhost-fail-device-start-if-iotlb-update-fails.patch %if %{have_clang} BuildRequires: clang @@ -1498,6 +1522,26 @@ useradd -r -u 107 -g qemu -G kvm -d / -s /sbin/nologin \ %endif %changelog +* Wed Jan 15 2025 Miroslav Rezanina - 9.0.0-10.el9_5.2 +- kvm-vhost-fail-device-start-if-iotlb-update-fails.patch [RHEL-73006] +- Resolves: RHEL-73006 + (qemu-kvm: vhost: reports error while updating IOTLB entries [rhel-9.5.z]) + +* Mon Nov 04 2024 Miroslav Rezanina - 9.0.0-10.el9_5.1 +- kvm-migration-multifd-Avoid-the-final-FLUSH-in-complete.patch [RHEL-63874] +- kvm-migration-Rename-thread-debug-names.patch [RHEL-63874] +- kvm-migration-Use-MigrationStatus-instead-of-int.patch [RHEL-63874] +- kvm-migration-Cleanup-incoming-migration-setup-state-cha.patch [RHEL-63874] +- kvm-migration-postcopy-Add-postcopy-recover-setup-phase.patch [RHEL-63874] +- kvm-migration-docs-Update-postcopy-recover-session-for-S.patch [RHEL-63874] +- kvm-tests-migration-tests-Drop-most-WIN32-ifdefs-for-pos.patch [RHEL-63874] +- kvm-tests-migration-tests-Always-enable-migration-events.patch [RHEL-63874] +- kvm-tests-migration-tests-migration_event_wait.patch [RHEL-63874] +- kvm-tests-migration-tests-Verify-postcopy-recover-setup-.patch [RHEL-63874] +- kvm-tests-migration-tests-Cover-postcopy-failure-on-reco.patch [RHEL-63874] +- Resolves: RHEL-63874 + (Failure to resume paused post-copy migration is undetectable [rhel-9.5.z]) + * Mon Sep 02 2024 Miroslav Rezanina - 9.0.0-10 - kvm-nbd-server-CVE-2024-7409-Avoid-use-after-free-when-c.patch [RHEL-52617] - Resolves: RHEL-52617