You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
252 lines
9.1 KiB
252 lines
9.1 KiB
From c77a30265b8d0db43174b040ea82103f8fdb9911 Mon Sep 17 00:00:00 2001
|
|
From: Peter Xu <peterx@redhat.com>
|
|
Date: Tue, 17 Sep 2024 12:38:32 -0400
|
|
Subject: [PATCH 5/9] KVM: Dynamic sized kvm memslots array
|
|
|
|
RH-Author: Peter Xu <peterx@redhat.com>
|
|
RH-MergeRequest: 285: KVM: Dynamic sized kvm memslots array
|
|
RH-Jira: RHEL-57685
|
|
RH-Acked-by: Juraj Marcin <None>
|
|
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
|
RH-Commit: [4/7] 46d4abec352a92112e593ea61b7cbf5ce5f94cdc (peterx/qemu-kvm)
|
|
|
|
Zhiyi reported an infinite loop issue in VFIO use case. The cause of that
|
|
was a separate discussion, however during that I found a regression of
|
|
dirty sync slowness when profiling.
|
|
|
|
Each KVMMemoryListerner maintains an array of kvm memslots. Currently it's
|
|
statically allocated to be the max supported by the kernel. However after
|
|
Linux commit 4fc096a99e ("KVM: Raise the maximum number of user memslots"),
|
|
the max supported memslots reported now grows to some number large enough
|
|
so that it may not be wise to always statically allocate with the max
|
|
reported.
|
|
|
|
What's worse, QEMU kvm code still walks all the allocated memslots entries
|
|
to do any form of lookups. It can drastically slow down all memslot
|
|
operations because each of such loop can run over 32K times on the new
|
|
kernels.
|
|
|
|
Fix this issue by making the memslots to be allocated dynamically.
|
|
|
|
Here the initial size was set to 16 because it should cover the basic VM
|
|
usages, so that the hope is the majority VM use case may not even need to
|
|
grow at all (e.g. if one starts a VM with ./qemu-system-x86_64 by default
|
|
it'll consume 9 memslots), however not too large to waste memory.
|
|
|
|
There can also be even better way to address this, but so far this is the
|
|
simplest and should be already better even than before we grow the max
|
|
supported memslots. For example, in the case of above issue when VFIO was
|
|
attached on a 32GB system, there are only ~10 memslots used. So it could
|
|
be good enough as of now.
|
|
|
|
In the above VFIO context, measurement shows that the precopy dirty sync
|
|
shrinked from ~86ms to ~3ms after this patch applied. It should also apply
|
|
to any KVM enabled VM even without VFIO.
|
|
|
|
NOTE: we don't have a FIXES tag for this patch because there's no real
|
|
commit that regressed this in QEMU. Such behavior existed for a long time,
|
|
but only start to be a problem when the kernel reports very large
|
|
nr_slots_max value. However that's pretty common now (the kernel change
|
|
was merged in 2021) so we attached cc:stable because we'll want this change
|
|
to be backported to stable branches.
|
|
|
|
Cc: qemu-stable <qemu-stable@nongnu.org>
|
|
Reported-by: Zhiyi Guo <zhguo@redhat.com>
|
|
Tested-by: Zhiyi Guo <zhguo@redhat.com>
|
|
Signed-off-by: Peter Xu <peterx@redhat.com>
|
|
Acked-by: David Hildenbrand <david@redhat.com>
|
|
Reviewed-by: Fabiano Rosas <farosas@suse.de>
|
|
Link: https://lore.kernel.org/r/20240917163835.194664-2-peterx@redhat.com
|
|
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
|
|
(cherry picked from commit 5504a8126115d173687b37e657312a8ffe29fc0c)
|
|
Signed-off-by: Peter Xu <peterx@redhat.com>
|
|
---
|
|
accel/kvm/kvm-all.c | 87 +++++++++++++++++++++++++++++++++-------
|
|
accel/kvm/trace-events | 1 +
|
|
include/sysemu/kvm_int.h | 1 +
|
|
3 files changed, 74 insertions(+), 15 deletions(-)
|
|
|
|
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
|
|
index 8187ad3964..38393bc86b 100644
|
|
--- a/accel/kvm/kvm-all.c
|
|
+++ b/accel/kvm/kvm-all.c
|
|
@@ -69,6 +69,9 @@
|
|
#define KVM_GUESTDBG_BLOCKIRQ 0
|
|
#endif
|
|
|
|
+/* Default num of memslots to be allocated when VM starts */
|
|
+#define KVM_MEMSLOTS_NR_ALLOC_DEFAULT 16
|
|
+
|
|
struct KVMParkedVcpu {
|
|
unsigned long vcpu_id;
|
|
int kvm_fd;
|
|
@@ -165,6 +168,57 @@ void kvm_resample_fd_notify(int gsi)
|
|
}
|
|
}
|
|
|
|
+/**
|
|
+ * kvm_slots_grow(): Grow the slots[] array in the KVMMemoryListener
|
|
+ *
|
|
+ * @kml: The KVMMemoryListener* to grow the slots[] array
|
|
+ * @nr_slots_new: The new size of slots[] array
|
|
+ *
|
|
+ * Returns: True if the array grows larger, false otherwise.
|
|
+ */
|
|
+static bool kvm_slots_grow(KVMMemoryListener *kml, unsigned int nr_slots_new)
|
|
+{
|
|
+ unsigned int i, cur = kml->nr_slots_allocated;
|
|
+ KVMSlot *slots;
|
|
+
|
|
+ if (nr_slots_new > kvm_state->nr_slots) {
|
|
+ nr_slots_new = kvm_state->nr_slots;
|
|
+ }
|
|
+
|
|
+ if (cur >= nr_slots_new) {
|
|
+ /* Big enough, no need to grow, or we reached max */
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ if (cur == 0) {
|
|
+ slots = g_new0(KVMSlot, nr_slots_new);
|
|
+ } else {
|
|
+ assert(kml->slots);
|
|
+ slots = g_renew(KVMSlot, kml->slots, nr_slots_new);
|
|
+ /*
|
|
+ * g_renew() doesn't initialize extended buffers, however kvm
|
|
+ * memslots require fields to be zero-initialized. E.g. pointers,
|
|
+ * memory_size field, etc.
|
|
+ */
|
|
+ memset(&slots[cur], 0x0, sizeof(slots[0]) * (nr_slots_new - cur));
|
|
+ }
|
|
+
|
|
+ for (i = cur; i < nr_slots_new; i++) {
|
|
+ slots[i].slot = i;
|
|
+ }
|
|
+
|
|
+ kml->slots = slots;
|
|
+ kml->nr_slots_allocated = nr_slots_new;
|
|
+ trace_kvm_slots_grow(cur, nr_slots_new);
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static bool kvm_slots_double(KVMMemoryListener *kml)
|
|
+{
|
|
+ return kvm_slots_grow(kml, kml->nr_slots_allocated * 2);
|
|
+}
|
|
+
|
|
unsigned int kvm_get_max_memslots(void)
|
|
{
|
|
KVMState *s = KVM_STATE(current_accel());
|
|
@@ -193,15 +247,26 @@ unsigned int kvm_get_free_memslots(void)
|
|
/* Called with KVMMemoryListener.slots_lock held */
|
|
static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
|
|
{
|
|
- KVMState *s = kvm_state;
|
|
+ unsigned int n;
|
|
int i;
|
|
|
|
- for (i = 0; i < s->nr_slots; i++) {
|
|
+ for (i = 0; i < kml->nr_slots_allocated; i++) {
|
|
if (kml->slots[i].memory_size == 0) {
|
|
return &kml->slots[i];
|
|
}
|
|
}
|
|
|
|
+ /*
|
|
+ * If no free slots, try to grow first by doubling. Cache the old size
|
|
+ * here to avoid another round of search: if the grow succeeded, it
|
|
+ * means slots[] now must have the existing "n" slots occupied,
|
|
+ * followed by one or more free slots starting from slots[n].
|
|
+ */
|
|
+ n = kml->nr_slots_allocated;
|
|
+ if (kvm_slots_double(kml)) {
|
|
+ return &kml->slots[n];
|
|
+ }
|
|
+
|
|
return NULL;
|
|
}
|
|
|
|
@@ -222,10 +287,9 @@ static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
|
|
hwaddr start_addr,
|
|
hwaddr size)
|
|
{
|
|
- KVMState *s = kvm_state;
|
|
int i;
|
|
|
|
- for (i = 0; i < s->nr_slots; i++) {
|
|
+ for (i = 0; i < kml->nr_slots_allocated; i++) {
|
|
KVMSlot *mem = &kml->slots[i];
|
|
|
|
if (start_addr == mem->start_addr && size == mem->memory_size) {
|
|
@@ -267,7 +331,7 @@ int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
|
|
int i, ret = 0;
|
|
|
|
kvm_slots_lock();
|
|
- for (i = 0; i < s->nr_slots; i++) {
|
|
+ for (i = 0; i < kml->nr_slots_allocated; i++) {
|
|
KVMSlot *mem = &kml->slots[i];
|
|
|
|
if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
|
|
@@ -1071,7 +1135,7 @@ static int kvm_physical_log_clear(KVMMemoryListener *kml,
|
|
|
|
kvm_slots_lock();
|
|
|
|
- for (i = 0; i < s->nr_slots; i++) {
|
|
+ for (i = 0; i < kml->nr_slots_allocated; i++) {
|
|
mem = &kml->slots[i];
|
|
/* Discard slots that are empty or do not overlap the section */
|
|
if (!mem->memory_size ||
|
|
@@ -1719,12 +1783,8 @@ static void kvm_log_sync_global(MemoryListener *l, bool last_stage)
|
|
/* Flush all kernel dirty addresses into KVMSlot dirty bitmap */
|
|
kvm_dirty_ring_flush();
|
|
|
|
- /*
|
|
- * TODO: make this faster when nr_slots is big while there are
|
|
- * only a few used slots (small VMs).
|
|
- */
|
|
kvm_slots_lock();
|
|
- for (i = 0; i < s->nr_slots; i++) {
|
|
+ for (i = 0; i < kml->nr_slots_allocated; i++) {
|
|
mem = &kml->slots[i];
|
|
if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
|
|
kvm_slot_sync_dirty_pages(mem);
|
|
@@ -1839,12 +1899,9 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
|
|
{
|
|
int i;
|
|
|
|
- kml->slots = g_new0(KVMSlot, s->nr_slots);
|
|
kml->as_id = as_id;
|
|
|
|
- for (i = 0; i < s->nr_slots; i++) {
|
|
- kml->slots[i].slot = i;
|
|
- }
|
|
+ kvm_slots_grow(kml, KVM_MEMSLOTS_NR_ALLOC_DEFAULT);
|
|
|
|
QSIMPLEQ_INIT(&kml->transaction_add);
|
|
QSIMPLEQ_INIT(&kml->transaction_del);
|
|
diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events
|
|
index 37626c1ac5..ad2ae6fca5 100644
|
|
--- a/accel/kvm/trace-events
|
|
+++ b/accel/kvm/trace-events
|
|
@@ -36,3 +36,4 @@ kvm_io_window_exit(void) ""
|
|
kvm_run_exit_system_event(int cpu_index, uint32_t event_type) "cpu_index %d, system_even_type %"PRIu32
|
|
kvm_convert_memory(uint64_t start, uint64_t size, const char *msg) "start 0x%" PRIx64 " size 0x%" PRIx64 " %s"
|
|
kvm_memory_fault(uint64_t start, uint64_t size, uint64_t flags) "start 0x%" PRIx64 " size 0x%" PRIx64 " flags 0x%" PRIx64
|
|
+kvm_slots_grow(unsigned int old, unsigned int new) "%u -> %u"
|
|
diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
|
|
index 1d8fb1473b..48e496b3d4 100644
|
|
--- a/include/sysemu/kvm_int.h
|
|
+++ b/include/sysemu/kvm_int.h
|
|
@@ -46,6 +46,7 @@ typedef struct KVMMemoryListener {
|
|
MemoryListener listener;
|
|
KVMSlot *slots;
|
|
unsigned int nr_used_slots;
|
|
+ unsigned int nr_slots_allocated;
|
|
int as_id;
|
|
QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_add;
|
|
QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_del;
|
|
--
|
|
2.39.3
|
|
|