Version update to 9.0.0 ( Migrated from the i9c branch )

i9-spice
ebasov 2 months ago
parent 1e9369d713
commit daf27afeaa
No known key found for this signature in database
GPG Key ID: 3DE9E7A44B2D38F6

2
.gitignore vendored

@ -1 +1 @@
SOURCES/qemu-8.2.0.tar.xz
SOURCES/qemu-9.0.0.tar.xz

@ -1 +1 @@
1615e59b1bd68324e0819245fe003e33c14a52f9 SOURCES/qemu-8.2.0.tar.xz
6699bb03d6da21159b89668bca01c6c958b95d07 SOURCES/qemu-9.0.0.tar.xz

@ -1,4 +1,4 @@
From faae70a870156f86a5cf55ca967b15d7612941ff Mon Sep 17 00:00:00 2001
From ea7dff3dbf979d7d8a85a16cf5187235143e1048 Mon Sep 17 00:00:00 2001
From: Miroslav Rezanina <mrezanin@redhat.com>
Date: Wed, 26 May 2021 10:56:02 +0200
Subject: Initial redhat build
@ -13,7 +13,7 @@ several issues are fixed in QEMU tree:
We disable make check due to issues with some of the tests.
This rebase is based on qemu-kvm-8.1.0-5.el9
This rebase is based on qemu-kvm-8.2.0-11.el9
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
--
@ -83,6 +83,12 @@ Rebase changes (8.2.0):
- Added --disable-plugins configure option
- Fixing frh.py strings
Rebase notes (9.0.0):
- Fixed qemu-kvm binary location change
- Remove hppa-firmware64.img
- Package stp files for utilities
- Download subprojects on local build
Merged patches (6.0.0):
- 605758c902 Limit build on Power to qemu-img and qemu-ga only
@ -193,14 +199,17 @@ Merged patches (8.1.0):
Merged patches (8.2.0):
- cd9efa221d Enable qemu-kvm-device-usb-redirec for aarch64
Merged patches (9.0.0 rc0):
- 25de053dbf spec: Enable zstd
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
---
.distro/Makefile | 100 +
.distro/Makefile | 101 +
.distro/Makefile.common | 42 +
.distro/README.tests | 39 +
.distro/modules-load.conf | 4 +
.distro/qemu-guest-agent.service | 1 -
.distro/qemu-kvm.spec.template | 4909 +++++++++++++++++++++++
.distro/qemu-kvm.spec.template | 5170 +++++++++++++++++++++++
.distro/rpminspect.yaml | 6 +-
.distro/scripts/extract_build_cmd.py | 12 +
.distro/scripts/frh.py | 4 +-
@ -211,7 +220,7 @@ Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
scripts/systemtap/conf.d/qemu_kvm.conf | 4 +
scripts/systemtap/script.d/qemu_kvm.stp | 1 +
ui/vnc-auth-sasl.c | 2 +-
16 files changed, 5168 insertions(+), 6 deletions(-)
16 files changed, 5430 insertions(+), 6 deletions(-)
create mode 100644 .distro/Makefile
create mode 100644 .distro/Makefile.common
create mode 100644 .distro/README.tests

@ -1,4 +1,4 @@
From 048067b4618ba1fa7c8c517185d4cd3a675eba72 Mon Sep 17 00:00:00 2001
From 780c39975b059deaee106775b6e3a240155acea3 Mon Sep 17 00:00:00 2001
From: Miroslav Rezanina <mrezanin@redhat.com>
Date: Wed, 7 Dec 2022 03:05:48 -0500
Subject: Enable/disable devices for RHEL
@ -47,6 +47,12 @@ Rebase notes (8.2.0):
- Disable new neoverse-v2
- Removed CONFIG_OPENGL from x86_64 config file
Rebase notes (9.0.0 rc0):
- Split CONFIG_IDE_QDEV to CONFIG_IDE_DEV and CONFIG_IDE_BUS (upstream change)
Rebase notes (9.0.0 rc1):
- Do not compile armv7 cpu types
Merged patches (6.1.0):
- c51bf45304 Remove SPICE and QXL from x86_64-rh-devices.mak
- 02fc745601 aarch64-rh-devices: add CONFIG_PVPANIC_PCI
@ -74,36 +80,41 @@ Merged patches (8.1.0):
Merged patches (8.2.0):
- b29f66431f Enable igb on x86_64
Merged patches (9.0.0 rc0):
- 3889ede5d9 Compile IOMMUFD on x86_64
- 0beb18451f Compile IOMMUFD on s390x
- 2b4b13f70d Compile IOMMUFD object on aarch64
---
.distro/qemu-kvm.spec.template | 18 +--
.../aarch64-softmmu/aarch64-rh-devices.mak | 41 +++++++
.../aarch64-softmmu/aarch64-rh-devices.mak | 42 +++++++
.../ppc64-softmmu/ppc64-rh-devices.mak | 37 ++++++
configs/devices/rh-virtio.mak | 10 ++
.../s390x-softmmu/s390x-rh-devices.mak | 18 +++
.../x86_64-softmmu/x86_64-rh-devices.mak | 110 ++++++++++++++++++
.../s390x-softmmu/s390x-rh-devices.mak | 19 +++
.../x86_64-softmmu/x86_64-rh-devices.mak | 112 ++++++++++++++++++
hw/arm/virt.c | 2 +
hw/block/fdc.c | 10 ++
hw/cpu/meson.build | 3 +-
hw/cxl/meson.build | 3 +-
hw/display/cirrus_vga.c | 4 +
hw/ide/piix.c | 5 +-
hw/ide/qdev.c | 9 ++
hw/input/pckbd.c | 2 +
hw/net/e1000.c | 2 +
hw/ppc/spapr_cpu_core.c | 2 +
hw/usb/meson.build | 2 +-
hw/virtio/meson.build | 5 +-
hw/virtio/meson.build | 6 +-
target/arm/arm-qmp-cmds.c | 2 +
target/arm/cpu.c | 4 +
target/arm/cpu.h | 3 +
target/arm/cpu64.c | 12 +-
target/arm/tcg/cpu32.c | 2 +
target/arm/tcg/cpu64.c | 8 ++
target/arm/tcg/meson.build | 4 +-
target/ppc/cpu-models.c | 9 ++
target/s390x/cpu_models_sysemu.c | 3 +
target/s390x/kvm/kvm.c | 8 ++
tests/qtest/arm-cpu-features.c | 4 +
28 files changed, 323 insertions(+), 15 deletions(-)
28 files changed, 321 insertions(+), 17 deletions(-)
create mode 100644 configs/devices/aarch64-softmmu/aarch64-rh-devices.mak
create mode 100644 configs/devices/ppc64-softmmu/ppc64-rh-devices.mak
create mode 100644 configs/devices/rh-virtio.mak
@ -112,10 +123,10 @@ Merged patches (8.2.0):
diff --git a/configs/devices/aarch64-softmmu/aarch64-rh-devices.mak b/configs/devices/aarch64-softmmu/aarch64-rh-devices.mak
new file mode 100644
index 0000000000..aec1831199
index 0000000000..b0191d3c69
--- /dev/null
+++ b/configs/devices/aarch64-softmmu/aarch64-rh-devices.mak
@@ -0,0 +1,43 @@
@@ -0,0 +1,42 @@
+include ../rh-virtio.mak
+
+CONFIG_ARM_GIC_KVM=y
@ -126,13 +137,11 @@ index 0000000000..aec1831199
+CONFIG_CXL=y
+CONFIG_CXL_MEM_DEVICE=y
+CONFIG_EDID=y
+CONFIG_IVSHMEM_DEVICE=y
+CONFIG_PCIE_PORT=y
+CONFIG_PCIE_PCI_BRIDGE=y
+CONFIG_PCI_DEVICES=y
+CONFIG_PCI_TESTDEV=y
+CONFIG_PFLASH_CFI01=y
+CONFIG_QXL=y
+CONFIG_SCSI=y
+CONFIG_SEMIHOSTING=y
+CONFIG_USB=y
@ -159,6 +168,7 @@ index 0000000000..aec1831199
+CONFIG_VHOST_VSOCK=y
+CONFIG_VHOST_USER_VSOCK=y
+CONFIG_VHOST_USER_FS=y
+CONFIG_IOMMUFD=y
diff --git a/configs/devices/ppc64-softmmu/ppc64-rh-devices.mak b/configs/devices/ppc64-softmmu/ppc64-rh-devices.mak
new file mode 100644
index 0000000000..dbb7d30829
@ -220,10 +230,10 @@ index 0000000000..94ede1b5f6
+CONFIG_VIRTIO_SERIAL=y
diff --git a/configs/devices/s390x-softmmu/s390x-rh-devices.mak b/configs/devices/s390x-softmmu/s390x-rh-devices.mak
new file mode 100644
index 0000000000..69a799adbd
index 0000000000..24cf6dbd03
--- /dev/null
+++ b/configs/devices/s390x-softmmu/s390x-rh-devices.mak
@@ -0,0 +1,18 @@
@@ -0,0 +1,19 @@
+include ../rh-virtio.mak
+
+CONFIG_PCI=y
@ -242,9 +252,10 @@ index 0000000000..69a799adbd
+CONFIG_VHOST_VSOCK=y
+CONFIG_VHOST_USER_VSOCK=y
+CONFIG_VHOST_USER_FS=y
+CONFIG_IOMMUFD=y
diff --git a/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak b/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak
new file mode 100644
index 0000000000..ce5be73633
index 0000000000..d60ff1bcfc
--- /dev/null
+++ b/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak
@@ -0,0 +1,112 @@
@ -285,14 +296,14 @@ index 0000000000..ce5be73633
+CONFIG_IDE_CORE=y
+CONFIG_IDE_PCI=y
+CONFIG_IDE_PIIX=y
+CONFIG_IDE_QDEV=y
+CONFIG_IDE_DEV=y
+CONFIG_IDE_BUS=y
+CONFIG_IGB_PCI_EXPRESS=y
+CONFIG_IOAPIC=y
+CONFIG_IOH3420=y
+CONFIG_ISA_BUS=y
+CONFIG_ISA_DEBUG=y
+CONFIG_ISA_TESTDEV=y
+CONFIG_IVSHMEM_DEVICE=y
+CONFIG_LPC_ICH9=y
+CONFIG_MC146818RTC=y
+CONFIG_MEM_DEVICE=y
@ -314,7 +325,6 @@ index 0000000000..ce5be73633
+CONFIG_PFLASH_CFI01=y
+CONFIG_PVPANIC_ISA=y
+CONFIG_PXB=y
+CONFIG_QXL=y
+CONFIG_Q35=y
+CONFIG_RTL8139_PCI=y
+CONFIG_SCSI=y
@ -360,28 +370,29 @@ index 0000000000..ce5be73633
+CONFIG_VHOST_VSOCK=y
+CONFIG_VHOST_USER_VSOCK=y
+CONFIG_VHOST_USER_FS=y
+CONFIG_IOMMUFD=y
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index be2856c018..af9ea4dd1c 100644
index a9a913aead..6c6d155002 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -205,6 +205,7 @@ static const int a15irqmap[] = {
};
static const char *valid_cpus[] = {
@@ -2954,6 +2954,7 @@ static void virt_machine_class_init(ObjectClass *oc, void *data)
MachineClass *mc = MACHINE_CLASS(oc);
HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
static const char * const valid_cpu_types[] = {
+#if 0 /* Disabled for Red Hat Enterprise Linux */
#ifdef CONFIG_TCG
ARM_CPU_TYPE_NAME("cortex-a7"),
ARM_CPU_TYPE_NAME("cortex-a15"),
@@ -219,6 +220,7 @@ static const char *valid_cpus[] = {
ARM_CPU_TYPE_NAME("neoverse-n2"),
#endif
ARM_CPU_TYPE_NAME("cortex-a53"),
ARM_CPU_TYPE_NAME("cortex-a7"),
ARM_CPU_TYPE_NAME("cortex-a15"),
@@ -2971,6 +2972,7 @@ static void virt_machine_class_init(ObjectClass *oc, void *data)
#endif /* CONFIG_TCG */
#ifdef TARGET_AARCH64
ARM_CPU_TYPE_NAME("cortex-a53"),
+#endif /* disabled for RHEL */
ARM_CPU_TYPE_NAME("cortex-a57"),
ARM_CPU_TYPE_NAME("host"),
ARM_CPU_TYPE_NAME("max"),
ARM_CPU_TYPE_NAME("cortex-a57"),
#if defined(CONFIG_KVM) || defined(CONFIG_HVF)
ARM_CPU_TYPE_NAME("host"),
diff --git a/hw/block/fdc.c b/hw/block/fdc.c
index d7cc4d3ec1..12d0a60905 100644
index 6dd94e98bc..a05757fc9a 100644
--- a/hw/block/fdc.c
+++ b/hw/block/fdc.c
@@ -49,6 +49,8 @@
@ -409,7 +420,7 @@ index d7cc4d3ec1..12d0a60905 100644
error_setg(errp, "Cannot choose a fallback FDrive type of 'auto'");
return;
diff --git a/hw/cpu/meson.build b/hw/cpu/meson.build
index 6d319947ca..91962fd863 100644
index 38cdcfbe57..e588ecfd42 100644
--- a/hw/cpu/meson.build
+++ b/hw/cpu/meson.build
@@ -1,4 +1,5 @@
@ -420,7 +431,7 @@ index 6d319947ca..91962fd863 100644
system_ss.add(when: 'CONFIG_ARM11MPCORE', if_true: files('arm11mpcore.c'))
system_ss.add(when: 'CONFIG_REALVIEW', if_true: files('realview_mpcore.c'))
diff --git a/hw/cxl/meson.build b/hw/cxl/meson.build
index ea0aebf6e3..6878f06974 100644
index 3e375f61a9..613adb3ebb 100644
--- a/hw/cxl/meson.build
+++ b/hw/cxl/meson.build
@@ -6,7 +6,8 @@ system_ss.add(when: 'CONFIG_CXL',
@ -434,7 +445,7 @@ index ea0aebf6e3..6878f06974 100644
if_false: files(
'cxl-host-stubs.c',
diff --git a/hw/display/cirrus_vga.c b/hw/display/cirrus_vga.c
index b80f98b6c4..0370cf8a64 100644
index 150883a971..497365bd80 100644
--- a/hw/display/cirrus_vga.c
+++ b/hw/display/cirrus_vga.c
@@ -36,6 +36,7 @@
@ -456,10 +467,10 @@ index b80f98b6c4..0370cf8a64 100644
* Follow real hardware, cirrus card emulated has 4 MB video memory.
* Also accept 8 MB/16 MB for backward compatibility.
diff --git a/hw/ide/piix.c b/hw/ide/piix.c
index 4e5e12935f..03ca06bb17 100644
index 80efc633d3..9cb82b8eea 100644
--- a/hw/ide/piix.c
+++ b/hw/ide/piix.c
@@ -190,7 +190,8 @@ static void piix3_ide_class_init(ObjectClass *klass, void *data)
@@ -191,7 +191,8 @@ static void piix3_ide_class_init(ObjectClass *klass, void *data)
k->device_id = PCI_DEVICE_ID_INTEL_82371SB_1;
k->class_id = PCI_CLASS_STORAGE_IDE;
set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
@ -469,7 +480,7 @@ index 4e5e12935f..03ca06bb17 100644
}
static const TypeInfo piix3_ide_info = {
@@ -214,6 +215,8 @@ static void piix4_ide_class_init(ObjectClass *klass, void *data)
@@ -215,6 +216,8 @@ static void piix4_ide_class_init(ObjectClass *klass, void *data)
k->class_id = PCI_CLASS_STORAGE_IDE;
set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
dc->hotpluggable = false;
@ -478,57 +489,11 @@ index 4e5e12935f..03ca06bb17 100644
}
static const TypeInfo piix4_ide_info = {
diff --git a/hw/ide/qdev.c b/hw/ide/qdev.c
index 1b3b4da01d..454bfa5783 100644
--- a/hw/ide/qdev.c
+++ b/hw/ide/qdev.c
@@ -283,10 +283,13 @@ static void ide_cd_realize(IDEDevice *dev, Error **errp)
ide_dev_initfn(dev, IDE_CD, errp);
}
+/* Disabled for Red Hat Enterprise Linux */
+#if 0
static void ide_cf_realize(IDEDevice *dev, Error **errp)
{
ide_dev_initfn(dev, IDE_CFATA, errp);
}
+#endif
#define DEFINE_IDE_DEV_PROPERTIES() \
DEFINE_BLOCK_PROPERTIES(IDEDrive, dev.conf), \
@@ -346,6 +349,8 @@ static const TypeInfo ide_cd_info = {
.class_init = ide_cd_class_init,
};
+/* Disabled for Red Hat Enterprise Linux */
+#if 0
static Property ide_cf_properties[] = {
DEFINE_IDE_DEV_PROPERTIES(),
DEFINE_BLOCK_CHS_PROPERTIES(IDEDrive, dev.conf),
@@ -371,6 +376,7 @@ static const TypeInfo ide_cf_info = {
.instance_size = sizeof(IDEDrive),
.class_init = ide_cf_class_init,
};
+#endif
static void ide_device_class_init(ObjectClass *klass, void *data)
{
@@ -396,7 +402,10 @@ static void ide_register_types(void)
type_register_static(&ide_bus_info);
type_register_static(&ide_hd_info);
type_register_static(&ide_cd_info);
+/* Disabled for Red Hat Enterprise Linux */
+#if 0
type_register_static(&ide_cf_info);
+#endif
type_register_static(&ide_device_type_info);
}
diff --git a/hw/input/pckbd.c b/hw/input/pckbd.c
index b92b63bedc..3b6235dde6 100644
index 74f10b640f..2e85ecf476 100644
--- a/hw/input/pckbd.c
+++ b/hw/input/pckbd.c
@@ -957,6 +957,8 @@ static void i8042_class_initfn(ObjectClass *klass, void *data)
@@ -952,6 +952,8 @@ static void i8042_class_initfn(ObjectClass *klass, void *data)
dc->vmsd = &vmstate_kbd_isa;
adevc->build_dev_aml = i8042_build_aml;
set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
@ -538,7 +503,7 @@ index b92b63bedc..3b6235dde6 100644
static const TypeInfo i8042_info = {
diff --git a/hw/net/e1000.c b/hw/net/e1000.c
index 8ffe1077f1..b3dfeeca4f 100644
index 43f3a4a701..267f182883 100644
--- a/hw/net/e1000.c
+++ b/hw/net/e1000.c
@@ -1746,6 +1746,7 @@ static const E1000Info e1000_devices[] = {
@ -558,10 +523,10 @@ index 8ffe1077f1..b3dfeeca4f 100644
static void e1000_register_types(void)
diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c
index 91fae56573..33e0c8724c 100644
index e7c9edd033..3b0a47a28c 100644
--- a/hw/ppc/spapr_cpu_core.c
+++ b/hw/ppc/spapr_cpu_core.c
@@ -386,10 +386,12 @@ static const TypeInfo spapr_cpu_core_type_infos[] = {
@@ -389,10 +389,12 @@ static const TypeInfo spapr_cpu_core_type_infos[] = {
.instance_size = sizeof(SpaprCpuCore),
.class_size = sizeof(SpaprCpuCoreClass),
},
@ -569,16 +534,16 @@ index 91fae56573..33e0c8724c 100644
DEFINE_SPAPR_CPU_CORE_TYPE("970_v2.2"),
DEFINE_SPAPR_CPU_CORE_TYPE("970mp_v1.0"),
DEFINE_SPAPR_CPU_CORE_TYPE("970mp_v1.1"),
DEFINE_SPAPR_CPU_CORE_TYPE("power5+_v2.1"),
DEFINE_SPAPR_CPU_CORE_TYPE("power5p_v2.1"),
+#endif
DEFINE_SPAPR_CPU_CORE_TYPE("power7_v2.3"),
DEFINE_SPAPR_CPU_CORE_TYPE("power7+_v2.1"),
DEFINE_SPAPR_CPU_CORE_TYPE("power7p_v2.1"),
DEFINE_SPAPR_CPU_CORE_TYPE("power8_v2.0"),
diff --git a/hw/usb/meson.build b/hw/usb/meson.build
index e94149ebde..4a8adbf3dc 100644
index aac3bb35f2..5411ff35df 100644
--- a/hw/usb/meson.build
+++ b/hw/usb/meson.build
@@ -52,7 +52,7 @@ system_ss.add(when: 'CONFIG_USB_SMARTCARD', if_true: files('dev-smartcard-reader
@@ -55,7 +55,7 @@ system_ss.add(when: 'CONFIG_USB_SMARTCARD', if_true: files('dev-smartcard-reader
if cacard.found()
usbsmartcard_ss = ss.source_set()
usbsmartcard_ss.add(when: 'CONFIG_USB_SMARTCARD',
@ -588,26 +553,34 @@ index e94149ebde..4a8adbf3dc 100644
endif
diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index c0055a7832..12e1d6c67e 100644
index d7f18c96e6..aaabbb8b0b 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -17,8 +17,9 @@ if have_vhost
if have_vhost_user
# fixme - this really should be generic
specific_virtio_ss.add(files('vhost-user.c'))
@@ -20,7 +20,8 @@ if have_vhost
system_virtio_ss.add(files('vhost-user-base.c'))
# MMIO Stubs
- system_virtio_ss.add(files('vhost-user-device.c'))
- system_virtio_ss.add(when: 'CONFIG_VIRTIO_PCI', if_true: files('vhost-user-device-pci.c'))
+# Disabled for 8.2.0 rebase for RHEL 9.4.0
+# system_virtio_ss.add(files('vhost-user-device.c'))
system_virtio_ss.add(when: 'CONFIG_VHOST_USER_GPIO', if_true: files('vhost-user-gpio.c'))
system_virtio_ss.add(when: 'CONFIG_VHOST_USER_I2C', if_true: files('vhost-user-i2c.c'))
system_virtio_ss.add(when: 'CONFIG_VHOST_USER_RNG', if_true: files('vhost-user-rng.c'))
@@ -28,7 +29,8 @@ if have_vhost
system_virtio_ss.add(when: 'CONFIG_VHOST_USER_INPUT', if_true: files('vhost-user-input.c'))
# PCI Stubs
- system_virtio_ss.add(when: 'CONFIG_VIRTIO_PCI', if_true: files('vhost-user-device-pci.c'))
+# Disabled for 8.2.0 rebase for RHEL 9.4.0
+# system_virtio_ss.add(when: 'CONFIG_VIRTIO_PCI', if_true: files('vhost-user-device-pci.c'))
endif
if have_vhost_vdpa
system_virtio_ss.add(files('vhost-vdpa.c'))
system_virtio_ss.add(when: ['CONFIG_VIRTIO_PCI', 'CONFIG_VHOST_USER_GPIO'],
if_true: files('vhost-user-gpio-pci.c'))
system_virtio_ss.add(when: ['CONFIG_VIRTIO_PCI', 'CONFIG_VHOST_USER_I2C'],
diff --git a/target/arm/arm-qmp-cmds.c b/target/arm/arm-qmp-cmds.c
index b53d5efe13..64989a02d1 100644
index 3cc8cc738b..6f21fea1f5 100644
--- a/target/arm/arm-qmp-cmds.c
+++ b/target/arm/arm-qmp-cmds.c
@@ -231,6 +231,7 @@ CpuModelExpansionInfo *qmp_query_cpu_model_expansion(CpuModelExpansionType type,
@@ -223,6 +223,7 @@ CpuModelExpansionInfo *qmp_query_cpu_model_expansion(CpuModelExpansionType type,
static void arm_cpu_add_definition(gpointer data, gpointer user_data)
{
ObjectClass *oc = data;
@ -615,19 +588,19 @@ index b53d5efe13..64989a02d1 100644
CpuDefinitionInfoList **cpu_list = user_data;
CpuDefinitionInfo *info;
const char *typename;
@@ -240,6 +241,7 @@ static void arm_cpu_add_definition(gpointer data, gpointer user_data)
info->name = g_strndup(typename,
strlen(typename) - strlen("-" TYPE_ARM_CPU));
@@ -231,6 +232,7 @@ static void arm_cpu_add_definition(gpointer data, gpointer user_data)
info = g_malloc0(sizeof(*info));
info->name = cpu_model_from_type(typename);
info->q_typename = g_strdup(typename);
+ info->deprecated = !!cc->deprecation_note;
QAPI_LIST_PREPEND(*cpu_list, info);
}
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index efb22a87f9..a32521ada9 100644
index ab8d007a86..e5dce20f19 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -2524,6 +2524,10 @@ static void cpu_register_class_init(ObjectClass *oc, void *data)
@@ -2546,6 +2546,10 @@ static void cpu_register_class_init(ObjectClass *oc, void *data)
acc->info = data;
cc->gdb_core_xml_file = "arm-core.xml";
@ -639,10 +612,10 @@ index efb22a87f9..a32521ada9 100644
void arm_cpu_register(const ARMCPUInfo *info)
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index a0282e0d28..7e0f0dfea7 100644
index bc0c84873f..e9472c8bb8 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -34,6 +34,8 @@
@@ -37,6 +37,8 @@
#define KVM_HAVE_MCE_INJECTION 1
#endif
@ -651,7 +624,7 @@ index a0282e0d28..7e0f0dfea7 100644
#define EXCP_UDEF 1 /* undefined instruction */
#define EXCP_SWI 2 /* software interrupt */
#define EXCP_PREFETCH_ABORT 3
@@ -1120,6 +1122,7 @@ typedef struct ARMCPUInfo {
@@ -1092,6 +1094,7 @@ typedef struct ARMCPUInfo {
const char *name;
void (*initfn)(Object *obj);
void (*class_init)(ObjectClass *oc, void *data);
@ -660,7 +633,7 @@ index a0282e0d28..7e0f0dfea7 100644
/**
diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
index 1e9c6c85ae..10be900803 100644
index 985b1efe16..46a4e80171 100644
--- a/target/arm/cpu64.c
+++ b/target/arm/cpu64.c
@@ -648,6 +648,7 @@ static void aarch64_a57_initfn(Object *obj)
@ -692,7 +665,7 @@ index 1e9c6c85ae..10be900803 100644
{ .name = "max", .initfn = aarch64_max_initfn },
#if defined(CONFIG_KVM) || defined(CONFIG_HVF)
{ .name = "host", .initfn = aarch64_host_initfn },
@@ -815,8 +820,13 @@ static void aarch64_cpu_instance_init(Object *obj)
@@ -814,8 +819,13 @@ static void aarch64_cpu_instance_init(Object *obj)
static void cpu_register_class_init(ObjectClass *oc, void *data)
{
ARMCPUClass *acc = ARM_CPU_CLASS(oc);
@ -707,24 +680,24 @@ index 1e9c6c85ae..10be900803 100644
void aarch64_cpu_register(const ARMCPUInfo *info)
diff --git a/target/arm/tcg/cpu32.c b/target/arm/tcg/cpu32.c
index d9e0e2a4dd..c5c639a6ea 100644
index de8f2be941..8896295ae3 100644
--- a/target/arm/tcg/cpu32.c
+++ b/target/arm/tcg/cpu32.c
@@ -98,6 +98,7 @@ void aa32_max_features(ARMCPU *cpu)
@@ -92,6 +92,7 @@ void aa32_max_features(ARMCPU *cpu)
cpu->isar.id_dfr1 = t;
}
+#if 0 /* Disabled for Red Hat Enterprise Linux */
/* CPU models. These are not needed for the AArch64 linux-user build. */
#if !defined(CONFIG_USER_ONLY) || !defined(TARGET_AARCH64)
+#if 0 /* Disabled for Red Hat Enterprise Linux */
#if !defined(CONFIG_USER_ONLY)
static bool arm_v7m_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
{
@@ -1189,3 +1190,4 @@ static void arm_tcg_cpu_register_types(void)
@@ -1037,3 +1038,4 @@ static void arm_tcg_cpu_register_types(void)
type_init(arm_tcg_cpu_register_types)
#endif /* !CONFIG_USER_ONLY || !TARGET_AARCH64 */
+#endif /* disabled for RHEL */
diff --git a/target/arm/tcg/cpu64.c b/target/arm/tcg/cpu64.c
index fcda99e158..bd5a993ff8 100644
index 9f7a9f3d2c..7ec6851c9c 100644
--- a/target/arm/tcg/cpu64.c
+++ b/target/arm/tcg/cpu64.c
@@ -29,6 +29,7 @@
@ -759,7 +732,7 @@ index fcda99e158..bd5a993ff8 100644
/*
* -cpu max: a CPU with as many features enabled as our emulation supports.
@@ -1259,6 +1263,7 @@ void aarch64_max_tcg_initfn(Object *obj)
@@ -1271,6 +1275,7 @@ void aarch64_max_tcg_initfn(Object *obj)
qdev_property_add_static(DEVICE(obj), &arm_cpu_lpa2_property);
}
@ -767,7 +740,7 @@ index fcda99e158..bd5a993ff8 100644
static const ARMCPUInfo aarch64_cpus[] = {
{ .name = "cortex-a35", .initfn = aarch64_a35_initfn },
{ .name = "cortex-a55", .initfn = aarch64_a55_initfn },
@@ -1270,14 +1275,17 @@ static const ARMCPUInfo aarch64_cpus[] = {
@@ -1282,14 +1287,17 @@ static const ARMCPUInfo aarch64_cpus[] = {
{ .name = "neoverse-v1", .initfn = aarch64_neoverse_v1_initfn },
{ .name = "neoverse-n2", .initfn = aarch64_neoverse_n2_initfn },
};
@ -785,8 +758,20 @@ index fcda99e158..bd5a993ff8 100644
}
type_init(aarch64_cpu_register_types)
diff --git a/target/arm/tcg/meson.build b/target/arm/tcg/meson.build
index 3b1a9f0fc5..6898b4de6f 100644
--- a/target/arm/tcg/meson.build
+++ b/target/arm/tcg/meson.build
@@ -56,5 +56,5 @@ arm_system_ss.add(files(
'psci.c',
))
-arm_system_ss.add(when: 'CONFIG_ARM_V7M', if_true: files('cpu-v7m.c'))
-arm_user_ss.add(when: 'TARGET_AARCH64', if_false: files('cpu-v7m.c'))
+#arm_system_ss.add(when: 'CONFIG_ARM_V7M', if_true: files('cpu-v7m.c'))
+#arm_user_ss.add(when: 'TARGET_AARCH64', if_false: files('cpu-v7m.c'))
diff --git a/target/ppc/cpu-models.c b/target/ppc/cpu-models.c
index 7dbb47de64..69fddb05bc 100644
index f2301b43f7..f77ebfcc81 100644
--- a/target/ppc/cpu-models.c
+++ b/target/ppc/cpu-models.c
@@ -66,6 +66,7 @@
@ -810,13 +795,13 @@ index 7dbb47de64..69fddb05bc 100644
POWERPC_DEF("970fx_v1.0", CPU_POWERPC_970FX_v10, 970,
@@ -718,6 +721,7 @@
"PowerPC 970MP v1.1")
POWERPC_DEF("power5+_v2.1", CPU_POWERPC_POWER5P_v21, POWER5P,
POWERPC_DEF("power5p_v2.1", CPU_POWERPC_POWER5P_v21, POWER5P,
"POWER5+ v2.1")
+#endif
POWERPC_DEF("power7_v2.3", CPU_POWERPC_POWER7_v23, POWER7,
"POWER7 v2.3")
POWERPC_DEF("power7+_v2.1", CPU_POWERPC_POWER7P_v21, POWER7,
@@ -898,12 +902,15 @@ PowerPCCPUAlias ppc_cpu_aliases[] = {
POWERPC_DEF("power7p_v2.1", CPU_POWERPC_POWER7P_v21, POWER7,
@@ -894,13 +898,16 @@ PowerPCCPUAlias ppc_cpu_aliases[] = {
{ "7447a", "7447a_v1.2" },
{ "7457a", "7457a_v1.2" },
{ "apollo7pm", "7457a_v1.0" },
@ -826,13 +811,14 @@ index 7dbb47de64..69fddb05bc 100644
{ "970", "970_v2.2" },
{ "970fx", "970fx_v3.1" },
{ "970mp", "970mp_v1.1" },
{ "power5+", "power5+_v2.1" },
{ "power5+", "power5p_v2.1" },
{ "power5+_v2.1", "power5p_v2.1" },
{ "power5gs", "power5+_v2.1" },
+#endif
{ "power7", "power7_v2.3" },
{ "power7+", "power7+_v2.1" },
{ "power8e", "power8e_v2.1" },
@@ -913,12 +920,14 @@ PowerPCCPUAlias ppc_cpu_aliases[] = {
{ "power7+", "power7p_v2.1" },
{ "power7+_v2.1", "power7p_v2.1" },
@@ -911,12 +918,14 @@ PowerPCCPUAlias ppc_cpu_aliases[] = {
{ "power10", "power10_v2.0" },
#endif
@ -848,10 +834,10 @@ index 7dbb47de64..69fddb05bc 100644
{ NULL, NULL }
};
diff --git a/target/s390x/cpu_models_sysemu.c b/target/s390x/cpu_models_sysemu.c
index 63981bf36b..87a4480c05 100644
index 2d99218069..0728bfcc20 100644
--- a/target/s390x/cpu_models_sysemu.c
+++ b/target/s390x/cpu_models_sysemu.c
@@ -35,6 +35,9 @@ static void check_unavailable_features(const S390CPUModel *max_model,
@@ -34,6 +34,9 @@ static void check_unavailable_features(const S390CPUModel *max_model,
(max_model->def->gen == model->def->gen &&
max_model->def->ec_ga < model->def->ec_ga)) {
list_add_feat("type", unavailable);
@ -862,10 +848,10 @@ index 63981bf36b..87a4480c05 100644
/* detect missing features if any to properly report them */
diff --git a/target/s390x/kvm/kvm.c b/target/s390x/kvm/kvm.c
index 33ab3551f4..912e493951 100644
index 4ce809c5d4..55fb4855b1 100644
--- a/target/s390x/kvm/kvm.c
+++ b/target/s390x/kvm/kvm.c
@@ -2567,6 +2567,14 @@ void kvm_s390_apply_cpu_model(const S390CPUModel *model, Error **errp)
@@ -2565,6 +2565,14 @@ void kvm_s390_apply_cpu_model(const S390CPUModel *model, Error **errp)
error_setg(errp, "KVM doesn't support CPU models");
return;
}
@ -881,10 +867,10 @@ index 33ab3551f4..912e493951 100644
prop.ibc = s390_ibc_from_cpu_model(model);
/* configure cpu features indicated via STFL(e) */
diff --git a/tests/qtest/arm-cpu-features.c b/tests/qtest/arm-cpu-features.c
index a8a4c668ad..2458cc527c 100644
index 9d6e6190d5..f822526acb 100644
--- a/tests/qtest/arm-cpu-features.c
+++ b/tests/qtest/arm-cpu-features.c
@@ -451,8 +451,10 @@ static void test_query_cpu_model_expansion(const void *data)
@@ -452,8 +452,10 @@ static void test_query_cpu_model_expansion(const void *data)
assert_error(qts, "host", "The CPU type 'host' requires KVM", NULL);
/* Test expected feature presence/absence for some cpu types */
@ -895,7 +881,7 @@ index a8a4c668ad..2458cc527c 100644
/* Enabling and disabling pmu should always work. */
assert_has_feature_enabled(qts, "max", "pmu");
@@ -469,6 +471,7 @@ static void test_query_cpu_model_expansion(const void *data)
@@ -470,6 +472,7 @@ static void test_query_cpu_model_expansion(const void *data)
assert_has_feature_enabled(qts, "cortex-a57", "pmu");
assert_has_feature_enabled(qts, "cortex-a57", "aarch64");
@ -903,7 +889,7 @@ index a8a4c668ad..2458cc527c 100644
assert_has_feature_enabled(qts, "a64fx", "pmu");
assert_has_feature_enabled(qts, "a64fx", "aarch64");
/*
@@ -481,6 +484,7 @@ static void test_query_cpu_model_expansion(const void *data)
@@ -482,6 +485,7 @@ static void test_query_cpu_model_expansion(const void *data)
"{ 'sve384': true }");
assert_error(qts, "a64fx", "cannot enable sve640",
"{ 'sve640': true }");

@ -1,4 +1,4 @@
From d9ff466c980d219ebf230ea24becce294c196f1f Mon Sep 17 00:00:00 2001
From 8e6a30073f9c1a5d6294b2d16556522453e227e7 Mon Sep 17 00:00:00 2001
From: Miroslav Rezanina <mrezanin@redhat.com>
Date: Fri, 11 Jan 2019 09:54:45 +0100
Subject: Machine type related general changes
@ -26,6 +26,12 @@ Rebase notes (7.1.0):
Rebase notes (8.1.0):
- Do not modify unused vga-isa.c
Rebase notes (9.0.0 rc0):
- Updated smsbios handling
Rebase notes (9.0.0 rc4):
- Moving downstream compat changes
Merged patches (6.1.0):
- f2fb42a3c6 redhat: add missing entries in hw_compat_rhel_8_4
- 1949ec258e hw/arm/virt: Disable PL011 clock migration through hw_compat_rhel_8_3
@ -61,24 +67,27 @@ Merged patches (8.1.0):
Merged patches (8.2.0):
- 4ee284aca9 Add machine types compat bits. (partial)
Merged patches (9.0.0 rc0):
- 4b8fe42abc virtio-mem: default-enable "dynamic-memslots"
---
hw/acpi/piix4.c | 2 +-
hw/arm/virt.c | 2 +-
hw/core/machine.c | 267 +++++++++++++++++++++++++++++++++++
hw/i386/pc_piix.c | 2 +
hw/i386/pc_q35.c | 2 +
hw/core/machine.c | 269 +++++++++++++++++++++++++++++++++++
hw/i386/fw_cfg.c | 3 +-
hw/net/rtl8139.c | 4 +-
hw/smbios/smbios.c | 46 +++++-
hw/timer/i8254_common.c | 2 +-
hw/usb/hcd-xhci-pci.c | 59 ++++++--
hw/usb/hcd-xhci-pci.h | 1 +
hw/virtio/virtio-mem.c | 3 +-
include/hw/boards.h | 40 ++++++
include/hw/firmware/smbios.h | 5 +-
include/hw/firmware/smbios.h | 4 +-
include/hw/i386/pc.h | 3 +
13 files changed, 413 insertions(+), 22 deletions(-)
13 files changed, 414 insertions(+), 24 deletions(-)
diff --git a/hw/acpi/piix4.c b/hw/acpi/piix4.c
index dd523d2e4c..5050c0ba97 100644
index debe1adb84..e8ddcd716e 100644
--- a/hw/acpi/piix4.c
+++ b/hw/acpi/piix4.c
@@ -245,7 +245,7 @@ static bool vmstate_test_migrate_acpi_index(void *opaque, int version_id)
@ -88,28 +97,28 @@ index dd523d2e4c..5050c0ba97 100644
- .minimum_version_id = 3,
+ .minimum_version_id = 2,
.post_load = vmstate_acpi_post_load,
.fields = (VMStateField[]) {
.fields = (const VMStateField[]) {
VMSTATE_PCI_DEVICE(parent_obj, PIIX4PMState),
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index af9ea4dd1c..62f0f7d4d6 100644
index 6c6d155002..36e9b4b4e9 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1638,7 +1638,7 @@ static void virt_build_smbios(VirtMachineState *vms)
@@ -1651,7 +1651,7 @@ static void virt_build_smbios(VirtMachineState *vms)
smbios_set_defaults("QEMU", product,
vmc->smbios_old_sys_ver ? "1.0" : mc->name, false,
- true, SMBIOS_ENTRY_POINT_TYPE_64);
+ true, NULL, NULL, SMBIOS_ENTRY_POINT_TYPE_64);
vmc->smbios_old_sys_ver ? "1.0" : mc->name,
- true);
+ true, NULL, NULL);
/* build the array of physical mem area from base_memmap */
mem_array.address = vms->memmap[VIRT_MEM].base;
diff --git a/hw/core/machine.c b/hw/core/machine.c
index 0c17398141..446601ee30 100644
index 37ede0e7d4..695cb89a46 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -57,6 +57,273 @@ GlobalProperty hw_compat_7_2[] = {
@@ -296,6 +296,275 @@ GlobalProperty hw_compat_2_1[] = {
};
const size_t hw_compat_7_2_len = G_N_ELEMENTS(hw_compat_7_2);
const size_t hw_compat_2_1_len = G_N_ELEMENTS(hw_compat_2_1);
+/*
+ * RHEL only: machine types for previous major releases are deprecated
@ -132,6 +141,8 @@ index 0c17398141..446601ee30 100644
+ { "vfio-pci-nohotplug", "x-ramfb-migrate", "off" },
+ /* hw_compat_rhel_9_4 from hw_compat_8_1 */
+ { "igb", "x-pcie-flr-init", "off" },
+ /* hw_compat_rhel_9_4 jira RHEL-24045 */
+ { "virtio-mem", "dynamic-memslots", "off" },
+};
+const size_t hw_compat_rhel_9_4_len = G_N_ELEMENTS(hw_compat_rhel_9_4);
+
@ -378,37 +389,25 @@ index 0c17398141..446601ee30 100644
+};
+const size_t hw_compat_rhel_7_6_len = G_N_ELEMENTS(hw_compat_rhel_7_6);
+
GlobalProperty hw_compat_7_1[] = {
{ "virtio-device", "queue_reset", "false" },
{ "virtio-rng-pci", "vectors", "0" },
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index eace854335..2a9f465619 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -238,6 +238,8 @@ static void pc_init1(MachineState *machine,
smbios_set_defaults("QEMU", mc->desc,
mc->name, pcmc->smbios_legacy_mode,
pcmc->smbios_uuid_encoded,
+ pcmc->smbios_stream_product,
+ pcmc->smbios_stream_version,
pcms->smbios_entry_point_type);
}
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 4f3e5412f6..912cb0c0dc 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -206,6 +206,8 @@ static void pc_q35_init(MachineState *machine)
smbios_set_defaults("QEMU", mc->desc,
mc->name, pcmc->smbios_legacy_mode,
pcmc->smbios_uuid_encoded,
+ pcmc->smbios_stream_product,
+ pcmc->smbios_stream_version,
pcms->smbios_entry_point_type);
MachineState *current_machine;
static char *machine_get_kernel(Object *obj, Error **errp)
diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c
index d802d2787f..c7aa39a13e 100644
--- a/hw/i386/fw_cfg.c
+++ b/hw/i386/fw_cfg.c
@@ -64,7 +64,8 @@ void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState *fw_cfg,
if (pcmc->smbios_defaults) {
/* These values are guest ABI, do not change */
smbios_set_defaults("QEMU", mc->desc, mc->name,
- pcmc->smbios_uuid_encoded);
+ pcmc->smbios_uuid_encoded,
+ pcmc->smbios_stream_product, pcmc->smbios_stream_version);
}
/* tell smbios about cpuid version and features */
diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c
index 4af8c66266..7dc12907ab 100644
index 897c86ec41..2d0db43f49 100644
--- a/hw/net/rtl8139.c
+++ b/hw/net/rtl8139.c
@@ -3169,7 +3169,7 @@ static int rtl8139_pre_save(void *opaque)
@ -431,20 +430,21 @@ index 4af8c66266..7dc12907ab 100644
VMSTATE_UINT16(tally_counters.TxUndrn, RTL8139State),
diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c
index 2a90601ac5..7bde23e59d 100644
index eed5787b15..68608a3403 100644
--- a/hw/smbios/smbios.c
+++ b/hw/smbios/smbios.c
@@ -58,6 +58,9 @@ static bool smbios_legacy = true;
static bool smbios_uuid_encoded = true;
/* end: legacy structures & constants for <= 2.0 machines */
@@ -39,6 +39,10 @@ size_t usr_blobs_len;
static unsigned usr_table_max;
static unsigned usr_table_cnt;
+/* Set to true for modern Windows 10 HardwareID-6 compat */
+static bool smbios_type2_required;
+
+
uint8_t *smbios_tables;
size_t smbios_tables_len;
@@ -670,7 +673,7 @@ static void smbios_build_type_1_table(void)
unsigned smbios_table_max;
@@ -629,7 +633,7 @@ static void smbios_build_type_1_table(void)
static void smbios_build_type_2_table(void)
{
@ -453,21 +453,17 @@ index 2a90601ac5..7bde23e59d 100644
SMBIOS_TABLE_SET_STR(2, manufacturer_str, type2.manufacturer);
SMBIOS_TABLE_SET_STR(2, product_str, type2.product);
@@ -985,7 +988,10 @@ void smbios_set_cpuid(uint32_t version, uint32_t features)
@@ -1018,16 +1022,52 @@ void smbios_set_default_processor_family(uint16_t processor_family)
void smbios_set_defaults(const char *manufacturer, const char *product,
const char *version, bool legacy_mode,
- bool uuid_encoded, SmbiosEntryPointType ep_type)
const char *version,
- bool uuid_encoded)
+ bool uuid_encoded,
+ const char *stream_product,
+ const char *stream_version,
+ SmbiosEntryPointType ep_type)
+ const char *stream_version)
{
smbios_have_defaults = true;
smbios_legacy = legacy_mode;
@@ -1006,11 +1012,45 @@ void smbios_set_defaults(const char *manufacturer, const char *product,
g_free(smbios_entries);
}
smbios_uuid_encoded = uuid_encoded;
+ /*
+ * If @stream_product & @stream_version are non-NULL, then
@ -494,12 +490,12 @@ index 2a90601ac5..7bde23e59d 100644
+ *
+ * We get 'System Manufacturer' and 'Baseboard Manufacturer'
+ */
SMBIOS_SET_DEFAULT(type1.manufacturer, manufacturer);
SMBIOS_SET_DEFAULT(type1.product, product);
SMBIOS_SET_DEFAULT(type1.version, version);
+ SMBIOS_SET_DEFAULT(type1.family, "Red Hat Enterprise Linux");
SMBIOS_SET_DEFAULT(smbios_type1.manufacturer, manufacturer);
SMBIOS_SET_DEFAULT(smbios_type1.product, product);
SMBIOS_SET_DEFAULT(smbios_type1.version, version);
+ SMBIOS_SET_DEFAULT(smbios_type1.family, "Red Hat Enterprise Linux");
+ if (stream_version != NULL) {
+ SMBIOS_SET_DEFAULT(type1.sku, stream_version);
+ SMBIOS_SET_DEFAULT(smbios_type1.sku, stream_version);
+ }
SMBIOS_SET_DEFAULT(type2.manufacturer, manufacturer);
- SMBIOS_SET_DEFAULT(type2.product, product);
@ -513,20 +509,20 @@ index 2a90601ac5..7bde23e59d 100644
SMBIOS_SET_DEFAULT(type3.manufacturer, manufacturer);
SMBIOS_SET_DEFAULT(type3.version, version);
diff --git a/hw/timer/i8254_common.c b/hw/timer/i8254_common.c
index b25da448c8..0331e84398 100644
index 28fdabc321..bad13ec224 100644
--- a/hw/timer/i8254_common.c
+++ b/hw/timer/i8254_common.c
@@ -229,7 +229,7 @@ static const VMStateDescription vmstate_pit_common = {
.pre_save = pit_dispatch_pre_save,
.post_load = pit_dispatch_post_load,
.fields = (VMStateField[]) {
.fields = (const VMStateField[]) {
- VMSTATE_UINT32_V(channels[0].irq_disabled, PITCommonState, 3),
+ VMSTATE_UINT32(channels[0].irq_disabled, PITCommonState), /* qemu-kvm's v2 had 'flags' here */
VMSTATE_STRUCT_ARRAY(channels, PITCommonState, 3, 2,
vmstate_pit_channel, PITChannelState),
VMSTATE_INT64(channels[0].next_transition_time,
diff --git a/hw/usb/hcd-xhci-pci.c b/hw/usb/hcd-xhci-pci.c
index 643d4643e4..529bad9366 100644
index 4423983308..43b4b71fdf 100644
--- a/hw/usb/hcd-xhci-pci.c
+++ b/hw/usb/hcd-xhci-pci.c
@@ -104,6 +104,33 @@ static int xhci_pci_vmstate_post_load(void *opaque, int version_id)
@ -636,11 +632,26 @@ index 08f70ce97c..1be7527c1b 100644
} XHCIPciState;
#endif
diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
index ffd119ebac..0e2be2219c 100644
--- a/hw/virtio/virtio-mem.c
+++ b/hw/virtio/virtio-mem.c
@@ -1694,8 +1694,9 @@ static Property virtio_mem_properties[] = {
#endif
DEFINE_PROP_BOOL(VIRTIO_MEM_EARLY_MIGRATION_PROP, VirtIOMEM,
early_migration, true),
+ /* RHEL: default-enable "dynamic-memslots" (jira RHEL-24045) */
DEFINE_PROP_BOOL(VIRTIO_MEM_DYNAMIC_MEMSLOTS_PROP, VirtIOMEM,
- dynamic_memslots, false),
+ dynamic_memslots, true),
DEFINE_PROP_END_OF_LIST(),
};
diff --git a/include/hw/boards.h b/include/hw/boards.h
index da85f86efb..4a21eddbf9 100644
index 8b8f6d5c00..0466f9d0f3 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -503,4 +503,44 @@ extern const size_t hw_compat_2_2_len;
@@ -512,4 +512,44 @@ extern const size_t hw_compat_2_2_len;
extern GlobalProperty hw_compat_2_1[];
extern const size_t hw_compat_2_1_len;
@ -686,26 +697,25 @@ index da85f86efb..4a21eddbf9 100644
+extern const char *rhel_old_machine_deprecation;
#endif
diff --git a/include/hw/firmware/smbios.h b/include/hw/firmware/smbios.h
index 7f3259a630..d24b3ccd32 100644
index 8d3fb2fb3b..d9d6d7a169 100644
--- a/include/hw/firmware/smbios.h
+++ b/include/hw/firmware/smbios.h
@@ -294,7 +294,10 @@ void smbios_entry_add(QemuOpts *opts, Error **errp);
@@ -332,7 +332,9 @@ void smbios_entry_add(QemuOpts *opts, Error **errp);
void smbios_set_cpuid(uint32_t version, uint32_t features);
void smbios_set_defaults(const char *manufacturer, const char *product,
const char *version, bool legacy_mode,
- bool uuid_encoded, SmbiosEntryPointType ep_type);
const char *version,
- bool uuid_encoded);
+ bool uuid_encoded,
+ const char *stream_product,
+ const char *stream_version,
+ SmbiosEntryPointType ep_type);
uint8_t *smbios_get_table_legacy(MachineState *ms, size_t *length);
+ const char *stream_version);
void smbios_set_default_processor_family(uint16_t processor_family);
uint8_t *smbios_get_table_legacy(size_t *length, Error **errp);
void smbios_get_tables(MachineState *ms,
const struct smbios_phys_mem_area *mem_array,
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index a10ceeabbf..037942d233 100644
index 27a68071d7..ebd8f973f2 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -113,6 +113,9 @@ struct PCMachineClass {
@@ -112,6 +112,9 @@ struct PCMachineClass {
bool smbios_legacy_mode;
bool smbios_uuid_encoded;
SmbiosEntryPointType default_smbios_ep_type;

@ -1,4 +1,4 @@
From 23f614ab0b79ec1c6f65a7f0d6993bfdfc53fd23 Mon Sep 17 00:00:00 2001
From cf398296f3fcee185a00f23de5deae57c97d648e Mon Sep 17 00:00:00 2001
From: Miroslav Rezanina <mrezanin@redhat.com>
Date: Fri, 19 Oct 2018 12:53:31 +0200
Subject: Add aarch64 machine types
@ -30,6 +30,9 @@ Rebase notes (8.0.0):
Rebase notes (8.1.0):
- Added setting default_nic
Rebase notes (9.0.0 rc0):
- call arm_virt_compat_set on rhel type class_init
Merged patches (6.2.0):
- 9a3d4fde0e hw/arm/virt: Remove 9.0 machine type
- f7d04d6695 hw: arm: virt: Add hw_compat_rhel_8_5 to 8.5 machine type
@ -64,34 +67,67 @@ Merged patches (8.1.0):
Merged patches (8.2.0):
- 4ee284aca9 Add machine types compat bits. (partial)
Merged patches (9.0.0 rc0):
- 117068376a hw/arm/virt: Fix compats
- 8bcccfabc4 hw/arm/virt: Add properties to disable high memory regions
- 0005a8b93a hw/arm/virt: deprecate virt-rhel9.{0,2}.0 machine types
---
hw/arm/virt.c | 250 +++++++++++++++++++++++++++++++++++++++++-
hw/arm/virt.c | 299 +++++++++++++++++++++++++++++++++++++++++-
include/hw/arm/virt.h | 8 ++
2 files changed, 257 insertions(+), 1 deletion(-)
2 files changed, 306 insertions(+), 1 deletion(-)
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 62f0f7d4d6..c541efee5e 100644
index 36e9b4b4e9..22bc345137 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -82,6 +82,7 @@
#include "hw/char/pl011.h"
#include "qemu/guest-random.h"
@@ -101,6 +101,7 @@ static void arm_virt_compat_set(MachineClass *mc)
arm_virt_compat_len);
}
+#if 0 /* Disabled for Red Hat Enterprise Linux */
#define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
static void virt_##major##_##minor##_class_init(ObjectClass *oc, \
void *data) \
@@ -108,7 +109,48 @@
@@ -128,7 +129,63 @@ static void arm_virt_compat_set(MachineClass *mc)
DEFINE_VIRT_MACHINE_LATEST(major, minor, true)
#define DEFINE_VIRT_MACHINE(major, minor) \
DEFINE_VIRT_MACHINE_LATEST(major, minor, false)
+#endif /* disabled for RHEL */
+
+/*
+ * This variable is for changes to properties that are RHEL specific,
+ * different to the current upstream and to be applied to the latest
+ * machine type. They may be overriden by older machine compats.
+ *
+ * virtio-net-pci variant romfiles are not needed because edk2 does
+ * fully support the pxe boot. Besides virtio romfiles are not shipped
+ * on rhel/aarch64.
+ */
+GlobalProperty arm_rhel_compat[] = {
+ {"virtio-net-pci", "romfile", "" },
+ {"virtio-net-pci-transitional", "romfile", "" },
+ {"virtio-net-pci-non-transitional", "romfile", "" },
+};
+const size_t arm_rhel_compat_len = G_N_ELEMENTS(arm_rhel_compat);
+/*
+ * This cannot be called from the rhel_virt_class_init() because
+ * TYPE_RHEL_MACHINE is abstract and mc->compat_props g_ptr_array_new()
+ * only is called on virt-rhelm.n.s non abstract class init.
+ */
+static void arm_rhel_compat_set(MachineClass *mc)
+{
+ compat_props_add(mc->compat_props, arm_rhel_compat,
+ arm_rhel_compat_len);
+}
+
+#define DEFINE_RHEL_MACHINE_LATEST(m, n, s, latest) \
+ static void rhel##m##n##s##_virt_class_init(ObjectClass *oc, \
+ void *data) \
+ { \
+ MachineClass *mc = MACHINE_CLASS(oc); \
+ arm_rhel_compat_set(mc); \
+ rhel##m##n##s##_virt_options(mc); \
+ mc->desc = "RHEL " # m "." # n "." # s " ARM Virtual Machine"; \
+ if (latest) { \
@ -114,23 +150,10 @@ index 62f0f7d4d6..c541efee5e 100644
+ DEFINE_RHEL_MACHINE_LATEST(major, minor, subminor, true)
+#define DEFINE_RHEL_MACHINE(major, minor, subminor) \
+ DEFINE_RHEL_MACHINE_LATEST(major, minor, subminor, false)
+
+/* This variable is for changes to properties that are RHEL specific,
+ * different to the current upstream and to be applied to the latest
+ * machine type.
+ */
+GlobalProperty arm_rhel_compat[] = {
+ {
+ .driver = "virtio-net-pci",
+ .property = "romfile",
+ .value = "",
+ },
+};
+const size_t arm_rhel_compat_len = G_N_ELEMENTS(arm_rhel_compat);
/* Number of external interrupt lines to configure the GIC with */
#define NUM_IRQS 256
@@ -2341,6 +2383,7 @@ static void machvirt_init(MachineState *machine)
@@ -2355,6 +2412,7 @@ static void machvirt_init(MachineState *machine)
qemu_add_machine_init_done_notifier(&vms->machine_done);
}
@ -138,7 +161,7 @@ index 62f0f7d4d6..c541efee5e 100644
static bool virt_get_secure(Object *obj, Error **errp)
{
VirtMachineState *vms = VIRT_MACHINE(obj);
@@ -2368,6 +2411,7 @@ static void virt_set_virt(Object *obj, bool value, Error **errp)
@@ -2382,6 +2440,7 @@ static void virt_set_virt(Object *obj, bool value, Error **errp)
vms->virt = value;
}
@ -146,7 +169,7 @@ index 62f0f7d4d6..c541efee5e 100644
static bool virt_get_highmem(Object *obj, Error **errp)
{
@@ -2383,6 +2427,7 @@ static void virt_set_highmem(Object *obj, bool value, Error **errp)
@@ -2397,6 +2456,7 @@ static void virt_set_highmem(Object *obj, bool value, Error **errp)
vms->highmem = value;
}
@ -154,16 +177,23 @@ index 62f0f7d4d6..c541efee5e 100644
static bool virt_get_compact_highmem(Object *obj, Error **errp)
{
VirtMachineState *vms = VIRT_MACHINE(obj);
@@ -2438,7 +2483,7 @@ static void virt_set_highmem_mmio(Object *obj, bool value, Error **errp)
@@ -2410,6 +2470,7 @@ static void virt_set_compact_highmem(Object *obj, bool value, Error **errp)
vms->highmem_mmio = value;
vms->highmem_compact = value;
}
-
+#endif /* disabled for RHEL */
static bool virt_get_highmem_redists(Object *obj, Error **errp)
{
@@ -2453,7 +2514,6 @@ static void virt_set_highmem_mmio(Object *obj, bool value, Error **errp)
vms->highmem_mmio = value;
}
-
static bool virt_get_its(Object *obj, Error **errp)
{
@@ -2454,6 +2499,7 @@ static void virt_set_its(Object *obj, bool value, Error **errp)
VirtMachineState *vms = VIRT_MACHINE(obj);
@@ -2468,6 +2528,7 @@ static void virt_set_its(Object *obj, bool value, Error **errp)
vms->its = value;
}
@ -171,7 +201,7 @@ index 62f0f7d4d6..c541efee5e 100644
static bool virt_get_dtb_randomness(Object *obj, Error **errp)
{
VirtMachineState *vms = VIRT_MACHINE(obj);
@@ -2467,6 +2513,7 @@ static void virt_set_dtb_randomness(Object *obj, bool value, Error **errp)
@@ -2481,6 +2542,7 @@ static void virt_set_dtb_randomness(Object *obj, bool value, Error **errp)
vms->dtb_randomness = value;
}
@ -179,7 +209,7 @@ index 62f0f7d4d6..c541efee5e 100644
static char *virt_get_oem_id(Object *obj, Error **errp)
{
@@ -2550,6 +2597,7 @@ static void virt_set_ras(Object *obj, bool value, Error **errp)
@@ -2564,6 +2626,7 @@ static void virt_set_ras(Object *obj, bool value, Error **errp)
vms->ras = value;
}
@ -187,7 +217,7 @@ index 62f0f7d4d6..c541efee5e 100644
static bool virt_get_mte(Object *obj, Error **errp)
{
VirtMachineState *vms = VIRT_MACHINE(obj);
@@ -2563,6 +2611,7 @@ static void virt_set_mte(Object *obj, bool value, Error **errp)
@@ -2577,6 +2640,7 @@ static void virt_set_mte(Object *obj, bool value, Error **errp)
vms->mte = value;
}
@ -195,7 +225,7 @@ index 62f0f7d4d6..c541efee5e 100644
static char *virt_get_gic_version(Object *obj, Error **errp)
{
@@ -2935,6 +2984,7 @@ static int virt_kvm_type(MachineState *ms, const char *type_str)
@@ -2949,6 +3013,7 @@ static int virt_kvm_type(MachineState *ms, const char *type_str)
return fixed_ipa ? 0 : requested_pa_size;
}
@ -203,7 +233,7 @@ index 62f0f7d4d6..c541efee5e 100644
static void virt_machine_class_init(ObjectClass *oc, void *data)
{
MachineClass *mc = MACHINE_CLASS(oc);
@@ -3405,3 +3455,201 @@ static void virt_machine_2_6_options(MachineClass *mc)
@@ -3463,3 +3528,235 @@ static void virt_machine_2_6_options(MachineClass *mc)
vmc->no_pmu = true;
}
DEFINE_VIRT_MACHINE(2, 6)
@ -213,6 +243,7 @@ index 62f0f7d4d6..c541efee5e 100644
+{
+ MachineClass *mc = MACHINE_CLASS(oc);
+ HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
+ arm_virt_compat_set(mc);
+
+ mc->family = "virt-rhel-Z";
+ mc->init = machvirt_init;
@ -258,6 +289,28 @@ index 62f0f7d4d6..c541efee5e 100644
+ "Set on/off to enable/disable using "
+ "physical address space above 32 bits");
+
+ object_class_property_add_bool(oc, "highmem-redists",
+ virt_get_highmem_redists,
+ virt_set_highmem_redists);
+ object_class_property_set_description(oc, "highmem-redists",
+ "Set on/off to enable/disable high "
+ "memory region for GICv3 or GICv4 "
+ "redistributor");
+
+ object_class_property_add_bool(oc, "highmem-ecam",
+ virt_get_highmem_ecam,
+ virt_set_highmem_ecam);
+ object_class_property_set_description(oc, "highmem-ecam",
+ "Set on/off to enable/disable high "
+ "memory region for PCI ECAM");
+
+ object_class_property_add_bool(oc, "highmem-mmio",
+ virt_get_highmem_mmio,
+ virt_set_highmem_mmio);
+ object_class_property_set_description(oc, "highmem-mmio",
+ "Set on/off to enable/disable high "
+ "memory region for PCI MMIO");
+
+ object_class_property_add_str(oc, "gic-version", virt_get_gic_version,
+ virt_set_gic_version);
+ object_class_property_set_description(oc, "gic-version",
@ -382,14 +435,24 @@ index 62f0f7d4d6..c541efee5e 100644
+}
+type_init(rhel_machine_init);
+
+static void rhel940_virt_options(MachineClass *mc)
+{
+}
+DEFINE_RHEL_MACHINE_AS_LATEST(9, 4, 0)
+
+static void rhel920_virt_options(MachineClass *mc)
+{
+ compat_props_add(mc->compat_props, arm_rhel_compat, arm_rhel_compat_len);
+ rhel940_virt_options(mc);
+
+ compat_props_add(mc->compat_props, hw_compat_rhel_9_4, hw_compat_rhel_9_4_len);
+ compat_props_add(mc->compat_props, hw_compat_rhel_9_3, hw_compat_rhel_9_3_len);
+ compat_props_add(mc->compat_props, hw_compat_rhel_9_2, hw_compat_rhel_9_2_len);
+
+ /* RHEL 9.4 is the first supported release */
+ mc->deprecation_reason =
+ "machine types for versions prior to 9.4 are deprecated";
+}
+DEFINE_RHEL_MACHINE_AS_LATEST(9, 2, 0)
+DEFINE_RHEL_MACHINE(9, 2, 0)
+
+static void rhel900_virt_options(MachineClass *mc)
+{
@ -398,6 +461,7 @@ index 62f0f7d4d6..c541efee5e 100644
+ rhel920_virt_options(mc);
+
+ compat_props_add(mc->compat_props, hw_compat_rhel_9_1, hw_compat_rhel_9_1_len);
+ compat_props_add(mc->compat_props, hw_compat_rhel_9_0, hw_compat_rhel_9_0_len);
+
+ /* Disable FEAT_LPA2 since old kernels (<= v5.12) don't boot with that feature */
+ vmc->no_tcg_lpa2 = true;
@ -406,10 +470,10 @@ index 62f0f7d4d6..c541efee5e 100644
+}
+DEFINE_RHEL_MACHINE(9, 0, 0)
diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
index f69239850e..7b8abe5645 100644
index bb486d36b1..237fc77bda 100644
--- a/include/hw/arm/virt.h
+++ b/include/hw/arm/virt.h
@@ -177,9 +177,17 @@ struct VirtMachineState {
@@ -179,9 +179,17 @@ struct VirtMachineState {
#define VIRT_ECAM_ID(high) (high ? VIRT_HIGH_PCIE_ECAM : VIRT_PCIE_ECAM)

@ -1,4 +1,4 @@
From d03cff85f5f1b69b1a66011ebaa974ece81d31bc Mon Sep 17 00:00:00 2001
From fb905dbe5b51ed899062ef99a2dd7f238d3e3384 Mon Sep 17 00:00:00 2001
From: Miroslav Rezanina <mrezanin@redhat.com>
Date: Fri, 19 Oct 2018 13:27:13 +0200
Subject: Add ppc64 machine types
@ -34,20 +34,20 @@ Merged patches (7.1.0):
8 files changed, 314 insertions(+), 1 deletion(-)
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index df09aa9d6a..ff459e1a46 100644
index e9bc97fee0..a258d81846 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1689,6 +1689,9 @@ static void spapr_machine_reset(MachineState *machine, ShutdownCause reason)
@@ -1718,6 +1718,9 @@ static void spapr_machine_reset(MachineState *machine, ShutdownCause reason)
pef_kvm_reset(machine->cgs, &error_fatal);
spapr_caps_apply(spapr);
spapr_nested_reset(spapr);
+ if (spapr->svm_allowed) {
+ kvmppc_svm_allow(&error_fatal);
+ }
first_ppc_cpu = POWERPC_CPU(first_cpu);
if (kvm_enabled() && kvmppc_has_cap_mmu_radix() &&
@@ -3397,6 +3400,20 @@ static void spapr_set_host_serial(Object *obj, const char *value, Error **errp)
@@ -3421,6 +3424,20 @@ static void spapr_set_host_serial(Object *obj, const char *value, Error **errp)
spapr->host_serial = g_strdup(value);
}
@ -68,7 +68,7 @@ index df09aa9d6a..ff459e1a46 100644
static void spapr_instance_init(Object *obj)
{
SpaprMachineState *spapr = SPAPR_MACHINE(obj);
@@ -3475,6 +3492,12 @@ static void spapr_instance_init(Object *obj)
@@ -3499,6 +3516,12 @@ static void spapr_instance_init(Object *obj)
spapr_get_host_serial, spapr_set_host_serial);
object_property_set_description(obj, "host-serial",
"Host serial number to advertise in guest device tree");
@ -81,7 +81,7 @@ index df09aa9d6a..ff459e1a46 100644
}
static void spapr_machine_finalizefn(Object *obj)
@@ -4734,6 +4757,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
@@ -4754,6 +4777,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
vmc->client_architecture_support = spapr_vof_client_architecture_support;
vmc->quiesce = spapr_vof_quiesce;
vmc->setprop = spapr_vof_setprop;
@ -89,15 +89,15 @@ index df09aa9d6a..ff459e1a46 100644
}
static const TypeInfo spapr_machine_info = {
@@ -4785,6 +4809,7 @@ static void spapr_machine_latest_class_options(MachineClass *mc)
@@ -4805,6 +4829,7 @@ static void spapr_machine_latest_class_options(MachineClass *mc)
} \
type_init(spapr_machine_register_##suffix)
+#if 0 /* Disabled for Red Hat Enterprise Linux */
/*
* pseries-8.2
* pseries-9.0
*/
@@ -4967,6 +4992,7 @@ static void spapr_machine_4_1_class_options(MachineClass *mc)
@@ -4998,6 +5023,7 @@ static void spapr_machine_4_1_class_options(MachineClass *mc)
}
DEFINE_SPAPR_MACHINE(4_1, "4.1", false);
@ -105,7 +105,7 @@ index df09aa9d6a..ff459e1a46 100644
/*
* pseries-4.0
@@ -4982,6 +5008,8 @@ static bool phb_placement_4_0(SpaprMachineState *spapr, uint32_t index,
@@ -5013,6 +5039,8 @@ static bool phb_placement_4_0(SpaprMachineState *spapr, uint32_t index,
}
return true;
}
@ -114,7 +114,7 @@ index df09aa9d6a..ff459e1a46 100644
static void spapr_machine_4_0_class_options(MachineClass *mc)
{
SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
@@ -5306,6 +5334,221 @@ static void spapr_machine_2_1_class_options(MachineClass *mc)
@@ -5338,6 +5366,221 @@ static void spapr_machine_2_1_class_options(MachineClass *mc)
compat_props_add(mc->compat_props, hw_compat_2_1, hw_compat_2_1_len);
}
DEFINE_SPAPR_MACHINE(2_1, "2.1", false);
@ -337,7 +337,7 @@ index df09aa9d6a..ff459e1a46 100644
static void spapr_machine_register_types(void)
{
diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c
index 33e0c8724c..9d01663f43 100644
index 3b0a47a28c..375e0c8e45 100644
--- a/hw/ppc/spapr_cpu_core.c
+++ b/hw/ppc/spapr_cpu_core.c
@@ -25,6 +25,7 @@
@ -348,7 +348,7 @@ index 33e0c8724c..9d01663f43 100644
static void spapr_reset_vcpu(PowerPCCPU *cpu)
{
@@ -261,6 +262,7 @@ static bool spapr_realize_vcpu(PowerPCCPU *cpu, SpaprMachineState *spapr,
@@ -264,6 +265,7 @@ static bool spapr_realize_vcpu(PowerPCCPU *cpu, SpaprMachineState *spapr,
{
CPUPPCState *env = &cpu->env;
CPUState *cs = CPU(cpu);
@ -356,7 +356,7 @@ index 33e0c8724c..9d01663f43 100644
if (!qdev_realize(DEVICE(cpu), NULL, errp)) {
return false;
@@ -277,6 +279,17 @@ static bool spapr_realize_vcpu(PowerPCCPU *cpu, SpaprMachineState *spapr,
@@ -280,6 +282,17 @@ static bool spapr_realize_vcpu(PowerPCCPU *cpu, SpaprMachineState *spapr,
/* Set time-base frequency to 512 MHz. vhyp must be set first. */
cpu_ppc_tb_init(env, SPAPR_TIMEBASE_FREQ);
@ -375,10 +375,10 @@ index 33e0c8724c..9d01663f43 100644
qdev_unrealize(DEVICE(cpu));
return false;
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index e91791a1a9..1951d8a2a0 100644
index 4aaf23d28f..3233c54d11 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -154,6 +154,7 @@ struct SpaprMachineClass {
@@ -157,6 +157,7 @@ struct SpaprMachineClass {
bool pre_5_2_numa_associativity;
bool pre_6_2_numa_affinity;
@ -386,7 +386,7 @@ index e91791a1a9..1951d8a2a0 100644
bool (*phb_placement)(SpaprMachineState *spapr, uint32_t index,
uint64_t *buid, hwaddr *pio,
hwaddr *mmio32, hwaddr *mmio64,
@@ -256,6 +257,9 @@ struct SpaprMachineState {
@@ -259,6 +260,9 @@ struct SpaprMachineState {
/* Set by -boot */
char *boot_device;
@ -422,10 +422,10 @@ index ebef2cccec..ff2c00c60e 100644
const CompatInfo *compat = compat_by_pvr(compat_pvr);
const CompatInfo *min = compat_by_pvr(min_compat_pvr);
diff --git a/target/ppc/cpu-models.c b/target/ppc/cpu-models.c
index 69fddb05bc..64a05aaef3 100644
index f77ebfcc81..18e9422006 100644
--- a/target/ppc/cpu-models.c
+++ b/target/ppc/cpu-models.c
@@ -748,6 +748,7 @@
@@ -744,6 +744,7 @@
/* PowerPC CPU aliases */
PowerPCCPUAlias ppc_cpu_aliases[] = {
@ -434,10 +434,10 @@ index 69fddb05bc..64a05aaef3 100644
{ "405cr", "405crc" },
{ "405gp", "405gpd" },
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index f8101ffa29..e799a2bee6 100644
index 67e6b2effd..11187aeb93 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1635,6 +1635,7 @@ static inline int cpu_mmu_index(CPUPPCState *env, bool ifetch)
@@ -1655,6 +1655,7 @@ static inline int ppc_env_mmu_index(CPUPPCState *env, bool ifetch)
/* Compatibility modes */
#if defined(TARGET_PPC64)
@ -446,7 +446,7 @@ index f8101ffa29..e799a2bee6 100644
uint32_t min_compat_pvr, uint32_t max_compat_pvr);
bool ppc_type_check_compat(const char *cputype, uint32_t compat_pvr,
diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index 9b1abe2fc4..56f1c46e8e 100644
index 8231feb2d4..59f640cf7b 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -89,6 +89,7 @@ static int cap_large_decr;
@ -465,7 +465,7 @@ index 9b1abe2fc4..56f1c46e8e 100644
cap_large_decr = kvmppc_get_dec_bits();
cap_fwnmi = kvm_vm_check_extension(s, KVM_CAP_PPC_FWNMI);
/*
@@ -2579,6 +2581,16 @@ bool kvmppc_supports_ail_3(void)
@@ -2564,6 +2566,16 @@ bool kvmppc_supports_ail_3(void)
return cap_ail_mode_3;
}
@ -482,7 +482,7 @@ index 9b1abe2fc4..56f1c46e8e 100644
PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void)
{
uint32_t host_pvr = mfpvr();
@@ -2979,3 +2991,18 @@ bool kvm_arch_cpu_check_are_resettable(void)
@@ -2964,3 +2976,18 @@ bool kvm_arch_cpu_check_are_resettable(void)
void kvm_arch_accel_class_init(ObjectClass *oc)
{
}

@ -1,4 +1,4 @@
From 3623043d4a923bf9f541d439c76e7874cf0fa81d Mon Sep 17 00:00:00 2001
From 04178c77cfe188b4eed9c08a0bf66842e61fe5dc Mon Sep 17 00:00:00 2001
From: Miroslav Rezanina <mrezanin@redhat.com>
Date: Fri, 19 Oct 2018 13:47:32 +0200
Subject: Add s390x machine types
@ -49,18 +49,18 @@ Merged patches (8.2.0):
4 files changed, 174 insertions(+)
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index 7262725d2e..984891b82a 100644
index b1dcb3857f..ff753a29e0 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -855,6 +855,7 @@ bool css_migration_enabled(void)
@@ -859,6 +859,7 @@ bool css_migration_enabled(void)
} \
type_init(ccw_machine_register_##suffix)
+#if 0 /* Disabled for Red Hat Enterprise Linux */
static void ccw_machine_8_2_instance_options(MachineState *machine)
static void ccw_machine_9_0_instance_options(MachineState *machine)
{
}
@@ -1256,6 +1257,164 @@ static void ccw_machine_2_4_class_options(MachineClass *mc)
@@ -1272,6 +1273,164 @@ static void ccw_machine_2_4_class_options(MachineClass *mc)
compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
}
DEFINE_CCW_MACHINE(2_4, "2.4", false);
@ -226,7 +226,7 @@ index 7262725d2e..984891b82a 100644
static void ccw_machine_register_types(void)
{
diff --git a/target/s390x/cpu_models.c b/target/s390x/cpu_models.c
index a63d990e4e..198b81f2c0 100644
index 8ed3bb6a27..370b3b3065 100644
--- a/target/s390x/cpu_models.c
+++ b/target/s390x/cpu_models.c
@@ -46,6 +46,9 @@
@ -239,7 +239,7 @@ index a63d990e4e..198b81f2c0 100644
static S390CPUDef s390_cpu_defs[] = {
CPUDEF_INIT(0x2064, 7, 1, 38, 0x00000000U, "z900", "IBM zSeries 900 GA1"),
CPUDEF_INIT(0x2064, 7, 2, 38, 0x00000000U, "z900.2", "IBM zSeries 900 GA2"),
@@ -856,22 +859,30 @@ static void s390_host_cpu_model_class_init(ObjectClass *oc, void *data)
@@ -866,22 +869,30 @@ static void s390_host_cpu_model_class_init(ObjectClass *oc, void *data)
static void s390_base_cpu_model_class_init(ObjectClass *oc, void *data)
{
S390CPUClass *xcc = S390_CPU_CLASS(oc);
@ -284,10 +284,10 @@ index d7b8912989..1a806a97c4 100644
/* CPU model based on a CPU definition */
diff --git a/target/s390x/cpu_models_sysemu.c b/target/s390x/cpu_models_sysemu.c
index 87a4480c05..28c1b0486c 100644
index 0728bfcc20..ca2e5d91e2 100644
--- a/target/s390x/cpu_models_sysemu.c
+++ b/target/s390x/cpu_models_sysemu.c
@@ -60,6 +60,7 @@ static void create_cpu_model_list(ObjectClass *klass, void *opaque)
@@ -59,6 +59,7 @@ static void create_cpu_model_list(ObjectClass *klass, void *opaque)
CpuDefinitionInfo *info;
char *name = g_strdup(object_class_get_name(klass));
S390CPUClass *scc = S390_CPU_CLASS(klass);
@ -295,7 +295,7 @@ index 87a4480c05..28c1b0486c 100644
/* strip off the -s390x-cpu */
g_strrstr(name, "-" TYPE_S390_CPU)[0] = 0;
@@ -69,6 +70,7 @@ static void create_cpu_model_list(ObjectClass *klass, void *opaque)
@@ -68,6 +69,7 @@ static void create_cpu_model_list(ObjectClass *klass, void *opaque)
info->migration_safe = scc->is_migration_safe;
info->q_static = scc->is_static;
info->q_typename = g_strdup(object_class_get_name(klass));

@ -1,4 +1,4 @@
From b432505cb28bc3b9b0c1849210ac6c63bca3fe37 Mon Sep 17 00:00:00 2001
From 3c88acb005806ad2386ab6c94a8831151f624738 Mon Sep 17 00:00:00 2001
From: Miroslav Rezanina <mrezanin@redhat.com>
Date: Fri, 19 Oct 2018 13:10:31 +0200
Subject: Add x86_64 machine types
@ -57,23 +57,40 @@ Merged patches (8.1.0):
Merged patches (8.2.0):
- 4ee284aca9 Add machine types compat bits. (partial)
- 719e2ac147 Fix x86 machine type compatibility for qemu-kvm 8.1.0
Merged patches (9.0.0 rc0):
- 9149e2bc8f x86: rhel 9.2.0 machine type compat fix
---
hw/i386/fw_cfg.c | 2 +-
hw/i386/pc.c | 159 ++++++++++++++++++++-
hw/i386/pc_piix.c | 112 ++++++++++++++-
hw/i386/pc_q35.c | 285 ++++++++++++++++++++++++++++++++++++-
hw/i386/pc_piix.c | 109 ++++++++++++++
hw/i386/pc_q35.c | 285 +++++++++++++++++++++++++++++++++++++
include/hw/boards.h | 2 +
include/hw/i386/pc.h | 33 +++++
target/i386/cpu.c | 21 +++
target/i386/kvm/kvm-cpu.c | 1 +
target/i386/kvm/kvm.c | 4 +
tests/qtest/pvpanic-test.c | 5 +-
9 files changed, 615 insertions(+), 7 deletions(-)
10 files changed, 617 insertions(+), 4 deletions(-)
diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c
index c7aa39a13e..283c3f4c16 100644
--- a/hw/i386/fw_cfg.c
+++ b/hw/i386/fw_cfg.c
@@ -63,7 +63,7 @@ void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState *fw_cfg,
if (pcmc->smbios_defaults) {
/* These values are guest ABI, do not change */
- smbios_set_defaults("QEMU", mc->desc, mc->name,
+ smbios_set_defaults("Red Hat", "KVM", mc->desc,
pcmc->smbios_uuid_encoded,
pcmc->smbios_stream_product, pcmc->smbios_stream_version);
}
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 29b9964733..a1faa9e92c 100644
index 5c21b0c4db..4a154c1a9a 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -323,6 +323,161 @@ GlobalProperty pc_compat_2_0[] = {
@@ -326,6 +326,161 @@ GlobalProperty pc_compat_2_0[] = {
};
const size_t pc_compat_2_0_len = G_N_ELEMENTS(pc_compat_2_0);
@ -235,15 +252,15 @@ index 29b9964733..a1faa9e92c 100644
GSIState *pc_gsi_create(qemu_irq **irqs, bool pci_enabled)
{
GSIState *s;
@@ -1826,6 +1981,7 @@ static void pc_machine_class_init(ObjectClass *oc, void *data)
pcmc->kvmclock_create_always = true;
@@ -1813,6 +1968,7 @@ static void pc_machine_class_init(ObjectClass *oc, void *data)
pcmc->resizable_acpi_blob = true;
x86mc->apic_xrupt_override = true;
assert(!mc->get_hotplug_handler);
+ mc->async_pf_vmexit_disable = false;
mc->get_hotplug_handler = pc_get_hotplug_handler;
mc->hotplug_allowed = pc_hotplug_allowed;
mc->cpu_index_to_instance_props = x86_cpu_index_to_props;
@@ -1836,7 +1992,8 @@ static void pc_machine_class_init(ObjectClass *oc, void *data)
@@ -1823,7 +1979,8 @@ static void pc_machine_class_init(ObjectClass *oc, void *data)
mc->has_hotpluggable_cpus = true;
mc->default_boot_order = "cad";
mc->block_default_type = IF_IDE;
@ -254,10 +271,10 @@ index 29b9964733..a1faa9e92c 100644
mc->wakeup = pc_machine_wakeup;
hc->pre_plug = pc_machine_device_pre_plug_cb;
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 2a9f465619..44038391fb 100644
index 18ba076609..a647262d63 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -53,6 +53,7 @@
@@ -52,6 +52,7 @@
#include "qapi/error.h"
#include "qemu/error-report.h"
#include "sysemu/xen.h"
@ -265,18 +282,7 @@ index 2a9f465619..44038391fb 100644
#ifdef CONFIG_XEN
#include <xen/hvm/hvm_info_table.h>
#include "hw/xen/xen_pt.h"
@@ -235,8 +236,8 @@ static void pc_init1(MachineState *machine,
if (pcmc->smbios_defaults) {
MachineClass *mc = MACHINE_GET_CLASS(machine);
/* These values are guest ABI, do not change */
- smbios_set_defaults("QEMU", mc->desc,
- mc->name, pcmc->smbios_legacy_mode,
+ smbios_set_defaults("Red Hat", "KVM",
+ mc->desc, pcmc->smbios_legacy_mode,
pcmc->smbios_uuid_encoded,
pcmc->smbios_stream_product,
pcmc->smbios_stream_version,
@@ -453,6 +454,7 @@ static void pc_set_south_bridge(Object *obj, int value, Error **errp)
@@ -422,6 +423,7 @@ static void pc_set_south_bridge(Object *obj, int value, Error **errp)
* hw_compat_*, pc_compat_*, or * pc_*_machine_options().
*/
@ -284,7 +290,7 @@ index 2a9f465619..44038391fb 100644
static void pc_compat_2_3_fn(MachineState *machine)
{
X86MachineState *x86ms = X86_MACHINE(machine);
@@ -970,3 +972,109 @@ static void xenfv_3_1_machine_options(MachineClass *m)
@@ -951,3 +953,110 @@ static void xenfv_3_1_machine_options(MachineClass *m)
DEFINE_PC_MACHINE(xenfv, "xenfv-3.1", pc_xen_hvm_init,
xenfv_3_1_machine_options);
#endif
@ -314,8 +320,7 @@ index 2a9f465619..44038391fb 100644
+
+static void pc_init_rhel760(MachineState *machine)
+{
+ pc_init1(machine, TYPE_I440FX_PCI_HOST_BRIDGE, \
+ TYPE_I440FX_PCI_DEVICE);
+ pc_init1(machine, TYPE_I440FX_PCI_DEVICE);
+}
+
+static void pc_machine_rhel760_options(MachineClass *m)
@ -339,6 +344,8 @@ index 2a9f465619..44038391fb 100644
+ pcmc->enforce_amd_1tb_hole = false;
+ /* From pc_i440fx_8_0_machine_options() */
+ pcmc->default_smbios_ep_type = SMBIOS_ENTRY_POINT_TYPE_32;
+ /* From pc_i440fx_8_1_machine_options() */
+ pcmc->broken_32bit_mem_addr_check = true;
+ /* Introduced in QEMU 8.2 */
+ pcmc->default_south_bridge = TYPE_PIIX3_DEVICE;
+
@ -395,21 +402,10 @@ index 2a9f465619..44038391fb 100644
+DEFINE_PC_MACHINE(rhel760, "pc-i440fx-rhel7.6.0", pc_init_rhel760,
+ pc_machine_rhel760_options);
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 912cb0c0dc..6387df97c8 100644
index c7bc8a2041..e872dc7e46 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -203,8 +203,8 @@ static void pc_q35_init(MachineState *machine)
if (pcmc->smbios_defaults) {
/* These values are guest ABI, do not change */
- smbios_set_defaults("QEMU", mc->desc,
- mc->name, pcmc->smbios_legacy_mode,
+ smbios_set_defaults("Red Hat", "KVM",
+ mc->desc, pcmc->smbios_legacy_mode,
pcmc->smbios_uuid_encoded,
pcmc->smbios_stream_product,
pcmc->smbios_stream_version,
@@ -363,6 +363,7 @@ static void pc_q35_init(MachineState *machine)
@@ -341,6 +341,7 @@ static void pc_q35_init(MachineState *machine)
DEFINE_PC_MACHINE(suffix, name, pc_init_##suffix, optionfn)
@ -417,7 +413,7 @@ index 912cb0c0dc..6387df97c8 100644
static void pc_q35_machine_options(MachineClass *m)
{
PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
@@ -699,3 +700,283 @@ static void pc_q35_2_4_machine_options(MachineClass *m)
@@ -693,3 +694,287 @@ static void pc_q35_2_4_machine_options(MachineClass *m)
DEFINE_Q35_MACHINE(v2_4, "pc-q35-2.4", NULL,
pc_q35_2_4_machine_options);
@ -444,6 +440,8 @@ index 912cb0c0dc..6387df97c8 100644
+ m->alias = "q35";
+ m->max_cpus = 710;
+ compat_props_add(m->compat_props, pc_rhel_compat, pc_rhel_compat_len);
+ compat_props_add(m->compat_props,
+ pc_q35_compat_defaults, pc_q35_compat_defaults_len);
+}
+
+static void pc_q35_init_rhel940(MachineState *machine)
@ -480,6 +478,8 @@ index 912cb0c0dc..6387df97c8 100644
+
+ /* From pc_q35_8_0_machine_options() */
+ pcmc->default_smbios_ep_type = SMBIOS_ENTRY_POINT_TYPE_32;
+ /* From pc_q35_8_1_machine_options() */
+ pcmc->broken_32bit_mem_addr_check = true;
+
+ compat_props_add(m->compat_props, hw_compat_rhel_9_4,
+ hw_compat_rhel_9_4_len);
@ -702,10 +702,10 @@ index 912cb0c0dc..6387df97c8 100644
+DEFINE_PC_MACHINE(q35_rhel760, "pc-q35-rhel7.6.0", pc_q35_init_rhel760,
+ pc_q35_machine_rhel760_options);
diff --git a/include/hw/boards.h b/include/hw/boards.h
index 4a21eddbf9..4edfdb0ddb 100644
index 0466f9d0f3..46b8725c41 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -277,6 +277,8 @@ struct MachineClass {
@@ -283,6 +283,8 @@ struct MachineClass {
strList *allowed_dynamic_sysbus_devices;
bool auto_enable_numa_with_memhp;
bool auto_enable_numa_with_memdev;
@ -715,12 +715,12 @@ index 4a21eddbf9..4edfdb0ddb 100644
bool smbus_no_migration_support;
bool nvdimm_supported;
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 037942d233..37644ede7e 100644
index ebd8f973f2..a984c951ad 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -314,6 +314,39 @@ extern const size_t pc_compat_1_4_len;
int pc_machine_kvm_type(MachineState *machine, const char *vm_type);
@@ -291,6 +291,39 @@ extern const size_t pc_compat_2_1_len;
extern GlobalProperty pc_compat_2_0[];
extern const size_t pc_compat_2_0_len;
+extern GlobalProperty pc_rhel_compat[];
+extern const size_t pc_rhel_compat_len;
@ -759,7 +759,7 @@ index 037942d233..37644ede7e 100644
static void pc_machine_##suffix##_class_init(ObjectClass *oc, void *data) \
{ \
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index cd16cb893d..93203d9b91 100644
index 33760a2ee1..be7b0663cd 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -2190,9 +2190,13 @@ static const CPUCaches epyc_genoa_cache_info = {
@ -925,10 +925,10 @@ index 9c791b7b05..b91af5051f 100644
};
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 4ce80555b4..9d41edf01e 100644
index e68cbe9293..739f33db47 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -3711,6 +3711,7 @@ static int kvm_get_msrs(X86CPU *cpu)
@@ -3715,6 +3715,7 @@ static int kvm_get_msrs(X86CPU *cpu)
struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries;
int ret, i;
uint64_t mtrr_top_bits;
@ -936,7 +936,7 @@ index 4ce80555b4..9d41edf01e 100644
kvm_msr_buf_reset(cpu);
@@ -4065,6 +4066,9 @@ static int kvm_get_msrs(X86CPU *cpu)
@@ -4069,6 +4070,9 @@ static int kvm_get_msrs(X86CPU *cpu)
break;
case MSR_KVM_ASYNC_PF_EN:
env->async_pf_en_msr = msrs[i].data;

@ -1,4 +1,4 @@
From 66a0510405e5142a1f9e38e0770aa0f10aed3e03 Mon Sep 17 00:00:00 2001
From 5768cf6811842e5c59da3b752f60659a9d6b5ba1 Mon Sep 17 00:00:00 2001
From: Miroslav Rezanina <mrezanin@redhat.com>
Date: Wed, 2 Sep 2020 09:39:41 +0200
Subject: Enable make check
@ -63,10 +63,10 @@ Merged patches (8.1.0):
13 files changed, 33 insertions(+), 30 deletions(-)
diff --git a/tests/avocado/replay_kernel.py b/tests/avocado/replay_kernel.py
index c37afa662c..61c95a2198 100644
index 10d99403a4..c3422ea1e4 100644
--- a/tests/avocado/replay_kernel.py
+++ b/tests/avocado/replay_kernel.py
@@ -153,7 +153,7 @@ def test_aarch64_virt(self):
@@ -166,7 +166,7 @@ def test_aarch64_virt(self):
"""
:avocado: tags=arch:aarch64
:avocado: tags=machine:virt
@ -76,7 +76,7 @@ index c37afa662c..61c95a2198 100644
kernel_url = ('https://archives.fedoraproject.org/pub/archive/fedora'
'/linux/releases/29/Everything/aarch64/os/images/pxeboot'
diff --git a/tests/avocado/reverse_debugging.py b/tests/avocado/reverse_debugging.py
index 4cce5a5598..e9248a04a2 100644
index 92855a02a5..87822074b6 100644
--- a/tests/avocado/reverse_debugging.py
+++ b/tests/avocado/reverse_debugging.py
@@ -230,7 +230,7 @@ def test_aarch64_virt(self):
@ -120,7 +120,7 @@ index 15fd87b2c1..f0d9d89c93 100644
kernel_path = self._grab_aarch64_kernel()
kernel_command_line = (self.KERNEL_COMMON_COMMAND_LINE +
diff --git a/tests/qemu-iotests/meson.build b/tests/qemu-iotests/meson.build
index 53847cb98f..a2abdb650e 100644
index fad340ad59..3c0d5241f6 100644
--- a/tests/qemu-iotests/meson.build
+++ b/tests/qemu-iotests/meson.build
@@ -51,21 +51,21 @@ foreach format, speed: qemu_iotests_formats
@ -163,7 +163,7 @@ index 53847cb98f..a2abdb650e 100644
+# endforeach
endforeach
diff --git a/tests/qemu-iotests/testenv.py b/tests/qemu-iotests/testenv.py
index 3ff38f2661..cab9a2bd6c 100644
index 588f30a4f1..3929a3634f 100644
--- a/tests/qemu-iotests/testenv.py
+++ b/tests/qemu-iotests/testenv.py
@@ -244,6 +244,9 @@ def __init__(self, source_dir: str, build_dir: str,
@ -216,7 +216,7 @@ index 663bb6c485..2efc43e3f7 100644
"-device intel-hda,id=" HDA_ID CODEC_DEVICES);
diff --git a/tests/qtest/libqos/meson.build b/tests/qtest/libqos/meson.build
index 90aae42a22..9bc4e41af0 100644
index 3aed6efcb8..119613237e 100644
--- a/tests/qtest/libqos/meson.build
+++ b/tests/qtest/libqos/meson.build
@@ -44,7 +44,7 @@ libqos_srcs = files(
@ -242,10 +242,10 @@ index 8ac95b89f7..cd2102555c 100644
qtest_outl(s, 0xcf8, 0x8000f840); /* PMBASE */
diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build
index 47dabf91d0..0bdfa3a821 100644
index 36c5c13a7b..a2887d6057 100644
--- a/tests/qtest/meson.build
+++ b/tests/qtest/meson.build
@@ -97,7 +97,6 @@ qtests_i386 = \
@@ -101,7 +101,6 @@ qtests_i386 = \
'drive_del-test',
'tco-test',
'cpu-plug-test',
@ -254,7 +254,7 @@ index 47dabf91d0..0bdfa3a821 100644
'migration-test',
'test-x86-cpuid-compat',
diff --git a/tests/qtest/virtio-net-failover.c b/tests/qtest/virtio-net-failover.c
index 0d40bc1f2d..4c633c1584 100644
index 73dfabc272..a9dd304781 100644
--- a/tests/qtest/virtio-net-failover.c
+++ b/tests/qtest/virtio-net-failover.c
@@ -26,6 +26,7 @@

@ -1,4 +1,4 @@
From a27cfa0b407bd806ce389a7c69d0130bcfd35244 Mon Sep 17 00:00:00 2001
From e06a905d726fc20ea6bd95dff1bd0ffe97ebb202 Mon Sep 17 00:00:00 2001
From: Bandan Das <bsd@redhat.com>
Date: Tue, 3 Dec 2013 20:05:13 +0100
Subject: vfio: cap number of devices that can be assigned
@ -27,7 +27,7 @@ to 64 as some usecases require more than 32 devices.
Signed-off-by: Bandan Das <bsd@redhat.com>
Rebase changes (231025):
Rebase changes (8.2.0):
- Update to upstream changes
---
hw/vfio/pci.c | 31 ++++++++++++++++++++++++++++++-
@ -35,10 +35,10 @@ Rebase changes (231025):
2 files changed, 31 insertions(+), 1 deletion(-)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index c62c02f7b6..ec98080f28 100644
index 64780d1b79..57ac63c10c 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -48,6 +48,9 @@
@@ -50,6 +50,9 @@
/* Protected by BQL */
static KVMRouteChange vfio_route_change;
@ -48,15 +48,14 @@ index c62c02f7b6..ec98080f28 100644
static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
static void vfio_msi_disable_common(VFIOPCIDevice *vdev);
@@ -3076,14 +3079,37 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
{
@@ -2946,13 +2949,36 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
ERRP_GUARD();
VFIOPCIDevice *vdev = VFIO_PCI(pdev);
VFIODevice *vbasedev = &vdev->vbasedev;
+ VFIODevice *vbasedev_iter;
+ VFIOGroup *group;
char *tmp, *subsys;
Error *err = NULL;
struct stat st;
- int i, ret;
+ int ret, i = 0;
bool is_mdev;
@ -84,10 +83,10 @@ index c62c02f7b6..ec98080f28 100644
+ return;
+ }
+
if (!vbasedev->sysfsdev) {
if (vbasedev->fd < 0 && !vbasedev->sysfsdev) {
if (!(~vdev->host.domain || ~vdev->host.bus ||
~vdev->host.slot || ~vdev->host.function)) {
@@ -3501,6 +3527,9 @@ static Property vfio_pci_dev_properties[] = {
@@ -3370,6 +3396,9 @@ static Property vfio_pci_dev_properties[] = {
DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
no_geforce_quirks, false),
@ -98,7 +97,7 @@ index c62c02f7b6..ec98080f28 100644
false),
DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index fba8737ab2..eb74d9de2d 100644
index 6e64a2654e..b7de39c010 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -142,6 +142,7 @@ struct VFIOPCIDevice {

@ -1,4 +1,4 @@
From 424f14d123fe1043518758605d94ed5ba50e52ad Mon Sep 17 00:00:00 2001
From b467dc6a24ef41fa574260429807711f6802a54d Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Wed, 4 Dec 2013 18:53:17 +0100
Subject: Add support statement to -help output
@ -21,10 +21,10 @@ Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
1 file changed, 9 insertions(+)
diff --git a/system/vl.c b/system/vl.c
index 2bcd9efb9a..93635ffc5b 100644
index c644222982..03c3b0aa94 100644
--- a/system/vl.c
+++ b/system/vl.c
@@ -870,9 +870,17 @@ static void version(void)
@@ -869,9 +869,17 @@ static void version(void)
QEMU_COPYRIGHT "\n");
}
@ -42,7 +42,7 @@ index 2bcd9efb9a..93635ffc5b 100644
printf("usage: %s [options] [disk_image]\n\n"
"'disk_image' is a raw hard disk image for IDE hard disk 0\n\n",
g_get_prgname());
@@ -898,6 +906,7 @@ static void help(int exitcode)
@@ -897,6 +905,7 @@ static void help(int exitcode)
"\n"
QEMU_HELP_BOTTOM "\n");

@ -1,4 +1,4 @@
From c683ff4a770b77dbe707413840918a46f67fa825 Mon Sep 17 00:00:00 2001
From 20cc3a6d9bce3e40d165f865b5e398c300cae7bf Mon Sep 17 00:00:00 2001
From: Miroslav Rezanina <mrezanin@redhat.com>
Date: Wed, 8 Jul 2020 08:35:50 +0200
Subject: Use qemu-kvm in documentation instead of qemu-system-<arch>
@ -36,10 +36,10 @@ index 52d6454b93..d74dbdeca9 100644
.. |I2C| replace:: I\ :sup:`2`\ C
.. |I2S| replace:: I\ :sup:`2`\ S
diff --git a/qemu-options.hx b/qemu-options.hx
index 42fd09e4de..557118cb1f 100644
index 8ce85d4559..4fc27ee2e2 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -3469,11 +3469,11 @@ SRST
@@ -3493,11 +3493,11 @@ SRST
::

@ -1,4 +1,4 @@
From 776bff1be5e98982a9bbc8345ff27274ff5b8c0f Mon Sep 17 00:00:00 2001
From 2f9fdd21ecf2810d0d83a8125ce0cc1e75dbb13a Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 20 Aug 2021 18:25:12 +0200
Subject: qcow2: Deprecation warning when opening v2 images rw
@ -44,7 +44,7 @@ Rebase notes (6.1.0):
2 files changed, 7 insertions(+)
diff --git a/block/qcow2.c b/block/qcow2.c
index 13e032bd5e..7968735346 100644
index 956128b409..0e8b2f7518 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1358,6 +1358,12 @@ qcow2_do_open(BlockDriverState *bs, QDict *options, int flags,

@ -0,0 +1,121 @@
From 59470e8ab849f22b407f55292e540e16a8cad01a Mon Sep 17 00:00:00 2001
From: Miroslav Rezanina <mrezanin@redhat.com>
Date: Wed, 20 Mar 2024 05:34:32 -0400
Subject: Add upstream compatibility bits
Adding new compats structure for changes introduced during rebase to QEMU 9.0.0.
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
---
Rebase notes (9.0.0 rc2):
- Add aw-bits setting for aarch compat record (overwritten for 9.4 and older)
---
hw/arm/virt.c | 3 +++
hw/core/machine.c | 10 ++++++++++
hw/i386/pc_piix.c | 3 ++-
hw/i386/pc_q35.c | 3 +++
hw/s390x/s390-virtio-ccw.c | 1 +
include/hw/boards.h | 3 +++
6 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 22bc345137..f1af9495c6 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -144,6 +144,8 @@ GlobalProperty arm_rhel_compat[] = {
{"virtio-net-pci", "romfile", "" },
{"virtio-net-pci-transitional", "romfile", "" },
{"virtio-net-pci-non-transitional", "romfile", "" },
+ /* arm_rhel_compat from arm_virt_compat, added for 9.0.0 rebase */
+ { TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "48" },
};
const size_t arm_rhel_compat_len = G_N_ELEMENTS(arm_rhel_compat);
@@ -3728,6 +3730,7 @@ type_init(rhel_machine_init);
static void rhel940_virt_options(MachineClass *mc)
{
+ compat_props_add(mc->compat_props, hw_compat_rhel_9_5, hw_compat_rhel_9_5_len);
}
DEFINE_RHEL_MACHINE_AS_LATEST(9, 4, 0)
diff --git a/hw/core/machine.c b/hw/core/machine.c
index 695cb89a46..0f256d9633 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -302,6 +302,16 @@ const size_t hw_compat_2_1_len = G_N_ELEMENTS(hw_compat_2_1);
const char *rhel_old_machine_deprecation =
"machine types for previous major releases are deprecated";
+GlobalProperty hw_compat_rhel_9_5[] = {
+ /* hw_compat_rhel_9_5 from hw_compat_8_2 */
+ { "migration", "zero-page-detection", "legacy"},
+ /* hw_compat_rhel_9_5 from hw_compat_8_2 */
+ { TYPE_VIRTIO_IOMMU_PCI, "granule", "4k" },
+ /* hw_compat_rhel_9_5 from hw_compat_8_2 */
+ { TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "64" },
+};
+const size_t hw_compat_rhel_9_5_len = G_N_ELEMENTS(hw_compat_rhel_9_5);
+
GlobalProperty hw_compat_rhel_9_4[] = {
/* hw_compat_rhel_9_4 from hw_compat_8_0 */
{ TYPE_VIRTIO_NET, "host_uso", "off"},
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index a647262d63..6b260682eb 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -1015,7 +1015,8 @@ static void pc_machine_rhel760_options(MachineClass *m)
object_class_property_set_description(oc, "x-south-bridge",
"Use a different south bridge than PIIX3");
-
+ compat_props_add(m->compat_props, hw_compat_rhel_9_5,
+ hw_compat_rhel_9_5_len);
compat_props_add(m->compat_props, hw_compat_rhel_9_4,
hw_compat_rhel_9_4_len);
compat_props_add(m->compat_props, hw_compat_rhel_9_3,
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index e872dc7e46..2b54944c0f 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -733,6 +733,9 @@ static void pc_q35_machine_rhel940_options(MachineClass *m)
m->desc = "RHEL-9.4.0 PC (Q35 + ICH9, 2009)";
pcmc->smbios_stream_product = "RHEL";
pcmc->smbios_stream_version = "9.4.0";
+
+ compat_props_add(m->compat_props, hw_compat_rhel_9_5,
+ hw_compat_rhel_9_5_len);
}
DEFINE_PC_MACHINE(q35_rhel940, "pc-q35-rhel9.4.0", pc_q35_init_rhel940,
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index ff753a29e0..9ad54682c6 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -1282,6 +1282,7 @@ static void ccw_machine_rhel940_instance_options(MachineState *machine)
static void ccw_machine_rhel940_class_options(MachineClass *mc)
{
+ compat_props_add(mc->compat_props, hw_compat_rhel_9_5, hw_compat_rhel_9_5_len);
}
DEFINE_CCW_MACHINE(rhel940, "rhel9.4.0", true);
diff --git a/include/hw/boards.h b/include/hw/boards.h
index 46b8725c41..cca62f906b 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -514,6 +514,9 @@ extern const size_t hw_compat_2_2_len;
extern GlobalProperty hw_compat_2_1[];
extern const size_t hw_compat_2_1_len;
+extern GlobalProperty hw_compat_rhel_9_5[];
+extern const size_t hw_compat_rhel_9_5_len;
+
extern GlobalProperty hw_compat_rhel_9_4[];
extern const size_t hw_compat_rhel_9_4_len;
--
2.39.3

@ -1,44 +0,0 @@
From 3b9b38339346ebfaf3e8ddf0822eba1cc9e78408 Mon Sep 17 00:00:00 2001
From: Miroslav Rezanina <mrezanin@redhat.com>
Date: Thu, 14 Dec 2023 04:42:01 -0500
Subject: Introduce RHEL 9.4.0 qemu-kvm machine type for aarch64
Jira: https://issues.redhat.com/browse/RHEL-17168
Adding new machine type to support enabling new features.
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
---
hw/arm/virt.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index c541efee5e..0b17c94ad7 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -3630,14 +3630,21 @@ static void rhel_machine_init(void)
}
type_init(rhel_machine_init);
+static void rhel940_virt_options(MachineClass *mc)
+{
+}
+DEFINE_RHEL_MACHINE_AS_LATEST(9, 4, 0)
+
static void rhel920_virt_options(MachineClass *mc)
{
+ rhel940_virt_options(mc);
+
compat_props_add(mc->compat_props, arm_rhel_compat, arm_rhel_compat_len);
compat_props_add(mc->compat_props, hw_compat_rhel_9_4, hw_compat_rhel_9_4_len);
compat_props_add(mc->compat_props, hw_compat_rhel_9_3, hw_compat_rhel_9_3_len);
compat_props_add(mc->compat_props, hw_compat_rhel_9_2, hw_compat_rhel_9_2_len);
}
-DEFINE_RHEL_MACHINE_AS_LATEST(9, 2, 0)
+DEFINE_RHEL_MACHINE(9, 2, 0)
static void rhel900_virt_options(MachineClass *mc)
{
--
2.39.3

@ -0,0 +1,30 @@
From ba574acacf679850e337ec2d5e7836b8277cf393 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@redhat.com>
Date: Thu, 18 Apr 2024 15:04:28 +0200
Subject: x86: rhel 9.4.0 machine type compat fix
Fix up the compatibility for 9.4.0. Ensure that pc-q35-rhel9.4.0
still uses SMBIOS 3.X by default.
Signed-off-by: Sebastian Ott <sebott@redhat.com>
---
hw/i386/pc_q35.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 2b54944c0f..2f11f9af7d 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -734,6 +734,9 @@ static void pc_q35_machine_rhel940_options(MachineClass *m)
pcmc->smbios_stream_product = "RHEL";
pcmc->smbios_stream_version = "9.4.0";
+ /* From pc_q35_8_2_machine_options() - use SMBIOS 3.X by default */
+ pcmc->default_smbios_ep_type = SMBIOS_ENTRY_POINT_TYPE_64;
+
compat_props_add(m->compat_props, hw_compat_rhel_9_5,
hw_compat_rhel_9_5_len);
}
--
2.39.3

@ -1,37 +0,0 @@
From 363d6aedc82314a70bdfbe9fa23b7e8fdda50138 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 11 Jan 2024 12:26:19 -0500
Subject: [PATCH 066/101] Compile IOMMUFD object on aarch64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [65/67] 9358030fdd499c5fe122dee3bb4f114966fac9c2 (eauger1/centos-qemu-kvm)
Upstream: RHEL only
Compiles the IOMMUFD object on aarch64 to be able to use
the IOMMUFD VFIO backend.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
configs/devices/aarch64-softmmu/aarch64-rh-devices.mak | 1 +
1 file changed, 1 insertion(+)
diff --git a/configs/devices/aarch64-softmmu/aarch64-rh-devices.mak b/configs/devices/aarch64-softmmu/aarch64-rh-devices.mak
index aec1831199..b0191d3c69 100644
--- a/configs/devices/aarch64-softmmu/aarch64-rh-devices.mak
+++ b/configs/devices/aarch64-softmmu/aarch64-rh-devices.mak
@@ -39,3 +39,4 @@ CONFIG_PXB=y
CONFIG_VHOST_VSOCK=y
CONFIG_VHOST_USER_VSOCK=y
CONFIG_VHOST_USER_FS=y
+CONFIG_IOMMUFD=y
--
2.39.3

@ -1,37 +0,0 @@
From c1e9ddf8d0ea6d358fcaa5cacd3a91920f36e73b Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 11 Jan 2024 12:33:17 -0500
Subject: [PATCH 067/101] Compile IOMMUFD on s390x
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [66/67] d3004aafca2bb76d817ac99c3d65973b8fbd4557 (eauger1/centos-qemu-kvm)
Upstream: RHEL only
Compiles the IOMMUFD object on s390x to be able to use
the IOMMUFD VFIO backend.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
configs/devices/s390x-softmmu/s390x-rh-devices.mak | 1 +
1 file changed, 1 insertion(+)
diff --git a/configs/devices/s390x-softmmu/s390x-rh-devices.mak b/configs/devices/s390x-softmmu/s390x-rh-devices.mak
index 69a799adbd..24cf6dbd03 100644
--- a/configs/devices/s390x-softmmu/s390x-rh-devices.mak
+++ b/configs/devices/s390x-softmmu/s390x-rh-devices.mak
@@ -16,3 +16,4 @@ CONFIG_WDT_DIAG288=y
CONFIG_VHOST_VSOCK=y
CONFIG_VHOST_USER_VSOCK=y
CONFIG_VHOST_USER_FS=y
+CONFIG_IOMMUFD=y
--
2.39.3

@ -1,37 +0,0 @@
From be2c3d9bbee1bdec061c901f507bc999fa40a53e Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 11 Jan 2024 12:34:44 -0500
Subject: [PATCH 068/101] Compile IOMMUFD on x86_64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [67/67] 411d48a5cc7ce1f05be793fd9a89c143ce34c91a (eauger1/centos-qemu-kvm)
Upstream: RHEL only
Compiles the IOMMUFD object on s390x to be able to use
the IOMMUFD VFIO backend.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
configs/devices/x86_64-softmmu/x86_64-rh-devices.mak | 1 +
1 file changed, 1 insertion(+)
diff --git a/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak b/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak
index ce5be73633..ba41108e0c 100644
--- a/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak
+++ b/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak
@@ -108,3 +108,4 @@ CONFIG_SGX=y
CONFIG_VHOST_VSOCK=y
CONFIG_VHOST_USER_VSOCK=y
CONFIG_VHOST_USER_FS=y
+CONFIG_IOMMUFD=y
--
2.39.3

@ -1,128 +0,0 @@
From ef212eb3026a2460f6502a76afa3baedeb74d975 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= <marcandre.lureau@redhat.com>
Date: Tue, 6 Aug 2024 14:14:27 +0400
Subject: [PATCH 1/5] Fix scanout version with pc-q35-rhel9.4.0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Marc-André Lureau <marcandre.lureau@redhat.com>
RH-MergeRequest: 383: Fix scanout version with pc-q35-rhel9.4.0
RH-Jira: RHEL-53565
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [1/1] ed2860cfb933654b0046c1b59f78d2e50146bd6c
Resolves: https://issues.redhat.com/browse/RHEL-52940
Introduce hw_compat_rhel_9_4_extra so that pc-q35-rhel9.4.0 has
x-scanout-vmstate-version=1
Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
---
hw/arm/virt.c | 1 +
hw/core/machine.c | 13 +++++++++++--
hw/i386/pc_piix.c | 2 ++
hw/i386/pc_q35.c | 3 +++
hw/s390x/s390-virtio-ccw.c | 1 +
include/hw/boards.h | 3 +++
6 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index e4a66affcb..851eff6b28 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -3670,6 +3670,7 @@ type_init(rhel_machine_init);
static void rhel940_virt_options(MachineClass *mc)
{
+ compat_props_add(mc->compat_props, hw_compat_rhel_9_4_extra, hw_compat_rhel_9_4_extra_len);
}
DEFINE_RHEL_MACHINE_AS_LATEST(9, 4, 0)
diff --git a/hw/core/machine.c b/hw/core/machine.c
index c8c460c916..fe0a28ef3b 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -64,6 +64,17 @@ const size_t hw_compat_7_2_len = G_N_ELEMENTS(hw_compat_7_2);
const char *rhel_old_machine_deprecation =
"machine types for previous major releases are deprecated";
+/*
+ * rhel_9_4_extra is used for <= 9.4 machines.
+ *
+ * (for compatibility and historical reasons, rhel_9_4 is used for <= 9.2 too)
+ */
+GlobalProperty hw_compat_rhel_9_4_extra[] = {
+ /* hw_compat_rhel_9_4_extra from hw_compat_8_2 */
+ { "virtio-gpu-device", "x-scanout-vmstate-version", "1" },
+};
+const size_t hw_compat_rhel_9_4_extra_len = G_N_ELEMENTS(hw_compat_rhel_9_4_extra);
+
GlobalProperty hw_compat_rhel_9_4[] = {
/* hw_compat_rhel_9_4 from hw_compat_8_0 */
{ TYPE_VIRTIO_NET, "host_uso", "off"},
@@ -81,8 +92,6 @@ GlobalProperty hw_compat_rhel_9_4[] = {
{ "igb", "x-pcie-flr-init", "off" },
/* hw_compat_rhel_9_4 jira RHEL-24045 */
{ "virtio-mem", "dynamic-memslots", "off" },
- /* hw_compat_rhel_9_4 from hw_compat_8_1 */
- { "virtio-gpu-device", "x-scanout-vmstate-version", "1" },
};
const size_t hw_compat_rhel_9_4_len = G_N_ELEMENTS(hw_compat_rhel_9_4);
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 54d1c58bce..c846673d87 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -1031,6 +1031,8 @@ static void pc_machine_rhel760_options(MachineClass *m)
"Use a different south bridge than PIIX3");
+ compat_props_add(m->compat_props, hw_compat_rhel_9_4_extra,
+ hw_compat_rhel_9_4_extra_len);
compat_props_add(m->compat_props, hw_compat_rhel_9_4,
hw_compat_rhel_9_4_len);
compat_props_add(m->compat_props, hw_compat_rhel_9_3,
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index cd5fb7380e..02bc3d515f 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -731,6 +731,9 @@ static void pc_q35_machine_rhel940_options(MachineClass *m)
m->desc = "RHEL-9.4.0 PC (Q35 + ICH9, 2009)";
pcmc->smbios_stream_product = "RHEL";
pcmc->smbios_stream_version = "9.4.0";
+
+ compat_props_add(m->compat_props, hw_compat_rhel_9_4_extra,
+ hw_compat_rhel_9_4_extra_len);
}
DEFINE_PC_MACHINE(q35_rhel940, "pc-q35-rhel9.4.0", pc_q35_init_rhel940,
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index 24f4773179..cc6087c37a 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -1277,6 +1277,7 @@ static void ccw_machine_rhel940_instance_options(MachineState *machine)
static void ccw_machine_rhel940_class_options(MachineClass *mc)
{
+ compat_props_add(mc->compat_props, hw_compat_rhel_9_4_extra, hw_compat_rhel_9_4_extra_len);
}
DEFINE_CCW_MACHINE(rhel940, "rhel9.4.0", true);
diff --git a/include/hw/boards.h b/include/hw/boards.h
index 4edfdb0ddb..e6ae0e61cf 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -505,6 +505,9 @@ extern const size_t hw_compat_2_2_len;
extern GlobalProperty hw_compat_2_1[];
extern const size_t hw_compat_2_1_len;
+extern GlobalProperty hw_compat_rhel_9_4_extra[];
+extern const size_t hw_compat_rhel_9_4_extra_len;
+
extern GlobalProperty hw_compat_rhel_9_4[];
extern const size_t hw_compat_rhel_9_4_len;
--
2.39.3

@ -0,0 +1,139 @@
From 93ea86ac8849ad9ca365b1646313dde9a34ba59c Mon Sep 17 00:00:00 2001
From: Xiaoyao Li <xiaoyao.li@intel.com>
Date: Wed, 20 Mar 2024 03:39:03 -0500
Subject: [PATCH 031/100] HostMem: Add mechanism to opt in kvm guest memfd via
MachineState
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [31/91] 43ce32aef954479cdb736301d1adcb919602c321 (bonzini/rhel-qemu-kvm)
Add a new member "guest_memfd" to memory backends. When it's set
to true, it enables RAM_GUEST_MEMFD in ram_flags, thus private kvm
guest_memfd will be allocated during RAMBlock allocation.
Memory backend's @guest_memfd is wired with @require_guest_memfd
field of MachineState. It avoid looking up the machine in phymem.c.
MachineState::require_guest_memfd is supposed to be set by any VMs
that requires KVM guest memfd as private memory, e.g., TDX VM.
Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Message-ID: <20240320083945.991426-8-michael.roth@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 37662d85b0b7dded0ebdf6747bef6c3bb7ed6a0c)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
backends/hostmem-file.c | 1 +
backends/hostmem-memfd.c | 1 +
backends/hostmem-ram.c | 1 +
backends/hostmem.c | 1 +
hw/core/machine.c | 5 +++++
include/hw/boards.h | 2 ++
include/sysemu/hostmem.h | 1 +
7 files changed, 12 insertions(+)
diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c
index ac3e433cbd..3c69db7946 100644
--- a/backends/hostmem-file.c
+++ b/backends/hostmem-file.c
@@ -85,6 +85,7 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
ram_flags |= fb->readonly ? RAM_READONLY_FD : 0;
ram_flags |= fb->rom == ON_OFF_AUTO_ON ? RAM_READONLY : 0;
ram_flags |= backend->reserve ? 0 : RAM_NORESERVE;
+ ram_flags |= backend->guest_memfd ? RAM_GUEST_MEMFD : 0;
ram_flags |= fb->is_pmem ? RAM_PMEM : 0;
ram_flags |= RAM_NAMED_FILE;
return memory_region_init_ram_from_file(&backend->mr, OBJECT(backend), name,
diff --git a/backends/hostmem-memfd.c b/backends/hostmem-memfd.c
index 3923ea9364..745ead0034 100644
--- a/backends/hostmem-memfd.c
+++ b/backends/hostmem-memfd.c
@@ -55,6 +55,7 @@ memfd_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
name = host_memory_backend_get_name(backend);
ram_flags = backend->share ? RAM_SHARED : 0;
ram_flags |= backend->reserve ? 0 : RAM_NORESERVE;
+ ram_flags |= backend->guest_memfd ? RAM_GUEST_MEMFD : 0;
return memory_region_init_ram_from_fd(&backend->mr, OBJECT(backend), name,
backend->size, ram_flags, fd, 0, errp);
}
diff --git a/backends/hostmem-ram.c b/backends/hostmem-ram.c
index d121249f0f..f7d81af783 100644
--- a/backends/hostmem-ram.c
+++ b/backends/hostmem-ram.c
@@ -30,6 +30,7 @@ ram_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
name = host_memory_backend_get_name(backend);
ram_flags = backend->share ? RAM_SHARED : 0;
ram_flags |= backend->reserve ? 0 : RAM_NORESERVE;
+ ram_flags |= backend->guest_memfd ? RAM_GUEST_MEMFD : 0;
return memory_region_init_ram_flags_nomigrate(&backend->mr, OBJECT(backend),
name, backend->size,
ram_flags, errp);
diff --git a/backends/hostmem.c b/backends/hostmem.c
index 81a72ce40b..eb9682b4a8 100644
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -277,6 +277,7 @@ static void host_memory_backend_init(Object *obj)
/* TODO: convert access to globals to compat properties */
backend->merge = machine_mem_merge(machine);
backend->dump = machine_dump_guest_core(machine);
+ backend->guest_memfd = machine_require_guest_memfd(machine);
backend->reserve = true;
backend->prealloc_threads = machine->smp.cpus;
}
diff --git a/hw/core/machine.c b/hw/core/machine.c
index 92609aae27..07b994e136 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -1480,6 +1480,11 @@ bool machine_mem_merge(MachineState *machine)
return machine->mem_merge;
}
+bool machine_require_guest_memfd(MachineState *machine)
+{
+ return machine->require_guest_memfd;
+}
+
static char *cpu_slot_to_string(const CPUArchId *cpu)
{
GString *s = g_string_new(NULL);
diff --git a/include/hw/boards.h b/include/hw/boards.h
index cca62f906b..815a1c4b26 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -36,6 +36,7 @@ bool machine_usb(MachineState *machine);
int machine_phandle_start(MachineState *machine);
bool machine_dump_guest_core(MachineState *machine);
bool machine_mem_merge(MachineState *machine);
+bool machine_require_guest_memfd(MachineState *machine);
HotpluggableCPUList *machine_query_hotpluggable_cpus(MachineState *machine);
void machine_set_cpu_numa_node(MachineState *machine,
const CpuInstanceProperties *props,
@@ -372,6 +373,7 @@ struct MachineState {
char *dt_compatible;
bool dump_guest_core;
bool mem_merge;
+ bool require_guest_memfd;
bool usb;
bool usb_disabled;
char *firmware;
diff --git a/include/sysemu/hostmem.h b/include/sysemu/hostmem.h
index 0e411aaa29..04b884bf42 100644
--- a/include/sysemu/hostmem.h
+++ b/include/sysemu/hostmem.h
@@ -74,6 +74,7 @@ struct HostMemoryBackend {
uint64_t size;
bool merge, dump, use_canonical_path;
bool prealloc, is_mapped, share, reserve;
+ bool guest_memfd;
uint32_t prealloc_threads;
ThreadContext *prealloc_context;
DECLARE_BITMAP(host_nodes, MAX_NODES + 1);
--
2.39.3

@ -1,155 +0,0 @@
From 5c639f8ce65183ce8e44ee8e0230e9d627a440d7 Mon Sep 17 00:00:00 2001
From: Igor Mammedov <imammedo@redhat.com>
Date: Wed, 21 Feb 2024 17:00:27 +0000
Subject: [PATCH 05/20] Implement SMBIOS type 9 v2.6
RH-Author: Igor Mammedov <imammedo@redhat.com>
RH-MergeRequest: 230: Workaround Windows failing to find 64bit SMBIOS entry point with SeaBIOS
RH-Jira: RHEL-21705
RH-Acked-by: MST <mst@redhat.com>
RH-Acked-by: Ani Sinha <None>
RH-Commit: [3/18] ead230527d93938907a561cf5b985ee4f54d82b1
JIRA: https://issues.redhat.com/browse/RHEL-21705
Author: Felix Wu <flwu@google.com>
Signed-off-by: Felix Wu <flwu@google.com>
Signed-off-by: Nabih Estefan <nabihestefan@google.com>
Message-Id: <20240221170027.1027325-3-nabihestefan@google.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 04f143d828845d0fd52dd4a52664d81a4f5431f7)
Signed-off-by: Igor Mammedov <imammedo@redhat.com>
---
hw/smbios/smbios.c | 49 +++++++++++++++++++++++++++++++++---
include/hw/firmware/smbios.h | 4 +++
qemu-options.hx | 2 +-
3 files changed, 51 insertions(+), 4 deletions(-)
diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c
index 4f5637d445..074705fa4c 100644
--- a/hw/smbios/smbios.c
+++ b/hw/smbios/smbios.c
@@ -124,7 +124,7 @@ static QTAILQ_HEAD(, type8_instance) type8 = QTAILQ_HEAD_INITIALIZER(type8);
/* type 9 instance for parsing */
struct type9_instance {
- const char *slot_designation;
+ const char *slot_designation, *pcidev;
uint8_t slot_type, slot_data_bus_width, current_usage, slot_length,
slot_characteristics1, slot_characteristics2;
uint16_t slot_id;
@@ -427,6 +427,11 @@ static const QemuOptDesc qemu_smbios_type9_opts[] = {
.type = QEMU_OPT_NUMBER,
.help = "slot characteristics2, see the spec",
},
+ {
+ .name = "pci_device",
+ .type = QEMU_OPT_STRING,
+ .help = "PCI device, if provided."
+ }
};
static const QemuOptDesc qemu_smbios_type11_opts[] = {
@@ -851,7 +856,7 @@ static void smbios_build_type_8_table(void)
}
}
-static void smbios_build_type_9_table(void)
+static void smbios_build_type_9_table(Error **errp)
{
unsigned instance = 0;
struct type9_instance *t9;
@@ -868,6 +873,43 @@ static void smbios_build_type_9_table(void)
t->slot_characteristics1 = t9->slot_characteristics1;
t->slot_characteristics2 = t9->slot_characteristics2;
+ if (t9->pcidev) {
+ PCIDevice *pdev = NULL;
+ int rc = pci_qdev_find_device(t9->pcidev, &pdev);
+ if (rc != 0) {
+ error_setg(errp,
+ "No PCI device %s for SMBIOS type 9 entry %s",
+ t9->pcidev, t9->slot_designation);
+ return;
+ }
+ /*
+ * We only handle the case were the device is attached to
+ * the PCI root bus. The general case is more complex as
+ * bridges are enumerated later and the table would need
+ * to be updated at this moment.
+ */
+ if (!pci_bus_is_root(pci_get_bus(pdev))) {
+ error_setg(errp,
+ "Cannot create type 9 entry for PCI device %s: "
+ "not attached to the root bus",
+ t9->pcidev);
+ return;
+ }
+ t->segment_group_number = cpu_to_le16(0);
+ t->bus_number = pci_dev_bus_num(pdev);
+ t->device_number = pdev->devfn;
+ } else {
+ /*
+ * Per SMBIOS spec, For slots that are not of the PCI, AGP, PCI-X,
+ * or PCI-Express type that do not have bus/device/function
+ * information, 0FFh should be populated in the fields of Segment
+ * Group Number, Bus Number, Device/Function Number.
+ */
+ t->segment_group_number = 0xff;
+ t->bus_number = 0xff;
+ t->device_number = 0xff;
+ }
+
SMBIOS_BUILD_TABLE_POST;
instance++;
}
@@ -1222,7 +1264,7 @@ void smbios_get_tables(MachineState *ms,
}
smbios_build_type_8_table();
- smbios_build_type_9_table();
+ smbios_build_type_9_table(errp);
smbios_build_type_11_table();
#define MAX_DIMM_SZ (16 * GiB)
@@ -1568,6 +1610,7 @@ void smbios_entry_add(QemuOpts *opts, Error **errp)
t->slot_id = qemu_opt_get_number(opts, "slot_id", 0);
t->slot_characteristics1 = qemu_opt_get_number(opts, "slot_characteristics1", 0);
t->slot_characteristics2 = qemu_opt_get_number(opts, "slot_characteristics2", 0);
+ save_opt(&t->pcidev, opts, "pcidev");
QTAILQ_INSERT_TAIL(&type9, t, next);
return;
}
diff --git a/include/hw/firmware/smbios.h b/include/hw/firmware/smbios.h
index 6bbd5a4c20..f8dd07fe4c 100644
--- a/include/hw/firmware/smbios.h
+++ b/include/hw/firmware/smbios.h
@@ -222,6 +222,10 @@ struct smbios_type_9 {
uint16_t slot_id;
uint8_t slot_characteristics1;
uint8_t slot_characteristics2;
+ /* SMBIOS spec v2.6+ */
+ uint16_t segment_group_number;
+ uint8_t bus_number;
+ uint8_t device_number;
} QEMU_PACKED;
/* SMBIOS type 11 - OEM strings */
diff --git a/qemu-options.hx b/qemu-options.hx
index 94cacc2c63..93364e1765 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -2710,7 +2710,7 @@ SRST
``-smbios type=4[,sock_pfx=str][,manufacturer=str][,version=str][,serial=str][,asset=str][,part=str][,processor-id=%d]``
Specify SMBIOS type 4 fields
-``-smbios type=9[,slot_designation=str][,slot_type=%d][,slot_data_bus_width=%d][,current_usage=%d][,slot_length=%d][,slot_id=%d][,slot_characteristics1=%d][,slot_characteristics12=%d]``
+``-smbios type=9[,slot_designation=str][,slot_type=%d][,slot_data_bus_width=%d][,current_usage=%d][,slot_length=%d][,slot_id=%d][,slot_characteristics1=%d][,slot_characteristics12=%d][,pci_device=str]``
Specify SMBIOS type 9 fields
``-smbios type=11[,value=str][,path=filename]``
--
2.39.3

@ -1,218 +0,0 @@
From 84fc607d678bd72397a41d706e91fa241fd97266 Mon Sep 17 00:00:00 2001
From: Igor Mammedov <imammedo@redhat.com>
Date: Wed, 21 Feb 2024 17:00:26 +0000
Subject: [PATCH 04/20] Implement base of SMBIOS type 9 descriptor.
RH-Author: Igor Mammedov <imammedo@redhat.com>
RH-MergeRequest: 230: Workaround Windows failing to find 64bit SMBIOS entry point with SeaBIOS
RH-Jira: RHEL-21705
RH-Acked-by: MST <mst@redhat.com>
RH-Acked-by: Ani Sinha <None>
RH-Commit: [2/18] 2678cc080bfbf3357fa2f94ceaf42fc61b690d32
JIRA: https://issues.redhat.com/browse/RHEL-21705
commit: 735eee07d1f963635d3c3bf9f5e4bf1bc000870e
Author: Felix Wu <flwu@google.com>
Version 2.1+.
Signed-off-by: Felix Wu <flwu@google.com>
Signed-off-by: Nabih Estefan <nabihestefan@google.com>
Message-Id: <20240221170027.1027325-2-nabihestefan@google.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Igor Mammedov <imammedo@redhat.com>
---
hw/smbios/smbios.c | 99 ++++++++++++++++++++++++++++++++++++
include/hw/firmware/smbios.h | 13 +++++
qemu-options.hx | 3 ++
3 files changed, 115 insertions(+)
diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c
index 7bde23e59d..4f5637d445 100644
--- a/hw/smbios/smbios.c
+++ b/hw/smbios/smbios.c
@@ -122,6 +122,16 @@ struct type8_instance {
};
static QTAILQ_HEAD(, type8_instance) type8 = QTAILQ_HEAD_INITIALIZER(type8);
+/* type 9 instance for parsing */
+struct type9_instance {
+ const char *slot_designation;
+ uint8_t slot_type, slot_data_bus_width, current_usage, slot_length,
+ slot_characteristics1, slot_characteristics2;
+ uint16_t slot_id;
+ QTAILQ_ENTRY(type9_instance) next;
+};
+static QTAILQ_HEAD(, type9_instance) type9 = QTAILQ_HEAD_INITIALIZER(type9);
+
static struct {
size_t nvalues;
char **values;
@@ -371,6 +381,54 @@ static const QemuOptDesc qemu_smbios_type8_opts[] = {
},
};
+static const QemuOptDesc qemu_smbios_type9_opts[] = {
+ {
+ .name = "type",
+ .type = QEMU_OPT_NUMBER,
+ .help = "SMBIOS element type",
+ },
+ {
+ .name = "slot_designation",
+ .type = QEMU_OPT_STRING,
+ .help = "string number for reference designation",
+ },
+ {
+ .name = "slot_type",
+ .type = QEMU_OPT_NUMBER,
+ .help = "connector type",
+ },
+ {
+ .name = "slot_data_bus_width",
+ .type = QEMU_OPT_NUMBER,
+ .help = "port type",
+ },
+ {
+ .name = "current_usage",
+ .type = QEMU_OPT_NUMBER,
+ .help = "current usage",
+ },
+ {
+ .name = "slot_length",
+ .type = QEMU_OPT_NUMBER,
+ .help = "system slot length",
+ },
+ {
+ .name = "slot_id",
+ .type = QEMU_OPT_NUMBER,
+ .help = "system slot id",
+ },
+ {
+ .name = "slot_characteristics1",
+ .type = QEMU_OPT_NUMBER,
+ .help = "slot characteristics1, see the spec",
+ },
+ {
+ .name = "slot_characteristics2",
+ .type = QEMU_OPT_NUMBER,
+ .help = "slot characteristics2, see the spec",
+ },
+};
+
static const QemuOptDesc qemu_smbios_type11_opts[] = {
{
.name = "value",
@@ -594,6 +652,7 @@ bool smbios_skip_table(uint8_t type, bool required_table)
#define T2_BASE 0x200
#define T3_BASE 0x300
#define T4_BASE 0x400
+#define T9_BASE 0x900
#define T11_BASE 0xe00
#define T16_BASE 0x1000
@@ -792,6 +851,28 @@ static void smbios_build_type_8_table(void)
}
}
+static void smbios_build_type_9_table(void)
+{
+ unsigned instance = 0;
+ struct type9_instance *t9;
+
+ QTAILQ_FOREACH(t9, &type9, next) {
+ SMBIOS_BUILD_TABLE_PRE(9, T9_BASE + instance, true);
+
+ SMBIOS_TABLE_SET_STR(9, slot_designation, t9->slot_designation);
+ t->slot_type = t9->slot_type;
+ t->slot_data_bus_width = t9->slot_data_bus_width;
+ t->current_usage = t9->current_usage;
+ t->slot_length = t9->slot_length;
+ t->slot_id = t9->slot_id;
+ t->slot_characteristics1 = t9->slot_characteristics1;
+ t->slot_characteristics2 = t9->slot_characteristics2;
+
+ SMBIOS_BUILD_TABLE_POST;
+ instance++;
+ }
+}
+
static void smbios_build_type_11_table(void)
{
char count_str[128];
@@ -1141,6 +1222,7 @@ void smbios_get_tables(MachineState *ms,
}
smbios_build_type_8_table();
+ smbios_build_type_9_table();
smbios_build_type_11_table();
#define MAX_DIMM_SZ (16 * GiB)
@@ -1472,6 +1554,23 @@ void smbios_entry_add(QemuOpts *opts, Error **errp)
t8_i->port_type = qemu_opt_get_number(opts, "port_type", 0);
QTAILQ_INSERT_TAIL(&type8, t8_i, next);
return;
+ case 9: {
+ if (!qemu_opts_validate(opts, qemu_smbios_type9_opts, errp)) {
+ return;
+ }
+ struct type9_instance *t;
+ t = g_new0(struct type9_instance, 1);
+ save_opt(&t->slot_designation, opts, "slot_designation");
+ t->slot_type = qemu_opt_get_number(opts, "slot_type", 0);
+ t->slot_data_bus_width = qemu_opt_get_number(opts, "slot_data_bus_width", 0);
+ t->current_usage = qemu_opt_get_number(opts, "current_usage", 0);
+ t->slot_length = qemu_opt_get_number(opts, "slot_length", 0);
+ t->slot_id = qemu_opt_get_number(opts, "slot_id", 0);
+ t->slot_characteristics1 = qemu_opt_get_number(opts, "slot_characteristics1", 0);
+ t->slot_characteristics2 = qemu_opt_get_number(opts, "slot_characteristics2", 0);
+ QTAILQ_INSERT_TAIL(&type9, t, next);
+ return;
+ }
case 11:
if (!qemu_opts_validate(opts, qemu_smbios_type11_opts, errp)) {
return;
diff --git a/include/hw/firmware/smbios.h b/include/hw/firmware/smbios.h
index d24b3ccd32..6bbd5a4c20 100644
--- a/include/hw/firmware/smbios.h
+++ b/include/hw/firmware/smbios.h
@@ -211,6 +211,19 @@ struct smbios_type_8 {
uint8_t port_type;
} QEMU_PACKED;
+/* SMBIOS type 9 - System Slots (v2.1+) */
+struct smbios_type_9 {
+ struct smbios_structure_header header;
+ uint8_t slot_designation;
+ uint8_t slot_type;
+ uint8_t slot_data_bus_width;
+ uint8_t current_usage;
+ uint8_t slot_length;
+ uint16_t slot_id;
+ uint8_t slot_characteristics1;
+ uint8_t slot_characteristics2;
+} QEMU_PACKED;
+
/* SMBIOS type 11 - OEM strings */
struct smbios_type_11 {
struct smbios_structure_header header;
diff --git a/qemu-options.hx b/qemu-options.hx
index 0814f43066..94cacc2c63 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -2710,6 +2710,9 @@ SRST
``-smbios type=4[,sock_pfx=str][,manufacturer=str][,version=str][,serial=str][,asset=str][,part=str][,processor-id=%d]``
Specify SMBIOS type 4 fields
+``-smbios type=9[,slot_designation=str][,slot_type=%d][,slot_data_bus_width=%d][,current_usage=%d][,slot_length=%d][,slot_id=%d][,slot_characteristics1=%d][,slot_characteristics12=%d]``
+ Specify SMBIOS type 9 fields
+
``-smbios type=11[,value=str][,path=filename]``
Specify SMBIOS type 11 fields
--
2.39.3

@ -0,0 +1,203 @@
From c46ac3db0a4db60e667edeabc9ed451c6e8e0ccf Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 18 Mar 2024 14:41:33 -0400
Subject: [PATCH 020/100] KVM: remove kvm_arch_cpu_check_are_resettable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [20/91] d7745bd1a0ed1b215847f150f4a1bb2e912beabc (bonzini/rhel-qemu-kvm)
Board reset requires writing a fresh CPU state. As far as KVM is
concerned, the only thing that blocks reset is that CPU state is
encrypted; therefore, kvm_cpus_are_resettable() can simply check
if that is the case.
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit a99c0c66ebe7d8db3af6f16689ade9375247e43e)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
accel/kvm/kvm-accel-ops.c | 2 +-
accel/kvm/kvm-all.c | 5 -----
include/sysemu/kvm.h | 10 ----------
target/arm/kvm.c | 5 -----
target/i386/kvm/kvm.c | 5 -----
target/loongarch/kvm/kvm.c | 5 -----
target/mips/kvm.c | 5 -----
target/ppc/kvm.c | 5 -----
target/riscv/kvm/kvm-cpu.c | 5 -----
target/s390x/kvm/kvm.c | 5 -----
10 files changed, 1 insertion(+), 51 deletions(-)
diff --git a/accel/kvm/kvm-accel-ops.c b/accel/kvm/kvm-accel-ops.c
index b3c946dc4b..74e3c5785b 100644
--- a/accel/kvm/kvm-accel-ops.c
+++ b/accel/kvm/kvm-accel-ops.c
@@ -82,7 +82,7 @@ static bool kvm_vcpu_thread_is_idle(CPUState *cpu)
static bool kvm_cpus_are_resettable(void)
{
- return !kvm_enabled() || kvm_cpu_check_are_resettable();
+ return !kvm_enabled() || !kvm_state->guest_state_protected;
}
#ifdef KVM_CAP_SET_GUEST_DEBUG
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index ec0f6df7c5..b51e09a583 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -2696,11 +2696,6 @@ void kvm_flush_coalesced_mmio_buffer(void)
s->coalesced_flush_in_progress = false;
}
-bool kvm_cpu_check_are_resettable(void)
-{
- return kvm_arch_cpu_check_are_resettable();
-}
-
static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
{
if (!cpu->vcpu_dirty && !kvm_state->guest_state_protected) {
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 302e8f6f1e..54f4d83a37 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -525,16 +525,6 @@ int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target);
/* Notify resamplefd for EOI of specific interrupts. */
void kvm_resample_fd_notify(int gsi);
-/**
- * kvm_cpu_check_are_resettable - return whether CPUs can be reset
- *
- * Returns: true: CPUs are resettable
- * false: CPUs are not resettable
- */
-bool kvm_cpu_check_are_resettable(void);
-
-bool kvm_arch_cpu_check_are_resettable(void);
-
bool kvm_dirty_ring_enabled(void);
uint32_t kvm_dirty_ring_size(void);
diff --git a/target/arm/kvm.c b/target/arm/kvm.c
index ab85d628a8..21ebbf3b8f 100644
--- a/target/arm/kvm.c
+++ b/target/arm/kvm.c
@@ -1598,11 +1598,6 @@ int kvm_arch_msi_data_to_gsi(uint32_t data)
return (data - 32) & 0xffff;
}
-bool kvm_arch_cpu_check_are_resettable(void)
-{
- return true;
-}
-
static void kvm_arch_get_eager_split_size(Object *obj, Visitor *v,
const char *name, void *opaque,
Error **errp)
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index e271652620..a12207a8ee 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -5623,11 +5623,6 @@ bool kvm_has_waitpkg(void)
return has_msr_umwait;
}
-bool kvm_arch_cpu_check_are_resettable(void)
-{
- return !sev_es_enabled();
-}
-
#define ARCH_REQ_XCOMP_GUEST_PERM 0x1025
void kvm_request_xsave_components(X86CPU *cpu, uint64_t mask)
diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c
index d630cc39cb..8224d94333 100644
--- a/target/loongarch/kvm/kvm.c
+++ b/target/loongarch/kvm/kvm.c
@@ -733,11 +733,6 @@ bool kvm_arch_stop_on_emulation_error(CPUState *cs)
return true;
}
-bool kvm_arch_cpu_check_are_resettable(void)
-{
- return true;
-}
-
int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
{
int ret = 0;
diff --git a/target/mips/kvm.c b/target/mips/kvm.c
index 6c52e59f55..a631ab544f 100644
--- a/target/mips/kvm.c
+++ b/target/mips/kvm.c
@@ -1273,11 +1273,6 @@ int kvm_arch_get_default_type(MachineState *machine)
return -1;
}
-bool kvm_arch_cpu_check_are_resettable(void)
-{
- return true;
-}
-
void kvm_arch_accel_class_init(ObjectClass *oc)
{
}
diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index 59f640cf7b..9d9d9f0d79 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -2968,11 +2968,6 @@ void kvmppc_set_reg_tb_offset(PowerPCCPU *cpu, int64_t tb_offset)
}
}
-bool kvm_arch_cpu_check_are_resettable(void)
-{
- return true;
-}
-
void kvm_arch_accel_class_init(ObjectClass *oc)
{
}
diff --git a/target/riscv/kvm/kvm-cpu.c b/target/riscv/kvm/kvm-cpu.c
index 6a6c6cae80..49d2f3ad58 100644
--- a/target/riscv/kvm/kvm-cpu.c
+++ b/target/riscv/kvm/kvm-cpu.c
@@ -1475,11 +1475,6 @@ void kvm_riscv_set_irq(RISCVCPU *cpu, int irq, int level)
}
}
-bool kvm_arch_cpu_check_are_resettable(void)
-{
- return true;
-}
-
static int aia_mode;
static const char *kvm_aia_mode_str(uint64_t mode)
diff --git a/target/s390x/kvm/kvm.c b/target/s390x/kvm/kvm.c
index 55fb4855b1..4db59658e1 100644
--- a/target/s390x/kvm/kvm.c
+++ b/target/s390x/kvm/kvm.c
@@ -2630,11 +2630,6 @@ void kvm_s390_stop_interrupt(S390CPU *cpu)
kvm_s390_vcpu_interrupt(cpu, &irq);
}
-bool kvm_arch_cpu_check_are_resettable(void)
-{
- return true;
-}
-
int kvm_s390_get_zpci_op(void)
{
return cap_zpci_op;
--
2.39.3

@ -0,0 +1,127 @@
From 50399796da938c4ea7c69058fde84695bce9d794 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 18 Mar 2024 14:41:10 -0400
Subject: [PATCH 019/100] KVM: track whether guest state is encrypted
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [19/91] 685b9c54d43d0043d15c33d13afc3a420cbe139b (bonzini/rhel-qemu-kvm)
So far, KVM has allowed KVM_GET/SET_* ioctls to execute even if the
guest state is encrypted, in which case they do nothing. For the new
API using VM types, instead, the ioctls will fail which is a safer and
more robust approach.
The new API will be the only one available for SEV-SNP and TDX, but it
is also usable for SEV and SEV-ES. In preparation for that, require
architecture-specific KVM code to communicate the point at which guest
state is protected (which must be after kvm_cpu_synchronize_post_init(),
though that might change in the future in order to suppor migration).
From that point, skip reading registers so that cpu->vcpu_dirty is
never true: if it ever becomes true, kvm_arch_put_registers() will
fail miserably.
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 5c3131c392f84c660033d511ec39872d8beb4b1e)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
accel/kvm/kvm-all.c | 17 ++++++++++++++---
include/sysemu/kvm.h | 2 ++
include/sysemu/kvm_int.h | 1 +
target/i386/sev.c | 1 +
4 files changed, 18 insertions(+), 3 deletions(-)
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 931f74256e..ec0f6df7c5 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -2703,7 +2703,7 @@ bool kvm_cpu_check_are_resettable(void)
static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
{
- if (!cpu->vcpu_dirty) {
+ if (!cpu->vcpu_dirty && !kvm_state->guest_state_protected) {
int ret = kvm_arch_get_registers(cpu);
if (ret) {
error_report("Failed to get registers: %s", strerror(-ret));
@@ -2717,7 +2717,7 @@ static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
void kvm_cpu_synchronize_state(CPUState *cpu)
{
- if (!cpu->vcpu_dirty) {
+ if (!cpu->vcpu_dirty && !kvm_state->guest_state_protected) {
run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
}
}
@@ -2752,7 +2752,13 @@ static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
void kvm_cpu_synchronize_post_init(CPUState *cpu)
{
- run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
+ if (!kvm_state->guest_state_protected) {
+ /*
+ * This runs before the machine_init_done notifiers, and is the last
+ * opportunity to synchronize the state of confidential guests.
+ */
+ run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
+ }
}
static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
@@ -4099,3 +4105,8 @@ void query_stats_schemas_cb(StatsSchemaList **result, Error **errp)
query_stats_schema_vcpu(first_cpu, &stats_args);
}
}
+
+void kvm_mark_guest_state_protected(void)
+{
+ kvm_state->guest_state_protected = true;
+}
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index fad9a7e8ff..302e8f6f1e 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -539,6 +539,8 @@ bool kvm_dirty_ring_enabled(void);
uint32_t kvm_dirty_ring_size(void);
+void kvm_mark_guest_state_protected(void);
+
/**
* kvm_hwpoisoned_mem - indicate if there is any hwpoisoned page
* reported for the VM.
diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
index 882e37e12c..3496be7997 100644
--- a/include/sysemu/kvm_int.h
+++ b/include/sysemu/kvm_int.h
@@ -87,6 +87,7 @@ struct KVMState
bool kernel_irqchip_required;
OnOffAuto kernel_irqchip_split;
bool sync_mmu;
+ bool guest_state_protected;
uint64_t manual_dirty_log_protect;
/* The man page (and posix) say ioctl numbers are signed int, but
* they're not. Linux, glibc and *BSD all treat ioctl numbers as
diff --git a/target/i386/sev.c b/target/i386/sev.c
index b8f79d34d1..c49a8fd55e 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -755,6 +755,7 @@ sev_launch_get_measure(Notifier *notifier, void *unused)
if (ret) {
exit(1);
}
+ kvm_mark_guest_state_protected();
}
/* query the measurement blob length */
--
2.39.3

@ -0,0 +1,329 @@
From f4b01d645926faab2cab86fadb7398c26d6b8285 Mon Sep 17 00:00:00 2001
From: Xiaoyao Li <xiaoyao.li@intel.com>
Date: Wed, 20 Mar 2024 03:39:02 -0500
Subject: [PATCH 028/100] RAMBlock: Add support of KVM private guest memfd
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [28/91] 95fdf196afcb67113834c20fa354ee1397411bfd (bonzini/rhel-qemu-kvm)
Add KVM guest_memfd support to RAMBlock so both normal hva based memory
and kvm guest memfd based private memory can be associated in one RAMBlock.
Introduce new flag RAM_GUEST_MEMFD. When it's set, it calls KVM ioctl to
create private guest_memfd during RAMBlock setup.
Allocating a new RAM_GUEST_MEMFD flag to instruct the setup of guest memfd
is more flexible and extensible than simply relying on the VM type because
in the future we may have the case that not all the memory of a VM need
guest memfd. As a benefit, it also avoid getting MachineState in memory
subsystem.
Note, RAM_GUEST_MEMFD is supposed to be set for memory backends of
confidential guests, such as TDX VM. How and when to set it for memory
backends will be implemented in the following patches.
Introduce memory_region_has_guest_memfd() to query if the MemoryRegion has
KVM guest_memfd allocated.
Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Message-ID: <20240320083945.991426-7-michael.roth@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 15f7a80c49cb3637f62fa37fa4a17da913bd91ff)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
accel/kvm/kvm-all.c | 28 ++++++++++++++++++++++++++++
accel/stubs/kvm-stub.c | 5 +++++
include/exec/memory.h | 20 +++++++++++++++++---
include/exec/ram_addr.h | 2 +-
include/exec/ramblock.h | 1 +
include/sysemu/kvm.h | 2 ++
system/memory.c | 5 +++++
system/physmem.c | 34 +++++++++++++++++++++++++++++++---
8 files changed, 90 insertions(+), 7 deletions(-)
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 272e945f52..a7b9a127dd 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -92,6 +92,7 @@ static bool kvm_has_guest_debug;
static int kvm_sstep_flags;
static bool kvm_immediate_exit;
static uint64_t kvm_supported_memory_attributes;
+static bool kvm_guest_memfd_supported;
static hwaddr kvm_max_slot_size = ~0;
static const KVMCapabilityInfo kvm_required_capabilites[] = {
@@ -2419,6 +2420,11 @@ static int kvm_init(MachineState *ms)
}
kvm_supported_memory_attributes = kvm_check_extension(s, KVM_CAP_MEMORY_ATTRIBUTES);
+ kvm_guest_memfd_supported =
+ kvm_check_extension(s, KVM_CAP_GUEST_MEMFD) &&
+ kvm_check_extension(s, KVM_CAP_USER_MEMORY2) &&
+ (kvm_supported_memory_attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE);
+
kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT);
s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
@@ -4138,3 +4144,25 @@ void kvm_mark_guest_state_protected(void)
{
kvm_state->guest_state_protected = true;
}
+
+int kvm_create_guest_memfd(uint64_t size, uint64_t flags, Error **errp)
+{
+ int fd;
+ struct kvm_create_guest_memfd guest_memfd = {
+ .size = size,
+ .flags = flags,
+ };
+
+ if (!kvm_guest_memfd_supported) {
+ error_setg(errp, "KVM does not support guest_memfd");
+ return -1;
+ }
+
+ fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_GUEST_MEMFD, &guest_memfd);
+ if (fd < 0) {
+ error_setg_errno(errp, errno, "Error creating KVM guest_memfd");
+ return -1;
+ }
+
+ return fd;
+}
diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
index ca38172884..8e0eb22e61 100644
--- a/accel/stubs/kvm-stub.c
+++ b/accel/stubs/kvm-stub.c
@@ -129,3 +129,8 @@ bool kvm_hwpoisoned_mem(void)
{
return false;
}
+
+int kvm_create_guest_memfd(uint64_t size, uint64_t flags, Error **errp)
+{
+ return -ENOSYS;
+}
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 8626a355b3..679a847685 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -243,6 +243,9 @@ typedef struct IOMMUTLBEvent {
/* RAM FD is opened read-only */
#define RAM_READONLY_FD (1 << 11)
+/* RAM can be private that has kvm guest memfd backend */
+#define RAM_GUEST_MEMFD (1 << 12)
+
static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
IOMMUNotifierFlag flags,
hwaddr start, hwaddr end,
@@ -1307,7 +1310,8 @@ bool memory_region_init_ram_nomigrate(MemoryRegion *mr,
* @name: Region name, becomes part of RAMBlock name used in migration stream
* must be unique within any device
* @size: size of the region.
- * @ram_flags: RamBlock flags. Supported flags: RAM_SHARED, RAM_NORESERVE.
+ * @ram_flags: RamBlock flags. Supported flags: RAM_SHARED, RAM_NORESERVE,
+ * RAM_GUEST_MEMFD.
* @errp: pointer to Error*, to store an error if it happens.
*
* Note that this function does not do anything to cause the data in the
@@ -1369,7 +1373,7 @@ bool memory_region_init_resizeable_ram(MemoryRegion *mr,
* (getpagesize()) will be used.
* @ram_flags: RamBlock flags. Supported flags: RAM_SHARED, RAM_PMEM,
* RAM_NORESERVE, RAM_PROTECTED, RAM_NAMED_FILE, RAM_READONLY,
- * RAM_READONLY_FD
+ * RAM_READONLY_FD, RAM_GUEST_MEMFD
* @path: the path in which to allocate the RAM.
* @offset: offset within the file referenced by path
* @errp: pointer to Error*, to store an error if it happens.
@@ -1399,7 +1403,7 @@ bool memory_region_init_ram_from_file(MemoryRegion *mr,
* @size: size of the region.
* @ram_flags: RamBlock flags. Supported flags: RAM_SHARED, RAM_PMEM,
* RAM_NORESERVE, RAM_PROTECTED, RAM_NAMED_FILE, RAM_READONLY,
- * RAM_READONLY_FD
+ * RAM_READONLY_FD, RAM_GUEST_MEMFD
* @fd: the fd to mmap.
* @offset: offset within the file referenced by fd
* @errp: pointer to Error*, to store an error if it happens.
@@ -1722,6 +1726,16 @@ static inline bool memory_region_is_romd(MemoryRegion *mr)
*/
bool memory_region_is_protected(MemoryRegion *mr);
+/**
+ * memory_region_has_guest_memfd: check whether a memory region has guest_memfd
+ * associated
+ *
+ * Returns %true if a memory region's ram_block has valid guest_memfd assigned.
+ *
+ * @mr: the memory region being queried
+ */
+bool memory_region_has_guest_memfd(MemoryRegion *mr);
+
/**
* memory_region_get_iommu: check whether a memory region is an iommu
*
diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index de45ba7bc9..07c8f86375 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -110,7 +110,7 @@ long qemu_maxrampagesize(void);
* @mr: the memory region where the ram block is
* @ram_flags: RamBlock flags. Supported flags: RAM_SHARED, RAM_PMEM,
* RAM_NORESERVE, RAM_PROTECTED, RAM_NAMED_FILE, RAM_READONLY,
- * RAM_READONLY_FD
+ * RAM_READONLY_FD, RAM_GUEST_MEMFD
* @mem_path or @fd: specify the backing file or device
* @offset: Offset into target file
* @errp: pointer to Error*, to store an error if it happens
diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h
index 848915ea5b..459c8917de 100644
--- a/include/exec/ramblock.h
+++ b/include/exec/ramblock.h
@@ -41,6 +41,7 @@ struct RAMBlock {
QLIST_HEAD(, RAMBlockNotifier) ramblock_notifiers;
int fd;
uint64_t fd_offset;
+ int guest_memfd;
size_t page_size;
/* dirty bitmap used during migration */
unsigned long *bmap;
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index f114ff6986..9e4ab7ae89 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -537,6 +537,8 @@ void kvm_mark_guest_state_protected(void);
*/
bool kvm_hwpoisoned_mem(void);
+int kvm_create_guest_memfd(uint64_t size, uint64_t flags, Error **errp);
+
int kvm_set_memory_attributes_private(hwaddr start, uint64_t size);
int kvm_set_memory_attributes_shared(hwaddr start, uint64_t size);
diff --git a/system/memory.c b/system/memory.c
index a229a79988..c756950c0c 100644
--- a/system/memory.c
+++ b/system/memory.c
@@ -1850,6 +1850,11 @@ bool memory_region_is_protected(MemoryRegion *mr)
return mr->ram && (mr->ram_block->flags & RAM_PROTECTED);
}
+bool memory_region_has_guest_memfd(MemoryRegion *mr)
+{
+ return mr->ram_block && mr->ram_block->guest_memfd >= 0;
+}
+
uint8_t memory_region_get_dirty_log_mask(MemoryRegion *mr)
{
uint8_t mask = mr->dirty_log_mask;
diff --git a/system/physmem.c b/system/physmem.c
index a4fe3d2bf8..f5dfa20e57 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -1808,6 +1808,7 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
const bool shared = qemu_ram_is_shared(new_block);
RAMBlock *block;
RAMBlock *last_block = NULL;
+ bool free_on_error = false;
ram_addr_t old_ram_size, new_ram_size;
Error *err = NULL;
@@ -1837,6 +1838,19 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
return;
}
memory_try_enable_merging(new_block->host, new_block->max_length);
+ free_on_error = true;
+ }
+ }
+
+ if (new_block->flags & RAM_GUEST_MEMFD) {
+ assert(kvm_enabled());
+ assert(new_block->guest_memfd < 0);
+
+ new_block->guest_memfd = kvm_create_guest_memfd(new_block->max_length,
+ 0, errp);
+ if (new_block->guest_memfd < 0) {
+ qemu_mutex_unlock_ramlist();
+ goto out_free;
}
}
@@ -1888,6 +1902,13 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
ram_block_notify_add(new_block->host, new_block->used_length,
new_block->max_length);
}
+ return;
+
+out_free:
+ if (free_on_error) {
+ qemu_anon_ram_free(new_block->host, new_block->max_length);
+ new_block->host = NULL;
+ }
}
#ifdef CONFIG_POSIX
@@ -1902,7 +1923,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
/* Just support these ram flags by now. */
assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_NORESERVE |
RAM_PROTECTED | RAM_NAMED_FILE | RAM_READONLY |
- RAM_READONLY_FD)) == 0);
+ RAM_READONLY_FD | RAM_GUEST_MEMFD)) == 0);
if (xen_enabled()) {
error_setg(errp, "-mem-path not supported with Xen");
@@ -1939,6 +1960,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
new_block->used_length = size;
new_block->max_length = size;
new_block->flags = ram_flags;
+ new_block->guest_memfd = -1;
new_block->host = file_ram_alloc(new_block, size, fd, !file_size, offset,
errp);
if (!new_block->host) {
@@ -2018,7 +2040,7 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
int align;
assert((ram_flags & ~(RAM_SHARED | RAM_RESIZEABLE | RAM_PREALLOC |
- RAM_NORESERVE)) == 0);
+ RAM_NORESERVE | RAM_GUEST_MEMFD)) == 0);
assert(!host ^ (ram_flags & RAM_PREALLOC));
align = qemu_real_host_page_size();
@@ -2033,6 +2055,7 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
new_block->max_length = max_size;
assert(max_size >= size);
new_block->fd = -1;
+ new_block->guest_memfd = -1;
new_block->page_size = qemu_real_host_page_size();
new_block->host = host;
new_block->flags = ram_flags;
@@ -2055,7 +2078,7 @@ RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
RAMBlock *qemu_ram_alloc(ram_addr_t size, uint32_t ram_flags,
MemoryRegion *mr, Error **errp)
{
- assert((ram_flags & ~(RAM_SHARED | RAM_NORESERVE)) == 0);
+ assert((ram_flags & ~(RAM_SHARED | RAM_NORESERVE | RAM_GUEST_MEMFD)) == 0);
return qemu_ram_alloc_internal(size, size, NULL, NULL, ram_flags, mr, errp);
}
@@ -2083,6 +2106,11 @@ static void reclaim_ramblock(RAMBlock *block)
} else {
qemu_anon_ram_free(block->host, block->max_length);
}
+
+ if (block->guest_memfd >= 0) {
+ close(block->guest_memfd);
+ }
+
g_free(block);
}
--
2.39.3

@ -0,0 +1,82 @@
From bd289293604d6f33e9fb89196f0b19117ce81f89 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 20 Mar 2024 17:45:29 +0100
Subject: [PATCH 032/100] RAMBlock: make guest_memfd require uncoordinated
discard
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [32/91] 0c005849026c334737b88cbd20a0ac237dfca37e (bonzini/rhel-qemu-kvm)
Some subsystems like VFIO might disable ram block discard, but guest_memfd
uses discard operations to implement conversions between private and
shared memory. Because of this, sequences like the following can result
in stale IOMMU mappings:
1. allocate shared page
2. convert page shared->private
3. discard shared page
4. convert page private->shared
5. allocate shared page
6. issue DMA operations against that shared page
This is not a use-after-free, because after step 3 VFIO is still pinning
the page. However, DMA operations in step 6 will hit the old mapping
that was allocated in step 1.
Address this by taking ram_block_discard_is_enabled() into account when
deciding whether or not to discard pages.
Since kvm_convert_memory()/guest_memfd doesn't implement a
RamDiscardManager handler to convey and replay discard operations,
this is a case of uncoordinated discard, which is blocked/released
by ram_block_discard_require(). Interestingly, this function had
no use so far.
Alternative approaches would be to block discard of shared pages, but
this would cause guests to consume twice the memory if they use VFIO;
or to implement a RamDiscardManager and only block uncoordinated
discard, i.e. use ram_block_coordinated_discard_require().
[Commit message mostly by Michael Roth <michael.roth@amd.com>]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 852f0048f3ea9f14de18eb279a99fccb6d250e8f)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
system/physmem.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/system/physmem.c b/system/physmem.c
index f5dfa20e57..5ebcf5be11 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -1846,6 +1846,13 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
assert(kvm_enabled());
assert(new_block->guest_memfd < 0);
+ if (ram_block_discard_require(true) < 0) {
+ error_setg_errno(errp, errno,
+ "cannot set up private guest memory: discard currently blocked");
+ error_append_hint(errp, "Are you using assigned devices?\n");
+ goto out_free;
+ }
+
new_block->guest_memfd = kvm_create_guest_memfd(new_block->max_length,
0, errp);
if (new_block->guest_memfd < 0) {
@@ -2109,6 +2116,7 @@ static void reclaim_ramblock(RAMBlock *block)
if (block->guest_memfd >= 0) {
close(block->guest_memfd);
+ ram_block_discard_require(false);
}
g_free(block);
--
2.39.3

@ -1,60 +0,0 @@
From 2e0e4355b2d4edb66b7d8c198339e17940abd682 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Mon, 18 Mar 2024 13:03:19 +0000
Subject: [PATCH 2/3] Revert "chardev/char-socket: Fix TLS io channels sending
too much data to the backend"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Daniel P. Berrangé <berrange@redhat.com>
RH-MergeRequest: 233: Fix handling of TLS sessions in chardevs
RH-Jira: RHEL-24614
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
RH-Commit: [2/3] 1cb3d72b86ced0f70a09dfa0d325ae8a85db1b2b (berrange/centos-src-qemu)
This commit results in unexpected termination of the TLS connection.
When 'fd_can_read' returns 0, the code goes on to pass a zero length
buffer to qio_channel_read. The TLS impl calls into gnutls_recv()
with this zero length buffer, at which point GNUTLS returns an error
GNUTLS_E_INVALID_REQUEST. This is treated as fatal by QEMU's TLS code
resulting in the connection being torn down by the chardev.
Simply skipping the qio_channel_read when the buffer length is zero
is also not satisfactory, as it results in a high CPU burn busy loop
massively slowing QEMU's functionality.
The proper solution is to avoid tcp_chr_read being called at all
unless the frontend is able to accept more data. This will be done
in a followup commit.
This reverts commit 462945cd22d2bcd233401ed3aa167d83a8e35b05
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
(cherry picked from commit e8ee827ffdb86ebbd5f5213a1f78123c25a90864)
---
chardev/char-socket.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index f48d341ebc..51d0943fce 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -492,9 +492,9 @@ static gboolean tcp_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque)
s->max_size <= 0) {
return TRUE;
}
- len = tcp_chr_read_poll(opaque);
- if (len > sizeof(buf)) {
- len = sizeof(buf);
+ len = sizeof(buf);
+ if (len > s->max_size) {
+ len = s->max_size;
}
size = tcp_chr_recv(chr, (void *)buf, len);
if (size == 0 || (size == -1 && errno != EAGAIN)) {
--
2.39.3

@ -1,216 +0,0 @@
From ab5a33d57b48e35388928e388bb6e6479bc77651 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Mon, 18 Mar 2024 17:08:30 +0000
Subject: [PATCH 3/3] Revert "chardev: use a child source for qio input source"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Daniel P. Berrangé <berrange@redhat.com>
RH-MergeRequest: 233: Fix handling of TLS sessions in chardevs
RH-Jira: RHEL-24614
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
RH-Commit: [3/3] b58e6c19c2b11d5d28db31cf1386226fb01d195e (berrange/centos-src-qemu)
This reverts commit a7077b8e354d90fec26c2921aa2dea85b90dff90,
and add comments to explain why child sources cannot be used.
When a GSource is added as a child of another GSource, if its
'prepare' function indicates readiness, then the parent's
'prepare' function will never be run. The io_watch_poll_prepare
absolutely *must* be run on every iteration of the main loop,
to ensure that the chardev backend doesn't feed data to the
frontend that it is unable to consume.
At the time a7077b8e354d90fec26c2921aa2dea85b90dff90 was made,
all the child GSource impls were relying on poll'ing an FD,
so their 'prepare' functions would never indicate readiness
ahead of poll() being invoked. So the buggy behaviour was
not noticed and lay dormant.
Relatively recently the QIOChannelTLS impl introduced a
level 2 child GSource, which checks with GNUTLS whether it
has cached any data that was decoded but not yet consumed:
commit ffda5db65aef42266a5053a4be34515106c4c7ee
Author: Antoine Damhet <antoine.damhet@shadow.tech>
Date: Tue Nov 15 15:23:29 2022 +0100
io/channel-tls: fix handling of bigger read buffers
Since the TLS backend can read more data from the underlying QIOChannel
we introduce a minimal child GSource to notify if we still have more
data available to be read.
Signed-off-by: Antoine Damhet <antoine.damhet@shadow.tech>
Signed-off-by: Charles Frey <charles.frey@shadow.tech>
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
With this, it is now quite common for the 'prepare' function
on a QIOChannelTLS GSource to indicate immediate readiness,
bypassing the parent GSource 'prepare' function. IOW, the
critical 'io_watch_poll_prepare' is being skipped on some
iterations of the main loop. As a result chardev frontend
asserts are now being triggered as they are fed data they
are not ready to consume.
A reproducer is as follows:
* In terminal 1 run a GNUTLS *echo* server
$ gnutls-serv --echo \
--x509cafile ca-cert.pem \
--x509keyfile server-key.pem \
--x509certfile server-cert.pem \
-p 9000
* In terminal 2 run a QEMU guest
$ qemu-system-s390x \
-nodefaults \
-display none \
-object tls-creds-x509,id=tls0,dir=$PWD,endpoint=client \
-chardev socket,id=con0,host=localhost,port=9000,tls-creds=tls0 \
-device sclpconsole,chardev=con0 \
-hda Fedora-Cloud-Base-39-1.5.s390x.qcow2
After the previous patch revert, but before this patch revert,
this scenario will crash:
qemu-system-s390x: ../hw/char/sclpconsole.c:73: chr_read: Assertion
`size <= SIZE_BUFFER_VT220 - scon->iov_data_len' failed.
This assert indicates that 'tcp_chr_read' was called without
'tcp_chr_read_poll' having first been checked for ability to
receive more data
QEMU's use of a 'prepare' function to create/delete another
GSource is rather a hack and not normally the kind of thing that
is expected to be done by a GSource. There is no mechanism to
force GLib to always run the 'prepare' function of a parent
GSource. The best option is to simply not use the child source
concept, and go back to the functional approach previously
relied on.
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Tested-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
(cherry picked from commit 038b4217884c6f297278bb1ec6f0463c6c8221de)
---
chardev/char-io.c | 56 ++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 51 insertions(+), 5 deletions(-)
diff --git a/chardev/char-io.c b/chardev/char-io.c
index 4451128cba..dab77b112e 100644
--- a/chardev/char-io.c
+++ b/chardev/char-io.c
@@ -33,6 +33,7 @@ typedef struct IOWatchPoll {
IOCanReadHandler *fd_can_read;
GSourceFunc fd_read;
void *opaque;
+ GMainContext *context;
} IOWatchPoll;
static IOWatchPoll *io_watch_poll_from_source(GSource *source)
@@ -50,28 +51,59 @@ static gboolean io_watch_poll_prepare(GSource *source,
return FALSE;
}
+ /*
+ * We do not register the QIOChannel watch as a child GSource.
+ * The 'prepare' function on the parent GSource will be
+ * skipped if a child GSource's 'prepare' function indicates
+ * readiness. We need this prepare function be guaranteed
+ * to run on *every* iteration of the main loop, because
+ * it is critical to ensure we remove the QIOChannel watch
+ * if 'fd_can_read' indicates the frontend cannot receive
+ * more data.
+ */
if (now_active) {
iwp->src = qio_channel_create_watch(
iwp->ioc, G_IO_IN | G_IO_ERR | G_IO_HUP | G_IO_NVAL);
g_source_set_callback(iwp->src, iwp->fd_read, iwp->opaque, NULL);
- g_source_add_child_source(source, iwp->src);
- g_source_unref(iwp->src);
+ g_source_attach(iwp->src, iwp->context);
} else {
- g_source_remove_child_source(source, iwp->src);
+ g_source_destroy(iwp->src);
+ g_source_unref(iwp->src);
iwp->src = NULL;
}
return FALSE;
}
+static gboolean io_watch_poll_check(GSource *source)
+{
+ return FALSE;
+}
+
static gboolean io_watch_poll_dispatch(GSource *source, GSourceFunc callback,
gpointer user_data)
{
- return G_SOURCE_CONTINUE;
+ abort();
+}
+
+static void io_watch_poll_finalize(GSource *source)
+{
+ /*
+ * Due to a glib bug, removing the last reference to a source
+ * inside a finalize callback causes recursive locking (and a
+ * deadlock). This is not a problem inside other callbacks,
+ * including dispatch callbacks, so we call io_remove_watch_poll
+ * to remove this source. At this point, iwp->src must
+ * be NULL, or we would leak it.
+ */
+ IOWatchPoll *iwp = io_watch_poll_from_source(source);
+ assert(iwp->src == NULL);
}
static GSourceFuncs io_watch_poll_funcs = {
.prepare = io_watch_poll_prepare,
+ .check = io_watch_poll_check,
.dispatch = io_watch_poll_dispatch,
+ .finalize = io_watch_poll_finalize,
};
GSource *io_add_watch_poll(Chardev *chr,
@@ -91,6 +123,7 @@ GSource *io_add_watch_poll(Chardev *chr,
iwp->ioc = ioc;
iwp->fd_read = (GSourceFunc) fd_read;
iwp->src = NULL;
+ iwp->context = context;
name = g_strdup_printf("chardev-iowatch-%s", chr->label);
g_source_set_name((GSource *)iwp, name);
@@ -101,10 +134,23 @@ GSource *io_add_watch_poll(Chardev *chr,
return (GSource *)iwp;
}
+static void io_remove_watch_poll(GSource *source)
+{
+ IOWatchPoll *iwp;
+
+ iwp = io_watch_poll_from_source(source);
+ if (iwp->src) {
+ g_source_destroy(iwp->src);
+ g_source_unref(iwp->src);
+ iwp->src = NULL;
+ }
+ g_source_destroy(&iwp->parent);
+}
+
void remove_fd_in_watch(Chardev *chr)
{
if (chr->gsource) {
- g_source_destroy(chr->gsource);
+ io_remove_watch_poll(chr->gsource);
chr->gsource = NULL;
}
}
--
2.39.3

@ -0,0 +1,67 @@
From d4e6f7105b00ba2536d5d733b7c03116f28ce116 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 6 May 2024 15:06:21 -0400
Subject: [PATCH 2/5] Revert "monitor: use aio_co_reschedule_self()"
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 248: Revert "monitor: use aio_co_reschedule_self()"
RH-Jira: RHEL-34618 RHEL-38697
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [1/2] b6a2ebd4a69dbcd2bd56c61e7c747f8f8f42337e (kmwolf/centos-qemu-kvm)
Commit 1f25c172f837 ("monitor: use aio_co_reschedule_self()") was a code
cleanup that uses aio_co_reschedule_self() instead of open coding
coroutine rescheduling.
Bug RHEL-34618 was reported and Kevin Wolf <kwolf@redhat.com> identified
the root cause. I missed that aio_co_reschedule_self() ->
qemu_get_current_aio_context() only knows about
qemu_aio_context/IOThread AioContexts and not about iohandler_ctx. It
does not function correctly when going back from the iohandler_ctx to
qemu_aio_context.
Go back to open coding the AioContext transitions to avoid this bug.
This reverts commit 1f25c172f83704e350c0829438d832384084a74d.
Cc: qemu-stable@nongnu.org
Buglink: https://issues.redhat.com/browse/RHEL-34618
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20240506190622.56095-2-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
(cherry picked from commit 719c6819ed9a9838520fa732f9861918dc693bda)
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
qapi/qmp-dispatch.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c
index f3488afeef..176b549473 100644
--- a/qapi/qmp-dispatch.c
+++ b/qapi/qmp-dispatch.c
@@ -212,7 +212,8 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList *cmds, QObject *requ
* executing the command handler so that it can make progress if it
* involves an AIO_WAIT_WHILE().
*/
- aio_co_reschedule_self(qemu_get_aio_context());
+ aio_co_schedule(qemu_get_aio_context(), qemu_coroutine_self());
+ qemu_coroutine_yield();
}
monitor_set_cur(qemu_coroutine_self(), cur_mon);
@@ -226,7 +227,9 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList *cmds, QObject *requ
* Move back to iohandler_ctx so that nested event loops for
* qemu_aio_context don't start new monitor commands.
*/
- aio_co_reschedule_self(iohandler_get_aio_context());
+ aio_co_schedule(iohandler_get_aio_context(),
+ qemu_coroutine_self());
+ qemu_coroutine_yield();
}
} else {
/*
--
2.39.3

@ -0,0 +1,38 @@
From bcbc897cb19b3a6523de611f48f6bac6cea16c97 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@redhat.com>
Date: Thu, 2 May 2024 13:17:03 +0200
Subject: [PATCH 2/2] Revert "x86: rhel 9.4.0 machine type compat fix"
RH-Author: Sebastian Ott <sebott@redhat.com>
RH-MergeRequest: 237: Revert "x86: rhel 9.4.0 machine type compat fix"
RH-Jira: RHEL-30362
RH-Acked-by: Ani Sinha <anisinha@redhat.com>
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [1/1] 858ec153e65e96c39ca4db17ed93fd58c77dc2eb (seott1/cos-qemu-kvm)
This reverts commit c46e44f0f4e861fe412ce679b0b0204881c1c2f5.
pc-q35-rhel9.4.0 and newer should stay with SMBIOS_ENTRY_POINT_TYPE_AUTO.
Signed-off-by: Sebastian Ott <sebott@redhat.com>
---
hw/i386/pc_q35.c | 3 ---
1 file changed, 3 deletions(-)
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 2f11f9af7d..2b54944c0f 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -734,9 +734,6 @@ static void pc_q35_machine_rhel940_options(MachineClass *m)
pcmc->smbios_stream_product = "RHEL";
pcmc->smbios_stream_version = "9.4.0";
- /* From pc_q35_8_2_machine_options() - use SMBIOS 3.X by default */
- pcmc->default_smbios_ep_type = SMBIOS_ENTRY_POINT_TYPE_64;
-
compat_props_add(m->compat_props, hw_compat_rhel_9_5,
hw_compat_rhel_9_5_len);
}
--
2.39.3

@ -1,60 +0,0 @@
From 6b5cfed21e20b372090046a934387255ff4bda58 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 5 Dec 2023 13:20:01 -0500
Subject: [PATCH 084/101] aio: make aio_context_acquire()/aio_context_release()
a no-op
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [15/26] 723dcada900aaf08862e8221921be22506b561a8 (kmwolf/centos-qemu-kvm)
aio_context_acquire()/aio_context_release() has been replaced by
fine-grained locking to protect state shared by multiple threads. The
AioContext lock still plays the role of balancing locking in
AIO_WAIT_WHILE() and many functions in QEMU either require that the
AioContext lock is held or not held for this reason. In other words, the
AioContext lock is purely there for consistency with itself and serves
no real purpose anymore.
Stop actually acquiring/releasing the lock in
aio_context_acquire()/aio_context_release() so that subsequent patches
can remove callers across the codebase incrementally.
I have performed "make check" and qemu-iotests stress tests across
x86-64, ppc64le, and aarch64 to confirm that there are no failures as a
result of eliminating the lock.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Acked-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20231205182011.1976568-5-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
util/async.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/util/async.c b/util/async.c
index 8f90ddc304..04ee83d220 100644
--- a/util/async.c
+++ b/util/async.c
@@ -725,12 +725,12 @@ void aio_context_unref(AioContext *ctx)
void aio_context_acquire(AioContext *ctx)
{
- qemu_rec_mutex_lock(&ctx->lock);
+ /* TODO remove this function */
}
void aio_context_release(AioContext *ctx)
{
- qemu_rec_mutex_unlock(&ctx->lock);
+ /* TODO remove this function */
}
QEMU_DEFINE_STATIC_CO_TLS(AioContext *, my_aiocontext)
--
2.39.3

@ -1,102 +0,0 @@
From 14913d8970090c8914dc19dad14f3b9f91985ec3 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 5 Dec 2023 13:20:07 -0500
Subject: [PATCH 090/101] aio: remove
aio_context_acquire()/aio_context_release() API
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [21/26] 4b6d4afcac79d3248a6722b063b5fc777dc418df (kmwolf/centos-qemu-kvm)
Delete these functions because nothing calls these functions anymore.
I introduced these APIs in commit 98563fc3ec44 ("aio: add
aio_context_acquire() and aio_context_release()") in 2014. It's with a
sigh of relief that I delete these APIs almost 10 years later.
Thanks to Paolo Bonzini's vision for multi-queue QEMU, we got an
understanding of where the code needed to go in order to remove the
limitations that the original dataplane and the IOThread/AioContext
approach that followed it.
Emanuele Giuseppe Esposito had the splendid determination to convert
large parts of the codebase so that they no longer needed the AioContext
lock. This was a painstaking process, both in the actual code changes
required and the iterations of code review that Emanuele eked out of
Kevin and me over many months.
Kevin Wolf tackled multitudes of graph locking conversions to protect
in-flight I/O from run-time changes to the block graph as well as the
clang Thread Safety Analysis annotations that allow the compiler to
check whether the graph lock is being used correctly.
And me, well, I'm just here to add some pizzazz to the QEMU multi-queue
block layer :). Thank you to everyone who helped with this effort,
including Eric Blake, code reviewer extraordinaire, and others who I've
forgotten to mention.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-ID: <20231205182011.1976568-11-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
include/block/aio.h | 17 -----------------
util/async.c | 10 ----------
2 files changed, 27 deletions(-)
diff --git a/include/block/aio.h b/include/block/aio.h
index f08b358077..af05512a7d 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -278,23 +278,6 @@ void aio_context_ref(AioContext *ctx);
*/
void aio_context_unref(AioContext *ctx);
-/* Take ownership of the AioContext. If the AioContext will be shared between
- * threads, and a thread does not want to be interrupted, it will have to
- * take ownership around calls to aio_poll(). Otherwise, aio_poll()
- * automatically takes care of calling aio_context_acquire and
- * aio_context_release.
- *
- * Note that this is separate from bdrv_drained_begin/bdrv_drained_end. A
- * thread still has to call those to avoid being interrupted by the guest.
- *
- * Bottom halves, timers and callbacks can be created or removed without
- * acquiring the AioContext.
- */
-void aio_context_acquire(AioContext *ctx);
-
-/* Relinquish ownership of the AioContext. */
-void aio_context_release(AioContext *ctx);
-
/**
* aio_bh_schedule_oneshot_full: Allocate a new bottom half structure that will
* run only once and as soon as possible.
diff --git a/util/async.c b/util/async.c
index dfd44ef612..460529057c 100644
--- a/util/async.c
+++ b/util/async.c
@@ -719,16 +719,6 @@ void aio_context_unref(AioContext *ctx)
g_source_unref(&ctx->source);
}
-void aio_context_acquire(AioContext *ctx)
-{
- /* TODO remove this function */
-}
-
-void aio_context_release(AioContext *ctx)
-{
- /* TODO remove this function */
-}
-
QEMU_DEFINE_STATIC_CO_TLS(AioContext *, my_aiocontext)
AioContext *qemu_get_current_aio_context(void)
--
2.39.3

@ -1,81 +0,0 @@
From e1e2f3972065c4b5d6fcf37e0e1c4fb92a0d5260 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 5 Dec 2023 13:20:06 -0500
Subject: [PATCH 089/101] aio-wait: draw equivalence between AIO_WAIT_WHILE()
and AIO_WAIT_WHILE_UNLOCKED()
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [20/26] 20e49777869714c99769263103f1b0c2c370cfcd (kmwolf/centos-qemu-kvm)
Now that the AioContext lock no longer exists, AIO_WAIT_WHILE() and
AIO_WAIT_WHILE_UNLOCKED() are equivalent.
A future patch will get rid of AIO_WAIT_WHILE_UNLOCKED().
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-ID: <20231205182011.1976568-10-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
include/block/aio-wait.h | 16 ++++------------
1 file changed, 4 insertions(+), 12 deletions(-)
diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
index 5449b6d742..157f105916 100644
--- a/include/block/aio-wait.h
+++ b/include/block/aio-wait.h
@@ -63,9 +63,6 @@ extern AioWait global_aio_wait;
* @ctx: the aio context, or NULL if multiple aio contexts (for which the
* caller does not hold a lock) are involved in the polling condition.
* @cond: wait while this conditional expression is true
- * @unlock: whether to unlock and then lock again @ctx. This applies
- * only when waiting for another AioContext from the main loop.
- * Otherwise it's ignored.
*
* Wait while a condition is true. Use this to implement synchronous
* operations that require event loop activity.
@@ -78,7 +75,7 @@ extern AioWait global_aio_wait;
* wait on conditions between two IOThreads since that could lead to deadlock,
* go via the main loop instead.
*/
-#define AIO_WAIT_WHILE_INTERNAL(ctx, cond, unlock) ({ \
+#define AIO_WAIT_WHILE_INTERNAL(ctx, cond) ({ \
bool waited_ = false; \
AioWait *wait_ = &global_aio_wait; \
AioContext *ctx_ = (ctx); \
@@ -95,13 +92,7 @@ extern AioWait global_aio_wait;
assert(qemu_get_current_aio_context() == \
qemu_get_aio_context()); \
while ((cond)) { \
- if (unlock && ctx_) { \
- aio_context_release(ctx_); \
- } \
aio_poll(qemu_get_aio_context(), true); \
- if (unlock && ctx_) { \
- aio_context_acquire(ctx_); \
- } \
waited_ = true; \
} \
} \
@@ -109,10 +100,11 @@ extern AioWait global_aio_wait;
waited_; })
#define AIO_WAIT_WHILE(ctx, cond) \
- AIO_WAIT_WHILE_INTERNAL(ctx, cond, true)
+ AIO_WAIT_WHILE_INTERNAL(ctx, cond)
+/* TODO replace this with AIO_WAIT_WHILE() in a future patch */
#define AIO_WAIT_WHILE_UNLOCKED(ctx, cond) \
- AIO_WAIT_WHILE_INTERNAL(ctx, cond, false)
+ AIO_WAIT_WHILE_INTERNAL(ctx, cond)
/**
* aio_wait_kick:
--
2.39.3

@ -0,0 +1,64 @@
From 0e3934e89ad1dda21681f64ff38da69b07d1b531 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 6 May 2024 15:06:22 -0400
Subject: [PATCH 3/5] aio: warn about iohandler_ctx special casing
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 248: Revert "monitor: use aio_co_reschedule_self()"
RH-Jira: RHEL-34618 RHEL-38697
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [2/2] cc316d70b2c187ee0412d6560ca1a03e381a69c1 (kmwolf/centos-qemu-kvm)
The main loop has two AioContexts: qemu_aio_context and iohandler_ctx.
The main loop runs them both, but nested aio_poll() calls on
qemu_aio_context exclude iohandler_ctx.
Which one should qemu_get_current_aio_context() return when called from
the main loop? Document that it's always qemu_aio_context.
This has subtle effects on functions that use
qemu_get_current_aio_context(). For example, aio_co_reschedule_self()
does not work when moving from iohandler_ctx to qemu_aio_context because
qemu_get_current_aio_context() does not differentiate these two
AioContexts.
Document this in order to reduce the chance of future bugs.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20240506190622.56095-3-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
(cherry picked from commit e669e800fc9ef8806af5c5578249ab758a4f8a5a)
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
include/block/aio.h | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/include/block/aio.h b/include/block/aio.h
index 8378553eb9..4ee81936ed 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -629,6 +629,9 @@ void aio_co_schedule(AioContext *ctx, Coroutine *co);
*
* Move the currently running coroutine to new_ctx. If the coroutine is already
* running in new_ctx, do nothing.
+ *
+ * Note that this function cannot reschedule from iohandler_ctx to
+ * qemu_aio_context.
*/
void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx);
@@ -661,6 +664,9 @@ void aio_co_enter(AioContext *ctx, Coroutine *co);
* If called from an IOThread this will be the IOThread's AioContext. If
* called from the main thread or with the "big QEMU lock" taken it
* will be the main loop AioContext.
+ *
+ * Note that the return value is never the main loop's iohandler_ctx and the
+ * return value is the main loop AioContext instead.
*/
AioContext *qemu_get_current_aio_context(void);
--
2.39.3

@ -1,476 +0,0 @@
From 0d8255c98b3ef6f603ff0279592d3e91de26de0e Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Tue, 21 Nov 2023 16:44:00 +0800
Subject: [PATCH 021/101] backends/iommufd: Introduce the iommufd object
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [20/67] 8a56344ab4a2126f248bfa492ccddd19265f39be (eauger1/centos-qemu-kvm)
Introduce an iommufd object which allows the interaction
with the host /dev/iommu device.
The /dev/iommu can have been already pre-opened outside of qemu,
in which case the fd can be passed directly along with the
iommufd object:
This allows the iommufd object to be shared accross several
subsystems (VFIO, VDPA, ...). For example, libvirt would open
the /dev/iommu once.
If no fd is passed along with the iommufd object, the /dev/iommu
is opened by the qemu code.
Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 6e6d8ac62b5b38dc9d4b69ffdf073f0a0b43b7be)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
MAINTAINERS | 8 ++
backends/Kconfig | 4 +
backends/iommufd.c | 245 +++++++++++++++++++++++++++++++++++++++
backends/meson.build | 1 +
backends/trace-events | 10 ++
include/sysemu/iommufd.h | 38 ++++++
qapi/qom.json | 19 +++
qemu-options.hx | 12 ++
8 files changed, 337 insertions(+)
create mode 100644 backends/iommufd.c
create mode 100644 include/sysemu/iommufd.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 695e0bd34f..a5a446914a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2167,6 +2167,14 @@ F: hw/vfio/ap.c
F: docs/system/s390x/vfio-ap.rst
L: qemu-s390x@nongnu.org
+iommufd
+M: Yi Liu <yi.l.liu@intel.com>
+M: Eric Auger <eric.auger@redhat.com>
+M: Zhenzhong Duan <zhenzhong.duan@intel.com>
+S: Supported
+F: backends/iommufd.c
+F: include/sysemu/iommufd.h
+
vhost
M: Michael S. Tsirkin <mst@redhat.com>
S: Supported
diff --git a/backends/Kconfig b/backends/Kconfig
index f35abc1609..2cb23f62fa 100644
--- a/backends/Kconfig
+++ b/backends/Kconfig
@@ -1 +1,5 @@
source tpm/Kconfig
+
+config IOMMUFD
+ bool
+ depends on VFIO
diff --git a/backends/iommufd.c b/backends/iommufd.c
new file mode 100644
index 0000000000..ba58a0eb0d
--- /dev/null
+++ b/backends/iommufd.c
@@ -0,0 +1,245 @@
+/*
+ * iommufd container backend
+ *
+ * Copyright (C) 2023 Intel Corporation.
+ * Copyright Red Hat, Inc. 2023
+ *
+ * Authors: Yi Liu <yi.l.liu@intel.com>
+ * Eric Auger <eric.auger@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/iommufd.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qerror.h"
+#include "qemu/module.h"
+#include "qom/object_interfaces.h"
+#include "qemu/error-report.h"
+#include "monitor/monitor.h"
+#include "trace.h"
+#include <sys/ioctl.h>
+#include <linux/iommufd.h>
+
+static void iommufd_backend_init(Object *obj)
+{
+ IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
+
+ be->fd = -1;
+ be->users = 0;
+ be->owned = true;
+ qemu_mutex_init(&be->lock);
+}
+
+static void iommufd_backend_finalize(Object *obj)
+{
+ IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
+
+ if (be->owned) {
+ close(be->fd);
+ be->fd = -1;
+ }
+}
+
+static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp)
+{
+ IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
+ int fd = -1;
+
+ fd = monitor_fd_param(monitor_cur(), str, errp);
+ if (fd == -1) {
+ error_prepend(errp, "Could not parse remote object fd %s:", str);
+ return;
+ }
+ qemu_mutex_lock(&be->lock);
+ be->fd = fd;
+ be->owned = false;
+ qemu_mutex_unlock(&be->lock);
+ trace_iommu_backend_set_fd(be->fd);
+}
+
+static bool iommufd_backend_can_be_deleted(UserCreatable *uc)
+{
+ IOMMUFDBackend *be = IOMMUFD_BACKEND(uc);
+
+ return !be->users;
+}
+
+static void iommufd_backend_class_init(ObjectClass *oc, void *data)
+{
+ UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
+
+ ucc->can_be_deleted = iommufd_backend_can_be_deleted;
+
+ object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd);
+}
+
+int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp)
+{
+ int fd, ret = 0;
+
+ qemu_mutex_lock(&be->lock);
+ if (be->users == UINT32_MAX) {
+ error_setg(errp, "too many connections");
+ ret = -E2BIG;
+ goto out;
+ }
+ if (be->owned && !be->users) {
+ fd = qemu_open_old("/dev/iommu", O_RDWR);
+ if (fd < 0) {
+ error_setg_errno(errp, errno, "/dev/iommu opening failed");
+ ret = fd;
+ goto out;
+ }
+ be->fd = fd;
+ }
+ be->users++;
+out:
+ trace_iommufd_backend_connect(be->fd, be->owned,
+ be->users, ret);
+ qemu_mutex_unlock(&be->lock);
+ return ret;
+}
+
+void iommufd_backend_disconnect(IOMMUFDBackend *be)
+{
+ qemu_mutex_lock(&be->lock);
+ if (!be->users) {
+ goto out;
+ }
+ be->users--;
+ if (!be->users && be->owned) {
+ close(be->fd);
+ be->fd = -1;
+ }
+out:
+ trace_iommufd_backend_disconnect(be->fd, be->users);
+ qemu_mutex_unlock(&be->lock);
+}
+
+int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
+ Error **errp)
+{
+ int ret, fd = be->fd;
+ struct iommu_ioas_alloc alloc_data = {
+ .size = sizeof(alloc_data),
+ .flags = 0,
+ };
+
+ ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data);
+ if (ret) {
+ error_setg_errno(errp, errno, "Failed to allocate ioas");
+ return ret;
+ }
+
+ *ioas_id = alloc_data.out_ioas_id;
+ trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret);
+
+ return ret;
+}
+
+void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id)
+{
+ int ret, fd = be->fd;
+ struct iommu_destroy des = {
+ .size = sizeof(des),
+ .id = id,
+ };
+
+ ret = ioctl(fd, IOMMU_DESTROY, &des);
+ trace_iommufd_backend_free_id(fd, id, ret);
+ if (ret) {
+ error_report("Failed to free id: %u %m", id);
+ }
+}
+
+int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
+ ram_addr_t size, void *vaddr, bool readonly)
+{
+ int ret, fd = be->fd;
+ struct iommu_ioas_map map = {
+ .size = sizeof(map),
+ .flags = IOMMU_IOAS_MAP_READABLE |
+ IOMMU_IOAS_MAP_FIXED_IOVA,
+ .ioas_id = ioas_id,
+ .__reserved = 0,
+ .user_va = (uintptr_t)vaddr,
+ .iova = iova,
+ .length = size,
+ };
+
+ if (!readonly) {
+ map.flags |= IOMMU_IOAS_MAP_WRITEABLE;
+ }
+
+ ret = ioctl(fd, IOMMU_IOAS_MAP, &map);
+ trace_iommufd_backend_map_dma(fd, ioas_id, iova, size,
+ vaddr, readonly, ret);
+ if (ret) {
+ ret = -errno;
+
+ /* TODO: Not support mapping hardware PCI BAR region for now. */
+ if (errno == EFAULT) {
+ warn_report("IOMMU_IOAS_MAP failed: %m, PCI BAR?");
+ } else {
+ error_report("IOMMU_IOAS_MAP failed: %m");
+ }
+ }
+ return ret;
+}
+
+int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
+ hwaddr iova, ram_addr_t size)
+{
+ int ret, fd = be->fd;
+ struct iommu_ioas_unmap unmap = {
+ .size = sizeof(unmap),
+ .ioas_id = ioas_id,
+ .iova = iova,
+ .length = size,
+ };
+
+ ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap);
+ /*
+ * IOMMUFD takes mapping as some kind of object, unmapping
+ * nonexistent mapping is treated as deleting a nonexistent
+ * object and return ENOENT. This is different from legacy
+ * backend which allows it. vIOMMU may trigger a lot of
+ * redundant unmapping, to avoid flush the log, treat them
+ * as succeess for IOMMUFD just like legacy backend.
+ */
+ if (ret && errno == ENOENT) {
+ trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size, ret);
+ ret = 0;
+ } else {
+ trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret);
+ }
+
+ if (ret) {
+ ret = -errno;
+ error_report("IOMMU_IOAS_UNMAP failed: %m");
+ }
+ return ret;
+}
+
+static const TypeInfo iommufd_backend_info = {
+ .name = TYPE_IOMMUFD_BACKEND,
+ .parent = TYPE_OBJECT,
+ .instance_size = sizeof(IOMMUFDBackend),
+ .instance_init = iommufd_backend_init,
+ .instance_finalize = iommufd_backend_finalize,
+ .class_size = sizeof(IOMMUFDBackendClass),
+ .class_init = iommufd_backend_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_USER_CREATABLE },
+ { }
+ }
+};
+
+static void register_types(void)
+{
+ type_register_static(&iommufd_backend_info);
+}
+
+type_init(register_types);
diff --git a/backends/meson.build b/backends/meson.build
index 914c7c4afb..9a5cea480d 100644
--- a/backends/meson.build
+++ b/backends/meson.build
@@ -20,6 +20,7 @@ if have_vhost_user
system_ss.add(when: 'CONFIG_VIRTIO', if_true: files('vhost-user.c'))
endif
system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost.c'))
+system_ss.add(when: 'CONFIG_IOMMUFD', if_true: files('iommufd.c'))
if have_vhost_user_crypto
system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost-user.c'))
endif
diff --git a/backends/trace-events b/backends/trace-events
index 652eb76a57..d45c6e31a6 100644
--- a/backends/trace-events
+++ b/backends/trace-events
@@ -5,3 +5,13 @@ dbus_vmstate_pre_save(void)
dbus_vmstate_post_load(int version_id) "version_id: %d"
dbus_vmstate_loading(const char *id) "id: %s"
dbus_vmstate_saving(const char *id) "id: %s"
+
+# iommufd.c
+iommufd_backend_connect(int fd, bool owned, uint32_t users, int ret) "fd=%d owned=%d users=%d (%d)"
+iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d"
+iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d"
+iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)"
+iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
+iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
+iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)"
+iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)"
diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
new file mode 100644
index 0000000000..9c5524b0ed
--- /dev/null
+++ b/include/sysemu/iommufd.h
@@ -0,0 +1,38 @@
+#ifndef SYSEMU_IOMMUFD_H
+#define SYSEMU_IOMMUFD_H
+
+#include "qom/object.h"
+#include "qemu/thread.h"
+#include "exec/hwaddr.h"
+#include "exec/cpu-common.h"
+
+#define TYPE_IOMMUFD_BACKEND "iommufd"
+OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND)
+
+struct IOMMUFDBackendClass {
+ ObjectClass parent_class;
+};
+
+struct IOMMUFDBackend {
+ Object parent;
+
+ /*< protected >*/
+ int fd; /* /dev/iommu file descriptor */
+ bool owned; /* is the /dev/iommu opened internally */
+ QemuMutex lock;
+ uint32_t users;
+
+ /*< public >*/
+};
+
+int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp);
+void iommufd_backend_disconnect(IOMMUFDBackend *be);
+
+int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
+ Error **errp);
+void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id);
+int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
+ ram_addr_t size, void *vaddr, bool readonly);
+int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
+ hwaddr iova, ram_addr_t size);
+#endif
diff --git a/qapi/qom.json b/qapi/qom.json
index c53ef978ff..95516ba325 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -794,6 +794,23 @@
{ 'struct': 'VfioUserServerProperties',
'data': { 'socket': 'SocketAddress', 'device': 'str' } }
+##
+# @IOMMUFDProperties:
+#
+# Properties for iommufd objects.
+#
+# @fd: file descriptor name previously passed via 'getfd' command,
+# which represents a pre-opened /dev/iommu. This allows the
+# iommufd object to be shared accross several subsystems
+# (VFIO, VDPA, ...), and the file descriptor to be shared
+# with other process, e.g. DPDK. (default: QEMU opens
+# /dev/iommu by itself)
+#
+# Since: 9.0
+##
+{ 'struct': 'IOMMUFDProperties',
+ 'data': { '*fd': 'str' } }
+
##
# @RngProperties:
#
@@ -934,6 +951,7 @@
'input-barrier',
{ 'name': 'input-linux',
'if': 'CONFIG_LINUX' },
+ 'iommufd',
'iothread',
'main-loop',
{ 'name': 'memory-backend-epc',
@@ -1003,6 +1021,7 @@
'input-barrier': 'InputBarrierProperties',
'input-linux': { 'type': 'InputLinuxProperties',
'if': 'CONFIG_LINUX' },
+ 'iommufd': 'IOMMUFDProperties',
'iothread': 'IothreadProperties',
'main-loop': 'MainLoopProperties',
'memory-backend-epc': { 'type': 'MemoryBackendEpcProperties',
diff --git a/qemu-options.hx b/qemu-options.hx
index 557118cb1f..0814f43066 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -5224,6 +5224,18 @@ SRST
The ``share`` boolean option is on by default with memfd.
+ ``-object iommufd,id=id[,fd=fd]``
+ Creates an iommufd backend which allows control of DMA mapping
+ through the ``/dev/iommu`` device.
+
+ The ``id`` parameter is a unique ID which frontends (such as
+ vfio-pci of vdpa) will use to connect with the iommufd backend.
+
+ The ``fd`` parameter is an optional pre-opened file descriptor
+ resulting from ``/dev/iommu`` opening. Usually the iommufd is shared
+ across all subsystems, bringing the benefit of centralized
+ reference counting.
+
``-object rng-builtin,id=id``
Creates a random number generator backend which obtains entropy
from QEMU builtin functions. The ``id`` parameter is a unique ID
--
2.39.3

@ -1,47 +0,0 @@
From da9a24793e876f6f2727d57f939d882be26a47b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Fri, 22 Dec 2023 08:55:23 +0100
Subject: [PATCH 064/101] backends/iommufd: Remove check on number of backend
users
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [63/67] ac4d4589d1f2de5ac3f0adfd8d1f27dbf6bbfdee (eauger1/centos-qemu-kvm)
QOM already has a ref count on objects and it will assert much
earlier, when INT_MAX is reached.
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit c2ab3a6f7411c895e538e8350fee8948ac07c1a0)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
backends/iommufd.c | 5 -----
1 file changed, 5 deletions(-)
diff --git a/backends/iommufd.c b/backends/iommufd.c
index ba58a0eb0d..393c0d9a37 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -80,11 +80,6 @@ int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp)
int fd, ret = 0;
qemu_mutex_lock(&be->lock);
- if (be->users == UINT32_MAX) {
- error_setg(errp, "too many connections");
- ret = -E2BIG;
- goto out;
- }
if (be->owned && !be->users) {
fd = qemu_open_old("/dev/iommu", O_RDWR);
if (fd < 0) {
--
2.39.3

@ -1,112 +0,0 @@
From 92aff3cc1a412de01e9563802fa48848eae5283f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Thu, 21 Dec 2023 16:58:41 +0100
Subject: [PATCH 065/101] backends/iommufd: Remove mutex
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [64/67] 65518432b18f18ceadafe1b0698cdaa962e84f61 (eauger1/centos-qemu-kvm)
Coverity reports a concurrent data access violation because be->users
is being accessed in iommufd_backend_can_be_deleted() without holding
the mutex.
However, these routines are called from the QEMU main thread when a
device is created. In this case, the code paths should be protected by
the BQL lock and it should be safe to drop the IOMMUFD backend mutex.
Simply remove it.
Fixes: CID 1531550
Fixes: CID 1531549
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 19368b1905b4b917e915526fcbd5bfa3f7439451)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
backends/iommufd.c | 7 -------
include/sysemu/iommufd.h | 2 --
2 files changed, 9 deletions(-)
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 393c0d9a37..1ef683c7b0 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -29,7 +29,6 @@ static void iommufd_backend_init(Object *obj)
be->fd = -1;
be->users = 0;
be->owned = true;
- qemu_mutex_init(&be->lock);
}
static void iommufd_backend_finalize(Object *obj)
@@ -52,10 +51,8 @@ static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp)
error_prepend(errp, "Could not parse remote object fd %s:", str);
return;
}
- qemu_mutex_lock(&be->lock);
be->fd = fd;
be->owned = false;
- qemu_mutex_unlock(&be->lock);
trace_iommu_backend_set_fd(be->fd);
}
@@ -79,7 +76,6 @@ int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp)
{
int fd, ret = 0;
- qemu_mutex_lock(&be->lock);
if (be->owned && !be->users) {
fd = qemu_open_old("/dev/iommu", O_RDWR);
if (fd < 0) {
@@ -93,13 +89,11 @@ int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp)
out:
trace_iommufd_backend_connect(be->fd, be->owned,
be->users, ret);
- qemu_mutex_unlock(&be->lock);
return ret;
}
void iommufd_backend_disconnect(IOMMUFDBackend *be)
{
- qemu_mutex_lock(&be->lock);
if (!be->users) {
goto out;
}
@@ -110,7 +104,6 @@ void iommufd_backend_disconnect(IOMMUFDBackend *be)
}
out:
trace_iommufd_backend_disconnect(be->fd, be->users);
- qemu_mutex_unlock(&be->lock);
}
int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index 9c5524b0ed..9af27ebd6c 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -2,7 +2,6 @@
#define SYSEMU_IOMMUFD_H
#include "qom/object.h"
-#include "qemu/thread.h"
#include "exec/hwaddr.h"
#include "exec/cpu-common.h"
@@ -19,7 +18,6 @@ struct IOMMUFDBackend {
/*< protected >*/
int fd; /* /dev/iommu file descriptor */
bool owned; /* is the /dev/iommu opened internally */
- QemuMutex lock;
uint32_t users;
/*< public >*/
--
2.39.3

@ -1,60 +1,52 @@
From 5c35b7d631e9cdf75512b9e1a0b5d48e8fd768d9 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jmaloy@redhat.com>
Date: Wed, 5 Jun 2024 19:56:51 -0400
From 2ee645a339e9ef9cd92620a8b784d18d512326be Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 25 Apr 2024 14:56:02 +0200
Subject: [PATCH 4/4] block: Parse filenames only when explicitly requested
RH-Author: Jon Maloy <jmaloy@redhat.com>
RH-MergeRequest: 2: EMBARGOED CVE-2024-4467 for rhel-9.4.z (PRDSC)
RH-Jira: https://issues.redhat.com/browse/RHEL-35610
RH-Author: Hana Czenczek <hczenczek@redhat.com>
RH-MergeRequest: 1: CVE 2024-4467 (PRDSC)
RH-Jira: RHEL-35611
RH-CVE: CVE-2024-4467
RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Acked-by: Hanna Czenczek <hczenczek@redhat.com>
RH-Commit: [4/4] 6f71e6a07bd5a9f8352db920f498f5fa5a2cdbfb
commit f44c2941d4419e60f16dea3e9adca164e75aa78d (origin/cve-2024-4467-hreitz-rhel-9.5.0)
Author: Kevin Wolf <kwolf@redhat.com>
Date: Thu Apr 25 14:56:02 2024 +0200
block: Parse filenames only when explicitly requested
When handling image filenames from legacy options such as -drive or from
tools, these filenames are parsed for protocol prefixes, including for
the json:{} pseudo-protocol.
This behaviour is intended for filenames that come directly from the
command line and for backing files, which may come from the image file
itself. Higher level management tools generally take care to verify that
untrusted images don't contain a bad (or any) backing file reference;
'qemu-img info' is a suitable tool for this.
However, for other files that can be referenced in images, such as
qcow2 data files or VMDK extents, the string from the image file is
usually not verified by management tools - and 'qemu-img info' wouldn't
be suitable because in contrast to backing files, it already opens these
other referenced files. So here the string should be interpreted as a
literal local filename. More complex configurations need to be specified
explicitly on the command line or in QMP.
This patch changes bdrv_open_inherit() so that it only parses filenames
if a new parameter parse_filename is true. It is set for the top level
in bdrv_open(), for the file child and for the backing file child. All
other callers pass false and disable filename parsing this way.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Hanna Czenczek <hreitz@redhat.com>
Upstream: N/A, embargoed
Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
RH-Acked-by: Eric Blake <eblake@redhat.com>
RH-Commit: [4/4] f44c2941d4419e60f16dea3e9adca164e75aa78d
When handling image filenames from legacy options such as -drive or from
tools, these filenames are parsed for protocol prefixes, including for
the json:{} pseudo-protocol.
This behaviour is intended for filenames that come directly from the
command line and for backing files, which may come from the image file
itself. Higher level management tools generally take care to verify that
untrusted images don't contain a bad (or any) backing file reference;
'qemu-img info' is a suitable tool for this.
However, for other files that can be referenced in images, such as
qcow2 data files or VMDK extents, the string from the image file is
usually not verified by management tools - and 'qemu-img info' wouldn't
be suitable because in contrast to backing files, it already opens these
other referenced files. So here the string should be interpreted as a
literal local filename. More complex configurations need to be specified
explicitly on the command line or in QMP.
This patch changes bdrv_open_inherit() so that it only parses filenames
if a new parameter parse_filename is true. It is set for the top level
in bdrv_open(), for the file child and for the backing file child. All
other callers pass false and disable filename parsing this way.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Hanna Czenczek <hreitz@redhat.com>
Upstream: N/A, embargoed
Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
---
block.c | 90 ++++++++++++++++++++++++++++++++++++---------------------
1 file changed, 57 insertions(+), 33 deletions(-)
diff --git a/block.c b/block.c
index a097772238..8b6aa4a65c 100644
index 468cf5e67d..50bdd197b7 100644
--- a/block.c
+++ b/block.c
@@ -86,6 +86,7 @@ static BlockDriverState *bdrv_open_inherit(const char *filename,
@ -65,7 +57,7 @@ index a097772238..8b6aa4a65c 100644
Error **errp);
static bool bdrv_recurse_has_child(BlockDriverState *bs,
@@ -2035,7 +2036,8 @@ static void parse_json_protocol(QDict *options, const char **pfilename,
@@ -2058,7 +2059,8 @@ static void parse_json_protocol(QDict *options, const char **pfilename,
* block driver has been specified explicitly.
*/
static int bdrv_fill_options(QDict **options, const char *filename,
@ -75,7 +67,7 @@ index a097772238..8b6aa4a65c 100644
{
const char *drvname;
bool protocol = *flags & BDRV_O_PROTOCOL;
@@ -2077,7 +2079,7 @@ static int bdrv_fill_options(QDict **options, const char *filename,
@@ -2100,7 +2102,7 @@ static int bdrv_fill_options(QDict **options, const char *filename,
if (protocol && filename) {
if (!qdict_haskey(*options, "filename")) {
qdict_put_str(*options, "filename", filename);
@ -84,7 +76,7 @@ index a097772238..8b6aa4a65c 100644
} else {
error_setg(errp, "Can't specify 'file' and 'filename' options at "
"the same time");
@@ -3639,7 +3641,8 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
@@ -3663,7 +3665,8 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
}
backing_hd = bdrv_open_inherit(backing_filename, reference, options, 0, bs,
@ -94,7 +86,7 @@ index a097772238..8b6aa4a65c 100644
if (!backing_hd) {
bs->open_flags |= BDRV_O_NO_BACKING;
error_prepend(errp, "Could not open backing file: ");
@@ -3673,7 +3676,8 @@ free_exit:
@@ -3697,7 +3700,8 @@ free_exit:
static BlockDriverState *
bdrv_open_child_bs(const char *filename, QDict *options, const char *bdref_key,
BlockDriverState *parent, const BdrvChildClass *child_class,
@ -104,7 +96,7 @@ index a097772238..8b6aa4a65c 100644
{
BlockDriverState *bs = NULL;
QDict *image_options;
@@ -3704,7 +3708,8 @@ bdrv_open_child_bs(const char *filename, QDict *options, const char *bdref_key,
@@ -3728,7 +3732,8 @@ bdrv_open_child_bs(const char *filename, QDict *options, const char *bdref_key,
}
bs = bdrv_open_inherit(filename, reference, image_options, 0,
@ -114,7 +106,7 @@ index a097772238..8b6aa4a65c 100644
if (!bs) {
goto done;
}
@@ -3714,6 +3719,33 @@ done:
@@ -3738,6 +3743,33 @@ done:
return bs;
}
@ -148,7 +140,7 @@ index a097772238..8b6aa4a65c 100644
/*
* Opens a disk image whose options are given as BlockdevRef in another block
* device's options.
@@ -3737,27 +3769,15 @@ BdrvChild *bdrv_open_child(const char *filename,
@@ -3761,27 +3793,15 @@ BdrvChild *bdrv_open_child(const char *filename,
BdrvChildRole child_role,
bool allow_none, Error **errp)
{
@ -182,7 +174,7 @@ index a097772238..8b6aa4a65c 100644
*
* @parent can move to a different AioContext in this function.
*/
@@ -3772,8 +3792,8 @@ int bdrv_open_file_child(const char *filename,
@@ -3796,8 +3816,8 @@ int bdrv_open_file_child(const char *filename,
role = parent->drv->is_filter ?
(BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY) : BDRV_CHILD_IMAGE;
@ -193,7 +185,7 @@ index a097772238..8b6aa4a65c 100644
{
return -EINVAL;
}
@@ -3818,7 +3838,8 @@ BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp)
@@ -3842,7 +3862,8 @@ BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp)
}
@ -203,7 +195,7 @@ index a097772238..8b6aa4a65c 100644
obj = NULL;
qobject_unref(obj);
visit_free(v);
@@ -3907,7 +3928,7 @@ static BlockDriverState * no_coroutine_fn
@@ -3932,7 +3953,7 @@ static BlockDriverState * no_coroutine_fn
bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
int flags, BlockDriverState *parent,
const BdrvChildClass *child_class, BdrvChildRole child_role,
@ -212,7 +204,7 @@ index a097772238..8b6aa4a65c 100644
{
int ret;
BlockBackend *file = NULL;
@@ -3955,9 +3976,11 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
@@ -3980,9 +4001,11 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
}
/* json: syntax counts as explicit options, as if in the QDict */
@ -227,7 +219,7 @@ index a097772238..8b6aa4a65c 100644
}
bs->explicit_options = qdict_clone_shallow(options);
@@ -3982,7 +4005,8 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
@@ -4007,7 +4030,8 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
parent->open_flags, parent->options);
}
@ -237,7 +229,7 @@ index a097772238..8b6aa4a65c 100644
if (ret < 0) {
goto fail;
}
@@ -4051,7 +4075,7 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
@@ -4076,7 +4100,7 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
file_bs = bdrv_open_child_bs(filename, options, "file", bs,
&child_of_bds, BDRV_CHILD_IMAGE,
@ -246,7 +238,7 @@ index a097772238..8b6aa4a65c 100644
if (local_err) {
goto fail;
}
@@ -4200,7 +4224,7 @@ BlockDriverState *bdrv_open(const char *filename, const char *reference,
@@ -4225,7 +4249,7 @@ BlockDriverState *bdrv_open(const char *filename, const char *reference,
GLOBAL_STATE_CODE();
return bdrv_open_inherit(filename, reference, options, flags, NULL,

@ -1,104 +0,0 @@
From afa842e9fdf6e1d6e5d5785679a22779632142bd Mon Sep 17 00:00:00 2001
From: Hanna Czenczek <hreitz@redhat.com>
Date: Fri, 2 Feb 2024 15:47:54 +0100
Subject: [PATCH 03/22] block-backend: Allow concurrent context changes
RH-Author: Hanna Czenczek <hreitz@redhat.com>
RH-MergeRequest: 222: Allow concurrent BlockBackend context changes
RH-Jira: RHEL-24593
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
RH-Commit: [1/2] 9e1b535f60f7afa94a0817dc3e71136e41631c71 (hreitz/qemu-kvm-c-9-s)
Since AioContext locks have been removed, a BlockBackend's AioContext
may really change at any time (only exception is that it is often
confined to a drained section, as noted in this patch). Therefore,
blk_get_aio_context() cannot rely on its root node's context always
matching that of the BlockBackend.
In practice, whether they match does not matter anymore anyway: Requests
can be sent to BDSs from any context, so anyone who requests the BB's
context should have no reason to require the root node to have the same
context. Therefore, we can and should remove the assertion to that
effect.
In addition, because the context can be set and queried from different
threads concurrently, it has to be accessed with atomic operations.
Buglink: https://issues.redhat.com/browse/RHEL-19381
Suggested-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
Message-ID: <20240202144755.671354-2-hreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
(cherry picked from commit ad893672027ffe26db498947d70cde6d4f58a111)
---
block/block-backend.c | 22 +++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/block/block-backend.c b/block/block-backend.c
index 209eb07528..9c4de79e6b 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -44,7 +44,7 @@ struct BlockBackend {
char *name;
int refcnt;
BdrvChild *root;
- AioContext *ctx;
+ AioContext *ctx; /* access with atomic operations only */
DriveInfo *legacy_dinfo; /* null unless created by drive_new() */
QTAILQ_ENTRY(BlockBackend) link; /* for block_backends */
QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */
@@ -2414,22 +2414,22 @@ void blk_op_unblock_all(BlockBackend *blk, Error *reason)
}
}
+/**
+ * Return BB's current AioContext. Note that this context may change
+ * concurrently at any time, with one exception: If the BB has a root node
+ * attached, its context will only change through bdrv_try_change_aio_context(),
+ * which creates a drained section. Therefore, incrementing such a BB's
+ * in-flight counter will prevent its context from changing.
+ */
AioContext *blk_get_aio_context(BlockBackend *blk)
{
- BlockDriverState *bs;
IO_CODE();
if (!blk) {
return qemu_get_aio_context();
}
- bs = blk_bs(blk);
- if (bs) {
- AioContext *ctx = bdrv_get_aio_context(blk_bs(blk));
- assert(ctx == blk->ctx);
- }
-
- return blk->ctx;
+ return qatomic_read(&blk->ctx);
}
int blk_set_aio_context(BlockBackend *blk, AioContext *new_context,
@@ -2442,7 +2442,7 @@ int blk_set_aio_context(BlockBackend *blk, AioContext *new_context,
GLOBAL_STATE_CODE();
if (!bs) {
- blk->ctx = new_context;
+ qatomic_set(&blk->ctx, new_context);
return 0;
}
@@ -2471,7 +2471,7 @@ static void blk_root_set_aio_ctx_commit(void *opaque)
AioContext *new_context = s->new_ctx;
ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
- blk->ctx = new_context;
+ qatomic_set(&blk->ctx, new_context);
if (tgm->throttle_state) {
throttle_group_detach_aio_context(tgm);
throttle_group_attach_aio_context(tgm, new_context);
--
2.39.3

@ -1,69 +0,0 @@
From b1a68aebadecd7d339cf5eaffeda15099c998528 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 12 Sep 2023 19:10:37 -0400
Subject: [PATCH 095/101] block-coroutine-wrapper: use
qemu_get_current_aio_context()
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [26/26] cde767bcdc626e90721792e3889952057a548ac5 (kmwolf/centos-qemu-kvm)
Use qemu_get_current_aio_context() in mixed wrappers and coroutine
wrappers so that code runs in the caller's AioContext instead of moving
to the BlockDriverState's AioContext. This change is necessary for the
multi-queue block layer where any thread can call into the block layer.
Most wrappers are IO_CODE where it's safe to use the current AioContext
nowadays. BlockDrivers and the core block layer use their own locks and
no longer depend on the AioContext lock for thread-safety.
The bdrv_create() wrapper invokes GLOBAL_STATE code. Using the current
AioContext is safe because this code is only called with the BQL held
from the main loop thread.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20230912231037.826804-6-stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
scripts/block-coroutine-wrapper.py | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/scripts/block-coroutine-wrapper.py b/scripts/block-coroutine-wrapper.py
index c9c09fcacd..dbbde99e39 100644
--- a/scripts/block-coroutine-wrapper.py
+++ b/scripts/block-coroutine-wrapper.py
@@ -92,8 +92,6 @@ def __init__(self, wrapper_type: str, return_type: str, name: str,
f"{self.name}")
self.target_name = f'{subsystem}_{subname}'
- self.ctx = self.gen_ctx()
-
self.get_result = 's->ret = '
self.ret = 'return s.ret;'
self.co_ret = 'return '
@@ -167,7 +165,7 @@ def create_mixed_wrapper(func: FuncDecl) -> str:
{func.co_ret}{name}({ func.gen_list('{name}') });
}} else {{
{struct_name} s = {{
- .poll_state.ctx = {func.ctx},
+ .poll_state.ctx = qemu_get_current_aio_context(),
.poll_state.in_progress = true,
{ func.gen_block(' .{name} = {name},') }
@@ -191,7 +189,7 @@ def create_co_wrapper(func: FuncDecl) -> str:
{func.return_type} {func.name}({ func.gen_list('{decl}') })
{{
{struct_name} s = {{
- .poll_state.ctx = {func.ctx},
+ .poll_state.ctx = qemu_get_current_aio_context(),
.poll_state.in_progress = true,
{ func.gen_block(' .{name} = {name},') }
--
2.39.3

@ -0,0 +1,330 @@
From a67edfb4b591acdffc5b4987601a30224376996f Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 27 May 2024 11:58:50 -0400
Subject: [PATCH 4/5] block/crypto: create ciphers on demand
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 251: block/crypto: create ciphers on demand
RH-Jira: RHEL-36159
RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [1/2] 22a4c87fef774cad98a6f5a79f27df50a208013d (stefanha/centos-stream-qemu-kvm)
Ciphers are pre-allocated by qcrypto_block_init_cipher() depending on
the given number of threads. The -device
virtio-blk-pci,iothread-vq-mapping= feature allows users to assign
multiple IOThreads to a virtio-blk device, but the association between
the virtio-blk device and the block driver happens after the block
driver is already open.
When the number of threads given to qcrypto_block_init_cipher() is
smaller than the actual number of threads at runtime, the
block->n_free_ciphers > 0 assertion in qcrypto_block_pop_cipher() can
fail.
Get rid of qcrypto_block_init_cipher() n_thread's argument and allocate
ciphers on demand.
Reported-by: Qing Wang <qinwang@redhat.com>
Buglink: https://issues.redhat.com/browse/RHEL-36159
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20240527155851.892885-2-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Acked-by: Daniel P. Berrangé <berrange@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
(cherry picked from commit af206c284e4c1b17cdfb0f17e898b288c0fc1751)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
crypto/block-luks.c | 3 +-
crypto/block-qcow.c | 2 +-
crypto/block.c | 111 ++++++++++++++++++++++++++------------------
crypto/blockpriv.h | 12 +++--
4 files changed, 78 insertions(+), 50 deletions(-)
diff --git a/crypto/block-luks.c b/crypto/block-luks.c
index 3ee928fb5a..3357852c0a 100644
--- a/crypto/block-luks.c
+++ b/crypto/block-luks.c
@@ -1262,7 +1262,6 @@ qcrypto_block_luks_open(QCryptoBlock *block,
luks->cipher_mode,
masterkey,
luks->header.master_key_len,
- n_threads,
errp) < 0) {
goto fail;
}
@@ -1456,7 +1455,7 @@ qcrypto_block_luks_create(QCryptoBlock *block,
/* Setup the block device payload encryption objects */
if (qcrypto_block_init_cipher(block, luks_opts.cipher_alg,
luks_opts.cipher_mode, masterkey,
- luks->header.master_key_len, 1, errp) < 0) {
+ luks->header.master_key_len, errp) < 0) {
goto error;
}
diff --git a/crypto/block-qcow.c b/crypto/block-qcow.c
index 4d7cf36a8f..02305058e3 100644
--- a/crypto/block-qcow.c
+++ b/crypto/block-qcow.c
@@ -75,7 +75,7 @@ qcrypto_block_qcow_init(QCryptoBlock *block,
ret = qcrypto_block_init_cipher(block, QCRYPTO_CIPHER_ALG_AES_128,
QCRYPTO_CIPHER_MODE_CBC,
keybuf, G_N_ELEMENTS(keybuf),
- n_threads, errp);
+ errp);
if (ret < 0) {
ret = -ENOTSUP;
goto fail;
diff --git a/crypto/block.c b/crypto/block.c
index 506ea1d1a3..ba6d1cebc7 100644
--- a/crypto/block.c
+++ b/crypto/block.c
@@ -20,6 +20,7 @@
#include "qemu/osdep.h"
#include "qapi/error.h"
+#include "qemu/lockable.h"
#include "blockpriv.h"
#include "block-qcow.h"
#include "block-luks.h"
@@ -57,6 +58,8 @@ QCryptoBlock *qcrypto_block_open(QCryptoBlockOpenOptions *options,
{
QCryptoBlock *block = g_new0(QCryptoBlock, 1);
+ qemu_mutex_init(&block->mutex);
+
block->format = options->format;
if (options->format >= G_N_ELEMENTS(qcrypto_block_drivers) ||
@@ -76,8 +79,6 @@ QCryptoBlock *qcrypto_block_open(QCryptoBlockOpenOptions *options,
return NULL;
}
- qemu_mutex_init(&block->mutex);
-
return block;
}
@@ -92,6 +93,8 @@ QCryptoBlock *qcrypto_block_create(QCryptoBlockCreateOptions *options,
{
QCryptoBlock *block = g_new0(QCryptoBlock, 1);
+ qemu_mutex_init(&block->mutex);
+
block->format = options->format;
if (options->format >= G_N_ELEMENTS(qcrypto_block_drivers) ||
@@ -111,8 +114,6 @@ QCryptoBlock *qcrypto_block_create(QCryptoBlockCreateOptions *options,
return NULL;
}
- qemu_mutex_init(&block->mutex);
-
return block;
}
@@ -227,37 +228,42 @@ QCryptoCipher *qcrypto_block_get_cipher(QCryptoBlock *block)
* This function is used only in test with one thread (it's safe to skip
* pop/push interface), so it's enough to assert it here:
*/
- assert(block->n_ciphers <= 1);
- return block->ciphers ? block->ciphers[0] : NULL;
+ assert(block->max_free_ciphers <= 1);
+ return block->free_ciphers ? block->free_ciphers[0] : NULL;
}
-static QCryptoCipher *qcrypto_block_pop_cipher(QCryptoBlock *block)
+static QCryptoCipher *qcrypto_block_pop_cipher(QCryptoBlock *block,
+ Error **errp)
{
- QCryptoCipher *cipher;
-
- qemu_mutex_lock(&block->mutex);
-
- assert(block->n_free_ciphers > 0);
- block->n_free_ciphers--;
- cipher = block->ciphers[block->n_free_ciphers];
-
- qemu_mutex_unlock(&block->mutex);
+ /* Usually there is a free cipher available */
+ WITH_QEMU_LOCK_GUARD(&block->mutex) {
+ if (block->n_free_ciphers > 0) {
+ block->n_free_ciphers--;
+ return block->free_ciphers[block->n_free_ciphers];
+ }
+ }
- return cipher;
+ /* Otherwise allocate a new cipher */
+ return qcrypto_cipher_new(block->alg, block->mode, block->key,
+ block->nkey, errp);
}
static void qcrypto_block_push_cipher(QCryptoBlock *block,
QCryptoCipher *cipher)
{
- qemu_mutex_lock(&block->mutex);
+ QEMU_LOCK_GUARD(&block->mutex);
- assert(block->n_free_ciphers < block->n_ciphers);
- block->ciphers[block->n_free_ciphers] = cipher;
- block->n_free_ciphers++;
+ if (block->n_free_ciphers == block->max_free_ciphers) {
+ block->max_free_ciphers++;
+ block->free_ciphers = g_renew(QCryptoCipher *,
+ block->free_ciphers,
+ block->max_free_ciphers);
+ }
- qemu_mutex_unlock(&block->mutex);
+ block->free_ciphers[block->n_free_ciphers] = cipher;
+ block->n_free_ciphers++;
}
@@ -265,24 +271,31 @@ int qcrypto_block_init_cipher(QCryptoBlock *block,
QCryptoCipherAlgorithm alg,
QCryptoCipherMode mode,
const uint8_t *key, size_t nkey,
- size_t n_threads, Error **errp)
+ Error **errp)
{
- size_t i;
+ QCryptoCipher *cipher;
- assert(!block->ciphers && !block->n_ciphers && !block->n_free_ciphers);
+ assert(!block->free_ciphers && !block->max_free_ciphers &&
+ !block->n_free_ciphers);
- block->ciphers = g_new0(QCryptoCipher *, n_threads);
+ /* Stash away cipher parameters for qcrypto_block_pop_cipher() */
+ block->alg = alg;
+ block->mode = mode;
+ block->key = g_memdup2(key, nkey);
+ block->nkey = nkey;
- for (i = 0; i < n_threads; i++) {
- block->ciphers[i] = qcrypto_cipher_new(alg, mode, key, nkey, errp);
- if (!block->ciphers[i]) {
- qcrypto_block_free_cipher(block);
- return -1;
- }
- block->n_ciphers++;
- block->n_free_ciphers++;
+ /*
+ * Create a new cipher to validate the parameters now. This reduces the
+ * chance of cipher creation failing at I/O time.
+ */
+ cipher = qcrypto_block_pop_cipher(block, errp);
+ if (!cipher) {
+ g_free(block->key);
+ block->key = NULL;
+ return -1;
}
+ qcrypto_block_push_cipher(block, cipher);
return 0;
}
@@ -291,19 +304,23 @@ void qcrypto_block_free_cipher(QCryptoBlock *block)
{
size_t i;
- if (!block->ciphers) {
+ g_free(block->key);
+ block->key = NULL;
+
+ if (!block->free_ciphers) {
return;
}
- assert(block->n_ciphers == block->n_free_ciphers);
+ /* All popped ciphers were eventually pushed back */
+ assert(block->n_free_ciphers == block->max_free_ciphers);
- for (i = 0; i < block->n_ciphers; i++) {
- qcrypto_cipher_free(block->ciphers[i]);
+ for (i = 0; i < block->max_free_ciphers; i++) {
+ qcrypto_cipher_free(block->free_ciphers[i]);
}
- g_free(block->ciphers);
- block->ciphers = NULL;
- block->n_ciphers = block->n_free_ciphers = 0;
+ g_free(block->free_ciphers);
+ block->free_ciphers = NULL;
+ block->max_free_ciphers = block->n_free_ciphers = 0;
}
QCryptoIVGen *qcrypto_block_get_ivgen(QCryptoBlock *block)
@@ -311,7 +328,7 @@ QCryptoIVGen *qcrypto_block_get_ivgen(QCryptoBlock *block)
/* ivgen should be accessed under mutex. However, this function is used only
* in test with one thread, so it's enough to assert it here:
*/
- assert(block->n_ciphers <= 1);
+ assert(block->max_free_ciphers <= 1);
return block->ivgen;
}
@@ -446,7 +463,10 @@ int qcrypto_block_decrypt_helper(QCryptoBlock *block,
Error **errp)
{
int ret;
- QCryptoCipher *cipher = qcrypto_block_pop_cipher(block);
+ QCryptoCipher *cipher = qcrypto_block_pop_cipher(block, errp);
+ if (!cipher) {
+ return -1;
+ }
ret = do_qcrypto_block_cipher_encdec(cipher, block->niv, block->ivgen,
&block->mutex, sectorsize, offset, buf,
@@ -465,7 +485,10 @@ int qcrypto_block_encrypt_helper(QCryptoBlock *block,
Error **errp)
{
int ret;
- QCryptoCipher *cipher = qcrypto_block_pop_cipher(block);
+ QCryptoCipher *cipher = qcrypto_block_pop_cipher(block, errp);
+ if (!cipher) {
+ return -1;
+ }
ret = do_qcrypto_block_cipher_encdec(cipher, block->niv, block->ivgen,
&block->mutex, sectorsize, offset, buf,
diff --git a/crypto/blockpriv.h b/crypto/blockpriv.h
index 836f3b4726..4bf6043d5d 100644
--- a/crypto/blockpriv.h
+++ b/crypto/blockpriv.h
@@ -32,8 +32,14 @@ struct QCryptoBlock {
const QCryptoBlockDriver *driver;
void *opaque;
- QCryptoCipher **ciphers;
- size_t n_ciphers;
+ /* Cipher parameters */
+ QCryptoCipherAlgorithm alg;
+ QCryptoCipherMode mode;
+ uint8_t *key;
+ size_t nkey;
+
+ QCryptoCipher **free_ciphers;
+ size_t max_free_ciphers;
size_t n_free_ciphers;
QCryptoIVGen *ivgen;
QemuMutex mutex;
@@ -130,7 +136,7 @@ int qcrypto_block_init_cipher(QCryptoBlock *block,
QCryptoCipherAlgorithm alg,
QCryptoCipherMode mode,
const uint8_t *key, size_t nkey,
- size_t n_threads, Error **errp);
+ Error **errp);
void qcrypto_block_free_cipher(QCryptoBlock *block);
--
2.39.3

@ -1,217 +0,0 @@
From 25cce5df341861e8ba8ec57722558e2dee3ce56a Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 14 Sep 2023 10:00:58 -0400
Subject: [PATCH 073/101] block/file-posix: set up Linux AIO and io_uring in
the current thread
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [4/26] 74c7daf805daefe706378308c3afeb28d861164b (kmwolf/centos-qemu-kvm)
The file-posix block driver currently only sets up Linux AIO and
io_uring in the BDS's AioContext. In the multi-queue block layer we must
be able to submit I/O requests in AioContexts that do not have Linux AIO
and io_uring set up yet since any thread can call into the block driver.
Set up Linux AIO and io_uring for the current AioContext during request
submission. We lose the ability to return an error from
.bdrv_file_open() when Linux AIO and io_uring setup fails (e.g. due to
resource limits). Instead the user only gets warnings and we fall back
to aio=threads. This is still better than a fatal error after startup.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20230914140101.1065008-2-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
block/file-posix.c | 103 ++++++++++++++++++++++-----------------------
1 file changed, 51 insertions(+), 52 deletions(-)
diff --git a/block/file-posix.c b/block/file-posix.c
index b862406c71..35684f7e21 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -712,17 +712,11 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
#ifdef CONFIG_LINUX_AIO
/* Currently Linux does AIO only for files opened with O_DIRECT */
- if (s->use_linux_aio) {
- if (!(s->open_flags & O_DIRECT)) {
- error_setg(errp, "aio=native was specified, but it requires "
- "cache.direct=on, which was not specified.");
- ret = -EINVAL;
- goto fail;
- }
- if (!aio_setup_linux_aio(bdrv_get_aio_context(bs), errp)) {
- error_prepend(errp, "Unable to use native AIO: ");
- goto fail;
- }
+ if (s->use_linux_aio && !(s->open_flags & O_DIRECT)) {
+ error_setg(errp, "aio=native was specified, but it requires "
+ "cache.direct=on, which was not specified.");
+ ret = -EINVAL;
+ goto fail;
}
#else
if (s->use_linux_aio) {
@@ -733,14 +727,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
}
#endif /* !defined(CONFIG_LINUX_AIO) */
-#ifdef CONFIG_LINUX_IO_URING
- if (s->use_linux_io_uring) {
- if (!aio_setup_linux_io_uring(bdrv_get_aio_context(bs), errp)) {
- error_prepend(errp, "Unable to use io_uring: ");
- goto fail;
- }
- }
-#else
+#ifndef CONFIG_LINUX_IO_URING
if (s->use_linux_io_uring) {
error_setg(errp, "aio=io_uring was specified, but is not supported "
"in this build.");
@@ -2444,6 +2431,48 @@ static bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
return true;
}
+#ifdef CONFIG_LINUX_IO_URING
+static inline bool raw_check_linux_io_uring(BDRVRawState *s)
+{
+ Error *local_err = NULL;
+ AioContext *ctx;
+
+ if (!s->use_linux_io_uring) {
+ return false;
+ }
+
+ ctx = qemu_get_current_aio_context();
+ if (unlikely(!aio_setup_linux_io_uring(ctx, &local_err))) {
+ error_reportf_err(local_err, "Unable to use linux io_uring, "
+ "falling back to thread pool: ");
+ s->use_linux_io_uring = false;
+ return false;
+ }
+ return true;
+}
+#endif
+
+#ifdef CONFIG_LINUX_AIO
+static inline bool raw_check_linux_aio(BDRVRawState *s)
+{
+ Error *local_err = NULL;
+ AioContext *ctx;
+
+ if (!s->use_linux_aio) {
+ return false;
+ }
+
+ ctx = qemu_get_current_aio_context();
+ if (unlikely(!aio_setup_linux_aio(ctx, &local_err))) {
+ error_reportf_err(local_err, "Unable to use Linux AIO, "
+ "falling back to thread pool: ");
+ s->use_linux_aio = false;
+ return false;
+ }
+ return true;
+}
+#endif
+
static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
uint64_t bytes, QEMUIOVector *qiov, int type)
{
@@ -2474,13 +2503,13 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) {
type |= QEMU_AIO_MISALIGNED;
#ifdef CONFIG_LINUX_IO_URING
- } else if (s->use_linux_io_uring) {
+ } else if (raw_check_linux_io_uring(s)) {
assert(qiov->size == bytes);
ret = luring_co_submit(bs, s->fd, offset, qiov, type);
goto out;
#endif
#ifdef CONFIG_LINUX_AIO
- } else if (s->use_linux_aio) {
+ } else if (raw_check_linux_aio(s)) {
assert(qiov->size == bytes);
ret = laio_co_submit(s->fd, offset, qiov, type,
s->aio_max_batch);
@@ -2567,39 +2596,13 @@ static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
};
#ifdef CONFIG_LINUX_IO_URING
- if (s->use_linux_io_uring) {
+ if (raw_check_linux_io_uring(s)) {
return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH);
}
#endif
return raw_thread_pool_submit(handle_aiocb_flush, &acb);
}
-static void raw_aio_attach_aio_context(BlockDriverState *bs,
- AioContext *new_context)
-{
- BDRVRawState __attribute__((unused)) *s = bs->opaque;
-#ifdef CONFIG_LINUX_AIO
- if (s->use_linux_aio) {
- Error *local_err = NULL;
- if (!aio_setup_linux_aio(new_context, &local_err)) {
- error_reportf_err(local_err, "Unable to use native AIO, "
- "falling back to thread pool: ");
- s->use_linux_aio = false;
- }
- }
-#endif
-#ifdef CONFIG_LINUX_IO_URING
- if (s->use_linux_io_uring) {
- Error *local_err = NULL;
- if (!aio_setup_linux_io_uring(new_context, &local_err)) {
- error_reportf_err(local_err, "Unable to use linux io_uring, "
- "falling back to thread pool: ");
- s->use_linux_io_uring = false;
- }
- }
-#endif
-}
-
static void raw_close(BlockDriverState *bs)
{
BDRVRawState *s = bs->opaque;
@@ -3896,7 +3899,6 @@ BlockDriver bdrv_file = {
.bdrv_co_copy_range_from = raw_co_copy_range_from,
.bdrv_co_copy_range_to = raw_co_copy_range_to,
.bdrv_refresh_limits = raw_refresh_limits,
- .bdrv_attach_aio_context = raw_aio_attach_aio_context,
.bdrv_co_truncate = raw_co_truncate,
.bdrv_co_getlength = raw_co_getlength,
@@ -4266,7 +4268,6 @@ static BlockDriver bdrv_host_device = {
.bdrv_co_copy_range_from = raw_co_copy_range_from,
.bdrv_co_copy_range_to = raw_co_copy_range_to,
.bdrv_refresh_limits = raw_refresh_limits,
- .bdrv_attach_aio_context = raw_aio_attach_aio_context,
.bdrv_co_truncate = raw_co_truncate,
.bdrv_co_getlength = raw_co_getlength,
@@ -4402,7 +4403,6 @@ static BlockDriver bdrv_host_cdrom = {
.bdrv_co_pwritev = raw_co_pwritev,
.bdrv_co_flush_to_disk = raw_co_flush_to_disk,
.bdrv_refresh_limits = cdrom_refresh_limits,
- .bdrv_attach_aio_context = raw_aio_attach_aio_context,
.bdrv_co_truncate = raw_co_truncate,
.bdrv_co_getlength = raw_co_getlength,
@@ -4528,7 +4528,6 @@ static BlockDriver bdrv_host_cdrom = {
.bdrv_co_pwritev = raw_co_pwritev,
.bdrv_co_flush_to_disk = raw_co_flush_to_disk,
.bdrv_refresh_limits = cdrom_refresh_limits,
- .bdrv_attach_aio_context = raw_aio_attach_aio_context,
.bdrv_co_truncate = raw_co_truncate,
.bdrv_co_getlength = raw_co_getlength,
--
2.39.3

File diff suppressed because it is too large Load Diff

@ -1,97 +0,0 @@
From d0514c7d5d6cc1aa140119c95d5ea2c1591b01e9 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 5 Dec 2023 13:20:04 -0500
Subject: [PATCH 087/101] block: remove bdrv_co_lock()
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [18/26] a303f861ea5e84d8e89fd51e530fd0cb2da17b89 (kmwolf/centos-qemu-kvm)
The bdrv_co_lock() and bdrv_co_unlock() functions are already no-ops.
Remove them.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20231205182011.1976568-8-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
block.c | 10 ----------
blockdev.c | 5 -----
include/block/block-global-state.h | 14 --------------
3 files changed, 29 deletions(-)
diff --git a/block.c b/block.c
index 91ace5d2d5..434b7f4d72 100644
--- a/block.c
+++ b/block.c
@@ -7431,16 +7431,6 @@ void coroutine_fn bdrv_co_leave(BlockDriverState *bs, AioContext *old_ctx)
bdrv_dec_in_flight(bs);
}
-void coroutine_fn bdrv_co_lock(BlockDriverState *bs)
-{
- /* TODO removed in next patch */
-}
-
-void coroutine_fn bdrv_co_unlock(BlockDriverState *bs)
-{
- /* TODO removed in next patch */
-}
-
static void bdrv_do_remove_aio_context_notifier(BdrvAioNotifier *ban)
{
GLOBAL_STATE_CODE();
diff --git a/blockdev.c b/blockdev.c
index 5d8b3a23eb..3a5e7222ec 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2264,18 +2264,13 @@ void coroutine_fn qmp_block_resize(const char *device, const char *node_name,
return;
}
- bdrv_co_lock(bs);
bdrv_drained_begin(bs);
- bdrv_co_unlock(bs);
old_ctx = bdrv_co_enter(bs);
blk_co_truncate(blk, size, false, PREALLOC_MODE_OFF, 0, errp);
bdrv_co_leave(bs, old_ctx);
- bdrv_co_lock(bs);
bdrv_drained_end(bs);
- bdrv_co_unlock(bs);
-
blk_co_unref(blk);
}
diff --git a/include/block/block-global-state.h b/include/block/block-global-state.h
index 0327f1c605..4ec0b217f0 100644
--- a/include/block/block-global-state.h
+++ b/include/block/block-global-state.h
@@ -267,20 +267,6 @@ int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag);
int bdrv_debug_resume(BlockDriverState *bs, const char *tag);
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag);
-/**
- * Locks the AioContext of @bs if it's not the current AioContext. This avoids
- * double locking which could lead to deadlocks: This is a coroutine_fn, so we
- * know we already own the lock of the current AioContext.
- *
- * May only be called in the main thread.
- */
-void coroutine_fn bdrv_co_lock(BlockDriverState *bs);
-
-/**
- * Unlocks the AioContext of @bs if it's not the current AioContext.
- */
-void coroutine_fn bdrv_co_unlock(BlockDriverState *bs);
-
bool bdrv_child_change_aio_context(BdrvChild *c, AioContext *ctx,
GHashTable *visited, Transaction *tran,
Error **errp);
--
2.39.3

@ -1,411 +0,0 @@
From dc4eb64185957a01948217814478abc450ce5f26 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 5 Dec 2023 13:20:11 -0500
Subject: [PATCH 094/101] block: remove outdated AioContext locking comments
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [25/26] 395e18fb40d28d4bc961acee1a00da7f60748076 (kmwolf/centos-qemu-kvm)
The AioContext lock no longer exists.
There is one noteworthy change:
- * More specifically, these functions use BDRV_POLL_WHILE(bs), which
- * requires the caller to be either in the main thread and hold
- * the BlockdriverState (bs) AioContext lock, or directly in the
- * home thread that runs the bs AioContext. Calling them from
- * another thread in another AioContext would cause deadlocks.
+ * More specifically, these functions use BDRV_POLL_WHILE(bs), which requires
+ * the caller to be either in the main thread or directly in the home thread
+ * that runs the bs AioContext. Calling them from another thread in another
+ * AioContext would cause deadlocks.
I am not sure whether deadlocks are still possible. Maybe they have just
moved to the fine-grained locks that have replaced the AioContext. Since
I am not sure if the deadlocks are gone, I have kept the substance
unchanged and just removed mention of the AioContext.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-ID: <20231205182011.1976568-15-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
block.c | 73 ++++++----------------------
block/block-backend.c | 8 ---
block/export/vhost-user-blk-server.c | 4 --
include/block/block-common.h | 3 --
include/block/block-io.h | 9 ++--
include/block/block_int-common.h | 2 -
tests/qemu-iotests/202 | 2 +-
tests/qemu-iotests/203 | 3 +-
8 files changed, 22 insertions(+), 82 deletions(-)
diff --git a/block.c b/block.c
index 434b7f4d72..a097772238 100644
--- a/block.c
+++ b/block.c
@@ -1616,11 +1616,6 @@ out:
g_free(gen_node_name);
}
-/*
- * The caller must always hold @bs AioContext lock, because this function calls
- * bdrv_refresh_total_sectors() which polls when called from non-coroutine
- * context.
- */
static int no_coroutine_fn GRAPH_UNLOCKED
bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, const char *node_name,
QDict *options, int open_flags, Error **errp)
@@ -2901,7 +2896,7 @@ uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm)
* Replaces the node that a BdrvChild points to without updating permissions.
*
* If @new_bs is non-NULL, the parent of @child must already be drained through
- * @child and the caller must hold the AioContext lock for @new_bs.
+ * @child.
*/
static void GRAPH_WRLOCK
bdrv_replace_child_noperm(BdrvChild *child, BlockDriverState *new_bs)
@@ -3041,9 +3036,8 @@ static TransactionActionDrv bdrv_attach_child_common_drv = {
*
* Returns new created child.
*
- * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
- * @child_bs can move to a different AioContext in this function. Callers must
- * make sure that their AioContext locking is still correct after this.
+ * Both @parent_bs and @child_bs can move to a different AioContext in this
+ * function.
*/
static BdrvChild * GRAPH_WRLOCK
bdrv_attach_child_common(BlockDriverState *child_bs,
@@ -3142,9 +3136,8 @@ bdrv_attach_child_common(BlockDriverState *child_bs,
/*
* Function doesn't update permissions, caller is responsible for this.
*
- * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
- * @child_bs can move to a different AioContext in this function. Callers must
- * make sure that their AioContext locking is still correct after this.
+ * Both @parent_bs and @child_bs can move to a different AioContext in this
+ * function.
*
* After calling this function, the transaction @tran may only be completed
* while holding a writer lock for the graph.
@@ -3184,9 +3177,6 @@ bdrv_attach_child_noperm(BlockDriverState *parent_bs,
*
* On failure NULL is returned, errp is set and the reference to
* child_bs is also dropped.
- *
- * The caller must hold the AioContext lock @child_bs, but not that of @ctx
- * (unless @child_bs is already in @ctx).
*/
BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
const char *child_name,
@@ -3226,9 +3216,6 @@ out:
*
* On failure NULL is returned, errp is set and the reference to
* child_bs is also dropped.
- *
- * If @parent_bs and @child_bs are in different AioContexts, the caller must
- * hold the AioContext lock for @child_bs, but not for @parent_bs.
*/
BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
BlockDriverState *child_bs,
@@ -3418,9 +3405,8 @@ static BdrvChildRole bdrv_backing_role(BlockDriverState *bs)
*
* Function doesn't update permissions, caller is responsible for this.
*
- * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
- * @child_bs can move to a different AioContext in this function. Callers must
- * make sure that their AioContext locking is still correct after this.
+ * Both @parent_bs and @child_bs can move to a different AioContext in this
+ * function.
*
* After calling this function, the transaction @tran may only be completed
* while holding a writer lock for the graph.
@@ -3513,9 +3499,8 @@ out:
}
/*
- * The caller must hold the AioContext lock for @backing_hd. Both @bs and
- * @backing_hd can move to a different AioContext in this function. Callers must
- * make sure that their AioContext locking is still correct after this.
+ * Both @bs and @backing_hd can move to a different AioContext in this
+ * function.
*
* If a backing child is already present (i.e. we're detaching a node), that
* child node must be drained.
@@ -3574,8 +3559,6 @@ int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
* itself, all options starting with "${bdref_key}." are considered part of the
* BlockdevRef.
*
- * The caller must hold the main AioContext lock.
- *
* TODO Can this be unified with bdrv_open_image()?
*/
int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
@@ -3745,9 +3728,7 @@ done:
*
* The BlockdevRef will be removed from the options QDict.
*
- * The caller must hold the lock of the main AioContext and no other AioContext.
- * @parent can move to a different AioContext in this function. Callers must
- * make sure that their AioContext locking is still correct after this.
+ * @parent can move to a different AioContext in this function.
*/
BdrvChild *bdrv_open_child(const char *filename,
QDict *options, const char *bdref_key,
@@ -3778,9 +3759,7 @@ BdrvChild *bdrv_open_child(const char *filename,
/*
* Wrapper on bdrv_open_child() for most popular case: open primary child of bs.
*
- * The caller must hold the lock of the main AioContext and no other AioContext.
- * @parent can move to a different AioContext in this function. Callers must
- * make sure that their AioContext locking is still correct after this.
+ * @parent can move to a different AioContext in this function.
*/
int bdrv_open_file_child(const char *filename,
QDict *options, const char *bdref_key,
@@ -3923,8 +3902,6 @@ out:
* The reference parameter may be used to specify an existing block device which
* should be opened. If specified, neither options nor a filename may be given,
* nor can an existing BDS be reused (that is, *pbs has to be NULL).
- *
- * The caller must always hold the main AioContext lock.
*/
static BlockDriverState * no_coroutine_fn
bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
@@ -4217,7 +4194,6 @@ close_and_fail:
return NULL;
}
-/* The caller must always hold the main AioContext lock. */
BlockDriverState *bdrv_open(const char *filename, const char *reference,
QDict *options, int flags, Error **errp)
{
@@ -4665,10 +4641,7 @@ int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only,
*
* Return 0 on success, otherwise return < 0 and set @errp.
*
- * The caller must hold the AioContext lock of @reopen_state->bs.
* @reopen_state->bs can move to a different AioContext in this function.
- * Callers must make sure that their AioContext locking is still correct after
- * this.
*/
static int GRAPH_UNLOCKED
bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
@@ -4801,8 +4774,6 @@ out_rdlock:
* It is the responsibility of the caller to then call the abort() or
* commit() for any other BDS that have been left in a prepare() state
*
- * The caller must hold the AioContext lock of @reopen_state->bs.
- *
* After calling this function, the transaction @change_child_tran may only be
* completed while holding a writer lock for the graph.
*/
@@ -5437,8 +5408,6 @@ int bdrv_drop_filter(BlockDriverState *bs, Error **errp)
* child.
*
* This function does not create any image files.
- *
- * The caller must hold the AioContext lock for @bs_top.
*/
int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
Error **errp)
@@ -5545,9 +5514,8 @@ static void bdrv_delete(BlockDriverState *bs)
* after the call (even on failure), so if the caller intends to reuse the
* dictionary, it needs to use qobject_ref() before calling bdrv_open.
*
- * The caller holds the AioContext lock for @bs. It must make sure that @bs
- * stays in the same AioContext, i.e. @options must not refer to nodes in a
- * different AioContext.
+ * The caller must make sure that @bs stays in the same AioContext, i.e.
+ * @options must not refer to nodes in a different AioContext.
*/
BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *options,
int flags, Error **errp)
@@ -7565,10 +7533,6 @@ static TransactionActionDrv set_aio_context = {
*
* Must be called from the main AioContext.
*
- * The caller must own the AioContext lock for the old AioContext of bs, but it
- * must not own the AioContext lock for new_context (unless new_context is the
- * same as the current context of bs).
- *
* @visited will accumulate all visited BdrvChild objects. The caller is
* responsible for freeing the list afterwards.
*/
@@ -7621,13 +7585,6 @@ static bool bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx,
*
* If ignore_child is not NULL, that child (and its subgraph) will not
* be touched.
- *
- * This function still requires the caller to take the bs current
- * AioContext lock, otherwise draining will fail since AIO_WAIT_WHILE
- * assumes the lock is always held if bs is in another AioContext.
- * For the same reason, it temporarily also holds the new AioContext, since
- * bdrv_drained_end calls BDRV_POLL_WHILE that assumes the lock is taken too.
- * Therefore the new AioContext lock must not be taken by the caller.
*/
int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx,
BdrvChild *ignore_child, Error **errp)
@@ -7653,8 +7610,8 @@ int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx,
/*
* Linear phase: go through all callbacks collected in the transaction.
- * Run all callbacks collected in the recursion to switch all nodes
- * AioContext lock (transaction commit), or undo all changes done in the
+ * Run all callbacks collected in the recursion to switch every node's
+ * AioContext (transaction commit), or undo all changes done in the
* recursion (transaction abort).
*/
diff --git a/block/block-backend.c b/block/block-backend.c
index f412bed274..209eb07528 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -390,8 +390,6 @@ BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm)
* Both sets of permissions can be changed later using blk_set_perm().
*
* Return the new BlockBackend on success, null on failure.
- *
- * Callers must hold the AioContext lock of @bs.
*/
BlockBackend *blk_new_with_bs(BlockDriverState *bs, uint64_t perm,
uint64_t shared_perm, Error **errp)
@@ -416,8 +414,6 @@ BlockBackend *blk_new_with_bs(BlockDriverState *bs, uint64_t perm,
* Just as with bdrv_open(), after having called this function the reference to
* @options belongs to the block layer (even on failure).
*
- * Called without holding an AioContext lock.
- *
* TODO: Remove @filename and @flags; it should be possible to specify a whole
* BDS tree just by specifying the @options QDict (or @reference,
* alternatively). At the time of adding this function, this is not possible,
@@ -872,8 +868,6 @@ BlockBackend *blk_by_public(BlockBackendPublic *public)
/*
* Disassociates the currently associated BlockDriverState from @blk.
- *
- * The caller must hold the AioContext lock for the BlockBackend.
*/
void blk_remove_bs(BlockBackend *blk)
{
@@ -915,8 +909,6 @@ void blk_remove_bs(BlockBackend *blk)
/*
* Associates a new BlockDriverState with @blk.
- *
- * Callers must hold the AioContext lock of @bs.
*/
int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
{
diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
index 16f48388d3..50c358e8cd 100644
--- a/block/export/vhost-user-blk-server.c
+++ b/block/export/vhost-user-blk-server.c
@@ -278,7 +278,6 @@ static void vu_blk_exp_resize(void *opaque)
vu_config_change_msg(&vexp->vu_server.vu_dev);
}
-/* Called with vexp->export.ctx acquired */
static void vu_blk_drained_begin(void *opaque)
{
VuBlkExport *vexp = opaque;
@@ -287,7 +286,6 @@ static void vu_blk_drained_begin(void *opaque)
vhost_user_server_detach_aio_context(&vexp->vu_server);
}
-/* Called with vexp->export.blk AioContext acquired */
static void vu_blk_drained_end(void *opaque)
{
VuBlkExport *vexp = opaque;
@@ -300,8 +298,6 @@ static void vu_blk_drained_end(void *opaque)
* Ensures that bdrv_drained_begin() waits until in-flight requests complete
* and the server->co_trip coroutine has terminated. It will be restarted in
* vhost_user_server_attach_aio_context().
- *
- * Called with vexp->export.ctx acquired.
*/
static bool vu_blk_drained_poll(void *opaque)
{
diff --git a/include/block/block-common.h b/include/block/block-common.h
index d7599564db..a846023a09 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -70,9 +70,6 @@
* automatically takes the graph rdlock when calling the wrapped function. In
* the same way, no_co_wrapper_bdrv_wrlock functions automatically take the
* graph wrlock.
- *
- * If the first parameter of the function is a BlockDriverState, BdrvChild or
- * BlockBackend pointer, the AioContext lock for it is taken in the wrapper.
*/
#define no_co_wrapper
#define no_co_wrapper_bdrv_rdlock
diff --git a/include/block/block-io.h b/include/block/block-io.h
index 8eb39a858b..b49e0537dd 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -332,11 +332,10 @@ bdrv_co_copy_range(BdrvChild *src, int64_t src_offset,
* "I/O or GS" API functions. These functions can run without
* the BQL, but only in one specific iothread/main loop.
*
- * More specifically, these functions use BDRV_POLL_WHILE(bs), which
- * requires the caller to be either in the main thread and hold
- * the BlockdriverState (bs) AioContext lock, or directly in the
- * home thread that runs the bs AioContext. Calling them from
- * another thread in another AioContext would cause deadlocks.
+ * More specifically, these functions use BDRV_POLL_WHILE(bs), which requires
+ * the caller to be either in the main thread or directly in the home thread
+ * that runs the bs AioContext. Calling them from another thread in another
+ * AioContext would cause deadlocks.
*
* Therefore, these functions are not proper I/O, because they
* can't run in *any* iothreads, but only in a specific one.
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 4e31d161c5..151279d481 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -1192,8 +1192,6 @@ struct BlockDriverState {
/* The error object in use for blocking operations on backing_hd */
Error *backing_blocker;
- /* Protected by AioContext lock */
-
/*
* If we are reading a disk image, give its size in sectors.
* Generally read-only; it is written to by load_snapshot and
diff --git a/tests/qemu-iotests/202 b/tests/qemu-iotests/202
index b784dcd791..13304242e5 100755
--- a/tests/qemu-iotests/202
+++ b/tests/qemu-iotests/202
@@ -21,7 +21,7 @@
# Check that QMP 'transaction' blockdev-snapshot-sync with multiple drives on a
# single IOThread completes successfully. This particular command triggered a
# hang due to recursive AioContext locking and BDRV_POLL_WHILE(). Protect
-# against regressions.
+# against regressions even though the AioContext lock no longer exists.
import iotests
diff --git a/tests/qemu-iotests/203 b/tests/qemu-iotests/203
index ab80fd0e44..1ba878522b 100755
--- a/tests/qemu-iotests/203
+++ b/tests/qemu-iotests/203
@@ -21,7 +21,8 @@
# Check that QMP 'migrate' with multiple drives on a single IOThread completes
# successfully. This particular command triggered a hang in the source QEMU
# process due to recursive AioContext locking in bdrv_invalidate_all() and
-# BDRV_POLL_WHILE().
+# BDRV_POLL_WHILE(). Protect against regressions even though the AioContext
+# lock no longer exists.
import iotests
--
2.39.3

@ -1,105 +0,0 @@
From 95b2ffc5f01dc4309c2e747ed883d22cd1d26347 Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Sat, 2 Mar 2024 17:00:23 +0100
Subject: [PATCH 2/2] chardev/char-socket: Fix TLS io channels sending too much
data to the backend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 227: Fix TLS io channels sending too much data to the backend
RH-Jira: RHEL-24614
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Daniel P. Berrangé <berrange@redhat.com>
RH-Commit: [1/1] fce871914e0ce52e16a6edae0e007513f9fec1ae (thuth/qemu-kvm-cs9)
JIRA: https://issues.redhat.com/browse/RHEL-24614
commit 462945cd22d2bcd233401ed3aa167d83a8e35b05
Author: Thomas Huth <thuth@redhat.com>
Date: Thu Feb 29 11:43:37 2024 +0100
chardev/char-socket: Fix TLS io channels sending too much data to the backend
Commit ffda5db65a ("io/channel-tls: fix handling of bigger read buffers")
changed the behavior of the TLS io channels to schedule a second reading
attempt if there is still incoming data pending. This caused a regression
with backends like the sclpconsole that check in their read function that
the sender does not try to write more bytes to it than the device can
currently handle.
The problem can be reproduced like this:
1) In one terminal, do this:
mkdir qemu-pki
cd qemu-pki
openssl genrsa 2048 > ca-key.pem
openssl req -new -x509 -nodes -days 365000 -key ca-key.pem -out ca-cert.pem
# enter some dummy value for the cert
openssl genrsa 2048 > server-key.pem
openssl req -new -x509 -nodes -days 365000 -key server-key.pem \
-out server-cert.pem
# enter some other dummy values for the cert
gnutls-serv --echo --x509cafile ca-cert.pem --x509keyfile server-key.pem \
--x509certfile server-cert.pem -p 8338
2) In another terminal, do this:
wget https://download.fedoraproject.org/pub/fedora-secondary/releases/39/Cloud/s390x/images/Fedora-Cloud-Base-39-1.5.s390x.qcow2
qemu-system-s390x -nographic -nodefaults \
-hda Fedora-Cloud-Base-39-1.5.s390x.qcow2 \
-object tls-creds-x509,id=tls0,endpoint=client,verify-peer=false,dir=$PWD/qemu-pki \
-chardev socket,id=tls_chardev,host=localhost,port=8338,tls-creds=tls0 \
-device sclpconsole,chardev=tls_chardev,id=tls_serial
QEMU then aborts after a second or two with:
qemu-system-s390x: ../hw/char/sclpconsole.c:73: chr_read: Assertion
`size <= SIZE_BUFFER_VT220 - scon->iov_data_len' failed.
Aborted (core dumped)
It looks like the second read does not trigger the chr_can_read() function
to be called before the second read, which should normally always be done
before sending bytes to a character device to see how much it can handle,
so the s->max_size in tcp_chr_read() still contains the old value from the
previous read. Let's make sure that we use the up-to-date value by calling
tcp_chr_read_poll() again here.
Fixes: ffda5db65a ("io/channel-tls: fix handling of bigger read buffers")
Buglink: https://issues.redhat.com/browse/RHEL-24614
Reviewed-by: "Daniel P. Berrangé" <berrange@redhat.com>
Message-ID: <20240229104339.42574-1-thuth@redhat.com>
Reviewed-by: Antoine Damhet <antoine.damhet@blade-group.com>
Tested-by: Antoine Damhet <antoine.damhet@blade-group.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
chardev/char-socket.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index 73947da188..034840593d 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -492,9 +492,9 @@ static gboolean tcp_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque)
s->max_size <= 0) {
return TRUE;
}
- len = sizeof(buf);
- if (len > s->max_size) {
- len = s->max_size;
+ len = tcp_chr_read_poll(opaque);
+ if (len > sizeof(buf)) {
+ len = sizeof(buf);
}
size = tcp_chr_recv(chr, (void *)buf, len);
if (size == 0 || (size == -1 && errno != EAGAIN)) {
--
2.39.3

@ -1,78 +0,0 @@
From 4d4102f6e2f9afd6182888787ae8b570347df87d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Mon, 18 Mar 2024 18:06:59 +0000
Subject: [PATCH 1/3] chardev: lower priority of the HUP GSource in socket
chardev
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Daniel P. Berrangé <berrange@redhat.com>
RH-MergeRequest: 233: Fix handling of TLS sessions in chardevs
RH-Jira: RHEL-24614
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
RH-Commit: [1/3] 842f54349191b0206e68f35a7a80155f5a584942 (berrange/centos-src-qemu)
The socket chardev often has 2 GSource object registered against the
same FD. One is registered all the time and is just intended to handle
POLLHUP events, while the other gets registered & unregistered on the
fly as the frontend is ready to receive more data or not.
It is very common for poll() to signal a POLLHUP event at the same time
as there is pending incoming data from the disconnected client. It is
therefore essential to process incoming data prior to processing HUP.
The problem with having 2 GSource on the same FD is that there is no
guaranteed ordering of execution between them, so the chardev code may
process HUP first and thus discard data.
This failure scenario is non-deterministic but can be seen fairly
reliably by reverting a7077b8e354d90fec26c2921aa2dea85b90dff90, and
then running 'tests/unit/test-char', which will sometimes fail with
missing data.
Ideally QEMU would only have 1 GSource, but that's a complex code
refactoring job. The next best solution is to try to ensure ordering
between the 2 GSource objects. This can be achieved by lowering the
priority of the HUP GSource, so that it is never dispatched if the
main GSource is also ready to dispatch. Counter-intuitively, lowering
the priority of a GSource is done by raising its priority number.
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
(cherry picked from commit 8bd8b04adc9f18904f323dff085f8b4ec77915c6)
---
chardev/char-socket.c | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index 034840593d..f48d341ebc 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -597,6 +597,22 @@ static void update_ioc_handlers(SocketChardev *s)
remove_hup_source(s);
s->hup_source = qio_channel_create_watch(s->ioc, G_IO_HUP);
+ /*
+ * poll() is liable to return POLLHUP even when there is
+ * still incoming data available to read on the FD. If
+ * we have the hup_source at the same priority as the
+ * main io_add_watch_poll GSource, then we might end up
+ * processing the POLLHUP event first, closing the FD,
+ * and as a result silently discard data we should have
+ * read.
+ *
+ * By setting the hup_source to G_PRIORITY_DEFAULT + 1,
+ * we ensure that io_add_watch_poll GSource will always
+ * be dispatched first, thus guaranteeing we will be
+ * able to process all incoming data before closing the
+ * FD
+ */
+ g_source_set_priority(s->hup_source, G_PRIORITY_DEFAULT + 1);
g_source_set_callback(s->hup_source, (GSourceFunc)tcp_chr_hup,
chr, NULL);
g_source_attach(s->hup_source, chr->gcontext);
--
2.39.3

@ -0,0 +1,90 @@
From 0f0a3a860a07addea21a0282556a5022b9cb8b2c Mon Sep 17 00:00:00 2001
From: Xiaoyao Li <xiaoyao.li@intel.com>
Date: Thu, 29 Feb 2024 01:00:35 -0500
Subject: [PATCH 011/100] confidential guest support: Add kvm_init() and
kvm_reset() in class
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [11/91] 21d2178178bf181a8e4d0b051f64bd983f0d0cf1 (bonzini/rhel-qemu-kvm)
Different confidential VMs in different architectures all have the same
needs to do their specific initialization (and maybe resetting) stuffs
with KVM. Currently each of them exposes individual *_kvm_init()
functions and let machine code or kvm code to call it.
To facilitate the introduction of confidential guest technology from
different x86 vendors, add two virtual functions, kvm_init() and kvm_reset()
in ConfidentialGuestSupportClass, and expose two helpers functions for
invodking them.
Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Message-Id: <20240229060038.606591-1-xiaoyao.li@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 41a605944e3fecae43ca18ded95ec31f28e0c7fe)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
include/exec/confidential-guest-support.h | 34 ++++++++++++++++++++++-
1 file changed, 33 insertions(+), 1 deletion(-)
diff --git a/include/exec/confidential-guest-support.h b/include/exec/confidential-guest-support.h
index ba2dd4b5df..e5b188cffb 100644
--- a/include/exec/confidential-guest-support.h
+++ b/include/exec/confidential-guest-support.h
@@ -23,7 +23,10 @@
#include "qom/object.h"
#define TYPE_CONFIDENTIAL_GUEST_SUPPORT "confidential-guest-support"
-OBJECT_DECLARE_SIMPLE_TYPE(ConfidentialGuestSupport, CONFIDENTIAL_GUEST_SUPPORT)
+OBJECT_DECLARE_TYPE(ConfidentialGuestSupport,
+ ConfidentialGuestSupportClass,
+ CONFIDENTIAL_GUEST_SUPPORT)
+
struct ConfidentialGuestSupport {
Object parent;
@@ -55,8 +58,37 @@ struct ConfidentialGuestSupport {
typedef struct ConfidentialGuestSupportClass {
ObjectClass parent;
+
+ int (*kvm_init)(ConfidentialGuestSupport *cgs, Error **errp);
+ int (*kvm_reset)(ConfidentialGuestSupport *cgs, Error **errp);
} ConfidentialGuestSupportClass;
+static inline int confidential_guest_kvm_init(ConfidentialGuestSupport *cgs,
+ Error **errp)
+{
+ ConfidentialGuestSupportClass *klass;
+
+ klass = CONFIDENTIAL_GUEST_SUPPORT_GET_CLASS(cgs);
+ if (klass->kvm_init) {
+ return klass->kvm_init(cgs, errp);
+ }
+
+ return 0;
+}
+
+static inline int confidential_guest_kvm_reset(ConfidentialGuestSupport *cgs,
+ Error **errp)
+{
+ ConfidentialGuestSupportClass *klass;
+
+ klass = CONFIDENTIAL_GUEST_SUPPORT_GET_CLASS(cgs);
+ if (klass->kvm_reset) {
+ return klass->kvm_reset(cgs, errp);
+ }
+
+ return 0;
+}
+
#endif /* !CONFIG_USER_ONLY */
#endif /* QEMU_CONFIDENTIAL_GUEST_SUPPORT_H */
--
2.39.3

@ -1,412 +0,0 @@
From e99c56752a1c4021a93c92b7be78856ebefaa1b3 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 18 Mar 2024 14:34:29 -0400
Subject: [PATCH 1/2] coroutine: cap per-thread local pool size
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 234: coroutine: cap per-thread local pool size
RH-Jira: RHEL-28947
RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
RH-Acked-by: Hanna Czenczek <hreitz@redhat.com>
RH-Commit: [1/2] 5971de1c1e238457925bfb9c4bfc932de857b28d (stefanha/centos-stream-qemu-kvm)
The coroutine pool implementation can hit the Linux vm.max_map_count
limit, causing QEMU to abort with "failed to allocate memory for stack"
or "failed to set up stack guard page" during coroutine creation.
This happens because per-thread pools can grow to tens of thousands of
coroutines. Each coroutine causes 2 virtual memory areas to be created.
Eventually vm.max_map_count is reached and memory-related syscalls fail.
The per-thread pool sizes are non-uniform and depend on past coroutine
usage in each thread, so it's possible for one thread to have a large
pool while another thread's pool is empty.
Switch to a new coroutine pool implementation with a global pool that
grows to a maximum number of coroutines and per-thread local pools that
are capped at hardcoded small number of coroutines.
This approach does not leave large numbers of coroutines pooled in a
thread that may not use them again. In order to perform well it
amortizes the cost of global pool accesses by working in batches of
coroutines instead of individual coroutines.
The global pool is a list. Threads donate batches of coroutines to when
they have too many and take batches from when they have too few:
.-----------------------------------.
| Batch 1 | Batch 2 | Batch 3 | ... | global_pool
`-----------------------------------'
Each thread has up to 2 batches of coroutines:
.-------------------.
| Batch 1 | Batch 2 | per-thread local_pool (maximum 2 batches)
`-------------------'
The goal of this change is to reduce the excessive number of pooled
coroutines that cause QEMU to abort when vm.max_map_count is reached
without losing the performance of an adequately sized coroutine pool.
Here are virtio-blk disk I/O benchmark results:
RW BLKSIZE IODEPTH OLD NEW CHANGE
randread 4k 1 113725 117451 +3.3%
randread 4k 8 192968 198510 +2.9%
randread 4k 16 207138 209429 +1.1%
randread 4k 32 212399 215145 +1.3%
randread 4k 64 218319 221277 +1.4%
randread 128k 1 17587 17535 -0.3%
randread 128k 8 17614 17616 +0.0%
randread 128k 16 17608 17609 +0.0%
randread 128k 32 17552 17553 +0.0%
randread 128k 64 17484 17484 +0.0%
See files/{fio.sh,test.xml.j2} for the benchmark configuration:
https://gitlab.com/stefanha/virt-playbooks/-/tree/coroutine-pool-fix-sizing
Buglink: https://issues.redhat.com/browse/RHEL-28947
Reported-by: Sanjay Rao <srao@redhat.com>
Reported-by: Boaz Ben Shabat <bbenshab@redhat.com>
Reported-by: Joe Mario <jmario@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20240318183429.1039340-1-stefanha@redhat.com>
(cherry picked from commit 86a637e48104ae74d8be53bed6441ce32be33433)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
util/qemu-coroutine.c | 282 +++++++++++++++++++++++++++++++++---------
1 file changed, 223 insertions(+), 59 deletions(-)
diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
index 5fd2dbaf8b..2790959eaf 100644
--- a/util/qemu-coroutine.c
+++ b/util/qemu-coroutine.c
@@ -18,39 +18,200 @@
#include "qemu/atomic.h"
#include "qemu/coroutine_int.h"
#include "qemu/coroutine-tls.h"
+#include "qemu/cutils.h"
#include "block/aio.h"
-/**
- * The minimal batch size is always 64, coroutines from the release_pool are
- * reused as soon as there are 64 coroutines in it. The maximum pool size starts
- * with 64 and is increased on demand so that coroutines are not deleted even if
- * they are not immediately reused.
- */
enum {
- POOL_MIN_BATCH_SIZE = 64,
- POOL_INITIAL_MAX_SIZE = 64,
+ COROUTINE_POOL_BATCH_MAX_SIZE = 128,
};
-/** Free list to speed up creation */
-static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool);
-static unsigned int pool_max_size = POOL_INITIAL_MAX_SIZE;
-static unsigned int release_pool_size;
+/*
+ * Coroutine creation and deletion is expensive so a pool of unused coroutines
+ * is kept as a cache. When the pool has coroutines available, they are
+ * recycled instead of creating new ones from scratch. Coroutines are added to
+ * the pool upon termination.
+ *
+ * The pool is global but each thread maintains a small local pool to avoid
+ * global pool contention. Threads fetch and return batches of coroutines from
+ * the global pool to maintain their local pool. The local pool holds up to two
+ * batches whereas the maximum size of the global pool is controlled by the
+ * qemu_coroutine_inc_pool_size() API.
+ *
+ * .-----------------------------------.
+ * | Batch 1 | Batch 2 | Batch 3 | ... | global_pool
+ * `-----------------------------------'
+ *
+ * .-------------------.
+ * | Batch 1 | Batch 2 | per-thread local_pool (maximum 2 batches)
+ * `-------------------'
+ */
+typedef struct CoroutinePoolBatch {
+ /* Batches are kept in a list */
+ QSLIST_ENTRY(CoroutinePoolBatch) next;
+
+ /* This batch holds up to @COROUTINE_POOL_BATCH_MAX_SIZE coroutines */
+ QSLIST_HEAD(, Coroutine) list;
+ unsigned int size;
+} CoroutinePoolBatch;
+
+typedef QSLIST_HEAD(, CoroutinePoolBatch) CoroutinePool;
+
+/* Host operating system limit on number of pooled coroutines */
+static unsigned int global_pool_hard_max_size;
+
+static QemuMutex global_pool_lock; /* protects the following variables */
+static CoroutinePool global_pool = QSLIST_HEAD_INITIALIZER(global_pool);
+static unsigned int global_pool_size;
+static unsigned int global_pool_max_size = COROUTINE_POOL_BATCH_MAX_SIZE;
+
+QEMU_DEFINE_STATIC_CO_TLS(CoroutinePool, local_pool);
+QEMU_DEFINE_STATIC_CO_TLS(Notifier, local_pool_cleanup_notifier);
-typedef QSLIST_HEAD(, Coroutine) CoroutineQSList;
-QEMU_DEFINE_STATIC_CO_TLS(CoroutineQSList, alloc_pool);
-QEMU_DEFINE_STATIC_CO_TLS(unsigned int, alloc_pool_size);
-QEMU_DEFINE_STATIC_CO_TLS(Notifier, coroutine_pool_cleanup_notifier);
+static CoroutinePoolBatch *coroutine_pool_batch_new(void)
+{
+ CoroutinePoolBatch *batch = g_new(CoroutinePoolBatch, 1);
+
+ QSLIST_INIT(&batch->list);
+ batch->size = 0;
+ return batch;
+}
-static void coroutine_pool_cleanup(Notifier *n, void *value)
+static void coroutine_pool_batch_delete(CoroutinePoolBatch *batch)
{
Coroutine *co;
Coroutine *tmp;
- CoroutineQSList *alloc_pool = get_ptr_alloc_pool();
- QSLIST_FOREACH_SAFE(co, alloc_pool, pool_next, tmp) {
- QSLIST_REMOVE_HEAD(alloc_pool, pool_next);
+ QSLIST_FOREACH_SAFE(co, &batch->list, pool_next, tmp) {
+ QSLIST_REMOVE_HEAD(&batch->list, pool_next);
qemu_coroutine_delete(co);
}
+ g_free(batch);
+}
+
+static void local_pool_cleanup(Notifier *n, void *value)
+{
+ CoroutinePool *local_pool = get_ptr_local_pool();
+ CoroutinePoolBatch *batch;
+ CoroutinePoolBatch *tmp;
+
+ QSLIST_FOREACH_SAFE(batch, local_pool, next, tmp) {
+ QSLIST_REMOVE_HEAD(local_pool, next);
+ coroutine_pool_batch_delete(batch);
+ }
+}
+
+/* Ensure the atexit notifier is registered */
+static void local_pool_cleanup_init_once(void)
+{
+ Notifier *notifier = get_ptr_local_pool_cleanup_notifier();
+ if (!notifier->notify) {
+ notifier->notify = local_pool_cleanup;
+ qemu_thread_atexit_add(notifier);
+ }
+}
+
+/* Helper to get the next unused coroutine from the local pool */
+static Coroutine *coroutine_pool_get_local(void)
+{
+ CoroutinePool *local_pool = get_ptr_local_pool();
+ CoroutinePoolBatch *batch = QSLIST_FIRST(local_pool);
+ Coroutine *co;
+
+ if (unlikely(!batch)) {
+ return NULL;
+ }
+
+ co = QSLIST_FIRST(&batch->list);
+ QSLIST_REMOVE_HEAD(&batch->list, pool_next);
+ batch->size--;
+
+ if (batch->size == 0) {
+ QSLIST_REMOVE_HEAD(local_pool, next);
+ coroutine_pool_batch_delete(batch);
+ }
+ return co;
+}
+
+/* Get the next batch from the global pool */
+static void coroutine_pool_refill_local(void)
+{
+ CoroutinePool *local_pool = get_ptr_local_pool();
+ CoroutinePoolBatch *batch;
+
+ WITH_QEMU_LOCK_GUARD(&global_pool_lock) {
+ batch = QSLIST_FIRST(&global_pool);
+
+ if (batch) {
+ QSLIST_REMOVE_HEAD(&global_pool, next);
+ global_pool_size -= batch->size;
+ }
+ }
+
+ if (batch) {
+ QSLIST_INSERT_HEAD(local_pool, batch, next);
+ local_pool_cleanup_init_once();
+ }
+}
+
+/* Add a batch of coroutines to the global pool */
+static void coroutine_pool_put_global(CoroutinePoolBatch *batch)
+{
+ WITH_QEMU_LOCK_GUARD(&global_pool_lock) {
+ unsigned int max = MIN(global_pool_max_size,
+ global_pool_hard_max_size);
+
+ if (global_pool_size < max) {
+ QSLIST_INSERT_HEAD(&global_pool, batch, next);
+
+ /* Overshooting the max pool size is allowed */
+ global_pool_size += batch->size;
+ return;
+ }
+ }
+
+ /* The global pool was full, so throw away this batch */
+ coroutine_pool_batch_delete(batch);
+}
+
+/* Get the next unused coroutine from the pool or return NULL */
+static Coroutine *coroutine_pool_get(void)
+{
+ Coroutine *co;
+
+ co = coroutine_pool_get_local();
+ if (!co) {
+ coroutine_pool_refill_local();
+ co = coroutine_pool_get_local();
+ }
+ return co;
+}
+
+static void coroutine_pool_put(Coroutine *co)
+{
+ CoroutinePool *local_pool = get_ptr_local_pool();
+ CoroutinePoolBatch *batch = QSLIST_FIRST(local_pool);
+
+ if (unlikely(!batch)) {
+ batch = coroutine_pool_batch_new();
+ QSLIST_INSERT_HEAD(local_pool, batch, next);
+ local_pool_cleanup_init_once();
+ }
+
+ if (unlikely(batch->size >= COROUTINE_POOL_BATCH_MAX_SIZE)) {
+ CoroutinePoolBatch *next = QSLIST_NEXT(batch, next);
+
+ /* Is the local pool full? */
+ if (next) {
+ QSLIST_REMOVE_HEAD(local_pool, next);
+ coroutine_pool_put_global(batch);
+ }
+
+ batch = coroutine_pool_batch_new();
+ QSLIST_INSERT_HEAD(local_pool, batch, next);
+ }
+
+ QSLIST_INSERT_HEAD(&batch->list, co, pool_next);
+ batch->size++;
}
Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque)
@@ -58,31 +219,7 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque)
Coroutine *co = NULL;
if (IS_ENABLED(CONFIG_COROUTINE_POOL)) {
- CoroutineQSList *alloc_pool = get_ptr_alloc_pool();
-
- co = QSLIST_FIRST(alloc_pool);
- if (!co) {
- if (release_pool_size > POOL_MIN_BATCH_SIZE) {
- /* Slow path; a good place to register the destructor, too. */
- Notifier *notifier = get_ptr_coroutine_pool_cleanup_notifier();
- if (!notifier->notify) {
- notifier->notify = coroutine_pool_cleanup;
- qemu_thread_atexit_add(notifier);
- }
-
- /* This is not exact; there could be a little skew between
- * release_pool_size and the actual size of release_pool. But
- * it is just a heuristic, it does not need to be perfect.
- */
- set_alloc_pool_size(qatomic_xchg(&release_pool_size, 0));
- QSLIST_MOVE_ATOMIC(alloc_pool, &release_pool);
- co = QSLIST_FIRST(alloc_pool);
- }
- }
- if (co) {
- QSLIST_REMOVE_HEAD(alloc_pool, pool_next);
- set_alloc_pool_size(get_alloc_pool_size() - 1);
- }
+ co = coroutine_pool_get();
}
if (!co) {
@@ -100,19 +237,10 @@ static void coroutine_delete(Coroutine *co)
co->caller = NULL;
if (IS_ENABLED(CONFIG_COROUTINE_POOL)) {
- if (release_pool_size < qatomic_read(&pool_max_size) * 2) {
- QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next);
- qatomic_inc(&release_pool_size);
- return;
- }
- if (get_alloc_pool_size() < qatomic_read(&pool_max_size)) {
- QSLIST_INSERT_HEAD(get_ptr_alloc_pool(), co, pool_next);
- set_alloc_pool_size(get_alloc_pool_size() + 1);
- return;
- }
+ coroutine_pool_put(co);
+ } else {
+ qemu_coroutine_delete(co);
}
-
- qemu_coroutine_delete(co);
}
void qemu_aio_coroutine_enter(AioContext *ctx, Coroutine *co)
@@ -223,10 +351,46 @@ AioContext *qemu_coroutine_get_aio_context(Coroutine *co)
void qemu_coroutine_inc_pool_size(unsigned int additional_pool_size)
{
- qatomic_add(&pool_max_size, additional_pool_size);
+ QEMU_LOCK_GUARD(&global_pool_lock);
+ global_pool_max_size += additional_pool_size;
}
void qemu_coroutine_dec_pool_size(unsigned int removing_pool_size)
{
- qatomic_sub(&pool_max_size, removing_pool_size);
+ QEMU_LOCK_GUARD(&global_pool_lock);
+ global_pool_max_size -= removing_pool_size;
+}
+
+static unsigned int get_global_pool_hard_max_size(void)
+{
+#ifdef __linux__
+ g_autofree char *contents = NULL;
+ int max_map_count;
+
+ /*
+ * Linux processes can have up to max_map_count virtual memory areas
+ * (VMAs). mmap(2), mprotect(2), etc fail with ENOMEM beyond this limit. We
+ * must limit the coroutine pool to a safe size to avoid running out of
+ * VMAs.
+ */
+ if (g_file_get_contents("/proc/sys/vm/max_map_count", &contents, NULL,
+ NULL) &&
+ qemu_strtoi(contents, NULL, 10, &max_map_count) == 0) {
+ /*
+ * This is a conservative upper bound that avoids exceeding
+ * max_map_count. Leave half for non-coroutine users like library
+ * dependencies, vhost-user, etc. Each coroutine takes up 2 VMAs so
+ * halve the amount again.
+ */
+ return max_map_count / 4;
+ }
+#endif
+
+ return UINT_MAX;
+}
+
+static void __attribute__((constructor)) qemu_coroutine_init(void)
+{
+ qemu_mutex_init(&global_pool_lock);
+ global_pool_hard_max_size = get_global_pool_hard_max_size();
}
--
2.39.3

@ -1,61 +0,0 @@
From 0aa65dc3acba481f7064df936ab49e3bceb1d5bd Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Wed, 20 Mar 2024 14:12:32 -0400
Subject: [PATCH 2/2] coroutine: reserve 5,000 mappings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 234: coroutine: cap per-thread local pool size
RH-Jira: RHEL-28947
RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
RH-Acked-by: Hanna Czenczek <hreitz@redhat.com>
RH-Commit: [2/2] 78560c2b947471111cc16c313d6f38db42860a1c (stefanha/centos-stream-qemu-kvm)
Daniel P. Berrangé <berrange@redhat.com> pointed out that the coroutine
pool size heuristic is very conservative. Instead of halving
max_map_count, he suggested reserving 5,000 mappings for non-coroutine
users based on observations of guests he has access to.
Fixes: 86a637e48104 ("coroutine: cap per-thread local pool size")
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
Message-id: 20240320181232.1464819-1-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 9352f80cd926fe2dde7c89b93ee33bb0356ff40e)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
util/qemu-coroutine.c | 15 ++++++++++-----
1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
index 2790959eaf..eb4eebefdf 100644
--- a/util/qemu-coroutine.c
+++ b/util/qemu-coroutine.c
@@ -377,12 +377,17 @@ static unsigned int get_global_pool_hard_max_size(void)
NULL) &&
qemu_strtoi(contents, NULL, 10, &max_map_count) == 0) {
/*
- * This is a conservative upper bound that avoids exceeding
- * max_map_count. Leave half for non-coroutine users like library
- * dependencies, vhost-user, etc. Each coroutine takes up 2 VMAs so
- * halve the amount again.
+ * This is an upper bound that avoids exceeding max_map_count. Leave a
+ * fixed amount for non-coroutine users like library dependencies,
+ * vhost-user, etc. Each coroutine takes up 2 VMAs so halve the
+ * remaining amount.
*/
- return max_map_count / 4;
+ if (max_map_count > 5000) {
+ return (max_map_count - 5000) / 2;
+ } else {
+ /* Disable the global pool but threads still have local pools */
+ return 0;
+ }
}
#endif
--
2.39.3

@ -0,0 +1,228 @@
From 117486e0820f135f191e19f8ebb8838a98b121c6 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 27 May 2024 11:58:51 -0400
Subject: [PATCH 5/5] crypto/block: drop qcrypto_block_open() n_threads
argument
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 251: block/crypto: create ciphers on demand
RH-Jira: RHEL-36159
RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [2/2] 68290935b174b1f2b76aa857a926da9011e54abe (stefanha/centos-stream-qemu-kvm)
The n_threads argument is no longer used since the previous commit.
Remove it.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20240527155851.892885-3-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Acked-by: Daniel P. Berrangé <berrange@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
(cherry picked from commit 3ab0f063e58ed9224237d69c4211ca83335164c4)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
block/crypto.c | 1 -
block/qcow.c | 2 +-
block/qcow2.c | 5 ++---
crypto/block-luks.c | 1 -
crypto/block-qcow.c | 6 ++----
crypto/block.c | 3 +--
crypto/blockpriv.h | 1 -
include/crypto/block.h | 2 --
tests/unit/test-crypto-block.c | 4 ----
9 files changed, 6 insertions(+), 19 deletions(-)
diff --git a/block/crypto.c b/block/crypto.c
index 21eed909c1..4eed3ffa6a 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -363,7 +363,6 @@ static int block_crypto_open_generic(QCryptoBlockFormat format,
block_crypto_read_func,
bs,
cflags,
- 1,
errp);
if (!crypto->block) {
diff --git a/block/qcow.c b/block/qcow.c
index ca8e1d5ec8..c2f89db055 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -211,7 +211,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
}
s->crypto = qcrypto_block_open(crypto_opts, "encrypt.",
- NULL, NULL, cflags, 1, errp);
+ NULL, NULL, cflags, errp);
if (!s->crypto) {
ret = -EINVAL;
goto fail;
diff --git a/block/qcow2.c b/block/qcow2.c
index 0e8b2f7518..0ebd455dc8 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -321,7 +321,7 @@ qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
}
s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.",
qcow2_crypto_hdr_read_func,
- bs, cflags, QCOW2_MAX_THREADS, errp);
+ bs, cflags, errp);
if (!s->crypto) {
return -EINVAL;
}
@@ -1707,8 +1707,7 @@ qcow2_do_open(BlockDriverState *bs, QDict *options, int flags,
cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
}
s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.",
- NULL, NULL, cflags,
- QCOW2_MAX_THREADS, errp);
+ NULL, NULL, cflags, errp);
if (!s->crypto) {
ret = -EINVAL;
goto fail;
diff --git a/crypto/block-luks.c b/crypto/block-luks.c
index 3357852c0a..5b777c15d3 100644
--- a/crypto/block-luks.c
+++ b/crypto/block-luks.c
@@ -1189,7 +1189,6 @@ qcrypto_block_luks_open(QCryptoBlock *block,
QCryptoBlockReadFunc readfunc,
void *opaque,
unsigned int flags,
- size_t n_threads,
Error **errp)
{
QCryptoBlockLUKS *luks = NULL;
diff --git a/crypto/block-qcow.c b/crypto/block-qcow.c
index 02305058e3..42e9556e42 100644
--- a/crypto/block-qcow.c
+++ b/crypto/block-qcow.c
@@ -44,7 +44,6 @@ qcrypto_block_qcow_has_format(const uint8_t *buf G_GNUC_UNUSED,
static int
qcrypto_block_qcow_init(QCryptoBlock *block,
const char *keysecret,
- size_t n_threads,
Error **errp)
{
char *password;
@@ -100,7 +99,6 @@ qcrypto_block_qcow_open(QCryptoBlock *block,
QCryptoBlockReadFunc readfunc G_GNUC_UNUSED,
void *opaque G_GNUC_UNUSED,
unsigned int flags,
- size_t n_threads,
Error **errp)
{
if (flags & QCRYPTO_BLOCK_OPEN_NO_IO) {
@@ -115,7 +113,7 @@ qcrypto_block_qcow_open(QCryptoBlock *block,
return -1;
}
return qcrypto_block_qcow_init(block, options->u.qcow.key_secret,
- n_threads, errp);
+ errp);
}
}
@@ -135,7 +133,7 @@ qcrypto_block_qcow_create(QCryptoBlock *block,
return -1;
}
/* QCow2 has no special header, since everything is hardwired */
- return qcrypto_block_qcow_init(block, options->u.qcow.key_secret, 1, errp);
+ return qcrypto_block_qcow_init(block, options->u.qcow.key_secret, errp);
}
diff --git a/crypto/block.c b/crypto/block.c
index ba6d1cebc7..3bcc4270c3 100644
--- a/crypto/block.c
+++ b/crypto/block.c
@@ -53,7 +53,6 @@ QCryptoBlock *qcrypto_block_open(QCryptoBlockOpenOptions *options,
QCryptoBlockReadFunc readfunc,
void *opaque,
unsigned int flags,
- size_t n_threads,
Error **errp)
{
QCryptoBlock *block = g_new0(QCryptoBlock, 1);
@@ -73,7 +72,7 @@ QCryptoBlock *qcrypto_block_open(QCryptoBlockOpenOptions *options,
block->driver = qcrypto_block_drivers[options->format];
if (block->driver->open(block, options, optprefix,
- readfunc, opaque, flags, n_threads, errp) < 0)
+ readfunc, opaque, flags, errp) < 0)
{
g_free(block);
return NULL;
diff --git a/crypto/blockpriv.h b/crypto/blockpriv.h
index 4bf6043d5d..b8f77cb5eb 100644
--- a/crypto/blockpriv.h
+++ b/crypto/blockpriv.h
@@ -59,7 +59,6 @@ struct QCryptoBlockDriver {
QCryptoBlockReadFunc readfunc,
void *opaque,
unsigned int flags,
- size_t n_threads,
Error **errp);
int (*create)(QCryptoBlock *block,
diff --git a/include/crypto/block.h b/include/crypto/block.h
index 92e823c9f2..5b5d039800 100644
--- a/include/crypto/block.h
+++ b/include/crypto/block.h
@@ -76,7 +76,6 @@ typedef enum {
* @readfunc: callback for reading data from the volume
* @opaque: data to pass to @readfunc
* @flags: bitmask of QCryptoBlockOpenFlags values
- * @n_threads: allow concurrent I/O from up to @n_threads threads
* @errp: pointer to a NULL-initialized error object
*
* Create a new block encryption object for an existing
@@ -113,7 +112,6 @@ QCryptoBlock *qcrypto_block_open(QCryptoBlockOpenOptions *options,
QCryptoBlockReadFunc readfunc,
void *opaque,
unsigned int flags,
- size_t n_threads,
Error **errp);
typedef enum {
diff --git a/tests/unit/test-crypto-block.c b/tests/unit/test-crypto-block.c
index 6cfc817a92..42cfab6067 100644
--- a/tests/unit/test-crypto-block.c
+++ b/tests/unit/test-crypto-block.c
@@ -303,7 +303,6 @@ static void test_block(gconstpointer opaque)
test_block_read_func,
&header,
0,
- 1,
NULL);
g_assert(blk == NULL);
@@ -312,7 +311,6 @@ static void test_block(gconstpointer opaque)
test_block_read_func,
&header,
QCRYPTO_BLOCK_OPEN_NO_IO,
- 1,
&error_abort);
g_assert(qcrypto_block_get_cipher(blk) == NULL);
@@ -327,7 +325,6 @@ static void test_block(gconstpointer opaque)
test_block_read_func,
&header,
0,
- 1,
&error_abort);
g_assert(blk);
@@ -384,7 +381,6 @@ test_luks_bad_header(gconstpointer data)
test_block_read_func,
&buf,
0,
- 1,
&err);
g_assert(!blk);
g_assert(err);
--
2.39.3

@ -1,75 +0,0 @@
From ac9dc8ea241ef6d3a0447d696620d4d4053b71bf Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 4 Dec 2023 11:42:59 -0500
Subject: [PATCH 080/101] dma-helpers: don't lock AioContext in dma_blk_cb()
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [11/26] a8580463ba6aee4ca248c0b947b9e72bd9e87aab (kmwolf/centos-qemu-kvm)
Commit abfcd2760b3e ("dma-helpers: prevent dma_blk_cb() vs
dma_aio_cancel() race") acquired the AioContext lock inside dma_blk_cb()
to avoid a race with scsi_device_purge_requests() running in the main
loop thread.
The SCSI code no longer calls dma_aio_cancel() from the main loop thread
while I/O is running in the IOThread AioContext. Therefore it is no
longer necessary to take this lock to protect DMAAIOCB fields. The
->cb() function also does not require the lock because blk_aio_*() and
friends do not need the AioContext lock.
Both hw/ide/core.c and hw/ide/macio.c also call dma_blk_io() but don't
rely on it taking the AioContext lock, so this change is safe.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20231204164259.1515217-5-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
system/dma-helpers.c | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/system/dma-helpers.c b/system/dma-helpers.c
index 36211acc7e..528117f256 100644
--- a/system/dma-helpers.c
+++ b/system/dma-helpers.c
@@ -119,13 +119,12 @@ static void dma_blk_cb(void *opaque, int ret)
trace_dma_blk_cb(dbs, ret);
- aio_context_acquire(ctx);
dbs->acb = NULL;
dbs->offset += dbs->iov.size;
if (dbs->sg_cur_index == dbs->sg->nsg || ret < 0) {
dma_complete(dbs, ret);
- goto out;
+ return;
}
dma_blk_unmap(dbs);
@@ -168,7 +167,7 @@ static void dma_blk_cb(void *opaque, int ret)
trace_dma_map_wait(dbs);
dbs->bh = aio_bh_new(ctx, reschedule_dma, dbs);
cpu_register_map_client(dbs->bh);
- goto out;
+ return;
}
if (!QEMU_IS_ALIGNED(dbs->iov.size, dbs->align)) {
@@ -179,8 +178,6 @@ static void dma_blk_cb(void *opaque, int ret)
dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
dma_blk_cb, dbs, dbs->io_func_opaque);
assert(dbs->acb);
-out:
- aio_context_release(ctx);
}
static void dma_aio_cancel(BlockAIOCB *acb)
--
2.39.3

@ -1,228 +0,0 @@
From 71aa0219f7c84cbf175eb2a091d48d5fd5daa40b Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Tue, 21 Nov 2023 16:44:26 +0800
Subject: [PATCH 047/101] docs/devel: Add VFIO iommufd backend documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [46/67] 6cf49d00e87788f894d690a985bb6798eae24505 (eauger1/centos-qemu-kvm)
Suggested-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 98dad2b01931f6064c6c4b48ca3c2a1d9f542cd8)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
MAINTAINERS | 1 +
docs/devel/index-internals.rst | 1 +
docs/devel/vfio-iommufd.rst | 166 +++++++++++++++++++++++++++++++++
3 files changed, 168 insertions(+)
create mode 100644 docs/devel/vfio-iommufd.rst
diff --git a/MAINTAINERS b/MAINTAINERS
index ca70bb4e64..0ddb20a35f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2176,6 +2176,7 @@ F: backends/iommufd.c
F: include/sysemu/iommufd.h
F: include/qemu/chardev_open.h
F: util/chardev_open.c
+F: docs/devel/vfio-iommufd.rst
vhost
M: Michael S. Tsirkin <mst@redhat.com>
diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst
index 6f81df92bc..3def4a138b 100644
--- a/docs/devel/index-internals.rst
+++ b/docs/devel/index-internals.rst
@@ -18,5 +18,6 @@ Details about QEMU's various subsystems including how to add features to them.
s390-dasd-ipl
tracing
vfio-migration
+ vfio-iommufd
writing-monitor-commands
virtio-backends
diff --git a/docs/devel/vfio-iommufd.rst b/docs/devel/vfio-iommufd.rst
new file mode 100644
index 0000000000..3d1c11f175
--- /dev/null
+++ b/docs/devel/vfio-iommufd.rst
@@ -0,0 +1,166 @@
+===============================
+IOMMUFD BACKEND usage with VFIO
+===============================
+
+(Same meaning for backend/container/BE)
+
+With the introduction of iommufd, the Linux kernel provides a generic
+interface for user space drivers to propagate their DMA mappings to kernel
+for assigned devices. While the legacy kernel interface is group-centric,
+the new iommufd interface is device-centric, relying on device fd and iommufd.
+
+To support both interfaces in the QEMU VFIO device, introduce a base container
+to abstract the common part of VFIO legacy and iommufd container. So that the
+generic VFIO code can use either container.
+
+The base container implements generic functions such as memory_listener and
+address space management whereas the derived container implements callbacks
+specific to either legacy or iommufd. Each container has its own way to setup
+secure context and dma management interface. The below diagram shows how it
+looks like with both containers.
+
+::
+
+ VFIO AddressSpace/Memory
+ +-------+ +----------+ +-----+ +-----+
+ | pci | | platform | | ap | | ccw |
+ +---+---+ +----+-----+ +--+--+ +--+--+ +----------------------+
+ | | | | | AddressSpace |
+ | | | | +------------+---------+
+ +---V-----------V-----------V--------V----+ /
+ | VFIOAddressSpace | <------------+
+ | | | MemoryListener
+ | VFIOContainerBase list |
+ +-------+----------------------------+----+
+ | |
+ | |
+ +-------V------+ +--------V----------+
+ | iommufd | | vfio legacy |
+ | container | | container |
+ +-------+------+ +--------+----------+
+ | |
+ | /dev/iommu | /dev/vfio/vfio
+ | /dev/vfio/devices/vfioX | /dev/vfio/$group_id
+ Userspace | |
+ ============+============================+===========================
+ Kernel | device fd |
+ +---------------+ | group/container fd
+ | (BIND_IOMMUFD | | (SET_CONTAINER/SET_IOMMU)
+ | ATTACH_IOAS) | | device fd
+ | | |
+ | +-------V------------V-----------------+
+ iommufd | | vfio |
+ (map/unmap | +---------+--------------------+-------+
+ ioas_copy) | | | map/unmap
+ | | |
+ +------V------+ +-----V------+ +------V--------+
+ | iommfd core | | device | | vfio iommu |
+ +-------------+ +------------+ +---------------+
+
+* Secure Context setup
+
+ - iommufd BE: uses device fd and iommufd to setup secure context
+ (bind_iommufd, attach_ioas)
+ - vfio legacy BE: uses group fd and container fd to setup secure context
+ (set_container, set_iommu)
+
+* Device access
+
+ - iommufd BE: device fd is opened through ``/dev/vfio/devices/vfioX``
+ - vfio legacy BE: device fd is retrieved from group fd ioctl
+
+* DMA Mapping flow
+
+ 1. VFIOAddressSpace receives MemoryRegion add/del via MemoryListener
+ 2. VFIO populates DMA map/unmap via the container BEs
+ * iommufd BE: uses iommufd
+ * vfio legacy BE: uses container fd
+
+Example configuration
+=====================
+
+Step 1: configure the host device
+---------------------------------
+
+It's exactly same as the VFIO device with legacy VFIO container.
+
+Step 2: configure QEMU
+----------------------
+
+Interactions with the ``/dev/iommu`` are abstracted by a new iommufd
+object (compiled in with the ``CONFIG_IOMMUFD`` option).
+
+Any QEMU device (e.g. VFIO device) wishing to use ``/dev/iommu`` must
+be linked with an iommufd object. It gets a new optional property
+named iommufd which allows to pass an iommufd object. Take ``vfio-pci``
+device for example:
+
+.. code-block:: bash
+
+ -object iommufd,id=iommufd0
+ -device vfio-pci,host=0000:02:00.0,iommufd=iommufd0
+
+Note the ``/dev/iommu`` and VFIO cdev can be externally opened by a
+management layer. In such a case the fd is passed, the fd supports a
+string naming the fd or a number, for example:
+
+.. code-block:: bash
+
+ -object iommufd,id=iommufd0,fd=22
+ -device vfio-pci,iommufd=iommufd0,fd=23
+
+If the ``fd`` property is not passed, the fd is opened by QEMU.
+
+If no ``iommufd`` object is passed to the ``vfio-pci`` device, iommufd
+is not used and the user gets the behavior based on the legacy VFIO
+container:
+
+.. code-block:: bash
+
+ -device vfio-pci,host=0000:02:00.0
+
+Supported platform
+==================
+
+Supports x86, ARM and s390x currently.
+
+Caveats
+=======
+
+Dirty page sync
+---------------
+
+Dirty page sync with iommufd backend is unsupported yet, live migration is
+disabled by default. But it can be force enabled like below, low efficient
+though.
+
+.. code-block:: bash
+
+ -object iommufd,id=iommufd0
+ -device vfio-pci,host=0000:02:00.0,iommufd=iommufd0,enable-migration=on
+
+P2P DMA
+-------
+
+PCI p2p DMA is unsupported as IOMMUFD doesn't support mapping hardware PCI
+BAR region yet. Below warning shows for assigned PCI device, it's not a bug.
+
+.. code-block:: none
+
+ qemu-system-x86_64: warning: IOMMU_IOAS_MAP failed: Bad address, PCI BAR?
+ qemu-system-x86_64: vfio_container_dma_map(0x560cb6cb1620, 0xe000000021000, 0x3000, 0x7f32ed55c000) = -14 (Bad address)
+
+FD passing with mdev
+--------------------
+
+``vfio-pci`` device checks sysfsdev property to decide if backend is a mdev.
+If FD passing is used, there is no way to know that and the mdev is treated
+like a real PCI device. There is an error as below if user wants to enable
+RAM discarding for mdev.
+
+.. code-block:: none
+
+ qemu-system-x86_64: -device vfio-pci,iommufd=iommufd0,x-balloon-allowed=on,fd=9: vfio VFIO_FD9: x-balloon-allowed only potentially compatible with mdev devices
+
+``vfio-ap`` and ``vfio-ccw`` devices don't have same issue as their backend
+devices are always mdev and RAM discarding is force enabled.
--
2.39.3

@ -1,98 +0,0 @@
From fc69df3a70bed5722643cc16828ca20beae3a20d Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 5 Dec 2023 13:20:08 -0500
Subject: [PATCH 091/101] docs: remove AioContext lock from IOThread docs
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [22/26] ab89cda483e74ded983d26e1c6e50217405e0a55 (kmwolf/centos-qemu-kvm)
Encourage the use of locking primitives and stop mentioning the
AioContext lock since it is being removed.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-ID: <20231205182011.1976568-12-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
docs/devel/multiple-iothreads.txt | 47 +++++++++++--------------------
1 file changed, 16 insertions(+), 31 deletions(-)
diff --git a/docs/devel/multiple-iothreads.txt b/docs/devel/multiple-iothreads.txt
index a3e949f6b3..4865196bde 100644
--- a/docs/devel/multiple-iothreads.txt
+++ b/docs/devel/multiple-iothreads.txt
@@ -88,27 +88,18 @@ loop, depending on which AioContext instance the caller passes in.
How to synchronize with an IOThread
-----------------------------------
-AioContext is not thread-safe so some rules must be followed when using file
-descriptors, event notifiers, timers, or BHs across threads:
+Variables that can be accessed by multiple threads require some form of
+synchronization such as qemu_mutex_lock(), rcu_read_lock(), etc.
-1. AioContext functions can always be called safely. They handle their
-own locking internally.
-
-2. Other threads wishing to access the AioContext must use
-aio_context_acquire()/aio_context_release() for mutual exclusion. Once the
-context is acquired no other thread can access it or run event loop iterations
-in this AioContext.
-
-Legacy code sometimes nests aio_context_acquire()/aio_context_release() calls.
-Do not use nesting anymore, it is incompatible with the BDRV_POLL_WHILE() macro
-used in the block layer and can lead to hangs.
-
-There is currently no lock ordering rule if a thread needs to acquire multiple
-AioContexts simultaneously. Therefore, it is only safe for code holding the
-QEMU global mutex to acquire other AioContexts.
+AioContext functions like aio_set_fd_handler(), aio_set_event_notifier(),
+aio_bh_new(), and aio_timer_new() are thread-safe. They can be used to trigger
+activity in an IOThread.
Side note: the best way to schedule a function call across threads is to call
-aio_bh_schedule_oneshot(). No acquire/release or locking is needed.
+aio_bh_schedule_oneshot().
+
+The main loop thread can wait synchronously for a condition using
+AIO_WAIT_WHILE().
AioContext and the block layer
------------------------------
@@ -124,22 +115,16 @@ Block layer code must therefore expect to run in an IOThread and avoid using
old APIs that implicitly use the main loop. See the "How to program for
IOThreads" above for information on how to do that.
-If main loop code such as a QMP function wishes to access a BlockDriverState
-it must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure
-that callbacks in the IOThread do not run in parallel.
-
Code running in the monitor typically needs to ensure that past
requests from the guest are completed. When a block device is running
in an IOThread, the IOThread can also process requests from the guest
(via ioeventfd). To achieve both objects, wrap the code between
bdrv_drained_begin() and bdrv_drained_end(), thus creating a "drained
-section". The functions must be called between aio_context_acquire()
-and aio_context_release(). You can freely release and re-acquire the
-AioContext within a drained section.
-
-Long-running jobs (usually in the form of coroutines) are best scheduled in
-the BlockDriverState's AioContext to avoid the need to acquire/release around
-each bdrv_*() call. The functions bdrv_add/remove_aio_context_notifier,
-or alternatively blk_add/remove_aio_context_notifier if you use BlockBackends,
-can be used to get a notification whenever bdrv_try_change_aio_context() moves a
+section".
+
+Long-running jobs (usually in the form of coroutines) are often scheduled in
+the BlockDriverState's AioContext. The functions
+bdrv_add/remove_aio_context_notifier, or alternatively
+blk_add/remove_aio_context_notifier if you use BlockBackends, can be used to
+get a notification whenever bdrv_try_change_aio_context() moves a
BlockDriverState to a different AioContext.
--
2.39.3

File diff suppressed because it is too large Load Diff

@ -1,94 +0,0 @@
From a5b4eec5f456b1ca3fe753e1d76f96cf3f8914ef Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 17 Jan 2024 14:55:53 +0100
Subject: [PATCH 01/22] hv-balloon: use get_min_alignment() to express 32 GiB
alignment
RH-Author: David Hildenbrand <david@redhat.com>
RH-MergeRequest: 221: memory-device: reintroduce memory region size check
RH-Jira: RHEL-20341
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Igor Mammedov <imammedo@redhat.com>
RH-Commit: [1/2] cbe092fe549552928270892253b31cd8fe199825
https://issues.redhat.com/browse/RHEL-20341
Let's implement the get_min_alignment() callback for memory devices, and
copy for the device memory region the alignment of the host memory
region. This mimics what virtio-mem does, and allows for re-introducing
proper alignment checks for the memory region size (where we don't care
about additional device requirements) in memory device core.
Message-ID: <20240117135554.787344-2-david@redhat.com>
Reviewed-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
(cherry picked from commit f77c5f38f49c71bc14cf1019ac92b0b95f572414)
Signed-off-by: David Hildenbrand <david@redhat.com>
---
hw/hyperv/hv-balloon.c | 37 +++++++++++++++++++++----------------
1 file changed, 21 insertions(+), 16 deletions(-)
diff --git a/hw/hyperv/hv-balloon.c b/hw/hyperv/hv-balloon.c
index 66f297c1d7..0829c495b0 100644
--- a/hw/hyperv/hv-balloon.c
+++ b/hw/hyperv/hv-balloon.c
@@ -1476,22 +1476,7 @@ static void hv_balloon_ensure_mr(HvBalloon *balloon)
balloon->mr = g_new0(MemoryRegion, 1);
memory_region_init(balloon->mr, OBJECT(balloon), TYPE_HV_BALLOON,
memory_region_size(hostmem_mr));
-
- /*
- * The VM can indicate an alignment up to 32 GiB. Memory device core can
- * usually only handle/guarantee 1 GiB alignment. The user will have to
- * specify a larger maxmem eventually.
- *
- * The memory device core will warn the user in case maxmem might have to be
- * increased and will fail plugging the device if there is not sufficient
- * space after alignment.
- *
- * TODO: we could do the alignment ourselves in a slightly bigger region.
- * But this feels better, although the warning might be annoying. Maybe
- * we can optimize that in the future (e.g., with such a device on the
- * cmdline place/size the device memory region differently.
- */
- balloon->mr->align = MAX(32 * GiB, memory_region_get_alignment(hostmem_mr));
+ balloon->mr->align = memory_region_get_alignment(hostmem_mr);
}
static void hv_balloon_free_mr(HvBalloon *balloon)
@@ -1653,6 +1638,25 @@ static MemoryRegion *hv_balloon_md_get_memory_region(MemoryDeviceState *md,
return balloon->mr;
}
+static uint64_t hv_balloon_md_get_min_alignment(const MemoryDeviceState *md)
+{
+ /*
+ * The VM can indicate an alignment up to 32 GiB. Memory device core can
+ * usually only handle/guarantee 1 GiB alignment. The user will have to
+ * specify a larger maxmem eventually.
+ *
+ * The memory device core will warn the user in case maxmem might have to be
+ * increased and will fail plugging the device if there is not sufficient
+ * space after alignment.
+ *
+ * TODO: we could do the alignment ourselves in a slightly bigger region.
+ * But this feels better, although the warning might be annoying. Maybe
+ * we can optimize that in the future (e.g., with such a device on the
+ * cmdline place/size the device memory region differently.
+ */
+ return 32 * GiB;
+}
+
static void hv_balloon_md_fill_device_info(const MemoryDeviceState *md,
MemoryDeviceInfo *info)
{
@@ -1765,5 +1769,6 @@ static void hv_balloon_class_init(ObjectClass *klass, void *data)
mdc->get_memory_region = hv_balloon_md_get_memory_region;
mdc->decide_memslots = hv_balloon_decide_memslots;
mdc->get_memslots = hv_balloon_get_memslots;
+ mdc->get_min_alignment = hv_balloon_md_get_min_alignment;
mdc->fill_device_info = hv_balloon_md_fill_device_info;
}
--
2.39.3

@ -1,42 +0,0 @@
From ceaee9c4372bbdc4196cb6808515047388f7aa26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Tue, 21 Nov 2023 16:44:18 +0800
Subject: [PATCH 039/101] hw/arm: Activate IOMMUFD for virt machines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [38/67] 0a059ae661616e95eb8455e17f35774495cae8e7 (eauger1/centos-qemu-kvm)
Signed-off-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 0970238343af45a8b547695bfc22f18d4eb7da7e)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/arm/Kconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig
index 3ada335a24..660f49db49 100644
--- a/hw/arm/Kconfig
+++ b/hw/arm/Kconfig
@@ -8,6 +8,7 @@ config ARM_VIRT
imply TPM_TIS_SYSBUS
imply TPM_TIS_I2C
imply NVDIMM
+ imply IOMMUFD
select ARM_GIC
select ACPI
select ARM_SMMUV3
--
2.39.3

@ -1,88 +0,0 @@
From e670722b9a6460d41497688d820d5a9a9b51d8e9 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gshan@redhat.com>
Date: Tue, 9 Jan 2024 11:36:42 +1000
Subject: [PATCH 001/101] hw/arm/virt: Add properties to disable high memory
regions
RH-Author: Gavin Shan <gshan@redhat.com>
RH-MergeRequest: 210: hw/arm/virt: Add properties to disable high memory regions
RH-Jira: RHEL-19738
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Eric Auger <eric.auger@redhat.com>
RH-Commit: [1/1] 4097ba5133a67126e30b84202cb40df4e019c5f4
Upstream: RHEL-only
Brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=57927352
There are 3 high memory regions for GICv3 or GICv4 redistributor, PCI
ECAM and PCI MMIO. Each of them has a property introduced by upstream
commit 6a48c64eec ("hw/arm/virt: Add properties to disable high memory
regions") so that the corresponding high memory region can be disabled.
It's notable that another property ("compact-highmem") introduced by
upstream commit f40408a9fe ("hw/arm/virt: Add 'compact-highmem' property")
so that the compact high memory region layout during assignment can be
disabled, compatible to the old machine types. However, we don't have
the compatible issue since the compact high memory region layout is
always kept as disabled until RHEL9.2.0 machine type and onwards.
Expose those 3 properties: "highmem-redists", "highmem-ecam" and
"highmem-mmio". The property "compact-highmem" is kept as hidden.
Signed-off-by: Gavin Shan <gshan@redhat.com>
---
hw/arm/virt.c | 24 +++++++++++++++++++++++-
1 file changed, 23 insertions(+), 1 deletion(-)
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 5cab00b4cd..60f117f0d2 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -2456,6 +2456,7 @@ static void virt_set_compact_highmem(Object *obj, bool value, Error **errp)
vms->highmem_compact = value;
}
+#endif /* disabled for RHEL */
static bool virt_get_highmem_redists(Object *obj, Error **errp)
{
@@ -2498,7 +2499,6 @@ static void virt_set_highmem_mmio(Object *obj, bool value, Error **errp)
vms->highmem_mmio = value;
}
-#endif /* disabled for RHEL */
static bool virt_get_its(Object *obj, Error **errp)
{
@@ -3521,6 +3521,28 @@ static void rhel_machine_class_init(ObjectClass *oc, void *data)
"Set on/off to enable/disable using "
"physical address space above 32 bits");
+ object_class_property_add_bool(oc, "highmem-redists",
+ virt_get_highmem_redists,
+ virt_set_highmem_redists);
+ object_class_property_set_description(oc, "highmem-redists",
+ "Set on/off to enable/disable high "
+ "memory region for GICv3 or GICv4 "
+ "redistributor");
+
+ object_class_property_add_bool(oc, "highmem-ecam",
+ virt_get_highmem_ecam,
+ virt_set_highmem_ecam);
+ object_class_property_set_description(oc, "highmem-ecam",
+ "Set on/off to enable/disable high "
+ "memory region for PCI ECAM");
+
+ object_class_property_add_bool(oc, "highmem-mmio",
+ virt_get_highmem_mmio,
+ virt_set_highmem_mmio);
+ object_class_property_set_description(oc, "highmem-mmio",
+ "Set on/off to enable/disable high "
+ "memory region for PCI MMIO");
+
object_class_property_add_str(oc, "gic-version", virt_get_gic_version,
virt_set_gic_version);
object_class_property_set_description(oc, "gic-version",
--
2.39.3

@ -0,0 +1,120 @@
From 41c4083269ec772b406c6c57b496ca2011f928c7 Mon Sep 17 00:00:00 2001
From: Zhenyu Zhang <zhenyzha@redhat.com>
Date: Tue, 9 Jul 2024 23:08:59 -0400
Subject: [PATCH 2/2] hw/arm/virt: Avoid unexpected warning from Linux guest on
host with Fujitsu CPUs
RH-Author: zhenyzha <None>
RH-MergeRequest: 256: hw/arm/virt: Avoid unexpected warning from Linux guest on host with Fujitsu CPUs
RH-Jira: RHEL-39936
RH-Acked-by: Gavin Shan <gshan@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
RH-Commit: [1/1] fdf156fd05b219a06e2e2ca409fff0f728c1e2cf (zhenyzha/qemu-kvm)
JIRA: https://issues.redhat.com/browse/RHEL-39936
Multiple warning messages and corresponding backtraces are observed when Linux
guest is booted on the host with Fujitsu CPUs. One of them is shown as below.
[ 0.032443] ------------[ cut here ]------------
[ 0.032446] uart-pl011 9000000.pl011: ARCH_DMA_MINALIGN smaller than
CTR_EL0.CWG (128 < 256)
[ 0.032454] WARNING: CPU: 0 PID: 1 at arch/arm64/mm/dma-mapping.c:54
arch_setup_dma_ops+0xbc/0xcc
[ 0.032470] Modules linked in:
[ 0.032475] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.14.0-452.el9.aarch64
[ 0.032481] Hardware name: linux,dummy-virt (DT)
[ 0.032484] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[ 0.032490] pc : arch_setup_dma_ops+0xbc/0xcc
[ 0.032496] lr : arch_setup_dma_ops+0xbc/0xcc
[ 0.032501] sp : ffff80008003b860
[ 0.032503] x29: ffff80008003b860 x28: 0000000000000000 x27: ffffaae4b949049c
[ 0.032510] x26: 0000000000000000 x25: 0000000000000000 x24: 0000000000000000
[ 0.032517] x23: 0000000000000100 x22: 0000000000000000 x21: 0000000000000000
[ 0.032523] x20: 0000000100000000 x19: ffff2f06c02ea400 x18: ffffffffffffffff
[ 0.032529] x17: 00000000208a5f76 x16: 000000006589dbcb x15: ffffaae4ba071c89
[ 0.032535] x14: 0000000000000000 x13: ffffaae4ba071c84 x12: 455f525443206e61
[ 0.032541] x11: 68742072656c6c61 x10: 0000000000000029 x9 : ffffaae4b7d21da4
[ 0.032547] x8 : 0000000000000029 x7 : 4c414e494d5f414d x6 : 0000000000000029
[ 0.032553] x5 : 000000000000000f x4 : ffffaae4b9617a00 x3 : 0000000000000001
[ 0.032558] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff2f06c029be40
[ 0.032564] Call trace:
[ 0.032566] arch_setup_dma_ops+0xbc/0xcc
[ 0.032572] of_dma_configure_id+0x138/0x300
[ 0.032591] amba_dma_configure+0x34/0xc0
[ 0.032600] really_probe+0x78/0x3dc
[ 0.032614] __driver_probe_device+0x108/0x160
[ 0.032619] driver_probe_device+0x44/0x114
[ 0.032624] __device_attach_driver+0xb8/0x14c
[ 0.032629] bus_for_each_drv+0x88/0xe4
[ 0.032634] __device_attach+0xb0/0x1e0
[ 0.032638] device_initial_probe+0x18/0x20
[ 0.032643] bus_probe_device+0xa8/0xb0
[ 0.032648] device_add+0x4b4/0x6c0
[ 0.032652] amba_device_try_add.part.0+0x48/0x360
[ 0.032657] amba_device_add+0x104/0x144
[ 0.032662] of_amba_device_create.isra.0+0x100/0x1c4
[ 0.032666] of_platform_bus_create+0x294/0x35c
[ 0.032669] of_platform_populate+0x5c/0x150
[ 0.032672] of_platform_default_populate_init+0xd0/0xec
[ 0.032697] do_one_initcall+0x4c/0x2e0
[ 0.032701] do_initcalls+0x100/0x13c
[ 0.032707] kernel_init_freeable+0x1c8/0x21c
[ 0.032712] kernel_init+0x28/0x140
[ 0.032731] ret_from_fork+0x10/0x20
[ 0.032735] ---[ end trace 0000000000000000 ]---
In Linux, a check is applied to every device which is exposed through
device-tree node. The warning message is raised when the device isn't
DMA coherent and the cache line size is larger than ARCH_DMA_MINALIGN
(128 bytes). The cache line is sorted from CTR_EL0[CWG], which corresponds
to 256 bytes on the guest CPUs. The DMA coherent capability is claimed
through 'dma-coherent' in their device-tree nodes or parent nodes.
This happens even when the device doesn't implement or use DMA at all,
for legacy reasons.
Fix the issue by adding 'dma-coherent' property to the device-tree root
node, meaning all devices are capable of DMA coherent by default.
This both suppresses the spurious kernel warnings and also guards
against possible future QEMU bugs where we add a DMA-capable device
and forget to mark it as dma-coherent.
Signed-off-by: Zhenyu Zhang <zhenyzha@redhat.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Donald Dutile <ddutile@redhat.com
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Message-id: 20240612020506.307793-1-zhenyzha@redhat.com
[PMM: tweaked commit message]
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
(cherry picked from commit dda533087ad5559674ff486e7031c88dc01e0abd)
Signed-off-by: Zhenyu Zhang <zhenyzha@redhat.com>
---
hw/arm/virt.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 3f0496cdb9..6ece67f11d 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -330,6 +330,17 @@ static void create_fdt(VirtMachineState *vms)
qemu_fdt_setprop_cell(fdt, "/", "#size-cells", 0x2);
qemu_fdt_setprop_string(fdt, "/", "model", "linux,dummy-virt");
+ /*
+ * For QEMU, all DMA is coherent. Advertising this in the root node
+ * has two benefits:
+ *
+ * - It avoids potential bugs where we forget to mark a DMA
+ * capable device as being dma-coherent
+ * - It avoids spurious warnings from the Linux kernel about
+ * devices which can't do DMA at all
+ */
+ qemu_fdt_setprop(fdt, "/", "dma-coherent", NULL, 0);
+
/* /chosen must exist for load_dtb to fill in necessary properties later */
qemu_fdt_add_subnode(fdt, "/chosen");
if (vms->dtb_randomness) {
--
2.39.3

@ -1,132 +0,0 @@
From 3f58194f8642a71c47d91d3c00a34faf44ea2c11 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Wed, 3 Jan 2024 05:57:38 -0500
Subject: [PATCH] hw/arm/virt: Fix compats
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 209: hw/arm/virt: Fix compats
RH-Jira: RHEL-17168
RH-Acked-by: Gavin Shan <gshan@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [1/1] bcdf6493bbd6d7b52b0b88ff44441d22aeddfde2 (eauger1/centos-qemu-kvm)
arm_rhel_compat is not added for virt-rhel9.4.0 machine causing
the efi-virtio.rom to be looked for when instantiating a virtio-net-pci
device and it won't be found since not shipped on ARM. This is a
regression compared to 9.2.
Actually we do not need any rom file for any virtio-net-pci variant
because edk2 already brings the functionality. So for 9.4 onwards, we
want to set romfiles to "" for all of them.
However at the moment we apply arm_rhel_compat from the latest
rhel*_virt_options(). This is not aligned with the generic compat
usage which sets compats for a given machine type to accomodate for
changes that occured after its advent. Here we are somehow abusing
the compat infra to set general driver options that should apply for
all machines. On top of that this is really error prone and we have
forgotten to add arm_rhel_compat several times in the past.
So let's introduce set_arm_rhel_compat() being called before any
*virt_options in the non abstract machine class. That way the setting
will apply to any machine type without any need to add it in any
future machine types.
For < 9.4 machines we don't really care keeping non void romfiles
for transitional and non transitional devices because anyway this was
not working. So let's keep things simple and apply the new defaults for
all RHEL9 machine types.
Finally, to follow the generic pattern we should set hw_compat_rhel_9_0
in 9.0 machine as it is done on x86 or ccw. This has no consequence on
aarch64 because it only contains x86 stuff but that helps understanding
the consistency.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/arm/virt.c | 43 +++++++++++++++++++++++++++++--------------
1 file changed, 29 insertions(+), 14 deletions(-)
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 0b17c94ad7..5cab00b4cd 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -111,11 +111,39 @@
DEFINE_VIRT_MACHINE_LATEST(major, minor, false)
#endif /* disabled for RHEL */
+/*
+ * This variable is for changes to properties that are RHEL specific,
+ * different to the current upstream and to be applied to the latest
+ * machine type. They may be overriden by older machine compats.
+ *
+ * virtio-net-pci variant romfiles are not needed because edk2 does
+ * fully support the pxe boot. Besides virtio romfiles are not shipped
+ * on rhel/aarch64.
+ */
+GlobalProperty arm_rhel_compat[] = {
+ {"virtio-net-pci", "romfile", "" },
+ {"virtio-net-pci-transitional", "romfile", "" },
+ {"virtio-net-pci-non-transitional", "romfile", "" },
+};
+const size_t arm_rhel_compat_len = G_N_ELEMENTS(arm_rhel_compat);
+
+/*
+ * This cannot be called from the rhel_virt_class_init() because
+ * TYPE_RHEL_MACHINE is abstract and mc->compat_props g_ptr_array_new()
+ * only is called on virt-rhelm.n.s non abstract class init.
+ */
+static void arm_rhel_compat_set(MachineClass *mc)
+{
+ compat_props_add(mc->compat_props, arm_rhel_compat,
+ arm_rhel_compat_len);
+}
+
#define DEFINE_RHEL_MACHINE_LATEST(m, n, s, latest) \
static void rhel##m##n##s##_virt_class_init(ObjectClass *oc, \
void *data) \
{ \
MachineClass *mc = MACHINE_CLASS(oc); \
+ arm_rhel_compat_set(mc); \
rhel##m##n##s##_virt_options(mc); \
mc->desc = "RHEL " # m "." # n "." # s " ARM Virtual Machine"; \
if (latest) { \
@@ -139,19 +167,6 @@
#define DEFINE_RHEL_MACHINE(major, minor, subminor) \
DEFINE_RHEL_MACHINE_LATEST(major, minor, subminor, false)
-/* This variable is for changes to properties that are RHEL specific,
- * different to the current upstream and to be applied to the latest
- * machine type.
- */
-GlobalProperty arm_rhel_compat[] = {
- {
- .driver = "virtio-net-pci",
- .property = "romfile",
- .value = "",
- },
-};
-const size_t arm_rhel_compat_len = G_N_ELEMENTS(arm_rhel_compat);
-
/* Number of external interrupt lines to configure the GIC with */
#define NUM_IRQS 256
@@ -3639,7 +3654,6 @@ static void rhel920_virt_options(MachineClass *mc)
{
rhel940_virt_options(mc);
- compat_props_add(mc->compat_props, arm_rhel_compat, arm_rhel_compat_len);
compat_props_add(mc->compat_props, hw_compat_rhel_9_4, hw_compat_rhel_9_4_len);
compat_props_add(mc->compat_props, hw_compat_rhel_9_3, hw_compat_rhel_9_3_len);
compat_props_add(mc->compat_props, hw_compat_rhel_9_2, hw_compat_rhel_9_2_len);
@@ -3653,6 +3667,7 @@ static void rhel900_virt_options(MachineClass *mc)
rhel920_virt_options(mc);
compat_props_add(mc->compat_props, hw_compat_rhel_9_1, hw_compat_rhel_9_1_len);
+ compat_props_add(mc->compat_props, hw_compat_rhel_9_0, hw_compat_rhel_9_0_len);
/* Disable FEAT_LPA2 since old kernels (<= v5.12) don't boot with that feature */
vmc->no_tcg_lpa2 = true;
--
2.39.3

@ -0,0 +1,59 @@
From e3360c415f7de923d27c3167260a93cb679afabe Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Mon, 6 May 2024 15:09:43 +0200
Subject: [PATCH 1/2] hw/arm/virt: Fix spurious call to arm_virt_compat_set()
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 238: hw/arm/virt: Fix spurious call to arm_virt_compat_set()
RH-Jira: RHEL-34945
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
RH-Acked-by: Gavin Shan <gshan@redhat.com>
RH-Commit: [1/1] a858a3e1dff12b28e14f7e4bd2b896a9f06eacbb (eauger1/centos-qemu-kvm)
JIRA: https://issues.redhat.com/browse/RHEL-34945
Status: RHEL-only
Downstream, we apply arm_rhel_compat in place of arm_virt_compat.
This is done though arm_rhel_compat_set() transparently called in
DEFINE_RHEL_MACHINE_LATEST(). So there is no need to call
arm_virt_compat_set() in rhel_machine_class_init(). Besides
this triggers a "GLib: g_ptr_array_add: assertion 'rarray' failed"
warning.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/arm/virt.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index f1af9495c6..3f0496cdb9 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -85,6 +85,7 @@
#include "hw/char/pl011.h"
#include "qemu/guest-random.h"
+#if 0 /* Disabled for Red Hat Enterprise Linux */
static GlobalProperty arm_virt_compat[] = {
{ TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "48" },
};
@@ -101,7 +102,6 @@ static void arm_virt_compat_set(MachineClass *mc)
arm_virt_compat_len);
}
-#if 0 /* Disabled for Red Hat Enterprise Linux */
#define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
static void virt_##major##_##minor##_class_init(ObjectClass *oc, \
void *data) \
@@ -3536,7 +3536,6 @@ static void rhel_machine_class_init(ObjectClass *oc, void *data)
{
MachineClass *mc = MACHINE_CLASS(oc);
HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
- arm_virt_compat_set(mc);
mc->family = "virt-rhel-Z";
mc->init = machvirt_init;
--
2.39.3

@ -1,41 +0,0 @@
From 4c1d07995a7afb6fae68a7e7a8b6b6c94fa0a7bb Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cohuck@redhat.com>
Date: Mon, 12 Feb 2024 10:37:54 +0100
Subject: [PATCH 5/6] hw/arm/virt: deprecate virt-rhel9.{0,2}.0 machine types
RH-Author: Cornelia Huck <cohuck@redhat.com>
RH-MergeRequest: 225: hw/arm/virt: deprecate virt-rhel9.{0,2}.0 machine types
RH-Jira: RHEL-24988
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Acked-by: Eric Auger <eric.auger@redhat.com>
RH-Commit: [1/1] f15579db44808fa8a2d7bc01b3915aa59c064411 (cohuck/qemu-kvm-c9s)
Jira: https://issues.redhat.com/browse/RHEL-24988
Upstream: RHEL only
We do not plan to support any machine types prior to 9.4.0; leave them
in, but mark as deprecated.
Signed-off-by: Cornelia Huck <cohuck@redhat.com>
---
hw/arm/virt.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 60f117f0d2..943c563391 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -3679,6 +3679,10 @@ static void rhel920_virt_options(MachineClass *mc)
compat_props_add(mc->compat_props, hw_compat_rhel_9_4, hw_compat_rhel_9_4_len);
compat_props_add(mc->compat_props, hw_compat_rhel_9_3, hw_compat_rhel_9_3_len);
compat_props_add(mc->compat_props, hw_compat_rhel_9_2, hw_compat_rhel_9_2_len);
+
+ /* RHEL 9.4 is the first supported release */
+ mc->deprecation_reason =
+ "machine types for versions prior to 9.4 are deprecated";
}
DEFINE_RHEL_MACHINE(9, 2, 0)
--
2.39.3

@ -1,41 +0,0 @@
From 7a6be312c11911bdd2ce82566be22a3e014947c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Tue, 21 Nov 2023 16:44:20 +0800
Subject: [PATCH 041/101] hw/i386: Activate IOMMUFD for q35 machines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [40/67] b15764ab24fd57389a8d219736613484acd7d29e (eauger1/centos-qemu-kvm)
Signed-off-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 64ad06f6eba66c514477f490bcba409439a480d8)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/i386/Kconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig
index 55850791df..a1846be6f7 100644
--- a/hw/i386/Kconfig
+++ b/hw/i386/Kconfig
@@ -95,6 +95,7 @@ config Q35
imply E1000E_PCI_EXPRESS
imply VMPORT
imply VMMOUSE
+ imply IOMMUFD
select PC_PCI
select PC_ACPI
select PCI_EXPRESS_Q35
--
2.39.3

@ -0,0 +1,73 @@
From e74980be81d641736ea9d44d0fe9af02af63a220 Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Thu, 30 May 2024 06:16:40 -0500
Subject: [PATCH 083/100] hw/i386: Add support for loading BIOS using
guest_memfd
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [83/91] 7b77d212ef7d83b66ad9d8348179ee84e64fb911 (bonzini/rhel-qemu-kvm)
When guest_memfd is enabled, the BIOS is generally part of the initial
encrypted guest image and will be accessed as private guest memory. Add
the necessary changes to set up the associated RAM region with a
guest_memfd backend to allow for this.
Current support centers around using -bios to load the BIOS data.
Support for loading the BIOS via pflash requires additional enablement
since those interfaces rely on the use of ROM memory regions which make
use of the KVM_MEM_READONLY memslot flag, which is not supported for
guest_memfd-backed memslots.
Signed-off-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Pankaj Gupta <pankaj.gupta@amd.com>
Message-ID: <20240530111643.1091816-29-pankaj.gupta@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit fc7a69e177e4ba26d11fcf47b853f85115b35a11)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
hw/i386/x86-common.c | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/hw/i386/x86-common.c b/hw/i386/x86-common.c
index 35fe6eabea..6cbb76c25c 100644
--- a/hw/i386/x86-common.c
+++ b/hw/i386/x86-common.c
@@ -969,8 +969,13 @@ void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware,
(bios_size % 65536) != 0) {
goto bios_error;
}
- memory_region_init_ram(&x86ms->bios, NULL, "pc.bios", bios_size,
- &error_fatal);
+ if (machine_require_guest_memfd(MACHINE(x86ms))) {
+ memory_region_init_ram_guest_memfd(&x86ms->bios, NULL, "pc.bios",
+ bios_size, &error_fatal);
+ } else {
+ memory_region_init_ram(&x86ms->bios, NULL, "pc.bios",
+ bios_size, &error_fatal);
+ }
if (sev_enabled()) {
/*
* The concept of a "reset" simply doesn't exist for
@@ -991,9 +996,11 @@ void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware,
}
g_free(filename);
- /* map the last 128KB of the BIOS in ISA space */
- x86_isa_bios_init(&x86ms->isa_bios, rom_memory, &x86ms->bios,
- !isapc_ram_fw);
+ if (!machine_require_guest_memfd(MACHINE(x86ms))) {
+ /* map the last 128KB of the BIOS in ISA space */
+ x86_isa_bios_init(&x86ms->isa_bios, rom_memory, &x86ms->bios,
+ !isapc_ram_fw);
+ }
/* map all the bios at the top of memory */
memory_region_add_subregion(rom_memory,
--
2.39.3

@ -0,0 +1,106 @@
From c1e615d6b8f609b72a94ffe6d31a9848a41744ef Mon Sep 17 00:00:00 2001
From: Bernhard Beschow <shentey@gmail.com>
Date: Tue, 30 Apr 2024 17:06:39 +0200
Subject: [PATCH 038/100] hw/i386: Have x86_bios_rom_init() take
X86MachineState rather than MachineState
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [38/91] 59f388b1dffc5d0aa2f0fff768194d755bc3efbb (bonzini/rhel-qemu-kvm)
The function creates and leaks two MemoryRegion objects regarding the BIOS which
will be moved into X86MachineState in the next steps to avoid the leakage.
Signed-off-by: Bernhard Beschow <shentey@gmail.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Message-ID: <20240430150643.111976-3-shentey@gmail.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
(cherry picked from commit 848351840148f8c3b53ddf6210194506547d3ffd)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
hw/i386/microvm.c | 2 +-
hw/i386/pc_sysfw.c | 4 ++--
hw/i386/x86.c | 4 ++--
include/hw/i386/x86.h | 2 +-
4 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c
index 61a772dfe6..fec63cacfa 100644
--- a/hw/i386/microvm.c
+++ b/hw/i386/microvm.c
@@ -278,7 +278,7 @@ static void microvm_devices_init(MicrovmMachineState *mms)
default_firmware = x86_machine_is_acpi_enabled(x86ms)
? MICROVM_BIOS_FILENAME
: MICROVM_QBOOT_FILENAME;
- x86_bios_rom_init(MACHINE(mms), default_firmware, get_system_memory(), true);
+ x86_bios_rom_init(x86ms, default_firmware, get_system_memory(), true);
}
static void microvm_memory_init(MicrovmMachineState *mms)
diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c
index 3efabbbab2..ef7dea9798 100644
--- a/hw/i386/pc_sysfw.c
+++ b/hw/i386/pc_sysfw.c
@@ -206,7 +206,7 @@ void pc_system_firmware_init(PCMachineState *pcms,
BlockBackend *pflash_blk[ARRAY_SIZE(pcms->flash)];
if (!pcmc->pci_enabled) {
- x86_bios_rom_init(MACHINE(pcms), "bios.bin", rom_memory, true);
+ x86_bios_rom_init(X86_MACHINE(pcms), "bios.bin", rom_memory, true);
return;
}
@@ -227,7 +227,7 @@ void pc_system_firmware_init(PCMachineState *pcms,
if (!pflash_blk[0]) {
/* Machine property pflash0 not set, use ROM mode */
- x86_bios_rom_init(MACHINE(pcms), "bios.bin", rom_memory, false);
+ x86_bios_rom_init(X86_MACHINE(pcms), "bios.bin", rom_memory, false);
} else {
if (kvm_enabled() && !kvm_readonly_mem_enabled()) {
/*
diff --git a/hw/i386/x86.c b/hw/i386/x86.c
index 2a4f3ee285..6d3c72f124 100644
--- a/hw/i386/x86.c
+++ b/hw/i386/x86.c
@@ -1128,7 +1128,7 @@ void x86_load_linux(X86MachineState *x86ms,
nb_option_roms++;
}
-void x86_bios_rom_init(MachineState *ms, const char *default_firmware,
+void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware,
MemoryRegion *rom_memory, bool isapc_ram_fw)
{
const char *bios_name;
@@ -1138,7 +1138,7 @@ void x86_bios_rom_init(MachineState *ms, const char *default_firmware,
ssize_t ret;
/* BIOS load */
- bios_name = ms->firmware ?: default_firmware;
+ bios_name = MACHINE(x86ms)->firmware ?: default_firmware;
filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
if (filename) {
bios_size = get_image_size(filename);
diff --git a/include/hw/i386/x86.h b/include/hw/i386/x86.h
index 4dc30dcb4d..cb07618d19 100644
--- a/include/hw/i386/x86.h
+++ b/include/hw/i386/x86.h
@@ -116,7 +116,7 @@ void x86_cpu_unplug_request_cb(HotplugHandler *hotplug_dev,
void x86_cpu_unplug_cb(HotplugHandler *hotplug_dev,
DeviceState *dev, Error **errp);
-void x86_bios_rom_init(MachineState *ms, const char *default_firmware,
+void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware,
MemoryRegion *rom_memory, bool isapc_ram_fw);
void x86_load_linux(X86MachineState *x86ms,
--
2.39.3

@ -0,0 +1,51 @@
From 7bb1f124413891bc5d2187f12cd19da6e794904b Mon Sep 17 00:00:00 2001
From: Xiaoyao Li <xiaoyao.li@intel.com>
Date: Wed, 3 Apr 2024 10:59:53 -0400
Subject: [PATCH 010/100] hw/i386/acpi: Set PCAT_COMPAT bit only when pic is
not disabled
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [10/91] 62110e4bf52cb3e106c8d2a902bbd31548beba00 (bonzini/rhel-qemu-kvm)
A value 1 of PCAT_COMPAT (bit 0) of MADT.Flags indicates that the system
also has a PC-AT-compatible dual-8259 setup, i.e., the PIC. When PIC
is not enabled (pic=off) for x86 machine, the PCAT_COMPAT bit needs to
be cleared. The PIC probe should then print:
[ 0.155970] Using NULL legacy PIC
However, no such log printed in guest kernel unless PCAT_COMPAT is
cleared.
Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Message-ID: <20240403145953.3082491-1-xiaoyao.li@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 292dd287e78e0cbafde9d1522c729349d132d844)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
hw/i386/acpi-common.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/hw/i386/acpi-common.c b/hw/i386/acpi-common.c
index 20f19269da..0cc2919bb8 100644
--- a/hw/i386/acpi-common.c
+++ b/hw/i386/acpi-common.c
@@ -107,7 +107,9 @@ void acpi_build_madt(GArray *table_data, BIOSLinker *linker,
acpi_table_begin(&table, table_data);
/* Local APIC Address */
build_append_int_noprefix(table_data, APIC_DEFAULT_ADDRESS, 4);
- build_append_int_noprefix(table_data, 1 /* PCAT_COMPAT */, 4); /* Flags */
+ /* Flags. bit 0: PCAT_COMPAT */
+ build_append_int_noprefix(table_data,
+ x86ms->pic != ON_OFF_AUTO_OFF ? 1 : 0 , 4);
for (i = 0; i < apic_ids->len; i++) {
pc_madt_cpu_entry(i, apic_ids, table_data, false);
--
2.39.3

@ -1,186 +0,0 @@
From ea2e2368dcf4140be47288472f2c2a094358e0c7 Mon Sep 17 00:00:00 2001
From: Igor Mammedov <imammedo@redhat.com>
Date: Thu, 8 Feb 2024 23:03:45 +0100
Subject: [PATCH 03/20] hw/i386/pc: Defer smbios_set_defaults() to machine_done
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Igor Mammedov <imammedo@redhat.com>
RH-MergeRequest: 230: Workaround Windows failing to find 64bit SMBIOS entry point with SeaBIOS
RH-Jira: RHEL-21705
RH-Acked-by: MST <mst@redhat.com>
RH-Acked-by: Ani Sinha <None>
RH-Commit: [1/18] 9d4c1d1a910fec7d310429d6fc0b10c798932db7
JIRA: https://issues.redhat.com/browse/RHEL-21705
commit: a0204a5ed091dfe79aced7ec8f3ce1931fd25816
Author: Bernhard Beschow <shentey@gmail.com>
Handling most of smbios data generation in the machine_done notifier is similar
to how the ARM virt machine handles it which also calls smbios_set_defaults()
there. The result is that all pc machines are freed from explicitly worrying
about smbios setup.
Signed-off-by: Bernhard Beschow <shentey@gmail.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Message-ID: <20240208220349.4948-6-shentey@gmail.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Conflicts: hw/i386/pc_q35.c, hw/i386/pc_piix.c
due to missing 4d3457fef9 (w/i386/pc: Merge pc_guest_info_init() into pc_machine_initfn())
and different signature of smbios_set_defaults() downstream
Fixup: hw/i386/fw_cfg.c to account for downstream changes smbios_set_defaults()
Signed-off-by: Igor Mammedov <imammedo@redhat.com>
---
hw/i386/fw_cfg.c | 14 +++++++++++++-
hw/i386/fw_cfg.h | 3 ++-
hw/i386/pc.c | 2 +-
hw/i386/pc_piix.c | 12 ------------
hw/i386/pc_q35.c | 11 -----------
include/hw/i386/pc.h | 1 -
6 files changed, 16 insertions(+), 27 deletions(-)
diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c
index 7362daa45a..6a5466faf0 100644
--- a/hw/i386/fw_cfg.c
+++ b/hw/i386/fw_cfg.c
@@ -48,15 +48,27 @@ const char *fw_cfg_arch_key_name(uint16_t key)
return NULL;
}
-void fw_cfg_build_smbios(MachineState *ms, FWCfgState *fw_cfg)
+void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState *fw_cfg)
{
#ifdef CONFIG_SMBIOS
uint8_t *smbios_tables, *smbios_anchor;
size_t smbios_tables_len, smbios_anchor_len;
struct smbios_phys_mem_area *mem_array;
unsigned i, array_count;
+ MachineState *ms = MACHINE(pcms);
+ PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+ MachineClass *mc = MACHINE_GET_CLASS(pcms);
X86CPU *cpu = X86_CPU(ms->possible_cpus->cpus[0].cpu);
+ if (pcmc->smbios_defaults) {
+ /* These values are guest ABI, do not change */
+ smbios_set_defaults("QEMU", mc->desc, mc->name,
+ pcmc->smbios_legacy_mode, pcmc->smbios_uuid_encoded,
+ pcmc->smbios_stream_product,
+ pcmc->smbios_stream_version,
+ pcms->smbios_entry_point_type);
+ }
+
/* tell smbios about cpuid version and features */
smbios_set_cpuid(cpu->env.cpuid_version, cpu->env.features[FEAT_1_EDX]);
diff --git a/hw/i386/fw_cfg.h b/hw/i386/fw_cfg.h
index 86ca7c1c0c..1e1de6b4a3 100644
--- a/hw/i386/fw_cfg.h
+++ b/hw/i386/fw_cfg.h
@@ -10,6 +10,7 @@
#define HW_I386_FW_CFG_H
#include "hw/boards.h"
+#include "hw/i386/pc.h"
#include "hw/nvram/fw_cfg.h"
#define FW_CFG_IO_BASE 0x510
@@ -22,7 +23,7 @@
FWCfgState *fw_cfg_arch_create(MachineState *ms,
uint16_t boot_cpus,
uint16_t apic_id_limit);
-void fw_cfg_build_smbios(MachineState *ms, FWCfgState *fw_cfg);
+void fw_cfg_build_smbios(PCMachineState *ms, FWCfgState *fw_cfg);
void fw_cfg_build_feature_control(MachineState *ms, FWCfgState *fw_cfg);
void fw_cfg_add_acpi_dsdt(Aml *scope, FWCfgState *fw_cfg);
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index a1faa9e92c..16de2a59e8 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -847,7 +847,7 @@ void pc_machine_done(Notifier *notifier, void *data)
acpi_setup();
if (x86ms->fw_cfg) {
- fw_cfg_build_smbios(MACHINE(pcms), x86ms->fw_cfg);
+ fw_cfg_build_smbios(pcms, x86ms->fw_cfg);
fw_cfg_build_feature_control(MACHINE(pcms), x86ms->fw_cfg);
/* update FW_CFG_NB_CPUS to account for -device added CPUs */
fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 09d02cc91f..7344b35cf1 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -36,7 +36,6 @@
#include "hw/rtc/mc146818rtc.h"
#include "hw/southbridge/piix.h"
#include "hw/display/ramfb.h"
-#include "hw/firmware/smbios.h"
#include "hw/pci/pci.h"
#include "hw/pci/pci_ids.h"
#include "hw/usb.h"
@@ -233,17 +232,6 @@ static void pc_init1(MachineState *machine,
pc_guest_info_init(pcms);
- if (pcmc->smbios_defaults) {
- MachineClass *mc = MACHINE_GET_CLASS(machine);
- /* These values are guest ABI, do not change */
- smbios_set_defaults("Red Hat", "KVM",
- mc->desc, pcmc->smbios_legacy_mode,
- pcmc->smbios_uuid_encoded,
- pcmc->smbios_stream_product,
- pcmc->smbios_stream_version,
- pcms->smbios_entry_point_type);
- }
-
/* allocate ram and load rom/bios */
if (!xen_enabled()) {
pc_memory_init(pcms, system_memory, rom_memory, hole64_size);
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index c6967e1846..9a22ff5dd6 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -45,7 +45,6 @@
#include "hw/i386/amd_iommu.h"
#include "hw/i386/intel_iommu.h"
#include "hw/display/ramfb.h"
-#include "hw/firmware/smbios.h"
#include "hw/ide/pci.h"
#include "hw/ide/ahci.h"
#include "hw/intc/ioapic.h"
@@ -201,16 +200,6 @@ static void pc_q35_init(MachineState *machine)
pc_guest_info_init(pcms);
- if (pcmc->smbios_defaults) {
- /* These values are guest ABI, do not change */
- smbios_set_defaults("Red Hat", "KVM",
- mc->desc, pcmc->smbios_legacy_mode,
- pcmc->smbios_uuid_encoded,
- pcmc->smbios_stream_product,
- pcmc->smbios_stream_version,
- pcms->smbios_entry_point_type);
- }
-
/* create pci host bus */
phb = OBJECT(qdev_new(TYPE_Q35_HOST_DEVICE));
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 37644ede7e..c286c10bc3 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -12,7 +12,6 @@
#include "hw/hotplug.h"
#include "qom/object.h"
#include "hw/i386/sgx-epc.h"
-#include "hw/firmware/smbios.h"
#include "hw/cxl/cxl.h"
#define HPET_INTCAP "hpet-intcap"
--
2.39.3

@ -0,0 +1,164 @@
From fd6de3c5e97bdf13a39342fc71815a20c66867ae Mon Sep 17 00:00:00 2001
From: Bernhard Beschow <shentey@gmail.com>
Date: Wed, 8 May 2024 19:55:07 +0200
Subject: [PATCH 043/100] hw/i386/pc_sysfw: Alias rather than copy isa-bios
region
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [43/91] f64dab2a091838a10a9b94e3d09ea11432b0809f (bonzini/rhel-qemu-kvm)
In the -bios case the "isa-bios" memory region is an alias to the BIOS mapped
to the top of the 4G memory boundary. Do the same in the -pflash case, but only
for new machine versions for migration compatibility. This establishes common
behavior and makes pflash commands work in the "isa-bios" region which some
real-world legacy bioses rely on.
Note that in the sev_enabled() case, the "isa-bios" memory region in the -pflash
case will now also point to encrypted memory, just like it already does in the
-bios case.
When running `info mtree` before and after this commit with
`qemu-system-x86_64 -S -drive \
if=pflash,format=raw,readonly=on,file=/usr/share/qemu/bios-256k.bin` and running
`diff -u before.mtree after.mtree` results in the following changes in the
memory tree:
| --- before.mtree
| +++ after.mtree
| @@ -71,7 +71,7 @@
| 0000000000000000-ffffffffffffffff (prio -1, i/o): pci
| 00000000000a0000-00000000000bffff (prio 1, i/o): vga-lowmem
| 00000000000c0000-00000000000dffff (prio 1, rom): pc.rom
| - 00000000000e0000-00000000000fffff (prio 1, rom): isa-bios
| + 00000000000e0000-00000000000fffff (prio 1, romd): alias isa-bios @system.flash0 0000000000020000-000000000003ffff
| 00000000000a0000-00000000000bffff (prio 1, i/o): alias smram-region @pci 00000000000a0000-00000000000bffff
| 00000000000c0000-00000000000c3fff (prio 1, i/o): alias pam-pci @pci 00000000000c0000-00000000000c3fff
| 00000000000c4000-00000000000c7fff (prio 1, i/o): alias pam-pci @pci 00000000000c4000-00000000000c7fff
| @@ -108,7 +108,7 @@
| 0000000000000000-ffffffffffffffff (prio -1, i/o): pci
| 00000000000a0000-00000000000bffff (prio 1, i/o): vga-lowmem
| 00000000000c0000-00000000000dffff (prio 1, rom): pc.rom
| - 00000000000e0000-00000000000fffff (prio 1, rom): isa-bios
| + 00000000000e0000-00000000000fffff (prio 1, romd): alias isa-bios @system.flash0 0000000000020000-000000000003ffff
| 00000000000a0000-00000000000bffff (prio 1, i/o): alias smram-region @pci 00000000000a0000-00000000000bffff
| 00000000000c0000-00000000000c3fff (prio 1, i/o): alias pam-pci @pci 00000000000c0000-00000000000c3fff
| 00000000000c4000-00000000000c7fff (prio 1, i/o): alias pam-pci @pci 00000000000c4000-00000000000c7fff
| @@ -131,11 +131,14 @@
| memory-region: pc.ram
| 0000000000000000-0000000007ffffff (prio 0, ram): pc.ram
|
| +memory-region: system.flash0
| + 00000000fffc0000-00000000ffffffff (prio 0, romd): system.flash0
| +
| memory-region: pci
| 0000000000000000-ffffffffffffffff (prio -1, i/o): pci
| 00000000000a0000-00000000000bffff (prio 1, i/o): vga-lowmem
| 00000000000c0000-00000000000dffff (prio 1, rom): pc.rom
| - 00000000000e0000-00000000000fffff (prio 1, rom): isa-bios
| + 00000000000e0000-00000000000fffff (prio 1, romd): alias isa-bios @system.flash0 0000000000020000-000000000003ffff
|
| memory-region: smram
| 00000000000a0000-00000000000bffff (prio 0, ram): alias smram-low @pc.ram 00000000000a0000-00000000000bffff
Note that in both cases the "system" memory region contains the entry
00000000fffc0000-00000000ffffffff (prio 0, romd): system.flash0
but the "system.flash0" memory region only appears standalone when "isa-bios" is
an alias.
Signed-off-by: Bernhard Beschow <shentey@gmail.com>
Message-ID: <20240508175507.22270-7-shentey@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit a44ea3fa7f2aa1d809fdca1b84a52695b53d8ad0)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
hw/i386/pc.c | 1 +
hw/i386/pc_piix.c | 1 +
hw/i386/pc_q35.c | 1 +
hw/i386/pc_sysfw.c | 8 +++++++-
include/hw/i386/pc.h | 1 +
5 files changed, 11 insertions(+), 1 deletion(-)
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 1a34bc4522..660a59c63b 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1967,6 +1967,7 @@ static void pc_machine_class_init(ObjectClass *oc, void *data)
pcmc->has_reserved_memory = true;
pcmc->enforce_aligned_dimm = true;
pcmc->enforce_amd_1tb_hole = true;
+ pcmc->isa_bios_alias = true;
/* BIOS ACPI tables: 128K. Other BIOS datastructures: less than 4K reported
* to be used at the moment, 32K should be enough for a while. */
pcmc->acpi_data_size = 0x20000 + 0x8000;
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index bef3e8b73e..dbb7f2ed17 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -975,6 +975,7 @@ static void pc_machine_rhel7_options(MachineClass *m)
m->alias = "pc";
m->is_default = 1;
m->smp_props.prefer_sockets = true;
+ pcmc->isa_bios_alias = false;
}
static void pc_init_rhel760(MachineState *machine)
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index dedc86eec9..f9900ad798 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -735,6 +735,7 @@ static void pc_q35_machine_rhel940_options(MachineClass *m)
m->desc = "RHEL-9.4.0 PC (Q35 + ICH9, 2009)";
pcmc->smbios_stream_product = "RHEL";
pcmc->smbios_stream_version = "9.4.0";
+ pcmc->isa_bios_alias = false;
compat_props_add(m->compat_props, pc_rhel_9_5_compat,
pc_rhel_9_5_compat_len);
diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c
index 82d37cb376..ac88ad4eb9 100644
--- a/hw/i386/pc_sysfw.c
+++ b/hw/i386/pc_sysfw.c
@@ -135,6 +135,7 @@ static void pc_system_flash_map(PCMachineState *pcms,
MemoryRegion *rom_memory)
{
X86MachineState *x86ms = X86_MACHINE(pcms);
+ PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
hwaddr total_size = 0;
int i;
BlockBackend *blk;
@@ -184,7 +185,12 @@ static void pc_system_flash_map(PCMachineState *pcms,
if (i == 0) {
flash_mem = pflash_cfi01_get_memory(system_flash);
- pc_isa_bios_init(&x86ms->isa_bios, rom_memory, flash_mem);
+ if (pcmc->isa_bios_alias) {
+ x86_isa_bios_init(&x86ms->isa_bios, rom_memory, flash_mem,
+ true);
+ } else {
+ pc_isa_bios_init(&x86ms->isa_bios, rom_memory, flash_mem);
+ }
/* Encrypt the pflash boot ROM */
if (sev_enabled()) {
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 467e7fb52f..3f53ec73ac 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -122,6 +122,7 @@ struct PCMachineClass {
bool enforce_aligned_dimm;
bool broken_reserved_end;
bool enforce_amd_1tb_hole;
+ bool isa_bios_alias;
/* generate legacy CPU hotplug AML */
bool legacy_cpu_hotplug;
--
2.39.3

@ -0,0 +1,53 @@
From 9bf1d368c4b53139db39649833d475e097fc98d1 Mon Sep 17 00:00:00 2001
From: Bernhard Beschow <shentey@gmail.com>
Date: Mon, 22 Apr 2024 22:06:22 +0200
Subject: [PATCH 039/100] hw/i386/pc_sysfw: Remove unused parameter from
pc_isa_bios_init()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [39/91] c0019dc2706a8e3f40486fd4a4c0dd1fbe23237b (bonzini/rhel-qemu-kvm)
Signed-off-by: Bernhard Beschow <shentey@gmail.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Message-ID: <20240422200625.2768-2-shentey@gmail.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
(cherry picked from commit f4b63768b91811cdcf1fb7b270587123251dfea5)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
hw/i386/pc_sysfw.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c
index ef7dea9798..59c7a81692 100644
--- a/hw/i386/pc_sysfw.c
+++ b/hw/i386/pc_sysfw.c
@@ -41,8 +41,7 @@
#define FLASH_SECTOR_SIZE 4096
static void pc_isa_bios_init(MemoryRegion *rom_memory,
- MemoryRegion *flash_mem,
- int ram_size)
+ MemoryRegion *flash_mem)
{
int isa_bios_size;
MemoryRegion *isa_bios;
@@ -186,7 +185,7 @@ static void pc_system_flash_map(PCMachineState *pcms,
if (i == 0) {
flash_mem = pflash_cfi01_get_memory(system_flash);
- pc_isa_bios_init(rom_memory, flash_mem, size);
+ pc_isa_bios_init(rom_memory, flash_mem);
/* Encrypt the pflash boot ROM */
if (sev_enabled()) {
--
2.39.3

@ -0,0 +1,158 @@
From e6472ff46cbed97c2a238a8ef7d321351931333a Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Thu, 30 May 2024 06:16:30 -0500
Subject: [PATCH 070/100] hw/i386/sev: Add function to get SEV metadata from
OVMF header
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [70/91] ba818dade96119c8a51ca1fb222f4f69e2752396 (bonzini/rhel-qemu-kvm)
A recent version of OVMF expanded the reset vector GUID list to add
SEV-specific metadata GUID. The SEV metadata describes the reserved
memory regions such as the secrets and CPUID page used during the SEV-SNP
guest launch.
The pc_system_get_ovmf_sev_metadata_ptr() is used to retieve the SEV
metadata pointer from the OVMF GUID list.
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Pankaj Gupta <pankaj.gupta@amd.com>
Message-ID: <20240530111643.1091816-19-pankaj.gupta@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit f3c30c575d34122573b7370a7da5ca3a27dde481)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
hw/i386/pc_sysfw.c | 4 ++++
include/hw/i386/pc.h | 26 ++++++++++++++++++++++++++
target/i386/sev-sysemu-stub.c | 4 ++++
target/i386/sev.c | 32 ++++++++++++++++++++++++++++++++
target/i386/sev.h | 2 ++
5 files changed, 68 insertions(+)
diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c
index ac88ad4eb9..9b8671c441 100644
--- a/hw/i386/pc_sysfw.c
+++ b/hw/i386/pc_sysfw.c
@@ -260,6 +260,10 @@ void x86_firmware_configure(void *ptr, int size)
pc_system_parse_ovmf_flash(ptr, size);
if (sev_enabled()) {
+
+ /* Copy the SEV metadata table (if it exists) */
+ pc_system_parse_sev_metadata(ptr, size);
+
ret = sev_es_save_reset_vector(ptr, size);
if (ret) {
error_report("failed to locate and/or save reset vector");
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 3f53ec73ac..94b49310f5 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -167,6 +167,32 @@ void pc_acpi_smi_interrupt(void *opaque, int irq, int level);
#define PCI_HOST_ABOVE_4G_MEM_SIZE "above-4g-mem-size"
#define PCI_HOST_PROP_SMM_RANGES "smm-ranges"
+typedef enum {
+ SEV_DESC_TYPE_UNDEF,
+ /* The section contains the region that must be validated by the VMM. */
+ SEV_DESC_TYPE_SNP_SEC_MEM,
+ /* The section contains the SNP secrets page */
+ SEV_DESC_TYPE_SNP_SECRETS,
+ /* The section contains address that can be used as a CPUID page */
+ SEV_DESC_TYPE_CPUID,
+
+} ovmf_sev_metadata_desc_type;
+
+typedef struct __attribute__((__packed__)) OvmfSevMetadataDesc {
+ uint32_t base;
+ uint32_t len;
+ ovmf_sev_metadata_desc_type type;
+} OvmfSevMetadataDesc;
+
+typedef struct __attribute__((__packed__)) OvmfSevMetadata {
+ uint8_t signature[4];
+ uint32_t len;
+ uint32_t version;
+ uint32_t num_desc;
+ OvmfSevMetadataDesc descs[];
+} OvmfSevMetadata;
+
+OvmfSevMetadata *pc_system_get_ovmf_sev_metadata_ptr(void);
void pc_pci_as_mapping_init(MemoryRegion *system_memory,
MemoryRegion *pci_address_space);
diff --git a/target/i386/sev-sysemu-stub.c b/target/i386/sev-sysemu-stub.c
index 96e1c15cc3..fc1c57c411 100644
--- a/target/i386/sev-sysemu-stub.c
+++ b/target/i386/sev-sysemu-stub.c
@@ -67,3 +67,7 @@ void hmp_info_sev(Monitor *mon, const QDict *qdict)
{
monitor_printf(mon, "SEV is not available in this QEMU\n");
}
+
+void pc_system_parse_sev_metadata(uint8_t *flash_ptr, size_t flash_size)
+{
+}
diff --git a/target/i386/sev.c b/target/i386/sev.c
index e84e4395a5..17281bb2c7 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -597,6 +597,38 @@ SevCapability *qmp_query_sev_capabilities(Error **errp)
return sev_get_capabilities(errp);
}
+static OvmfSevMetadata *ovmf_sev_metadata_table;
+
+#define OVMF_SEV_META_DATA_GUID "dc886566-984a-4798-A75e-5585a7bf67cc"
+typedef struct __attribute__((__packed__)) OvmfSevMetadataOffset {
+ uint32_t offset;
+} OvmfSevMetadataOffset;
+
+OvmfSevMetadata *pc_system_get_ovmf_sev_metadata_ptr(void)
+{
+ return ovmf_sev_metadata_table;
+}
+
+void pc_system_parse_sev_metadata(uint8_t *flash_ptr, size_t flash_size)
+{
+ OvmfSevMetadata *metadata;
+ OvmfSevMetadataOffset *data;
+
+ if (!pc_system_ovmf_table_find(OVMF_SEV_META_DATA_GUID, (uint8_t **)&data,
+ NULL)) {
+ return;
+ }
+
+ metadata = (OvmfSevMetadata *)(flash_ptr + flash_size - data->offset);
+ if (memcmp(metadata->signature, "ASEV", 4) != 0 ||
+ metadata->len < sizeof(OvmfSevMetadata) ||
+ metadata->len > flash_size - data->offset) {
+ return;
+ }
+
+ ovmf_sev_metadata_table = g_memdup2(metadata, metadata->len);
+}
+
static SevAttestationReport *sev_get_attestation_report(const char *mnonce,
Error **errp)
{
diff --git a/target/i386/sev.h b/target/i386/sev.h
index 5dc4767b1e..cc12824dd6 100644
--- a/target/i386/sev.h
+++ b/target/i386/sev.h
@@ -66,4 +66,6 @@ int sev_inject_launch_secret(const char *hdr, const char *secret,
int sev_es_save_reset_vector(void *flash_ptr, uint64_t flash_size);
void sev_es_set_reset_vector(CPUState *cpu);
+void pc_system_parse_sev_metadata(uint8_t *flash_ptr, size_t flash_size);
+
#endif
--
2.39.3

@ -0,0 +1,165 @@
From 226cf6c3d3e2fd1a35422043dbe0b73d1216df83 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Thu, 30 May 2024 06:16:36 -0500
Subject: [PATCH 073/100] hw/i386/sev: Add support to encrypt BIOS when SEV-SNP
is enabled
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [73/91] 844afd322c12c3e8992cf6ec692c94e70747bd0c (bonzini/rhel-qemu-kvm)
As with SEV, an SNP guest requires that the BIOS be part of the initial
encrypted/measured guest payload. Extend sev_encrypt_flash() to handle
the SNP case and plumb through the GPA of the BIOS location since this
is needed for SNP.
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Pankaj Gupta <pankaj.gupta@amd.com>
Message-ID: <20240530111643.1091816-25-pankaj.gupta@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 77d1abd91e5352ad30ae2f83790f95fa6a3c0b6b)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
hw/i386/pc_sysfw.c | 12 +++++++-----
hw/i386/x86-common.c | 2 +-
include/hw/i386/x86.h | 2 +-
target/i386/sev-sysemu-stub.c | 2 +-
target/i386/sev.c | 5 +++--
target/i386/sev.h | 2 +-
6 files changed, 14 insertions(+), 11 deletions(-)
diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c
index 9b8671c441..7cdbafc8d2 100644
--- a/hw/i386/pc_sysfw.c
+++ b/hw/i386/pc_sysfw.c
@@ -148,6 +148,8 @@ static void pc_system_flash_map(PCMachineState *pcms,
assert(PC_MACHINE_GET_CLASS(pcms)->pci_enabled);
for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) {
+ hwaddr gpa;
+
system_flash = pcms->flash[i];
blk = pflash_cfi01_get_blk(system_flash);
if (!blk) {
@@ -177,11 +179,11 @@ static void pc_system_flash_map(PCMachineState *pcms,
}
total_size += size;
+ gpa = 0x100000000ULL - total_size; /* where the flash is mapped */
qdev_prop_set_uint32(DEVICE(system_flash), "num-blocks",
size / FLASH_SECTOR_SIZE);
sysbus_realize_and_unref(SYS_BUS_DEVICE(system_flash), &error_fatal);
- sysbus_mmio_map(SYS_BUS_DEVICE(system_flash), 0,
- 0x100000000ULL - total_size);
+ sysbus_mmio_map(SYS_BUS_DEVICE(system_flash), 0, gpa);
if (i == 0) {
flash_mem = pflash_cfi01_get_memory(system_flash);
@@ -196,7 +198,7 @@ static void pc_system_flash_map(PCMachineState *pcms,
if (sev_enabled()) {
flash_ptr = memory_region_get_ram_ptr(flash_mem);
flash_size = memory_region_size(flash_mem);
- x86_firmware_configure(flash_ptr, flash_size);
+ x86_firmware_configure(gpa, flash_ptr, flash_size);
}
}
}
@@ -249,7 +251,7 @@ void pc_system_firmware_init(PCMachineState *pcms,
pc_system_flash_cleanup_unused(pcms);
}
-void x86_firmware_configure(void *ptr, int size)
+void x86_firmware_configure(hwaddr gpa, void *ptr, int size)
{
int ret;
@@ -270,6 +272,6 @@ void x86_firmware_configure(void *ptr, int size)
exit(1);
}
- sev_encrypt_flash(ptr, size, &error_fatal);
+ sev_encrypt_flash(gpa, ptr, size, &error_fatal);
}
}
diff --git a/hw/i386/x86-common.c b/hw/i386/x86-common.c
index 67b03c913a..35fe6eabea 100644
--- a/hw/i386/x86-common.c
+++ b/hw/i386/x86-common.c
@@ -981,7 +981,7 @@ void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware,
*/
void *ptr = memory_region_get_ram_ptr(&x86ms->bios);
load_image_size(filename, ptr, bios_size);
- x86_firmware_configure(ptr, bios_size);
+ x86_firmware_configure(0x100000000ULL - bios_size, ptr, bios_size);
} else {
memory_region_set_readonly(&x86ms->bios, !isapc_ram_fw);
ret = rom_add_file_fixed(bios_name, (uint32_t)(-bios_size), -1);
diff --git a/include/hw/i386/x86.h b/include/hw/i386/x86.h
index b006f16b8d..d43cb3908e 100644
--- a/include/hw/i386/x86.h
+++ b/include/hw/i386/x86.h
@@ -154,6 +154,6 @@ void ioapic_init_gsi(GSIState *gsi_state, Object *parent);
DeviceState *ioapic_init_secondary(GSIState *gsi_state);
/* pc_sysfw.c */
-void x86_firmware_configure(void *ptr, int size);
+void x86_firmware_configure(hwaddr gpa, void *ptr, int size);
#endif
diff --git a/target/i386/sev-sysemu-stub.c b/target/i386/sev-sysemu-stub.c
index fc1c57c411..d5bf886e79 100644
--- a/target/i386/sev-sysemu-stub.c
+++ b/target/i386/sev-sysemu-stub.c
@@ -42,7 +42,7 @@ void qmp_sev_inject_launch_secret(const char *packet_header, const char *secret,
error_setg(errp, "SEV is not available in this QEMU");
}
-int sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp)
+int sev_encrypt_flash(hwaddr gpa, uint8_t *ptr, uint64_t len, Error **errp)
{
g_assert_not_reached();
}
diff --git a/target/i386/sev.c b/target/i386/sev.c
index 06401f0526..7b5c4b4874 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -1484,7 +1484,7 @@ static int sev_snp_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
}
int
-sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp)
+sev_encrypt_flash(hwaddr gpa, uint8_t *ptr, uint64_t len, Error **errp)
{
SevCommonState *sev_common = SEV_COMMON(MACHINE(qdev_get_machine())->cgs);
@@ -1841,7 +1841,8 @@ bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp)
/* zero the excess data so the measurement can be reliably calculated */
memset(padded_ht->padding, 0, sizeof(padded_ht->padding));
- if (sev_encrypt_flash((uint8_t *)padded_ht, sizeof(*padded_ht), errp) < 0) {
+ if (sev_encrypt_flash(area->base, (uint8_t *)padded_ht,
+ sizeof(*padded_ht), errp) < 0) {
ret = false;
}
diff --git a/target/i386/sev.h b/target/i386/sev.h
index cc12824dd6..858005a119 100644
--- a/target/i386/sev.h
+++ b/target/i386/sev.h
@@ -59,7 +59,7 @@ uint32_t sev_get_cbit_position(void);
uint32_t sev_get_reduced_phys_bits(void);
bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp);
-int sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp);
+int sev_encrypt_flash(hwaddr gpa, uint8_t *ptr, uint64_t len, Error **errp);
int sev_inject_launch_secret(const char *hdr, const char *secret,
uint64_t gpa, Error **errp);
--
2.39.3

@ -0,0 +1,123 @@
From a20b2e3e52b9589ac1abc8b9b818d526c86368cf Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Thu, 30 May 2024 06:16:39 -0500
Subject: [PATCH 082/100] hw/i386/sev: Use guest_memfd for legacy ROMs
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [82/91] a591e85e00c353009803b143c80852b8c9b1f15e (bonzini/rhel-qemu-kvm)
Current SNP guest kernels will attempt to access these regions with
with C-bit set, so guest_memfd is needed to handle that. Otherwise,
kvm_convert_memory() will fail when the guest kernel tries to access it
and QEMU attempts to call KVM_SET_MEMORY_ATTRIBUTES to set these ranges
to private.
Whether guests should actually try to access ROM regions in this way (or
need to deal with legacy ROM regions at all), is a separate issue to be
addressed on kernel side, but current SNP guest kernels will exhibit
this behavior and so this handling is needed to allow QEMU to continue
running existing SNP guest kernels.
Signed-off-by: Michael Roth <michael.roth@amd.com>
[pankaj: Added sev_snp_enabled() check]
Signed-off-by: Pankaj Gupta <pankaj.gupta@amd.com>
Message-ID: <20240530111643.1091816-28-pankaj.gupta@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 413a67450750e0459efeffc3db3ba9759c3e381c)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
hw/i386/pc.c | 14 ++++++++++----
hw/i386/pc_sysfw.c | 19 +++++++++++++------
2 files changed, 23 insertions(+), 10 deletions(-)
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 0aca0cc79e..b25d075b59 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -62,6 +62,7 @@
#include "hw/mem/memory-device.h"
#include "e820_memory_layout.h"
#include "trace.h"
+#include "sev.h"
#include CONFIG_DEVICES
#ifdef CONFIG_XEN_EMU
@@ -1173,10 +1174,15 @@ void pc_memory_init(PCMachineState *pcms,
pc_system_firmware_init(pcms, rom_memory);
option_rom_mr = g_malloc(sizeof(*option_rom_mr));
- memory_region_init_ram(option_rom_mr, NULL, "pc.rom", PC_ROM_SIZE,
- &error_fatal);
- if (pcmc->pci_enabled) {
- memory_region_set_readonly(option_rom_mr, true);
+ if (machine_require_guest_memfd(machine)) {
+ memory_region_init_ram_guest_memfd(option_rom_mr, NULL, "pc.rom",
+ PC_ROM_SIZE, &error_fatal);
+ } else {
+ memory_region_init_ram(option_rom_mr, NULL, "pc.rom", PC_ROM_SIZE,
+ &error_fatal);
+ if (pcmc->pci_enabled) {
+ memory_region_set_readonly(option_rom_mr, true);
+ }
}
memory_region_add_subregion_overlap(rom_memory,
PC_ROM_MIN_VGA,
diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c
index 7cdbafc8d2..ef80281d28 100644
--- a/hw/i386/pc_sysfw.c
+++ b/hw/i386/pc_sysfw.c
@@ -40,8 +40,8 @@
#define FLASH_SECTOR_SIZE 4096
-static void pc_isa_bios_init(MemoryRegion *isa_bios, MemoryRegion *rom_memory,
- MemoryRegion *flash_mem)
+static void pc_isa_bios_init(PCMachineState *pcms, MemoryRegion *isa_bios,
+ MemoryRegion *rom_memory, MemoryRegion *flash_mem)
{
int isa_bios_size;
uint64_t flash_size;
@@ -51,8 +51,13 @@ static void pc_isa_bios_init(MemoryRegion *isa_bios, MemoryRegion *rom_memory,
/* map the last 128KB of the BIOS in ISA space */
isa_bios_size = MIN(flash_size, 128 * KiB);
- memory_region_init_ram(isa_bios, NULL, "isa-bios", isa_bios_size,
- &error_fatal);
+ if (machine_require_guest_memfd(MACHINE(pcms))) {
+ memory_region_init_ram_guest_memfd(isa_bios, NULL, "isa-bios",
+ isa_bios_size, &error_fatal);
+ } else {
+ memory_region_init_ram(isa_bios, NULL, "isa-bios", isa_bios_size,
+ &error_fatal);
+ }
memory_region_add_subregion_overlap(rom_memory,
0x100000 - isa_bios_size,
isa_bios,
@@ -65,7 +70,9 @@ static void pc_isa_bios_init(MemoryRegion *isa_bios, MemoryRegion *rom_memory,
((uint8_t*)flash_ptr) + (flash_size - isa_bios_size),
isa_bios_size);
- memory_region_set_readonly(isa_bios, true);
+ if (!machine_require_guest_memfd(current_machine)) {
+ memory_region_set_readonly(isa_bios, true);
+ }
}
static PFlashCFI01 *pc_pflash_create(PCMachineState *pcms,
@@ -191,7 +198,7 @@ static void pc_system_flash_map(PCMachineState *pcms,
x86_isa_bios_init(&x86ms->isa_bios, rom_memory, flash_mem,
true);
} else {
- pc_isa_bios_init(&x86ms->isa_bios, rom_memory, flash_mem);
+ pc_isa_bios_init(pcms, &x86ms->isa_bios, rom_memory, flash_mem);
}
/* Encrypt the pflash boot ROM */
--
2.39.3

@ -0,0 +1,58 @@
From 4331180aa09e44550ff8de781c618bae5e99bb70 Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Tue, 9 Apr 2024 18:07:43 -0500
Subject: [PATCH 025/100] hw/i386/sev: Use legacy SEV VM types for older
machine types
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [25/91] 8c73cd312736ccb0818b4d3216fd13712f21f3c9 (bonzini/rhel-qemu-kvm)
Newer 9.1 machine types will default to using the KVM_SEV_INIT2 API for
creating SEV/SEV-ES going forward. However, this API results in guest
measurement changes which are generally not expected for users of these
older guest types and can cause disruption if they switch to a newer
QEMU/kernel version. Avoid this by continuing to use the older
KVM_SEV_INIT/KVM_SEV_ES_INIT APIs for older machine types.
Signed-off-by: Michael Roth <michael.roth@amd.com>
Message-ID: <20240409230743.962513-4-michael.roth@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit ea7fbd37537b3a598335c21ccb2ea674630fc810)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
hw/i386/pc.c | 1 +
target/i386/sev.c | 1 +
2 files changed, 2 insertions(+)
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index b9fde3cec1..1a34bc4522 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -351,6 +351,7 @@ const size_t pc_rhel_compat_len = G_N_ELEMENTS(pc_rhel_compat);
GlobalProperty pc_rhel_9_5_compat[] = {
/* pc_rhel_9_5_compat from pc_compat_pc_9_0 (backported from 9.1) */
{ TYPE_X86_CPU, "guest-phys-bits", "0" },
+ { "sev-guest", "legacy-vm-type", "true" },
};
const size_t pc_rhel_9_5_compat_len = G_N_ELEMENTS(pc_rhel_9_5_compat);
diff --git a/target/i386/sev.c b/target/i386/sev.c
index f4ee317cb0..d30b68c11e 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -1417,6 +1417,7 @@ sev_guest_instance_init(Object *obj)
object_property_add_uint32_ptr(obj, "reduced-phys-bits",
&sev->reduced_phys_bits,
OBJ_PROP_FLAG_READWRITE);
+ object_apply_compat_props(obj);
}
/* sev guest info */
--
2.39.3

File diff suppressed because it is too large Load Diff

@ -0,0 +1,133 @@
From ebf08d2a822576acfa60fbd5f552d26de1e4c4be Mon Sep 17 00:00:00 2001
From: Bernhard Beschow <shentey@gmail.com>
Date: Wed, 8 May 2024 19:55:04 +0200
Subject: [PATCH 040/100] hw/i386/x86: Don't leak "isa-bios" memory regions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [40/91] bb595357c6cc2d5a80bf3873853c69553c5feee5 (bonzini/rhel-qemu-kvm)
Fix the leaking in x86_bios_rom_init() and pc_isa_bios_init() by adding an
"isa_bios" attribute to X86MachineState.
Suggested-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Bernhard Beschow <shentey@gmail.com>
Message-ID: <20240508175507.22270-4-shentey@gmail.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
(cherry picked from commit 32d3ee87a17fc91e981a23dba94855bff89f5920)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
hw/i386/pc_sysfw.c | 7 +++----
hw/i386/x86.c | 9 ++++-----
include/hw/i386/x86.h | 7 +++++++
3 files changed, 14 insertions(+), 9 deletions(-)
diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c
index 59c7a81692..82d37cb376 100644
--- a/hw/i386/pc_sysfw.c
+++ b/hw/i386/pc_sysfw.c
@@ -40,11 +40,10 @@
#define FLASH_SECTOR_SIZE 4096
-static void pc_isa_bios_init(MemoryRegion *rom_memory,
+static void pc_isa_bios_init(MemoryRegion *isa_bios, MemoryRegion *rom_memory,
MemoryRegion *flash_mem)
{
int isa_bios_size;
- MemoryRegion *isa_bios;
uint64_t flash_size;
void *flash_ptr, *isa_bios_ptr;
@@ -52,7 +51,6 @@ static void pc_isa_bios_init(MemoryRegion *rom_memory,
/* map the last 128KB of the BIOS in ISA space */
isa_bios_size = MIN(flash_size, 128 * KiB);
- isa_bios = g_malloc(sizeof(*isa_bios));
memory_region_init_ram(isa_bios, NULL, "isa-bios", isa_bios_size,
&error_fatal);
memory_region_add_subregion_overlap(rom_memory,
@@ -136,6 +134,7 @@ void pc_system_flash_cleanup_unused(PCMachineState *pcms)
static void pc_system_flash_map(PCMachineState *pcms,
MemoryRegion *rom_memory)
{
+ X86MachineState *x86ms = X86_MACHINE(pcms);
hwaddr total_size = 0;
int i;
BlockBackend *blk;
@@ -185,7 +184,7 @@ static void pc_system_flash_map(PCMachineState *pcms,
if (i == 0) {
flash_mem = pflash_cfi01_get_memory(system_flash);
- pc_isa_bios_init(rom_memory, flash_mem);
+ pc_isa_bios_init(&x86ms->isa_bios, rom_memory, flash_mem);
/* Encrypt the pflash boot ROM */
if (sev_enabled()) {
diff --git a/hw/i386/x86.c b/hw/i386/x86.c
index 6d3c72f124..457e8a34a5 100644
--- a/hw/i386/x86.c
+++ b/hw/i386/x86.c
@@ -1133,7 +1133,7 @@ void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware,
{
const char *bios_name;
char *filename;
- MemoryRegion *bios, *isa_bios;
+ MemoryRegion *bios;
int bios_size, isa_bios_size;
ssize_t ret;
@@ -1173,14 +1173,13 @@ void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware,
/* map the last 128KB of the BIOS in ISA space */
isa_bios_size = MIN(bios_size, 128 * KiB);
- isa_bios = g_malloc(sizeof(*isa_bios));
- memory_region_init_alias(isa_bios, NULL, "isa-bios", bios,
+ memory_region_init_alias(&x86ms->isa_bios, NULL, "isa-bios", bios,
bios_size - isa_bios_size, isa_bios_size);
memory_region_add_subregion_overlap(rom_memory,
0x100000 - isa_bios_size,
- isa_bios,
+ &x86ms->isa_bios,
1);
- memory_region_set_readonly(isa_bios, !isapc_ram_fw);
+ memory_region_set_readonly(&x86ms->isa_bios, !isapc_ram_fw);
/* map all the bios at the top of memory */
memory_region_add_subregion(rom_memory,
diff --git a/include/hw/i386/x86.h b/include/hw/i386/x86.h
index cb07618d19..a07de79167 100644
--- a/include/hw/i386/x86.h
+++ b/include/hw/i386/x86.h
@@ -18,6 +18,7 @@
#define HW_I386_X86_H
#include "exec/hwaddr.h"
+#include "exec/memory.h"
#include "hw/boards.h"
#include "hw/intc/ioapic.h"
@@ -52,6 +53,12 @@ struct X86MachineState {
GMappedFile *initrd_mapped_file;
HotplugHandler *acpi_dev;
+ /*
+ * Map the upper 128 KiB of the BIOS just underneath the 1 MiB address
+ * boundary.
+ */
+ MemoryRegion isa_bios;
+
/* RAM information (sizes, addresses, configuration): */
ram_addr_t below_4g_mem_size, above_4g_mem_size;
--
2.39.3

@ -0,0 +1,105 @@
From e1f2265b5f6bf5b63bf3808bb540888f3cf8badb Mon Sep 17 00:00:00 2001
From: Bernhard Beschow <shentey@gmail.com>
Date: Wed, 8 May 2024 19:55:05 +0200
Subject: [PATCH 041/100] hw/i386/x86: Don't leak "pc.bios" memory region
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [41/91] a9cd61d8d240134c09c46e244efb89217cadf60c (bonzini/rhel-qemu-kvm)
Fix the leaking in x86_bios_rom_init() by adding a "bios" attribute to
X86MachineState. Note that it is only used in the -bios case.
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Bernhard Beschow <shentey@gmail.com>
Message-ID: <20240508175507.22270-5-shentey@gmail.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
(cherry picked from commit 865d95321ffc8d9941e33000b10140550f094556)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
hw/i386/x86.c | 13 ++++++-------
include/hw/i386/x86.h | 6 ++++++
2 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/hw/i386/x86.c b/hw/i386/x86.c
index 457e8a34a5..29167de97d 100644
--- a/hw/i386/x86.c
+++ b/hw/i386/x86.c
@@ -1133,7 +1133,6 @@ void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware,
{
const char *bios_name;
char *filename;
- MemoryRegion *bios;
int bios_size, isa_bios_size;
ssize_t ret;
@@ -1149,8 +1148,8 @@ void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware,
(bios_size % 65536) != 0) {
goto bios_error;
}
- bios = g_malloc(sizeof(*bios));
- memory_region_init_ram(bios, NULL, "pc.bios", bios_size, &error_fatal);
+ memory_region_init_ram(&x86ms->bios, NULL, "pc.bios", bios_size,
+ &error_fatal);
if (sev_enabled()) {
/*
* The concept of a "reset" simply doesn't exist for
@@ -1159,11 +1158,11 @@ void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware,
* the firmware as rom to properly re-initialize on reset.
* Just go for a straight file load instead.
*/
- void *ptr = memory_region_get_ram_ptr(bios);
+ void *ptr = memory_region_get_ram_ptr(&x86ms->bios);
load_image_size(filename, ptr, bios_size);
x86_firmware_configure(ptr, bios_size);
} else {
- memory_region_set_readonly(bios, !isapc_ram_fw);
+ memory_region_set_readonly(&x86ms->bios, !isapc_ram_fw);
ret = rom_add_file_fixed(bios_name, (uint32_t)(-bios_size), -1);
if (ret != 0) {
goto bios_error;
@@ -1173,7 +1172,7 @@ void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware,
/* map the last 128KB of the BIOS in ISA space */
isa_bios_size = MIN(bios_size, 128 * KiB);
- memory_region_init_alias(&x86ms->isa_bios, NULL, "isa-bios", bios,
+ memory_region_init_alias(&x86ms->isa_bios, NULL, "isa-bios", &x86ms->bios,
bios_size - isa_bios_size, isa_bios_size);
memory_region_add_subregion_overlap(rom_memory,
0x100000 - isa_bios_size,
@@ -1184,7 +1183,7 @@ void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware,
/* map all the bios at the top of memory */
memory_region_add_subregion(rom_memory,
(uint32_t)(-bios_size),
- bios);
+ &x86ms->bios);
return;
bios_error:
diff --git a/include/hw/i386/x86.h b/include/hw/i386/x86.h
index a07de79167..55c6809ae0 100644
--- a/include/hw/i386/x86.h
+++ b/include/hw/i386/x86.h
@@ -53,6 +53,12 @@ struct X86MachineState {
GMappedFile *initrd_mapped_file;
HotplugHandler *acpi_dev;
+ /*
+ * Map the whole BIOS just underneath the 4 GiB address boundary. Only used
+ * in the ROM (-bios) case.
+ */
+ MemoryRegion bios;
+
/*
* Map the upper 128 KiB of the BIOS just underneath the 1 MiB address
* boundary.
--
2.39.3

@ -0,0 +1,69 @@
From b9d0c78f04160fbc1eee6cfd94b17f1133a35d83 Mon Sep 17 00:00:00 2001
From: Bernhard Beschow <shentey@gmail.com>
Date: Tue, 30 Apr 2024 17:06:38 +0200
Subject: [PATCH 037/100] hw/i386/x86: Eliminate two if statements in
x86_bios_rom_init()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [37/91] 1ef6a13214e85f6ef773f5c894c720f20330912b (bonzini/rhel-qemu-kvm)
Given that memory_region_set_readonly() is a no-op when the readonlyness is
already as requested it is possible to simplify the pattern
if (condition) {
foo(true);
}
to
foo(condition);
which is shorter and allows to see the invariant of the code more easily.
Signed-off-by: Bernhard Beschow <shentey@gmail.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Message-ID: <20240430150643.111976-2-shentey@gmail.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
(cherry picked from commit 014dbdac8798799d081abc9dff3e4876ca54f49e)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
hw/i386/x86.c | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/hw/i386/x86.c b/hw/i386/x86.c
index 3d5b51e92d..2a4f3ee285 100644
--- a/hw/i386/x86.c
+++ b/hw/i386/x86.c
@@ -1163,9 +1163,7 @@ void x86_bios_rom_init(MachineState *ms, const char *default_firmware,
load_image_size(filename, ptr, bios_size);
x86_firmware_configure(ptr, bios_size);
} else {
- if (!isapc_ram_fw) {
- memory_region_set_readonly(bios, true);
- }
+ memory_region_set_readonly(bios, !isapc_ram_fw);
ret = rom_add_file_fixed(bios_name, (uint32_t)(-bios_size), -1);
if (ret != 0) {
goto bios_error;
@@ -1182,9 +1180,7 @@ void x86_bios_rom_init(MachineState *ms, const char *default_firmware,
0x100000 - isa_bios_size,
isa_bios,
1);
- if (!isapc_ram_fw) {
- memory_region_set_readonly(isa_bios, true);
- }
+ memory_region_set_readonly(isa_bios, !isapc_ram_fw);
/* map all the bios at the top of memory */
memory_region_add_subregion(rom_memory,
--
2.39.3

@ -0,0 +1,98 @@
From 1baf67564d4227d6ba98923217a15814c438c32b Mon Sep 17 00:00:00 2001
From: Bernhard Beschow <shentey@gmail.com>
Date: Wed, 8 May 2024 19:55:06 +0200
Subject: [PATCH 042/100] hw/i386/x86: Extract x86_isa_bios_init() from
x86_bios_rom_init()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [42/91] 1db417a5995480924f7fd0661a306f2d2bfa0a77 (bonzini/rhel-qemu-kvm)
The function is inspired by pc_isa_bios_init() and should eventually replace it.
Using x86_isa_bios_init() rather than pc_isa_bios_init() fixes pflash commands
to work in the isa-bios region.
While at it convert the magic number 0x100000 (== 1MiB) to increase readability.
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Bernhard Beschow <shentey@gmail.com>
Message-ID: <20240508175507.22270-6-shentey@gmail.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
(cherry picked from commit 5c5ffec12c30d2017cbdee6798f54d8fad3f9656)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
hw/i386/x86.c | 25 ++++++++++++++++---------
include/hw/i386/x86.h | 2 ++
2 files changed, 18 insertions(+), 9 deletions(-)
diff --git a/hw/i386/x86.c b/hw/i386/x86.c
index 29167de97d..c61f4ebfa6 100644
--- a/hw/i386/x86.c
+++ b/hw/i386/x86.c
@@ -1128,12 +1128,25 @@ void x86_load_linux(X86MachineState *x86ms,
nb_option_roms++;
}
+void x86_isa_bios_init(MemoryRegion *isa_bios, MemoryRegion *isa_memory,
+ MemoryRegion *bios, bool read_only)
+{
+ uint64_t bios_size = memory_region_size(bios);
+ uint64_t isa_bios_size = MIN(bios_size, 128 * KiB);
+
+ memory_region_init_alias(isa_bios, NULL, "isa-bios", bios,
+ bios_size - isa_bios_size, isa_bios_size);
+ memory_region_add_subregion_overlap(isa_memory, 1 * MiB - isa_bios_size,
+ isa_bios, 1);
+ memory_region_set_readonly(isa_bios, read_only);
+}
+
void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware,
MemoryRegion *rom_memory, bool isapc_ram_fw)
{
const char *bios_name;
char *filename;
- int bios_size, isa_bios_size;
+ int bios_size;
ssize_t ret;
/* BIOS load */
@@ -1171,14 +1184,8 @@ void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware,
g_free(filename);
/* map the last 128KB of the BIOS in ISA space */
- isa_bios_size = MIN(bios_size, 128 * KiB);
- memory_region_init_alias(&x86ms->isa_bios, NULL, "isa-bios", &x86ms->bios,
- bios_size - isa_bios_size, isa_bios_size);
- memory_region_add_subregion_overlap(rom_memory,
- 0x100000 - isa_bios_size,
- &x86ms->isa_bios,
- 1);
- memory_region_set_readonly(&x86ms->isa_bios, !isapc_ram_fw);
+ x86_isa_bios_init(&x86ms->isa_bios, rom_memory, &x86ms->bios,
+ !isapc_ram_fw);
/* map all the bios at the top of memory */
memory_region_add_subregion(rom_memory,
diff --git a/include/hw/i386/x86.h b/include/hw/i386/x86.h
index 55c6809ae0..d7b7d3f3ce 100644
--- a/include/hw/i386/x86.h
+++ b/include/hw/i386/x86.h
@@ -129,6 +129,8 @@ void x86_cpu_unplug_request_cb(HotplugHandler *hotplug_dev,
void x86_cpu_unplug_cb(HotplugHandler *hotplug_dev,
DeviceState *dev, Error **errp);
+void x86_isa_bios_init(MemoryRegion *isa_bios, MemoryRegion *isa_memory,
+ MemoryRegion *bios, bool read_only);
void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware,
MemoryRegion *rom_memory, bool isapc_ram_fw);
--
2.39.3

@ -1,116 +0,0 @@
From 84f378c41832602dcf9bad6167b1f532c7c53e37 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Tue, 21 Nov 2023 15:03:55 +0100
Subject: [PATCH 048/101] hw/ppc/Kconfig: Imply VFIO_PCI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [47/67] c1a40cdab9bf62b16cb428d57a20b3e0eaa6de38 (eauger1/centos-qemu-kvm)
When the legacy and iommufd backends were introduced, a set of common
vfio-pci routines were exported in pci.c for both backends to use :
vfio_pci_pre_reset
vfio_pci_get_pci_hot_reset_info
vfio_pci_host_match
vfio_pci_post_reset
This introduced a build failure on PPC when --without-default-devices
is use because VFIO is always selected in ppc/Kconfig but VFIO_PCI is
not.
Use an 'imply VFIO_PCI' in ppc/Kconfig and bypass compilation of the
VFIO EEH hooks routines defined in hw/ppc/spapr_pci_vfio.c with
CONFIG_VFIO_PCI.
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 4278df9d1d2383b738338c857406357660f11e42)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/ppc/Kconfig | 2 +-
hw/ppc/spapr_pci_vfio.c | 36 ++++++++++++++++++++++++++++++++++++
2 files changed, 37 insertions(+), 1 deletion(-)
diff --git a/hw/ppc/Kconfig b/hw/ppc/Kconfig
index 56f0475a8e..44263a58c4 100644
--- a/hw/ppc/Kconfig
+++ b/hw/ppc/Kconfig
@@ -3,11 +3,11 @@ config PSERIES
imply PCI_DEVICES
imply TEST_DEVICES
imply VIRTIO_VGA
+ imply VFIO_PCI if LINUX # needed by spapr_pci_vfio.c
select NVDIMM
select DIMM
select PCI
select SPAPR_VSCSI
- select VFIO if LINUX # needed by spapr_pci_vfio.c
select XICS
select XIVE
select MSI_NONBROKEN
diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c
index d1d07bec46..76b2a3487b 100644
--- a/hw/ppc/spapr_pci_vfio.c
+++ b/hw/ppc/spapr_pci_vfio.c
@@ -26,10 +26,12 @@
#include "hw/pci/pci_device.h"
#include "hw/vfio/vfio-common.h"
#include "qemu/error-report.h"
+#include CONFIG_DEVICES /* CONFIG_VFIO_PCI */
/*
* Interfaces for IBM EEH (Enhanced Error Handling)
*/
+#ifdef CONFIG_VFIO_PCI
static bool vfio_eeh_container_ok(VFIOContainer *container)
{
/*
@@ -314,3 +316,37 @@ int spapr_phb_vfio_eeh_configure(SpaprPhbState *sphb)
return RTAS_OUT_SUCCESS;
}
+
+#else
+
+bool spapr_phb_eeh_available(SpaprPhbState *sphb)
+{
+ return false;
+}
+
+void spapr_phb_vfio_reset(DeviceState *qdev)
+{
+}
+
+int spapr_phb_vfio_eeh_set_option(SpaprPhbState *sphb,
+ unsigned int addr, int option)
+{
+ return RTAS_OUT_NOT_SUPPORTED;
+}
+
+int spapr_phb_vfio_eeh_get_state(SpaprPhbState *sphb, int *state)
+{
+ return RTAS_OUT_NOT_SUPPORTED;
+}
+
+int spapr_phb_vfio_eeh_reset(SpaprPhbState *sphb, int option)
+{
+ return RTAS_OUT_NOT_SUPPORTED;
+}
+
+int spapr_phb_vfio_eeh_configure(SpaprPhbState *sphb)
+{
+ return RTAS_OUT_NOT_SUPPORTED;
+}
+
+#endif /* CONFIG_VFIO_PCI */
--
2.39.3

@ -1,73 +0,0 @@
From 8f27893a37e55a31180bb66cd9eae7199911881b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Volker=20R=C3=BCmelin?= <vr_qemu@t-online.de>
Date: Fri, 29 Dec 2023 21:38:54 +0100
Subject: [PATCH 060/101] hw/vfio: fix iteration over global VFIODevice list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [59/67] f926e1233c8c5ad418e8794b1a103371c9dc5eb0 (eauger1/centos-qemu-kvm)
Commit 3d779abafe ("vfio/common: Introduce a global VFIODevice list")
introduced a global VFIODevice list, but forgot to update the list
element field name when iterating over the new list. Change the code
to use the correct list element field.
Fixes: 3d779abafe ("vfio/common: Introduce a global VFIODevice list")
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2061
Signed-off-by: Volker Rümelin <vr_qemu@t-online.de>
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
(cherry picked from commit 9353b6da430f90e47f352dbf6dc31120c8914da6)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/common.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 0d4d8b8416..0b3352f2a9 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -73,7 +73,7 @@ bool vfio_mig_active(void)
return false;
}
- QLIST_FOREACH(vbasedev, &vfio_device_list, next) {
+ QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
if (vbasedev->migration_blocker) {
return false;
}
@@ -94,7 +94,7 @@ static bool vfio_multiple_devices_migration_is_supported(void)
unsigned int device_num = 0;
bool all_support_p2p = true;
- QLIST_FOREACH(vbasedev, &vfio_device_list, next) {
+ QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
if (vbasedev->migration) {
device_num++;
@@ -1366,13 +1366,13 @@ void vfio_reset_handler(void *opaque)
{
VFIODevice *vbasedev;
- QLIST_FOREACH(vbasedev, &vfio_device_list, next) {
+ QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
if (vbasedev->dev->realized) {
vbasedev->ops->vfio_compute_needs_reset(vbasedev);
}
}
- QLIST_FOREACH(vbasedev, &vfio_device_list, next) {
+ QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
if (vbasedev->dev->realized && vbasedev->needs_reset) {
vbasedev->ops->vfio_hot_reset_multi(vbasedev);
}
--
2.39.3

@ -0,0 +1,108 @@
From c554f8768a18ceba173aedbd582c1cae43a41e2c Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Tue, 18 Jun 2024 14:19:58 +0200
Subject: [PATCH 1/2] hw/virtio: Fix the de-initialization of vhost-user
devices
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 255: hw/virtio: Fix the de-initialization of vhost-user devices
RH-Jira: RHEL-40708
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [1/1] c7815a249ec135993f45934cab1c1f2c038b80ea (thuth/qemu-kvm-cs9)
JIRA: https://issues.redhat.com/browse/RHEL-40708
The unrealize functions of the various vhost-user devices are
calling the corresponding vhost_*_set_status() functions with a
status of 0 to shut down the device correctly.
Now these vhost_*_set_status() functions all follow this scheme:
bool should_start = virtio_device_should_start(vdev, status);
if (vhost_dev_is_started(&vvc->vhost_dev) == should_start) {
return;
}
if (should_start) {
/* ... do the initialization stuff ... */
} else {
/* ... do the cleanup stuff ... */
}
The problem here is virtio_device_should_start(vdev, 0) currently
always returns "true" since it internally only looks at vdev->started
instead of looking at the "status" parameter. Thus once the device
got started once, virtio_device_should_start() always returns true
and thus the vhost_*_set_status() functions return early, without
ever doing any clean-up when being called with status == 0. This
causes e.g. problems when trying to hot-plug and hot-unplug a vhost
user devices multiple times since the de-initialization step is
completely skipped during the unplug operation.
This bug has been introduced in commit 9f6bcfd99f ("hw/virtio: move
vm_running check to virtio_device_started") which replaced
should_start = status & VIRTIO_CONFIG_S_DRIVER_OK;
with
should_start = virtio_device_started(vdev, status);
which later got replaced by virtio_device_should_start(). This blocked
the possibility to set should_start to false in case the status flag
VIRTIO_CONFIG_S_DRIVER_OK was not set.
Fix it by adjusting the virtio_device_should_start() function to
only consider the status flag instead of vdev->started. Since this
function is only used in the various vhost_*_set_status() functions
for exactly the same purpose, it should be fine to fix it in this
central place there without any risk to change the behavior of other
code.
Fixes: 9f6bcfd99f ("hw/virtio: move vm_running check to virtio_device_started")
Buglink: https://issues.redhat.com/browse/RHEL-40708
Signed-off-by: Thomas Huth <thuth@redhat.com>
Message-Id: <20240618121958.88673-1-thuth@redhat.com>
Reviewed-by: Manos Pitsidianakis <manos.pitsidianakis@linaro.org>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit d72479b11797c28893e1e3fc565497a9cae5ca16)
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
include/hw/virtio/virtio.h | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 7d5ffdc145..2eafad17b8 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -470,9 +470,9 @@ static inline bool virtio_device_started(VirtIODevice *vdev, uint8_t status)
* @vdev - the VirtIO device
* @status - the devices status bits
*
- * This is similar to virtio_device_started() but also encapsulates a
- * check on the VM status which would prevent a device starting
- * anyway.
+ * This is similar to virtio_device_started() but ignores vdev->started
+ * and also encapsulates a check on the VM status which would prevent a
+ * device from starting anyway.
*/
static inline bool virtio_device_should_start(VirtIODevice *vdev, uint8_t status)
{
@@ -480,7 +480,7 @@ static inline bool virtio_device_should_start(VirtIODevice *vdev, uint8_t status
return false;
}
- return virtio_device_started(vdev, status);
+ return status & VIRTIO_CONFIG_S_DRIVER_OK;
}
static inline void virtio_set_started(VirtIODevice *vdev, bool started)
--
2.39.3

@ -0,0 +1,68 @@
From f572a40924c7138072e387111d0f092185972477 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 9 May 2024 19:00:39 +0200
Subject: [PATCH 044/100] i386: correctly select code in hw/i386 that depends
on other components
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [44/91] 1327a5eb2b91edacf56cc4e93255cad456abbbeb (bonzini/rhel-qemu-kvm)
fw_cfg.c and vapic.c are currently included unconditionally but
depend on other components. vapic.c depends on the local APIC,
while fw_cfg.c includes a piece of AML builder code that depends
on CONFIG_ACPI.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Zhao Liu <zhao1.liu@intel.com>
Message-ID: <20240509170044.190795-9-pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 7974e51342775c87f6e759a8c525db1045ddfa24)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
hw/i386/fw_cfg.c | 2 ++
hw/i386/meson.build | 2 +-
2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c
index 283c3f4c16..7f97d40616 100644
--- a/hw/i386/fw_cfg.c
+++ b/hw/i386/fw_cfg.c
@@ -204,6 +204,7 @@ void fw_cfg_build_feature_control(MachineState *ms, FWCfgState *fw_cfg)
fw_cfg_add_file(fw_cfg, "etc/msr_feature_control", val, sizeof(*val));
}
+#ifdef CONFIG_ACPI
void fw_cfg_add_acpi_dsdt(Aml *scope, FWCfgState *fw_cfg)
{
/*
@@ -230,3 +231,4 @@ void fw_cfg_add_acpi_dsdt(Aml *scope, FWCfgState *fw_cfg)
aml_append(dev, aml_name_decl("_CRS", crs));
aml_append(scope, dev);
}
+#endif
diff --git a/hw/i386/meson.build b/hw/i386/meson.build
index d8b70ef3e9..d9da676038 100644
--- a/hw/i386/meson.build
+++ b/hw/i386/meson.build
@@ -1,12 +1,12 @@
i386_ss = ss.source_set()
i386_ss.add(files(
'fw_cfg.c',
- 'vapic.c',
'e820_memory_layout.c',
'multiboot.c',
'x86.c',
))
+i386_ss.add(when: 'CONFIG_APIC', if_true: files('vapic.c'))
i386_ss.add(when: 'CONFIG_X86_IOMMU', if_true: files('x86-iommu.c'),
if_false: files('x86-iommu-stub.c'))
i386_ss.add(when: 'CONFIG_AMD_IOMMU', if_true: files('amd_iommu.c'),
--
2.39.3

@ -0,0 +1,40 @@
From 127f3c60668e1bd08ec00856a317cb841adf0440 Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Thu, 30 May 2024 06:16:23 -0500
Subject: [PATCH 063/100] i386/cpu: Set SEV-SNP CPUID bit when SNP enabled
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [63/91] 0f834a6897c5cdc0e29a5b1862e621f8ce309657 (bonzini/rhel-qemu-kvm)
SNP guests will rely on this bit to determine certain feature support.
Signed-off-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Pankaj Gupta <pankaj.gupta@amd.com>
Message-ID: <20240530111643.1091816-12-pankaj.gupta@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 7831221941cccbde922412c1550ed8b4bce7c361)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
target/i386/cpu.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 489c853b42..13737cd703 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -6822,6 +6822,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
if (sev_enabled()) {
*eax = 0x2;
*eax |= sev_es_enabled() ? 0x8 : 0;
+ *eax |= sev_snp_enabled() ? 0x10 : 0;
*ebx = sev_get_cbit_position() & 0x3f; /* EBX[5:0] */
*ebx |= (sev_get_reduced_phys_bits() & 0x3f) << 6; /* EBX[11:6] */
}
--
2.39.3

@ -0,0 +1,145 @@
From 14aa42bbacde75b2ce9a59d1267f73d613026461 Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Thu, 30 May 2024 06:16:42 -0500
Subject: [PATCH 076/100] i386/kvm: Add KVM_EXIT_HYPERCALL handling for
KVM_HC_MAP_GPA_RANGE
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [76/91] 3e1201c330dc826af1ec4650974d47053270eb16 (bonzini/rhel-qemu-kvm)
KVM_HC_MAP_GPA_RANGE will be used to send requests to userspace for
private/shared memory attribute updates requested by the guest.
Implement handling for that use-case along with some basic
infrastructure for enabling specific hypercall events.
Signed-off-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Pankaj Gupta <pankaj.gupta@amd.com>
Message-ID: <20240530111643.1091816-31-pankaj.gupta@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 47e76d03b155e43beca550251a6eb7ea926c059f)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
target/i386/kvm/kvm.c | 55 ++++++++++++++++++++++++++++++++++++
target/i386/kvm/kvm_i386.h | 1 +
target/i386/kvm/trace-events | 1 +
3 files changed, 57 insertions(+)
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 75e75d9772..2935e3931a 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -21,6 +21,7 @@
#include <sys/syscall.h>
#include <linux/kvm.h>
+#include <linux/kvm_para.h>
#include "standard-headers/asm-x86/kvm_para.h"
#include "hw/xen/interface/arch-x86/cpuid.h"
@@ -208,6 +209,13 @@ int kvm_get_vm_type(MachineState *ms)
return kvm_type;
}
+bool kvm_enable_hypercall(uint64_t enable_mask)
+{
+ KVMState *s = KVM_STATE(current_accel());
+
+ return !kvm_vm_enable_cap(s, KVM_CAP_EXIT_HYPERCALL, 0, enable_mask);
+}
+
bool kvm_has_smm(void)
{
return kvm_vm_check_extension(kvm_state, KVM_CAP_X86_SMM);
@@ -5325,6 +5333,50 @@ static bool host_supports_vmx(void)
return ecx & CPUID_EXT_VMX;
}
+/*
+ * Currently the handling here only supports use of KVM_HC_MAP_GPA_RANGE
+ * to service guest-initiated memory attribute update requests so that
+ * KVM_SET_MEMORY_ATTRIBUTES can update whether or not a page should be
+ * backed by the private memory pool provided by guest_memfd, and as such
+ * is only applicable to guest_memfd-backed guests (e.g. SNP/TDX).
+ *
+ * Other other use-cases for KVM_HC_MAP_GPA_RANGE, such as for SEV live
+ * migration, are not implemented here currently.
+ *
+ * For the guest_memfd use-case, these exits will generally be synthesized
+ * by KVM based on platform-specific hypercalls, like GHCB requests in the
+ * case of SEV-SNP, and not issued directly within the guest though the
+ * KVM_HC_MAP_GPA_RANGE hypercall. So in this case, KVM_HC_MAP_GPA_RANGE is
+ * not actually advertised to guests via the KVM CPUID feature bit, as
+ * opposed to SEV live migration where it would be. Since it is unlikely the
+ * SEV live migration use-case would be useful for guest-memfd backed guests,
+ * because private/shared page tracking is already provided through other
+ * means, these 2 use-cases should be treated as being mutually-exclusive.
+ */
+static int kvm_handle_hc_map_gpa_range(struct kvm_run *run)
+{
+ uint64_t gpa, size, attributes;
+
+ if (!machine_require_guest_memfd(current_machine))
+ return -EINVAL;
+
+ gpa = run->hypercall.args[0];
+ size = run->hypercall.args[1] * TARGET_PAGE_SIZE;
+ attributes = run->hypercall.args[2];
+
+ trace_kvm_hc_map_gpa_range(gpa, size, attributes, run->hypercall.flags);
+
+ return kvm_convert_memory(gpa, size, attributes & KVM_MAP_GPA_RANGE_ENCRYPTED);
+}
+
+static int kvm_handle_hypercall(struct kvm_run *run)
+{
+ if (run->hypercall.nr == KVM_HC_MAP_GPA_RANGE)
+ return kvm_handle_hc_map_gpa_range(run);
+
+ return -EINVAL;
+}
+
#define VMX_INVALID_GUEST_STATE 0x80000021
int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
@@ -5420,6 +5472,9 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
ret = kvm_xen_handle_exit(cpu, &run->xen);
break;
#endif
+ case KVM_EXIT_HYPERCALL:
+ ret = kvm_handle_hypercall(run);
+ break;
default:
fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
ret = -1;
diff --git a/target/i386/kvm/kvm_i386.h b/target/i386/kvm/kvm_i386.h
index 6b44844d95..34fc60774b 100644
--- a/target/i386/kvm/kvm_i386.h
+++ b/target/i386/kvm/kvm_i386.h
@@ -33,6 +33,7 @@
bool kvm_has_smm(void);
bool kvm_enable_x2apic(void);
bool kvm_hv_vpindex_settable(void);
+bool kvm_enable_hypercall(uint64_t enable_mask);
bool kvm_enable_sgx_provisioning(KVMState *s);
bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp);
diff --git a/target/i386/kvm/trace-events b/target/i386/kvm/trace-events
index b365a8e8e2..74a6234ff7 100644
--- a/target/i386/kvm/trace-events
+++ b/target/i386/kvm/trace-events
@@ -5,6 +5,7 @@ kvm_x86_fixup_msi_error(uint32_t gsi) "VT-d failed to remap interrupt for GSI %"
kvm_x86_add_msi_route(int virq) "Adding route entry for virq %d"
kvm_x86_remove_msi_route(int virq) "Removing route entry for virq %d"
kvm_x86_update_msi_routes(int num) "Updated %d MSI routes"
+kvm_hc_map_gpa_range(uint64_t gpa, uint64_t size, uint64_t attributes, uint64_t flags) "gpa 0x%" PRIx64 " size 0x%" PRIx64 " attributes 0x%" PRIx64 " flags 0x%" PRIx64
# xen-emu.c
kvm_xen_hypercall(int cpu, uint8_t cpl, uint64_t input, uint64_t a0, uint64_t a1, uint64_t a2, uint64_t ret) "xen_hypercall: cpu %d cpl %d input %" PRIu64 " a0 0x%" PRIx64 " a1 0x%" PRIx64 " a2 0x%" PRIx64" ret 0x%" PRIx64
--
2.39.3

@ -0,0 +1,536 @@
From 5ead79f45e8e90b7a04586c89e70cb9d0b66b730 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Thu, 29 Feb 2024 01:36:43 -0500
Subject: [PATCH 004/100] i386/kvm: Move architectural CPUID leaf generation to
separate helper
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [4/91] 06ecdbcf05ad3d658273980b114f02477d0b0475 (bonzini/rhel-qemu-kvm)
Move the architectural (for lack of a better term) CPUID leaf generation
to a separate helper so that the generation code can be reused by TDX,
which needs to generate a canonical VM-scoped configuration.
For now this is just a cleanup, so keep the function static.
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Message-ID: <20240229063726.610065-23-xiaoyao.li@intel.com>
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit a5acf4f26c208a05d05ef1bde65553ce2ab5e5d0)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
target/i386/kvm/kvm.c | 417 +++++++++++++++++++++---------------------
1 file changed, 211 insertions(+), 206 deletions(-)
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 739f33db47..5f30b649a0 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -1706,195 +1706,22 @@ static void kvm_init_nested_state(CPUX86State *env)
}
}
-int kvm_arch_init_vcpu(CPUState *cs)
+static uint32_t kvm_x86_build_cpuid(CPUX86State *env,
+ struct kvm_cpuid_entry2 *entries,
+ uint32_t cpuid_i)
{
- struct {
- struct kvm_cpuid2 cpuid;
- struct kvm_cpuid_entry2 entries[KVM_MAX_CPUID_ENTRIES];
- } cpuid_data;
- /*
- * The kernel defines these structs with padding fields so there
- * should be no extra padding in our cpuid_data struct.
- */
- QEMU_BUILD_BUG_ON(sizeof(cpuid_data) !=
- sizeof(struct kvm_cpuid2) +
- sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES);
-
- X86CPU *cpu = X86_CPU(cs);
- CPUX86State *env = &cpu->env;
- uint32_t limit, i, j, cpuid_i;
+ uint32_t limit, i, j;
uint32_t unused;
struct kvm_cpuid_entry2 *c;
- uint32_t signature[3];
- int kvm_base = KVM_CPUID_SIGNATURE;
- int max_nested_state_len;
- int r;
- Error *local_err = NULL;
-
- memset(&cpuid_data, 0, sizeof(cpuid_data));
-
- cpuid_i = 0;
-
- has_xsave2 = kvm_check_extension(cs->kvm_state, KVM_CAP_XSAVE2);
-
- r = kvm_arch_set_tsc_khz(cs);
- if (r < 0) {
- return r;
- }
-
- /* vcpu's TSC frequency is either specified by user, or following
- * the value used by KVM if the former is not present. In the
- * latter case, we query it from KVM and record in env->tsc_khz,
- * so that vcpu's TSC frequency can be migrated later via this field.
- */
- if (!env->tsc_khz) {
- r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
- kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
- -ENOTSUP;
- if (r > 0) {
- env->tsc_khz = r;
- }
- }
-
- env->apic_bus_freq = KVM_APIC_BUS_FREQUENCY;
-
- /*
- * kvm_hyperv_expand_features() is called here for the second time in case
- * KVM_CAP_SYS_HYPERV_CPUID is not supported. While we can't possibly handle
- * 'query-cpu-model-expansion' in this case as we don't have a KVM vCPU to
- * check which Hyper-V enlightenments are supported and which are not, we
- * can still proceed and check/expand Hyper-V enlightenments here so legacy
- * behavior is preserved.
- */
- if (!kvm_hyperv_expand_features(cpu, &local_err)) {
- error_report_err(local_err);
- return -ENOSYS;
- }
-
- if (hyperv_enabled(cpu)) {
- r = hyperv_init_vcpu(cpu);
- if (r) {
- return r;
- }
-
- cpuid_i = hyperv_fill_cpuids(cs, cpuid_data.entries);
- kvm_base = KVM_CPUID_SIGNATURE_NEXT;
- has_msr_hv_hypercall = true;
- }
-
- if (cs->kvm_state->xen_version) {
-#ifdef CONFIG_XEN_EMU
- struct kvm_cpuid_entry2 *xen_max_leaf;
-
- memcpy(signature, "XenVMMXenVMM", 12);
-
- xen_max_leaf = c = &cpuid_data.entries[cpuid_i++];
- c->function = kvm_base + XEN_CPUID_SIGNATURE;
- c->eax = kvm_base + XEN_CPUID_TIME;
- c->ebx = signature[0];
- c->ecx = signature[1];
- c->edx = signature[2];
-
- c = &cpuid_data.entries[cpuid_i++];
- c->function = kvm_base + XEN_CPUID_VENDOR;
- c->eax = cs->kvm_state->xen_version;
- c->ebx = 0;
- c->ecx = 0;
- c->edx = 0;
-
- c = &cpuid_data.entries[cpuid_i++];
- c->function = kvm_base + XEN_CPUID_HVM_MSR;
- /* Number of hypercall-transfer pages */
- c->eax = 1;
- /* Hypercall MSR base address */
- if (hyperv_enabled(cpu)) {
- c->ebx = XEN_HYPERCALL_MSR_HYPERV;
- kvm_xen_init(cs->kvm_state, c->ebx);
- } else {
- c->ebx = XEN_HYPERCALL_MSR;
- }
- c->ecx = 0;
- c->edx = 0;
-
- c = &cpuid_data.entries[cpuid_i++];
- c->function = kvm_base + XEN_CPUID_TIME;
- c->eax = ((!!tsc_is_stable_and_known(env) << 1) |
- (!!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP) << 2));
- /* default=0 (emulate if necessary) */
- c->ebx = 0;
- /* guest tsc frequency */
- c->ecx = env->user_tsc_khz;
- /* guest tsc incarnation (migration count) */
- c->edx = 0;
-
- c = &cpuid_data.entries[cpuid_i++];
- c->function = kvm_base + XEN_CPUID_HVM;
- xen_max_leaf->eax = kvm_base + XEN_CPUID_HVM;
- if (cs->kvm_state->xen_version >= XEN_VERSION(4, 5)) {
- c->function = kvm_base + XEN_CPUID_HVM;
-
- if (cpu->xen_vapic) {
- c->eax |= XEN_HVM_CPUID_APIC_ACCESS_VIRT;
- c->eax |= XEN_HVM_CPUID_X2APIC_VIRT;
- }
-
- c->eax |= XEN_HVM_CPUID_IOMMU_MAPPINGS;
-
- if (cs->kvm_state->xen_version >= XEN_VERSION(4, 6)) {
- c->eax |= XEN_HVM_CPUID_VCPU_ID_PRESENT;
- c->ebx = cs->cpu_index;
- }
-
- if (cs->kvm_state->xen_version >= XEN_VERSION(4, 17)) {
- c->eax |= XEN_HVM_CPUID_UPCALL_VECTOR;
- }
- }
-
- r = kvm_xen_init_vcpu(cs);
- if (r) {
- return r;
- }
-
- kvm_base += 0x100;
-#else /* CONFIG_XEN_EMU */
- /* This should never happen as kvm_arch_init() would have died first. */
- fprintf(stderr, "Cannot enable Xen CPUID without Xen support\n");
- abort();
-#endif
- } else if (cpu->expose_kvm) {
- memcpy(signature, "KVMKVMKVM\0\0\0", 12);
- c = &cpuid_data.entries[cpuid_i++];
- c->function = KVM_CPUID_SIGNATURE | kvm_base;
- c->eax = KVM_CPUID_FEATURES | kvm_base;
- c->ebx = signature[0];
- c->ecx = signature[1];
- c->edx = signature[2];
-
- c = &cpuid_data.entries[cpuid_i++];
- c->function = KVM_CPUID_FEATURES | kvm_base;
- c->eax = env->features[FEAT_KVM];
- c->edx = env->features[FEAT_KVM_HINTS];
- }
cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused);
- if (cpu->kvm_pv_enforce_cpuid) {
- r = kvm_vcpu_enable_cap(cs, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 0, 1);
- if (r < 0) {
- fprintf(stderr,
- "failed to enable KVM_CAP_ENFORCE_PV_FEATURE_CPUID: %s",
- strerror(-r));
- abort();
- }
- }
-
for (i = 0; i <= limit; i++) {
+ j = 0;
if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
- fprintf(stderr, "unsupported level value: 0x%x\n", limit);
- abort();
+ goto full;
}
- c = &cpuid_data.entries[cpuid_i++];
-
+ c = &entries[cpuid_i++];
switch (i) {
case 2: {
/* Keep reading function 2 till all the input is received */
@@ -1908,11 +1735,9 @@ int kvm_arch_init_vcpu(CPUState *cs)
for (j = 1; j < times; ++j) {
if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
- fprintf(stderr, "cpuid_data is full, no space for "
- "cpuid(eax:2):eax & 0xf = 0x%x\n", times);
- abort();
+ goto full;
}
- c = &cpuid_data.entries[cpuid_i++];
+ c = &entries[cpuid_i++];
c->function = i;
c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC;
cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
@@ -1951,11 +1776,9 @@ int kvm_arch_init_vcpu(CPUState *cs)
continue;
}
if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
- fprintf(stderr, "cpuid_data is full, no space for "
- "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
- abort();
+ goto full;
}
- c = &cpuid_data.entries[cpuid_i++];
+ c = &entries[cpuid_i++];
}
break;
case 0x12:
@@ -1970,11 +1793,9 @@ int kvm_arch_init_vcpu(CPUState *cs)
}
if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
- fprintf(stderr, "cpuid_data is full, no space for "
- "cpuid(eax:0x12,ecx:0x%x)\n", j);
- abort();
+ goto full;
}
- c = &cpuid_data.entries[cpuid_i++];
+ c = &entries[cpuid_i++];
}
break;
case 0x7:
@@ -1991,11 +1812,9 @@ int kvm_arch_init_vcpu(CPUState *cs)
for (j = 1; j <= times; ++j) {
if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
- fprintf(stderr, "cpuid_data is full, no space for "
- "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
- abort();
+ goto full;
}
- c = &cpuid_data.entries[cpuid_i++];
+ c = &entries[cpuid_i++];
c->function = i;
c->index = j;
c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
@@ -2048,11 +1867,11 @@ int kvm_arch_init_vcpu(CPUState *cs)
cpu_x86_cpuid(env, 0x80000000, 0, &limit, &unused, &unused, &unused);
for (i = 0x80000000; i <= limit; i++) {
+ j = 0;
if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
- fprintf(stderr, "unsupported xlevel value: 0x%x\n", limit);
- abort();
+ goto full;
}
- c = &cpuid_data.entries[cpuid_i++];
+ c = &entries[cpuid_i++];
switch (i) {
case 0x8000001d:
@@ -2067,11 +1886,9 @@ int kvm_arch_init_vcpu(CPUState *cs)
break;
}
if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
- fprintf(stderr, "cpuid_data is full, no space for "
- "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
- abort();
+ goto full;
}
- c = &cpuid_data.entries[cpuid_i++];
+ c = &entries[cpuid_i++];
}
break;
default:
@@ -2094,11 +1911,11 @@ int kvm_arch_init_vcpu(CPUState *cs)
cpu_x86_cpuid(env, 0xC0000000, 0, &limit, &unused, &unused, &unused);
for (i = 0xC0000000; i <= limit; i++) {
+ j = 0;
if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
- fprintf(stderr, "unsupported xlevel2 value: 0x%x\n", limit);
- abort();
+ goto full;
}
- c = &cpuid_data.entries[cpuid_i++];
+ c = &entries[cpuid_i++];
c->function = i;
c->flags = 0;
@@ -2106,6 +1923,194 @@ int kvm_arch_init_vcpu(CPUState *cs)
}
}
+ return cpuid_i;
+
+full:
+ fprintf(stderr, "cpuid_data is full, no space for "
+ "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
+ abort();
+}
+
+int kvm_arch_init_vcpu(CPUState *cs)
+{
+ struct {
+ struct kvm_cpuid2 cpuid;
+ struct kvm_cpuid_entry2 entries[KVM_MAX_CPUID_ENTRIES];
+ } cpuid_data;
+ /*
+ * The kernel defines these structs with padding fields so there
+ * should be no extra padding in our cpuid_data struct.
+ */
+ QEMU_BUILD_BUG_ON(sizeof(cpuid_data) !=
+ sizeof(struct kvm_cpuid2) +
+ sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES);
+
+ X86CPU *cpu = X86_CPU(cs);
+ CPUX86State *env = &cpu->env;
+ uint32_t cpuid_i;
+ struct kvm_cpuid_entry2 *c;
+ uint32_t signature[3];
+ int kvm_base = KVM_CPUID_SIGNATURE;
+ int max_nested_state_len;
+ int r;
+ Error *local_err = NULL;
+
+ memset(&cpuid_data, 0, sizeof(cpuid_data));
+
+ cpuid_i = 0;
+
+ has_xsave2 = kvm_check_extension(cs->kvm_state, KVM_CAP_XSAVE2);
+
+ r = kvm_arch_set_tsc_khz(cs);
+ if (r < 0) {
+ return r;
+ }
+
+ /* vcpu's TSC frequency is either specified by user, or following
+ * the value used by KVM if the former is not present. In the
+ * latter case, we query it from KVM and record in env->tsc_khz,
+ * so that vcpu's TSC frequency can be migrated later via this field.
+ */
+ if (!env->tsc_khz) {
+ r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
+ kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
+ -ENOTSUP;
+ if (r > 0) {
+ env->tsc_khz = r;
+ }
+ }
+
+ env->apic_bus_freq = KVM_APIC_BUS_FREQUENCY;
+
+ /*
+ * kvm_hyperv_expand_features() is called here for the second time in case
+ * KVM_CAP_SYS_HYPERV_CPUID is not supported. While we can't possibly handle
+ * 'query-cpu-model-expansion' in this case as we don't have a KVM vCPU to
+ * check which Hyper-V enlightenments are supported and which are not, we
+ * can still proceed and check/expand Hyper-V enlightenments here so legacy
+ * behavior is preserved.
+ */
+ if (!kvm_hyperv_expand_features(cpu, &local_err)) {
+ error_report_err(local_err);
+ return -ENOSYS;
+ }
+
+ if (hyperv_enabled(cpu)) {
+ r = hyperv_init_vcpu(cpu);
+ if (r) {
+ return r;
+ }
+
+ cpuid_i = hyperv_fill_cpuids(cs, cpuid_data.entries);
+ kvm_base = KVM_CPUID_SIGNATURE_NEXT;
+ has_msr_hv_hypercall = true;
+ }
+
+ if (cs->kvm_state->xen_version) {
+#ifdef CONFIG_XEN_EMU
+ struct kvm_cpuid_entry2 *xen_max_leaf;
+
+ memcpy(signature, "XenVMMXenVMM", 12);
+
+ xen_max_leaf = c = &cpuid_data.entries[cpuid_i++];
+ c->function = kvm_base + XEN_CPUID_SIGNATURE;
+ c->eax = kvm_base + XEN_CPUID_TIME;
+ c->ebx = signature[0];
+ c->ecx = signature[1];
+ c->edx = signature[2];
+
+ c = &cpuid_data.entries[cpuid_i++];
+ c->function = kvm_base + XEN_CPUID_VENDOR;
+ c->eax = cs->kvm_state->xen_version;
+ c->ebx = 0;
+ c->ecx = 0;
+ c->edx = 0;
+
+ c = &cpuid_data.entries[cpuid_i++];
+ c->function = kvm_base + XEN_CPUID_HVM_MSR;
+ /* Number of hypercall-transfer pages */
+ c->eax = 1;
+ /* Hypercall MSR base address */
+ if (hyperv_enabled(cpu)) {
+ c->ebx = XEN_HYPERCALL_MSR_HYPERV;
+ kvm_xen_init(cs->kvm_state, c->ebx);
+ } else {
+ c->ebx = XEN_HYPERCALL_MSR;
+ }
+ c->ecx = 0;
+ c->edx = 0;
+
+ c = &cpuid_data.entries[cpuid_i++];
+ c->function = kvm_base + XEN_CPUID_TIME;
+ c->eax = ((!!tsc_is_stable_and_known(env) << 1) |
+ (!!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP) << 2));
+ /* default=0 (emulate if necessary) */
+ c->ebx = 0;
+ /* guest tsc frequency */
+ c->ecx = env->user_tsc_khz;
+ /* guest tsc incarnation (migration count) */
+ c->edx = 0;
+
+ c = &cpuid_data.entries[cpuid_i++];
+ c->function = kvm_base + XEN_CPUID_HVM;
+ xen_max_leaf->eax = kvm_base + XEN_CPUID_HVM;
+ if (cs->kvm_state->xen_version >= XEN_VERSION(4, 5)) {
+ c->function = kvm_base + XEN_CPUID_HVM;
+
+ if (cpu->xen_vapic) {
+ c->eax |= XEN_HVM_CPUID_APIC_ACCESS_VIRT;
+ c->eax |= XEN_HVM_CPUID_X2APIC_VIRT;
+ }
+
+ c->eax |= XEN_HVM_CPUID_IOMMU_MAPPINGS;
+
+ if (cs->kvm_state->xen_version >= XEN_VERSION(4, 6)) {
+ c->eax |= XEN_HVM_CPUID_VCPU_ID_PRESENT;
+ c->ebx = cs->cpu_index;
+ }
+
+ if (cs->kvm_state->xen_version >= XEN_VERSION(4, 17)) {
+ c->eax |= XEN_HVM_CPUID_UPCALL_VECTOR;
+ }
+ }
+
+ r = kvm_xen_init_vcpu(cs);
+ if (r) {
+ return r;
+ }
+
+ kvm_base += 0x100;
+#else /* CONFIG_XEN_EMU */
+ /* This should never happen as kvm_arch_init() would have died first. */
+ fprintf(stderr, "Cannot enable Xen CPUID without Xen support\n");
+ abort();
+#endif
+ } else if (cpu->expose_kvm) {
+ memcpy(signature, "KVMKVMKVM\0\0\0", 12);
+ c = &cpuid_data.entries[cpuid_i++];
+ c->function = KVM_CPUID_SIGNATURE | kvm_base;
+ c->eax = KVM_CPUID_FEATURES | kvm_base;
+ c->ebx = signature[0];
+ c->ecx = signature[1];
+ c->edx = signature[2];
+
+ c = &cpuid_data.entries[cpuid_i++];
+ c->function = KVM_CPUID_FEATURES | kvm_base;
+ c->eax = env->features[FEAT_KVM];
+ c->edx = env->features[FEAT_KVM_HINTS];
+ }
+
+ if (cpu->kvm_pv_enforce_cpuid) {
+ r = kvm_vcpu_enable_cap(cs, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 0, 1);
+ if (r < 0) {
+ fprintf(stderr,
+ "failed to enable KVM_CAP_ENFORCE_PV_FEATURE_CPUID: %s",
+ strerror(-r));
+ abort();
+ }
+ }
+
+ cpuid_i = kvm_x86_build_cpuid(env, cpuid_data.entries, cpuid_i);
cpuid_data.cpuid.nent = cpuid_i;
if (((env->cpuid_version >> 8)&0xF) >= 6
--
2.39.3

@ -0,0 +1,91 @@
From 03e275023b482ac79b4f92ca4ceef6de3caa634f Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 9 May 2024 19:00:40 +0200
Subject: [PATCH 045/100] i386: pc: remove unnecessary MachineClass overrides
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [45/91] c03d5b57014d0d02f6ce0cdfb19a34996d100dea (bonzini/rhel-qemu-kvm)
There is no need to override these fields of MachineClass because they are
already set to the right value in the superclass.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Zhao Liu <zhao1.liu@intel.com>
Message-ID: <20240509170044.190795-10-pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit b348fdcdac9f9fc70be9ae56c54e41765e9aae24)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
hw/i386/pc.c | 3 ---
hw/i386/x86.c | 6 +++---
include/hw/i386/x86.h | 4 ----
3 files changed, 3 insertions(+), 10 deletions(-)
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 660a59c63b..0aca0cc79e 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1979,9 +1979,6 @@ static void pc_machine_class_init(ObjectClass *oc, void *data)
mc->async_pf_vmexit_disable = false;
mc->get_hotplug_handler = pc_get_hotplug_handler;
mc->hotplug_allowed = pc_hotplug_allowed;
- mc->cpu_index_to_instance_props = x86_cpu_index_to_props;
- mc->get_default_cpu_node_id = x86_get_default_cpu_node_id;
- mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids;
mc->auto_enable_numa_with_memhp = true;
mc->auto_enable_numa_with_memdev = true;
mc->has_hotpluggable_cpus = true;
diff --git a/hw/i386/x86.c b/hw/i386/x86.c
index c61f4ebfa6..fcef652c1e 100644
--- a/hw/i386/x86.c
+++ b/hw/i386/x86.c
@@ -443,7 +443,7 @@ void x86_cpu_pre_plug(HotplugHandler *hotplug_dev,
numa_cpu_pre_plug(cpu_slot, dev, errp);
}
-CpuInstanceProperties
+static CpuInstanceProperties
x86_cpu_index_to_props(MachineState *ms, unsigned cpu_index)
{
MachineClass *mc = MACHINE_GET_CLASS(ms);
@@ -453,7 +453,7 @@ x86_cpu_index_to_props(MachineState *ms, unsigned cpu_index)
return possible_cpus->cpus[cpu_index].props;
}
-int64_t x86_get_default_cpu_node_id(const MachineState *ms, int idx)
+static int64_t x86_get_default_cpu_node_id(const MachineState *ms, int idx)
{
X86CPUTopoIDs topo_ids;
X86MachineState *x86ms = X86_MACHINE(ms);
@@ -467,7 +467,7 @@ int64_t x86_get_default_cpu_node_id(const MachineState *ms, int idx)
return topo_ids.pkg_id % ms->numa_state->num_nodes;
}
-const CPUArchIdList *x86_possible_cpu_arch_ids(MachineState *ms)
+static const CPUArchIdList *x86_possible_cpu_arch_ids(MachineState *ms)
{
X86MachineState *x86ms = X86_MACHINE(ms);
unsigned int max_cpus = ms->smp.max_cpus;
diff --git a/include/hw/i386/x86.h b/include/hw/i386/x86.h
index d7b7d3f3ce..c2062db13f 100644
--- a/include/hw/i386/x86.h
+++ b/include/hw/i386/x86.h
@@ -114,10 +114,6 @@ uint32_t x86_cpu_apic_id_from_index(X86MachineState *pcms,
void x86_cpu_new(X86MachineState *pcms, int64_t apic_id, Error **errp);
void x86_cpus_init(X86MachineState *pcms, int default_cpu_version);
-CpuInstanceProperties x86_cpu_index_to_props(MachineState *ms,
- unsigned cpu_index);
-int64_t x86_get_default_cpu_node_id(const MachineState *ms, int idx);
-const CPUArchIdList *x86_possible_cpu_arch_ids(MachineState *ms);
CPUArchId *x86_find_cpu_slot(MachineState *ms, uint32_t id, int *idx);
void x86_rtc_set_cpus_count(ISADevice *rtc, uint16_t cpus_count);
void x86_cpu_pre_plug(HotplugHandler *hotplug_dev,
--
2.39.3

@ -0,0 +1,116 @@
From 652793962000d6906e219ceae36348a476b78c28 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 31 May 2024 12:44:44 +0200
Subject: [PATCH 065/100] i386/sev: Add a class method to determine KVM VM type
for SNP guests
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [65/91] c6cbeac0a6f691138df212b80efaa9b1143fdaa8 (bonzini/rhel-qemu-kvm)
SEV guests can use either KVM_X86_DEFAULT_VM, KVM_X86_SEV_VM,
or KVM_X86_SEV_ES_VM depending on the configuration and what
the host kernel supports. SNP guests on the other hand can only
ever use KVM_X86_SNP_VM, so split determination of VM type out
into a separate class method that can be set accordingly for
sev-guest vs. sev-snp-guest objects and add handling for SNP.
Signed-off-by: Pankaj Gupta <pankaj.gupta@amd.com>
Message-ID: <20240530111643.1091816-14-pankaj.gupta@amd.com>
[Remove unnecessary function pointer declaration. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit a808132f6d8e855bd83a400570ec91d2e00bebe3)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
target/i386/kvm/kvm.c | 1 +
target/i386/sev.c | 15 ++++++++++++---
2 files changed, 13 insertions(+), 3 deletions(-)
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 408568d053..75e75d9772 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -166,6 +166,7 @@ static const char *vm_type_name[] = {
[KVM_X86_DEFAULT_VM] = "default",
[KVM_X86_SEV_VM] = "SEV",
[KVM_X86_SEV_ES_VM] = "SEV-ES",
+ [KVM_X86_SNP_VM] = "SEV-SNP",
};
bool kvm_is_vm_type_supported(int type)
diff --git a/target/i386/sev.c b/target/i386/sev.c
index c3daaf1ad5..072cc4f853 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -885,6 +885,11 @@ out:
return sev_common->kvm_type;
}
+static int sev_snp_kvm_type(X86ConfidentialGuest *cg)
+{
+ return KVM_X86_SNP_VM;
+}
+
static int sev_common_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
{
char *devname;
@@ -894,6 +899,8 @@ static int sev_common_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
struct sev_user_data_status status = {};
SevCommonState *sev_common = SEV_COMMON(cgs);
SevCommonStateClass *klass = SEV_COMMON_GET_CLASS(cgs);
+ X86ConfidentialGuestClass *x86_klass =
+ X86_CONFIDENTIAL_GUEST_GET_CLASS(cgs);
sev_common->state = SEV_STATE_UNINIT;
@@ -964,7 +971,7 @@ static int sev_common_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
}
trace_kvm_sev_init();
- if (sev_kvm_type(X86_CONFIDENTIAL_GUEST(sev_common)) == KVM_X86_DEFAULT_VM) {
+ if (x86_klass->kvm_type(X86_CONFIDENTIAL_GUEST(sev_common)) == KVM_X86_DEFAULT_VM) {
cmd = sev_es_enabled() ? KVM_SEV_ES_INIT : KVM_SEV_INIT;
ret = sev_ioctl(sev_common->sev_fd, cmd, NULL, &fw_error);
@@ -1441,10 +1448,8 @@ static void
sev_common_class_init(ObjectClass *oc, void *data)
{
ConfidentialGuestSupportClass *klass = CONFIDENTIAL_GUEST_SUPPORT_CLASS(oc);
- X86ConfidentialGuestClass *x86_klass = X86_CONFIDENTIAL_GUEST_CLASS(oc);
klass->kvm_init = sev_common_kvm_init;
- x86_klass->kvm_type = sev_kvm_type;
object_class_property_add_str(oc, "sev-device",
sev_common_get_sev_device,
@@ -1529,10 +1534,12 @@ static void
sev_guest_class_init(ObjectClass *oc, void *data)
{
SevCommonStateClass *klass = SEV_COMMON_CLASS(oc);
+ X86ConfidentialGuestClass *x86_klass = X86_CONFIDENTIAL_GUEST_CLASS(oc);
klass->launch_start = sev_launch_start;
klass->launch_finish = sev_launch_finish;
klass->kvm_init = sev_kvm_init;
+ x86_klass->kvm_type = sev_kvm_type;
object_class_property_add_str(oc, "dh-cert-file",
sev_guest_get_dh_cert_file,
@@ -1770,8 +1777,10 @@ static void
sev_snp_guest_class_init(ObjectClass *oc, void *data)
{
SevCommonStateClass *klass = SEV_COMMON_CLASS(oc);
+ X86ConfidentialGuestClass *x86_klass = X86_CONFIDENTIAL_GUEST_CLASS(oc);
klass->kvm_init = sev_snp_kvm_init;
+ x86_klass->kvm_type = sev_snp_kvm_type;
object_class_property_add(oc, "policy", "uint64",
sev_snp_guest_get_policy,
--
2.39.3

@ -0,0 +1,84 @@
From 82a714b79851b5c2d1389d2fa7a01548c486a854 Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Thu, 30 May 2024 06:16:20 -0500
Subject: [PATCH 060/100] i386/sev: Add a sev_snp_enabled() helper
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [60/91] c35ead095028ccfb1e1be0fe010ca4f7688530a0 (bonzini/rhel-qemu-kvm)
Add a simple helper to check if the current guest type is SNP. Also have
SNP-enabled imply that SEV-ES is enabled as well, and fix up any places
where the sev_es_enabled() check is expecting a pure/non-SNP guest.
Signed-off-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Pankaj Gupta <pankaj.gupta@amd.com>
Message-ID: <20240530111643.1091816-9-pankaj.gupta@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 99190f805dca9475fe244fbd8041961842657dc2)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
target/i386/sev.c | 13 ++++++++++++-
target/i386/sev.h | 2 ++
2 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/target/i386/sev.c b/target/i386/sev.c
index a81b3228d4..4edfedc139 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -325,12 +325,21 @@ sev_enabled(void)
return !!object_dynamic_cast(OBJECT(cgs), TYPE_SEV_COMMON);
}
+bool
+sev_snp_enabled(void)
+{
+ ConfidentialGuestSupport *cgs = MACHINE(qdev_get_machine())->cgs;
+
+ return !!object_dynamic_cast(OBJECT(cgs), TYPE_SEV_SNP_GUEST);
+}
+
bool
sev_es_enabled(void)
{
ConfidentialGuestSupport *cgs = MACHINE(qdev_get_machine())->cgs;
- return sev_enabled() && (SEV_GUEST(cgs)->policy & SEV_POLICY_ES);
+ return sev_snp_enabled() ||
+ (sev_enabled() && SEV_GUEST(cgs)->policy & SEV_POLICY_ES);
}
uint32_t
@@ -946,7 +955,9 @@ static int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
"support", __func__);
goto err;
}
+ }
+ if (sev_es_enabled() && !sev_snp_enabled()) {
if (!(status.flags & SEV_STATUS_FLAGS_CONFIG_ES)) {
error_setg(errp, "%s: guest policy requires SEV-ES, but "
"host SEV-ES support unavailable",
diff --git a/target/i386/sev.h b/target/i386/sev.h
index bedc667eeb..94295ee74f 100644
--- a/target/i386/sev.h
+++ b/target/i386/sev.h
@@ -45,9 +45,11 @@ typedef struct SevKernelLoaderContext {
#ifdef CONFIG_SEV
bool sev_enabled(void);
bool sev_es_enabled(void);
+bool sev_snp_enabled(void);
#else
#define sev_enabled() 0
#define sev_es_enabled() 0
+#define sev_snp_enabled() 0
#endif
uint32_t sev_get_cbit_position(void);
--
2.39.3

@ -0,0 +1,187 @@
From 0e435819540b0d39da2c828aacc0f35ecaadbdf6 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Thu, 30 May 2024 06:16:28 -0500
Subject: [PATCH 068/100] i386/sev: Add handling to encrypt/finalize guest
launch data
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [68/91] fe77931d279aa8df061823da88a320fb5f72ffea (bonzini/rhel-qemu-kvm)
Process any queued up launch data and encrypt/measure it into the SNP
guest instance prior to initial guest launch.
This also updates the KVM_SEV_SNP_LAUNCH_UPDATE call to handle partial
update responses.
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Co-developed-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
Co-developed-by: Pankaj Gupta <pankaj.gupta@amd.com>
Signed-off-by: Pankaj Gupta <pankaj.gupta@amd.com>
Message-ID: <20240530111643.1091816-17-pankaj.gupta@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 9f3a6999f9730a694d7db448a99f9c9cb6515992)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
target/i386/sev.c | 112 ++++++++++++++++++++++++++++++++++++++-
target/i386/trace-events | 2 +
2 files changed, 113 insertions(+), 1 deletion(-)
diff --git a/target/i386/sev.c b/target/i386/sev.c
index e89b87d2f5..ef2e592ca7 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -756,6 +756,76 @@ out:
return ret;
}
+static const char *
+snp_page_type_to_str(int type)
+{
+ switch (type) {
+ case KVM_SEV_SNP_PAGE_TYPE_NORMAL: return "Normal";
+ case KVM_SEV_SNP_PAGE_TYPE_ZERO: return "Zero";
+ case KVM_SEV_SNP_PAGE_TYPE_UNMEASURED: return "Unmeasured";
+ case KVM_SEV_SNP_PAGE_TYPE_SECRETS: return "Secrets";
+ case KVM_SEV_SNP_PAGE_TYPE_CPUID: return "Cpuid";
+ default: return "unknown";
+ }
+}
+
+static int
+sev_snp_launch_update(SevSnpGuestState *sev_snp_guest,
+ SevLaunchUpdateData *data)
+{
+ int ret, fw_error;
+ struct kvm_sev_snp_launch_update update = {0};
+
+ if (!data->hva || !data->len) {
+ error_report("SNP_LAUNCH_UPDATE called with invalid address"
+ "/ length: %p / %lx",
+ data->hva, data->len);
+ return 1;
+ }
+
+ update.uaddr = (__u64)(unsigned long)data->hva;
+ update.gfn_start = data->gpa >> TARGET_PAGE_BITS;
+ update.len = data->len;
+ update.type = data->type;
+
+ /*
+ * KVM_SEV_SNP_LAUNCH_UPDATE requires that GPA ranges have the private
+ * memory attribute set in advance.
+ */
+ ret = kvm_set_memory_attributes_private(data->gpa, data->len);
+ if (ret) {
+ error_report("SEV-SNP: failed to configure initial"
+ "private guest memory");
+ goto out;
+ }
+
+ while (update.len || ret == -EAGAIN) {
+ trace_kvm_sev_snp_launch_update(update.uaddr, update.gfn_start <<
+ TARGET_PAGE_BITS, update.len,
+ snp_page_type_to_str(update.type));
+
+ ret = sev_ioctl(SEV_COMMON(sev_snp_guest)->sev_fd,
+ KVM_SEV_SNP_LAUNCH_UPDATE,
+ &update, &fw_error);
+ if (ret && ret != -EAGAIN) {
+ error_report("SNP_LAUNCH_UPDATE ret=%d fw_error=%d '%s'",
+ ret, fw_error, fw_error_to_str(fw_error));
+ break;
+ }
+ }
+
+out:
+ if (!ret && update.gfn_start << TARGET_PAGE_BITS != data->gpa + data->len) {
+ error_report("SEV-SNP: expected update of GPA range %lx-%lx,"
+ "got GPA range %lx-%llx",
+ data->gpa, data->gpa + data->len, data->gpa,
+ update.gfn_start << TARGET_PAGE_BITS);
+ ret = -EIO;
+ }
+
+ return ret;
+}
+
static int
sev_launch_update_data(SevGuestState *sev_guest, uint8_t *addr, uint64_t len)
{
@@ -901,6 +971,46 @@ sev_launch_finish(SevCommonState *sev_common)
migrate_add_blocker(&sev_mig_blocker, &error_fatal);
}
+static void
+sev_snp_launch_finish(SevCommonState *sev_common)
+{
+ int ret, error;
+ Error *local_err = NULL;
+ SevLaunchUpdateData *data;
+ SevSnpGuestState *sev_snp = SEV_SNP_GUEST(sev_common);
+ struct kvm_sev_snp_launch_finish *finish = &sev_snp->kvm_finish_conf;
+
+ QTAILQ_FOREACH(data, &launch_update, next) {
+ ret = sev_snp_launch_update(sev_snp, data);
+ if (ret) {
+ exit(1);
+ }
+ }
+
+ trace_kvm_sev_snp_launch_finish(sev_snp->id_block, sev_snp->id_auth,
+ sev_snp->host_data);
+ ret = sev_ioctl(sev_common->sev_fd, KVM_SEV_SNP_LAUNCH_FINISH,
+ finish, &error);
+ if (ret) {
+ error_report("SNP_LAUNCH_FINISH ret=%d fw_error=%d '%s'",
+ ret, error, fw_error_to_str(error));
+ exit(1);
+ }
+
+ sev_set_guest_state(sev_common, SEV_STATE_RUNNING);
+
+ /* add migration blocker */
+ error_setg(&sev_mig_blocker,
+ "SEV-SNP: Migration is not implemented");
+ ret = migrate_add_blocker(&sev_mig_blocker, &local_err);
+ if (local_err) {
+ error_report_err(local_err);
+ error_free(sev_mig_blocker);
+ exit(1);
+ }
+}
+
+
static void
sev_vm_state_change(void *opaque, bool running, RunState state)
{
@@ -1832,10 +1942,10 @@ sev_snp_guest_class_init(ObjectClass *oc, void *data)
X86ConfidentialGuestClass *x86_klass = X86_CONFIDENTIAL_GUEST_CLASS(oc);
klass->launch_start = sev_snp_launch_start;
+ klass->launch_finish = sev_snp_launch_finish;
klass->kvm_init = sev_snp_kvm_init;
x86_klass->kvm_type = sev_snp_kvm_type;
-
object_class_property_add(oc, "policy", "uint64",
sev_snp_guest_get_policy,
sev_snp_guest_set_policy, NULL, NULL);
diff --git a/target/i386/trace-events b/target/i386/trace-events
index cb26d8a925..06b44ead2e 100644
--- a/target/i386/trace-events
+++ b/target/i386/trace-events
@@ -12,3 +12,5 @@ kvm_sev_launch_finish(void) ""
kvm_sev_launch_secret(uint64_t hpa, uint64_t hva, uint64_t secret, int len) "hpa 0x%" PRIx64 " hva 0x%" PRIx64 " data 0x%" PRIx64 " len %d"
kvm_sev_attestation_report(const char *mnonce, const char *data) "mnonce %s data %s"
kvm_sev_snp_launch_start(uint64_t policy, char *gosvw) "policy 0x%" PRIx64 " gosvw %s"
+kvm_sev_snp_launch_update(uint64_t src, uint64_t gpa, uint64_t len, const char *type) "src 0x%" PRIx64 " gpa 0x%" PRIx64 " len 0x%" PRIx64 " (%s page)"
+kvm_sev_snp_launch_finish(char *id_block, char *id_auth, char *host_data) "id_block %s id_auth %s host_data %s"
--
2.39.3

@ -0,0 +1,127 @@
From 2872c423fa44dcbf50b581a5c3feac064a0473a0 Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Tue, 9 Apr 2024 18:07:41 -0500
Subject: [PATCH 024/100] i386/sev: Add 'legacy-vm-type' parameter for SEV
guest objects
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [24/91] ce35d1b09fe8aa8772ff149543f7760455c1e6b5 (bonzini/rhel-qemu-kvm)
QEMU will currently automatically make use of the KVM_SEV_INIT2 API for
initializing SEV and SEV-ES guests verses the older
KVM_SEV_INIT/KVM_SEV_ES_INIT interfaces.
However, the older interfaces will silently avoid sync'ing FPU/XSAVE
state to the VMSA prior to encryption, thus relying on behavior and
measurements that assume the related fields to be allow zero.
With KVM_SEV_INIT2, this state is now synced into the VMSA, resulting in
measurements changes and, theoretically, behaviorial changes, though the
latter are unlikely to be seen in practice.
To allow a smooth transition to the newer interface, while still
providing a mechanism to maintain backward compatibility with VMs
created using the older interfaces, provide a new command-line
parameter:
-object sev-guest,legacy-vm-type=true,...
and have it default to false.
Signed-off-by: Michael Roth <michael.roth@amd.com>
Message-ID: <20240409230743.962513-2-michael.roth@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 023267334da375226720e62963df9545aa8fc2fd)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
qapi/qom.json | 11 ++++++++++-
target/i386/sev.c | 18 +++++++++++++++++-
2 files changed, 27 insertions(+), 2 deletions(-)
diff --git a/qapi/qom.json b/qapi/qom.json
index 85e6b4f84a..38dde6d785 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -898,6 +898,14 @@
# designated guest firmware page for measured boot with -kernel
# (default: false) (since 6.2)
#
+# @legacy-vm-type: Use legacy KVM_SEV_INIT KVM interface for creating the VM.
+# The newer KVM_SEV_INIT2 interface syncs additional vCPU
+# state when initializing the VMSA structures, which will
+# result in a different guest measurement. Set this to
+# maintain compatibility with older QEMU or kernel versions
+# that rely on legacy KVM_SEV_INIT behavior.
+# (default: false) (since 9.1)
+#
# Since: 2.12
##
{ 'struct': 'SevGuestProperties',
@@ -908,7 +916,8 @@
'*handle': 'uint32',
'*cbitpos': 'uint32',
'reduced-phys-bits': 'uint32',
- '*kernel-hashes': 'bool' } }
+ '*kernel-hashes': 'bool',
+ '*legacy-vm-type': 'bool' } }
##
# @ThreadContextProperties:
diff --git a/target/i386/sev.c b/target/i386/sev.c
index 9dab4060b8..f4ee317cb0 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -67,6 +67,7 @@ struct SevGuestState {
uint32_t cbitpos;
uint32_t reduced_phys_bits;
bool kernel_hashes;
+ bool legacy_vm_type;
/* runtime state */
uint32_t handle;
@@ -356,6 +357,16 @@ static void sev_guest_set_kernel_hashes(Object *obj, bool value, Error **errp)
sev->kernel_hashes = value;
}
+static bool sev_guest_get_legacy_vm_type(Object *obj, Error **errp)
+{
+ return SEV_GUEST(obj)->legacy_vm_type;
+}
+
+static void sev_guest_set_legacy_vm_type(Object *obj, bool value, Error **errp)
+{
+ SEV_GUEST(obj)->legacy_vm_type = value;
+}
+
bool
sev_enabled(void)
{
@@ -863,7 +874,7 @@ static int sev_kvm_type(X86ConfidentialGuest *cg)
}
kvm_type = (sev->policy & SEV_POLICY_ES) ? KVM_X86_SEV_ES_VM : KVM_X86_SEV_VM;
- if (kvm_is_vm_type_supported(kvm_type)) {
+ if (kvm_is_vm_type_supported(kvm_type) && !sev->legacy_vm_type) {
sev->kvm_type = kvm_type;
} else {
sev->kvm_type = KVM_X86_DEFAULT_VM;
@@ -1381,6 +1392,11 @@ sev_guest_class_init(ObjectClass *oc, void *data)
sev_guest_set_kernel_hashes);
object_class_property_set_description(oc, "kernel-hashes",
"add kernel hashes to guest firmware for measured Linux boot");
+ object_class_property_add_bool(oc, "legacy-vm-type",
+ sev_guest_get_legacy_vm_type,
+ sev_guest_set_legacy_vm_type);
+ object_class_property_set_description(oc, "legacy-vm-type",
+ "use legacy VM type to maintain measurement compatibility with older QEMU or kernel versions.");
}
static void
--
2.39.3

@ -0,0 +1,203 @@
From a236548a903aa8350fff9601d481b2f529c8d4a7 Mon Sep 17 00:00:00 2001
From: Pankaj Gupta <pankaj.gupta@amd.com>
Date: Thu, 30 May 2024 06:16:21 -0500
Subject: [PATCH 061/100] i386/sev: Add sev_kvm_init() override for SEV class
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [61/91] b24fcbc8712e7394e029312229da023c63803969 (bonzini/rhel-qemu-kvm)
Some aspects of the init routine SEV are specific to SEV and not
applicable for SNP guests, so move the SEV-specific bits into
separate class method and retain only the common functionality.
Co-developed-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Pankaj Gupta <pankaj.gupta@amd.com>
Message-ID: <20240530111643.1091816-10-pankaj.gupta@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 990da8d243a8c59dafcbed78b56a0e4ffb1605d9)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
target/i386/sev.c | 72 +++++++++++++++++++++++++++++++++--------------
1 file changed, 51 insertions(+), 21 deletions(-)
diff --git a/target/i386/sev.c b/target/i386/sev.c
index 4edfedc139..5519de1c6b 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -73,6 +73,7 @@ struct SevCommonStateClass {
/* public */
int (*launch_start)(SevCommonState *sev_common);
void (*launch_finish)(SevCommonState *sev_common);
+ int (*kvm_init)(ConfidentialGuestSupport *cgs, Error **errp);
};
/**
@@ -882,7 +883,7 @@ out:
return sev_common->kvm_type;
}
-static int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
+static int sev_common_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
{
SevCommonState *sev_common = SEV_COMMON(cgs);
char *devname;
@@ -892,12 +893,6 @@ static int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
struct sev_user_data_status status = {};
SevCommonStateClass *klass = SEV_COMMON_GET_CLASS(cgs);
- ret = ram_block_discard_disable(true);
- if (ret) {
- error_report("%s: cannot disable RAM discard", __func__);
- return -1;
- }
-
sev_common->state = SEV_STATE_UNINIT;
host_cpuid(0x8000001F, 0, NULL, &ebx, NULL, NULL);
@@ -911,7 +906,7 @@ static int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
if (host_cbitpos != sev_common->cbitpos) {
error_setg(errp, "%s: cbitpos check failed, host '%d' requested '%d'",
__func__, host_cbitpos, sev_common->cbitpos);
- goto err;
+ return -1;
}
/*
@@ -924,7 +919,7 @@ static int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
error_setg(errp, "%s: reduced_phys_bits check failed,"
" it should be in the range of 1 to 63, requested '%d'",
__func__, sev_common->reduced_phys_bits);
- goto err;
+ return -1;
}
devname = object_property_get_str(OBJECT(sev_common), "sev-device", NULL);
@@ -933,7 +928,7 @@ static int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
error_setg(errp, "%s: Failed to open %s '%s'", __func__,
devname, strerror(errno));
g_free(devname);
- goto err;
+ return -1;
}
g_free(devname);
@@ -943,7 +938,7 @@ static int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
error_setg(errp, "%s: failed to get platform status ret=%d "
"fw_error='%d: %s'", __func__, ret, fw_error,
fw_error_to_str(fw_error));
- goto err;
+ return -1;
}
sev_common->build_id = status.build;
sev_common->api_major = status.api_major;
@@ -953,7 +948,7 @@ static int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
if (!kvm_kernel_irqchip_allowed()) {
error_setg(errp, "%s: SEV-ES guests require in-kernel irqchip"
"support", __func__);
- goto err;
+ return -1;
}
}
@@ -962,7 +957,7 @@ static int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
error_setg(errp, "%s: guest policy requires SEV-ES, but "
"host SEV-ES support unavailable",
__func__);
- goto err;
+ return -1;
}
}
@@ -980,25 +975,59 @@ static int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
if (ret) {
error_setg(errp, "%s: failed to initialize ret=%d fw_error=%d '%s'",
__func__, ret, fw_error, fw_error_to_str(fw_error));
- goto err;
+ return -1;
}
ret = klass->launch_start(sev_common);
if (ret) {
error_setg(errp, "%s: failed to create encryption context", __func__);
- goto err;
+ return -1;
+ }
+
+ if (klass->kvm_init && klass->kvm_init(cgs, errp)) {
+ return -1;
}
- ram_block_notifier_add(&sev_ram_notifier);
- qemu_add_machine_init_done_notifier(&sev_machine_done_notify);
qemu_add_vm_change_state_handler(sev_vm_state_change, sev_common);
cgs->ready = true;
return 0;
-err:
- ram_block_discard_disable(false);
- return -1;
+}
+
+static int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
+{
+ int ret;
+
+ /*
+ * SEV/SEV-ES rely on pinned memory to back guest RAM so discarding
+ * isn't actually possible. With SNP, only guest_memfd pages are used
+ * for private guest memory, so discarding of shared memory is still
+ * possible..
+ */
+ ret = ram_block_discard_disable(true);
+ if (ret) {
+ error_setg(errp, "%s: cannot disable RAM discard", __func__);
+ return -1;
+ }
+
+ /*
+ * SEV uses these notifiers to register/pin pages prior to guest use,
+ * but SNP relies on guest_memfd for private pages, which has its
+ * own internal mechanisms for registering/pinning private memory.
+ */
+ ram_block_notifier_add(&sev_ram_notifier);
+
+ /*
+ * The machine done notify event is used for SEV guests to get the
+ * measurement of the encrypted images. When SEV-SNP is enabled, the
+ * measurement is part of the guest attestation process where it can
+ * be collected without any reliance on the VMM. So skip registering
+ * the notifier for SNP in favor of using guest attestation instead.
+ */
+ qemu_add_machine_init_done_notifier(&sev_machine_done_notify);
+
+ return 0;
}
int
@@ -1397,7 +1426,7 @@ sev_common_class_init(ObjectClass *oc, void *data)
ConfidentialGuestSupportClass *klass = CONFIDENTIAL_GUEST_SUPPORT_CLASS(oc);
X86ConfidentialGuestClass *x86_klass = X86_CONFIDENTIAL_GUEST_CLASS(oc);
- klass->kvm_init = sev_kvm_init;
+ klass->kvm_init = sev_common_kvm_init;
x86_klass->kvm_type = sev_kvm_type;
object_class_property_add_str(oc, "sev-device",
@@ -1486,6 +1515,7 @@ sev_guest_class_init(ObjectClass *oc, void *data)
klass->launch_start = sev_launch_start;
klass->launch_finish = sev_launch_finish;
+ klass->kvm_init = sev_kvm_init;
object_class_property_add_str(oc, "dh-cert-file",
sev_guest_get_dh_cert_file,
--
2.39.3

@ -0,0 +1,94 @@
From 35ceebdeccbf5dceb374c6f89a12e9981def570b Mon Sep 17 00:00:00 2001
From: Pankaj Gupta <pankaj.gupta@amd.com>
Date: Thu, 30 May 2024 06:16:22 -0500
Subject: [PATCH 062/100] i386/sev: Add snp_kvm_init() override for SNP class
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [62/91] 8fa537961c9262b99a4ffb99e1c25f080d76d1de (bonzini/rhel-qemu-kvm)
SNP does not support SMM and requires guest_memfd for
private guest memory, so add SNP specific kvm_init()
functionality in snp_kvm_init() class method.
Signed-off-by: Michael Roth <michael.roth@amd.com>
Co-developed-by: Pankaj Gupta <pankaj.gupta@amd.com>
Signed-off-by: Pankaj Gupta <pankaj.gupta@amd.com>
Message-ID: <20240530111643.1091816-11-pankaj.gupta@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 125b95a6d465a03ff30816eff0b1889aec01f0c3)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
target/i386/sev.c | 24 +++++++++++++++++++++++-
1 file changed, 23 insertions(+), 1 deletion(-)
diff --git a/target/i386/sev.c b/target/i386/sev.c
index 5519de1c6b..6525b3c1a0 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -885,12 +885,12 @@ out:
static int sev_common_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
{
- SevCommonState *sev_common = SEV_COMMON(cgs);
char *devname;
int ret, fw_error, cmd;
uint32_t ebx;
uint32_t host_cbitpos;
struct sev_user_data_status status = {};
+ SevCommonState *sev_common = SEV_COMMON(cgs);
SevCommonStateClass *klass = SEV_COMMON_GET_CLASS(cgs);
sev_common->state = SEV_STATE_UNINIT;
@@ -1030,6 +1030,21 @@ static int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
return 0;
}
+static int sev_snp_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+ X86MachineState *x86ms = X86_MACHINE(ms);
+
+ if (x86ms->smm == ON_OFF_AUTO_AUTO) {
+ x86ms->smm = ON_OFF_AUTO_OFF;
+ } else if (x86ms->smm == ON_OFF_AUTO_ON) {
+ error_setg(errp, "SEV-SNP does not support SMM.");
+ return -1;
+ }
+
+ return 0;
+}
+
int
sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp)
{
@@ -1752,6 +1767,10 @@ sev_snp_guest_set_host_data(Object *obj, const char *value, Error **errp)
static void
sev_snp_guest_class_init(ObjectClass *oc, void *data)
{
+ SevCommonStateClass *klass = SEV_COMMON_CLASS(oc);
+
+ klass->kvm_init = sev_snp_kvm_init;
+
object_class_property_add(oc, "policy", "uint64",
sev_snp_guest_get_policy,
sev_snp_guest_set_policy, NULL, NULL);
@@ -1778,8 +1797,11 @@ sev_snp_guest_class_init(ObjectClass *oc, void *data)
static void
sev_snp_guest_instance_init(Object *obj)
{
+ ConfidentialGuestSupport *cgs = CONFIDENTIAL_GUEST_SUPPORT(obj);
SevSnpGuestState *sev_snp_guest = SEV_SNP_GUEST(obj);
+ cgs->require_guest_memfd = true;
+
/* default init/start/finish params for kvm */
sev_snp_guest->kvm_start_conf.policy = DEFAULT_SEV_SNP_POLICY;
}
--
2.39.3

@ -0,0 +1,262 @@
From 4013364679757161d6b9754bfc33ae38be0a1b7f Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Thu, 30 May 2024 06:16:32 -0500
Subject: [PATCH 072/100] i386/sev: Add support for SNP CPUID validation
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [72/91] 080e2942552dc7de8966e69d0d0d3b8951392030 (bonzini/rhel-qemu-kvm)
SEV-SNP firmware allows a special guest page to be populated with a
table of guest CPUID values so that they can be validated through
firmware before being loaded into encrypted guest memory where they can
be used in place of hypervisor-provided values[1].
As part of SEV-SNP guest initialization, use this interface to validate
the CPUID entries reported by KVM_GET_CPUID2 prior to initial guest
start and populate the CPUID page reserved by OVMF with the resulting
encrypted data.
[1] SEV SNP Firmware ABI Specification, Rev. 0.8, 8.13.2.6
Signed-off-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Pankaj Gupta <pankaj.gupta@amd.com>
Message-ID: <20240530111643.1091816-21-pankaj.gupta@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 70943ad8e4dfbe5f77006b880290219be9d03553)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
target/i386/sev.c | 164 +++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 162 insertions(+), 2 deletions(-)
diff --git a/target/i386/sev.c b/target/i386/sev.c
index c57534fca2..06401f0526 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -200,6 +200,36 @@ static const char *const sev_fw_errlist[] = {
#define SEV_FW_MAX_ERROR ARRAY_SIZE(sev_fw_errlist)
+/* <linux/kvm.h> doesn't expose this, so re-use the max from kvm.c */
+#define KVM_MAX_CPUID_ENTRIES 100
+
+typedef struct KvmCpuidInfo {
+ struct kvm_cpuid2 cpuid;
+ struct kvm_cpuid_entry2 entries[KVM_MAX_CPUID_ENTRIES];
+} KvmCpuidInfo;
+
+#define SNP_CPUID_FUNCTION_MAXCOUNT 64
+#define SNP_CPUID_FUNCTION_UNKNOWN 0xFFFFFFFF
+
+typedef struct {
+ uint32_t eax_in;
+ uint32_t ecx_in;
+ uint64_t xcr0_in;
+ uint64_t xss_in;
+ uint32_t eax;
+ uint32_t ebx;
+ uint32_t ecx;
+ uint32_t edx;
+ uint64_t reserved;
+} __attribute__((packed)) SnpCpuidFunc;
+
+typedef struct {
+ uint32_t count;
+ uint32_t reserved1;
+ uint64_t reserved2;
+ SnpCpuidFunc entries[SNP_CPUID_FUNCTION_MAXCOUNT];
+} __attribute__((packed)) SnpCpuidInfo;
+
static int
sev_ioctl(int fd, int cmd, void *data, int *error)
{
@@ -788,6 +818,35 @@ out:
return ret;
}
+static void
+sev_snp_cpuid_report_mismatches(SnpCpuidInfo *old,
+ SnpCpuidInfo *new)
+{
+ size_t i;
+
+ if (old->count != new->count) {
+ error_report("SEV-SNP: CPUID validation failed due to count mismatch,"
+ "provided: %d, expected: %d", old->count, new->count);
+ return;
+ }
+
+ for (i = 0; i < old->count; i++) {
+ SnpCpuidFunc *old_func, *new_func;
+
+ old_func = &old->entries[i];
+ new_func = &new->entries[i];
+
+ if (memcmp(old_func, new_func, sizeof(SnpCpuidFunc))) {
+ error_report("SEV-SNP: CPUID validation failed for function 0x%x, index: 0x%x"
+ "provided: eax:0x%08x, ebx: 0x%08x, ecx: 0x%08x, edx: 0x%08x"
+ "expected: eax:0x%08x, ebx: 0x%08x, ecx: 0x%08x, edx: 0x%08x",
+ old_func->eax_in, old_func->ecx_in,
+ old_func->eax, old_func->ebx, old_func->ecx, old_func->edx,
+ new_func->eax, new_func->ebx, new_func->ecx, new_func->edx);
+ }
+ }
+}
+
static const char *
snp_page_type_to_str(int type)
{
@@ -806,6 +865,7 @@ sev_snp_launch_update(SevSnpGuestState *sev_snp_guest,
SevLaunchUpdateData *data)
{
int ret, fw_error;
+ SnpCpuidInfo snp_cpuid_info;
struct kvm_sev_snp_launch_update update = {0};
if (!data->hva || !data->len) {
@@ -815,6 +875,11 @@ sev_snp_launch_update(SevSnpGuestState *sev_snp_guest,
return 1;
}
+ if (data->type == KVM_SEV_SNP_PAGE_TYPE_CPUID) {
+ /* Save a copy for comparison in case the LAUNCH_UPDATE fails */
+ memcpy(&snp_cpuid_info, data->hva, sizeof(snp_cpuid_info));
+ }
+
update.uaddr = (__u64)(unsigned long)data->hva;
update.gfn_start = data->gpa >> TARGET_PAGE_BITS;
update.len = data->len;
@@ -842,6 +907,11 @@ sev_snp_launch_update(SevSnpGuestState *sev_snp_guest,
if (ret && ret != -EAGAIN) {
error_report("SNP_LAUNCH_UPDATE ret=%d fw_error=%d '%s'",
ret, fw_error, fw_error_to_str(fw_error));
+
+ if (data->type == KVM_SEV_SNP_PAGE_TYPE_CPUID) {
+ sev_snp_cpuid_report_mismatches(&snp_cpuid_info, data->hva);
+ error_report("SEV-SNP: failed update CPUID page");
+ }
break;
}
}
@@ -1004,7 +1074,8 @@ sev_launch_finish(SevCommonState *sev_common)
}
static int
-snp_launch_update_data(uint64_t gpa, void *hva, uint32_t len, int type)
+snp_launch_update_data(uint64_t gpa, void *hva,
+ uint32_t len, int type)
{
SevLaunchUpdateData *data;
@@ -1019,6 +1090,90 @@ snp_launch_update_data(uint64_t gpa, void *hva, uint32_t len, int type)
return 0;
}
+static int
+sev_snp_cpuid_info_fill(SnpCpuidInfo *snp_cpuid_info,
+ const KvmCpuidInfo *kvm_cpuid_info)
+{
+ size_t i;
+
+ if (kvm_cpuid_info->cpuid.nent > SNP_CPUID_FUNCTION_MAXCOUNT) {
+ error_report("SEV-SNP: CPUID entry count (%d) exceeds max (%d)",
+ kvm_cpuid_info->cpuid.nent, SNP_CPUID_FUNCTION_MAXCOUNT);
+ return -1;
+ }
+
+ memset(snp_cpuid_info, 0, sizeof(*snp_cpuid_info));
+
+ for (i = 0; i < kvm_cpuid_info->cpuid.nent; i++) {
+ const struct kvm_cpuid_entry2 *kvm_cpuid_entry;
+ SnpCpuidFunc *snp_cpuid_entry;
+
+ kvm_cpuid_entry = &kvm_cpuid_info->entries[i];
+ snp_cpuid_entry = &snp_cpuid_info->entries[i];
+
+ snp_cpuid_entry->eax_in = kvm_cpuid_entry->function;
+ if (kvm_cpuid_entry->flags == KVM_CPUID_FLAG_SIGNIFCANT_INDEX) {
+ snp_cpuid_entry->ecx_in = kvm_cpuid_entry->index;
+ }
+ snp_cpuid_entry->eax = kvm_cpuid_entry->eax;
+ snp_cpuid_entry->ebx = kvm_cpuid_entry->ebx;
+ snp_cpuid_entry->ecx = kvm_cpuid_entry->ecx;
+ snp_cpuid_entry->edx = kvm_cpuid_entry->edx;
+
+ /*
+ * Guest kernels will calculate EBX themselves using the 0xD
+ * subfunctions corresponding to the individual XSAVE areas, so only
+ * encode the base XSAVE size in the initial leaves, corresponding
+ * to the initial XCR0=1 state.
+ */
+ if (snp_cpuid_entry->eax_in == 0xD &&
+ (snp_cpuid_entry->ecx_in == 0x0 || snp_cpuid_entry->ecx_in == 0x1)) {
+ snp_cpuid_entry->ebx = 0x240;
+ snp_cpuid_entry->xcr0_in = 1;
+ snp_cpuid_entry->xss_in = 0;
+ }
+ }
+
+ snp_cpuid_info->count = i;
+
+ return 0;
+}
+
+static int
+snp_launch_update_cpuid(uint32_t cpuid_addr, void *hva, uint32_t cpuid_len)
+{
+ KvmCpuidInfo kvm_cpuid_info = {0};
+ SnpCpuidInfo snp_cpuid_info;
+ CPUState *cs = first_cpu;
+ int ret;
+ uint32_t i = 0;
+
+ assert(sizeof(snp_cpuid_info) <= cpuid_len);
+
+ /* get the cpuid list from KVM */
+ do {
+ kvm_cpuid_info.cpuid.nent = ++i;
+ ret = kvm_vcpu_ioctl(cs, KVM_GET_CPUID2, &kvm_cpuid_info);
+ } while (ret == -E2BIG);
+
+ if (ret) {
+ error_report("SEV-SNP: unable to query CPUID values for CPU: '%s'",
+ strerror(-ret));
+ return 1;
+ }
+
+ ret = sev_snp_cpuid_info_fill(&snp_cpuid_info, &kvm_cpuid_info);
+ if (ret) {
+ error_report("SEV-SNP: failed to generate CPUID table information");
+ return 1;
+ }
+
+ memcpy(hva, &snp_cpuid_info, sizeof(snp_cpuid_info));
+
+ return snp_launch_update_data(cpuid_addr, hva, cpuid_len,
+ KVM_SEV_SNP_PAGE_TYPE_CPUID);
+}
+
static int
snp_metadata_desc_to_page_type(int desc_type)
{
@@ -1053,7 +1208,12 @@ snp_populate_metadata_pages(SevSnpGuestState *sev_snp,
exit(1);
}
- ret = snp_launch_update_data(desc->base, hva, desc->len, type);
+ if (type == KVM_SEV_SNP_PAGE_TYPE_CPUID) {
+ ret = snp_launch_update_cpuid(desc->base, hva, desc->len);
+ } else {
+ ret = snp_launch_update_data(desc->base, hva, desc->len, type);
+ }
+
if (ret) {
error_report("%s: Failed to add metadata page gpa 0x%x+%x type %d",
__func__, desc->base, desc->len, desc->type);
--
2.39.3

@ -0,0 +1,127 @@
From b2cfd4d89026e76ba86ea7adea323f2c3a588790 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Thu, 30 May 2024 06:16:31 -0500
Subject: [PATCH 071/100] i386/sev: Add support for populating OVMF metadata
pages
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [71/91] b563442c0e2f6ea01937425d300b56d9e641fd57 (bonzini/rhel-qemu-kvm)
OVMF reserves various pages so they can be pre-initialized/validated
prior to launching the guest. Add support for populating these pages
with the expected content.
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
Co-developed-by: Pankaj Gupta <pankaj.gupta@amd.com>
Signed-off-by: Pankaj Gupta <pankaj.gupta@amd.com>
Message-ID: <20240530111643.1091816-20-pankaj.gupta@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 3d8c2a7f4806ff39423312e503737fd76c34dcae)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
target/i386/sev.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 74 insertions(+)
diff --git a/target/i386/sev.c b/target/i386/sev.c
index 17281bb2c7..c57534fca2 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -1003,15 +1003,89 @@ sev_launch_finish(SevCommonState *sev_common)
migrate_add_blocker(&sev_mig_blocker, &error_fatal);
}
+static int
+snp_launch_update_data(uint64_t gpa, void *hva, uint32_t len, int type)
+{
+ SevLaunchUpdateData *data;
+
+ data = g_new0(SevLaunchUpdateData, 1);
+ data->gpa = gpa;
+ data->hva = hva;
+ data->len = len;
+ data->type = type;
+
+ QTAILQ_INSERT_TAIL(&launch_update, data, next);
+
+ return 0;
+}
+
+static int
+snp_metadata_desc_to_page_type(int desc_type)
+{
+ switch (desc_type) {
+ /* Add the umeasured prevalidated pages as a zero page */
+ case SEV_DESC_TYPE_SNP_SEC_MEM: return KVM_SEV_SNP_PAGE_TYPE_ZERO;
+ case SEV_DESC_TYPE_SNP_SECRETS: return KVM_SEV_SNP_PAGE_TYPE_SECRETS;
+ case SEV_DESC_TYPE_CPUID: return KVM_SEV_SNP_PAGE_TYPE_CPUID;
+ default:
+ return KVM_SEV_SNP_PAGE_TYPE_ZERO;
+ }
+}
+
+static void
+snp_populate_metadata_pages(SevSnpGuestState *sev_snp,
+ OvmfSevMetadata *metadata)
+{
+ OvmfSevMetadataDesc *desc;
+ int type, ret, i;
+ void *hva;
+ MemoryRegion *mr = NULL;
+
+ for (i = 0; i < metadata->num_desc; i++) {
+ desc = &metadata->descs[i];
+
+ type = snp_metadata_desc_to_page_type(desc->type);
+
+ hva = gpa2hva(&mr, desc->base, desc->len, NULL);
+ if (!hva) {
+ error_report("%s: Failed to get HVA for GPA 0x%x sz 0x%x",
+ __func__, desc->base, desc->len);
+ exit(1);
+ }
+
+ ret = snp_launch_update_data(desc->base, hva, desc->len, type);
+ if (ret) {
+ error_report("%s: Failed to add metadata page gpa 0x%x+%x type %d",
+ __func__, desc->base, desc->len, desc->type);
+ exit(1);
+ }
+ }
+}
+
static void
sev_snp_launch_finish(SevCommonState *sev_common)
{
int ret, error;
Error *local_err = NULL;
+ OvmfSevMetadata *metadata;
SevLaunchUpdateData *data;
SevSnpGuestState *sev_snp = SEV_SNP_GUEST(sev_common);
struct kvm_sev_snp_launch_finish *finish = &sev_snp->kvm_finish_conf;
+ /*
+ * To boot the SNP guest, the hypervisor is required to populate the CPUID
+ * and Secrets page before finalizing the launch flow. The location of
+ * the secrets and CPUID page is available through the OVMF metadata GUID.
+ */
+ metadata = pc_system_get_ovmf_sev_metadata_ptr();
+ if (metadata == NULL) {
+ error_report("%s: Failed to locate SEV metadata header", __func__);
+ exit(1);
+ }
+
+ /* Populate all the metadata pages */
+ snp_populate_metadata_pages(sev_snp, metadata);
+
QTAILQ_FOREACH(data, &launch_update, next) {
ret = sev_snp_launch_update(sev_snp, data);
if (ret) {
--
2.39.3

@ -0,0 +1,122 @@
From 0f7432f2b968298b64fd243df793b176f67a538f Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Thu, 30 May 2024 06:16:27 -0500
Subject: [PATCH 067/100] i386/sev: Add the SNP launch start context
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [67/91] 63759a25a413a7a9a7274fb4c3b8bc2528634855 (bonzini/rhel-qemu-kvm)
The SNP_LAUNCH_START is called first to create a cryptographic launch
context within the firmware.
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
Co-developed-by: Pankaj Gupta <pankaj.gupta@amd.com>
Signed-off-by: Pankaj Gupta <pankaj.gupta@amd.com>
Message-ID: <20240530111643.1091816-16-pankaj.gupta@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit d3107f882ec22cfb211eab7efa0c4e95f5ce11bb)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
target/i386/sev.c | 39 +++++++++++++++++++++++++++++++++++++++
target/i386/trace-events | 1 +
2 files changed, 40 insertions(+)
diff --git a/target/i386/sev.c b/target/i386/sev.c
index 43d1c48bd9..e89b87d2f5 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -39,6 +39,7 @@
#include "confidential-guest.h"
#include "hw/i386/pc.h"
#include "exec/address-spaces.h"
+#include "qemu/queue.h"
OBJECT_DECLARE_TYPE(SevCommonState, SevCommonStateClass, SEV_COMMON)
OBJECT_DECLARE_TYPE(SevGuestState, SevCommonStateClass, SEV_GUEST)
@@ -115,6 +116,16 @@ struct SevSnpGuestState {
#define DEFAULT_SEV_DEVICE "/dev/sev"
#define DEFAULT_SEV_SNP_POLICY 0x30000
+typedef struct SevLaunchUpdateData {
+ QTAILQ_ENTRY(SevLaunchUpdateData) next;
+ hwaddr gpa;
+ void *hva;
+ uint64_t len;
+ int type;
+} SevLaunchUpdateData;
+
+static QTAILQ_HEAD(, SevLaunchUpdateData) launch_update;
+
#define SEV_INFO_BLOCK_GUID "00f771de-1a7e-4fcb-890e-68c77e2fb44e"
typedef struct __attribute__((__packed__)) SevInfoBlock {
/* SEV-ES Reset Vector Address */
@@ -674,6 +685,31 @@ sev_read_file_base64(const char *filename, guchar **data, gsize *len)
return 0;
}
+static int
+sev_snp_launch_start(SevCommonState *sev_common)
+{
+ int fw_error, rc;
+ SevSnpGuestState *sev_snp_guest = SEV_SNP_GUEST(sev_common);
+ struct kvm_sev_snp_launch_start *start = &sev_snp_guest->kvm_start_conf;
+
+ trace_kvm_sev_snp_launch_start(start->policy,
+ sev_snp_guest->guest_visible_workarounds);
+
+ rc = sev_ioctl(sev_common->sev_fd, KVM_SEV_SNP_LAUNCH_START,
+ start, &fw_error);
+ if (rc < 0) {
+ error_report("%s: SNP_LAUNCH_START ret=%d fw_error=%d '%s'",
+ __func__, rc, fw_error, fw_error_to_str(fw_error));
+ return 1;
+ }
+
+ QTAILQ_INIT(&launch_update);
+
+ sev_set_guest_state(sev_common, SEV_STATE_LAUNCH_UPDATE);
+
+ return 0;
+}
+
static int
sev_launch_start(SevCommonState *sev_common)
{
@@ -1003,6 +1039,7 @@ static int sev_common_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
}
ret = klass->launch_start(sev_common);
+
if (ret) {
error_setg(errp, "%s: failed to create encryption context", __func__);
return -1;
@@ -1794,9 +1831,11 @@ sev_snp_guest_class_init(ObjectClass *oc, void *data)
SevCommonStateClass *klass = SEV_COMMON_CLASS(oc);
X86ConfidentialGuestClass *x86_klass = X86_CONFIDENTIAL_GUEST_CLASS(oc);
+ klass->launch_start = sev_snp_launch_start;
klass->kvm_init = sev_snp_kvm_init;
x86_klass->kvm_type = sev_snp_kvm_type;
+
object_class_property_add(oc, "policy", "uint64",
sev_snp_guest_get_policy,
sev_snp_guest_set_policy, NULL, NULL);
diff --git a/target/i386/trace-events b/target/i386/trace-events
index 2cd8726eeb..cb26d8a925 100644
--- a/target/i386/trace-events
+++ b/target/i386/trace-events
@@ -11,3 +11,4 @@ kvm_sev_launch_measurement(const char *value) "data %s"
kvm_sev_launch_finish(void) ""
kvm_sev_launch_secret(uint64_t hpa, uint64_t hva, uint64_t secret, int len) "hpa 0x%" PRIx64 " hva 0x%" PRIx64 " data 0x%" PRIx64 " len %d"
kvm_sev_attestation_report(const char *mnonce, const char *data) "mnonce %s data %s"
+kvm_sev_snp_launch_start(uint64_t policy, char *gosvw) "policy 0x%" PRIx64 " gosvw %s"
--
2.39.3

@ -0,0 +1,237 @@
From ec786a1ec0a76775e980862d77500f5196a937e3 Mon Sep 17 00:00:00 2001
From: Dov Murik <dovmurik@linux.ibm.com>
Date: Thu, 30 May 2024 06:16:35 -0500
Subject: [PATCH 080/100] i386/sev: Allow measured direct kernel boot on SNP
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [80/91] 11c629862519c1a279566febf5a537c63c5fcf61 (bonzini/rhel-qemu-kvm)
In SNP, the hashes page designated with a specific metadata entry
published in AmdSev OVMF.
Therefore, if the user enabled kernel hashes (for measured direct boot),
QEMU should prepare the content of hashes table, and during the
processing of the metadata entry it copy the content into the designated
page and encrypt it.
Note that in SNP (unlike SEV and SEV-ES) the measurements is done in
whole 4KB pages. Therefore QEMU zeros the whole page that includes the
hashes table, and fills in the kernel hashes area in that page, and then
encrypts the whole page. The rest of the page is reserved for SEV
launch secrets which are not usable anyway on SNP.
If the user disabled kernel hashes, QEMU pre-validates the kernel hashes
page as a zero page.
Signed-off-by: Dov Murik <dovmurik@linux.ibm.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Pankaj Gupta <pankaj.gupta@amd.com>
Message-ID: <20240530111643.1091816-24-pankaj.gupta@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit c1996992cc882b00139f78067d6a64e2ec9cb0d8)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
include/hw/i386/pc.h | 2 +
target/i386/sev.c | 111 ++++++++++++++++++++++++++++++++-----------
2 files changed, 85 insertions(+), 28 deletions(-)
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 94b49310f5..ee3bfb7be9 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -175,6 +175,8 @@ typedef enum {
SEV_DESC_TYPE_SNP_SECRETS,
/* The section contains address that can be used as a CPUID page */
SEV_DESC_TYPE_CPUID,
+ /* The section contains the region for kernel hashes for measured direct boot */
+ SEV_DESC_TYPE_SNP_KERNEL_HASHES = 0x10,
} ovmf_sev_metadata_desc_type;
diff --git a/target/i386/sev.c b/target/i386/sev.c
index 3fce4c08eb..004c667ac1 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -115,6 +115,10 @@ struct SevCommonStateClass {
X86ConfidentialGuestClass parent_class;
/* public */
+ bool (*build_kernel_loader_hashes)(SevCommonState *sev_common,
+ SevHashTableDescriptor *area,
+ SevKernelLoaderContext *ctx,
+ Error **errp);
int (*launch_start)(SevCommonState *sev_common);
void (*launch_finish)(SevCommonState *sev_common);
int (*launch_update_data)(SevCommonState *sev_common, hwaddr gpa, uint8_t *ptr, uint64_t len);
@@ -154,6 +158,9 @@ struct SevSnpGuestState {
struct kvm_sev_snp_launch_start kvm_start_conf;
struct kvm_sev_snp_launch_finish kvm_finish_conf;
+
+ uint32_t kernel_hashes_offset;
+ PaddedSevHashTable *kernel_hashes_data;
};
#define DEFAULT_GUEST_POLICY 0x1 /* disable debug */
@@ -1189,6 +1196,23 @@ snp_launch_update_cpuid(uint32_t cpuid_addr, void *hva, uint32_t cpuid_len)
KVM_SEV_SNP_PAGE_TYPE_CPUID);
}
+static int
+snp_launch_update_kernel_hashes(SevSnpGuestState *sev_snp, uint32_t addr,
+ void *hva, uint32_t len)
+{
+ int type = KVM_SEV_SNP_PAGE_TYPE_ZERO;
+ if (sev_snp->parent_obj.kernel_hashes) {
+ assert(sev_snp->kernel_hashes_data);
+ assert((sev_snp->kernel_hashes_offset +
+ sizeof(*sev_snp->kernel_hashes_data)) <= len);
+ memset(hva, 0, len);
+ memcpy(hva + sev_snp->kernel_hashes_offset, sev_snp->kernel_hashes_data,
+ sizeof(*sev_snp->kernel_hashes_data));
+ type = KVM_SEV_SNP_PAGE_TYPE_NORMAL;
+ }
+ return snp_launch_update_data(addr, hva, len, type);
+}
+
static int
snp_metadata_desc_to_page_type(int desc_type)
{
@@ -1225,6 +1249,9 @@ snp_populate_metadata_pages(SevSnpGuestState *sev_snp,
if (type == KVM_SEV_SNP_PAGE_TYPE_CPUID) {
ret = snp_launch_update_cpuid(desc->base, hva, desc->len);
+ } else if (desc->type == SEV_DESC_TYPE_SNP_KERNEL_HASHES) {
+ ret = snp_launch_update_kernel_hashes(sev_snp, desc->base, hva,
+ desc->len);
} else {
ret = snp_launch_update_data(desc->base, hva, desc->len, type);
}
@@ -1823,6 +1850,58 @@ static bool build_kernel_loader_hashes(PaddedSevHashTable *padded_ht,
return true;
}
+static bool sev_snp_build_kernel_loader_hashes(SevCommonState *sev_common,
+ SevHashTableDescriptor *area,
+ SevKernelLoaderContext *ctx,
+ Error **errp)
+{
+ /*
+ * SNP: Populate the hashes table in an area that later in
+ * snp_launch_update_kernel_hashes() will be copied to the guest memory
+ * and encrypted.
+ */
+ SevSnpGuestState *sev_snp_guest = SEV_SNP_GUEST(sev_common);
+ sev_snp_guest->kernel_hashes_offset = area->base & ~TARGET_PAGE_MASK;
+ sev_snp_guest->kernel_hashes_data = g_new0(PaddedSevHashTable, 1);
+ return build_kernel_loader_hashes(sev_snp_guest->kernel_hashes_data, ctx, errp);
+}
+
+static bool sev_build_kernel_loader_hashes(SevCommonState *sev_common,
+ SevHashTableDescriptor *area,
+ SevKernelLoaderContext *ctx,
+ Error **errp)
+{
+ PaddedSevHashTable *padded_ht;
+ hwaddr mapped_len = sizeof(*padded_ht);
+ MemTxAttrs attrs = { 0 };
+ bool ret = true;
+
+ /*
+ * Populate the hashes table in the guest's memory at the OVMF-designated
+ * area for the SEV hashes table
+ */
+ padded_ht = address_space_map(&address_space_memory, area->base,
+ &mapped_len, true, attrs);
+ if (!padded_ht || mapped_len != sizeof(*padded_ht)) {
+ error_setg(errp, "SEV: cannot map hashes table guest memory area");
+ return false;
+ }
+
+ if (build_kernel_loader_hashes(padded_ht, ctx, errp)) {
+ if (sev_encrypt_flash(area->base, (uint8_t *)padded_ht,
+ sizeof(*padded_ht), errp) < 0) {
+ ret = false;
+ }
+ } else {
+ ret = false;
+ }
+
+ address_space_unmap(&address_space_memory, padded_ht,
+ mapped_len, true, mapped_len);
+
+ return ret;
+}
+
/*
* Add the hashes of the linux kernel/initrd/cmdline to an encrypted guest page
* which is included in SEV's initial memory measurement.
@@ -1831,11 +1910,8 @@ bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp)
{
uint8_t *data;
SevHashTableDescriptor *area;
- PaddedSevHashTable *padded_ht;
- hwaddr mapped_len = sizeof(*padded_ht);
- MemTxAttrs attrs = { 0 };
- bool ret = true;
SevCommonState *sev_common = SEV_COMMON(MACHINE(qdev_get_machine())->cgs);
+ SevCommonStateClass *klass = SEV_COMMON_GET_CLASS(sev_common);
/*
* Only add the kernel hashes if the sev-guest configuration explicitly
@@ -1858,30 +1934,7 @@ bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp)
return false;
}
- /*
- * Populate the hashes table in the guest's memory at the OVMF-designated
- * area for the SEV hashes table
- */
- padded_ht = address_space_map(&address_space_memory, area->base,
- &mapped_len, true, attrs);
- if (!padded_ht || mapped_len != sizeof(*padded_ht)) {
- error_setg(errp, "SEV: cannot map hashes table guest memory area");
- return false;
- }
-
- if (build_kernel_loader_hashes(padded_ht, ctx, errp)) {
- if (sev_encrypt_flash(area->base, (uint8_t *)padded_ht,
- sizeof(*padded_ht), errp) < 0) {
- ret = false;
- }
- } else {
- ret = false;
- }
-
- address_space_unmap(&address_space_memory, padded_ht,
- mapped_len, true, mapped_len);
-
- return ret;
+ return klass->build_kernel_loader_hashes(sev_common, area, ctx, errp);
}
static char *
@@ -1998,6 +2051,7 @@ sev_guest_class_init(ObjectClass *oc, void *data)
SevCommonStateClass *klass = SEV_COMMON_CLASS(oc);
X86ConfidentialGuestClass *x86_klass = X86_CONFIDENTIAL_GUEST_CLASS(oc);
+ klass->build_kernel_loader_hashes = sev_build_kernel_loader_hashes;
klass->launch_start = sev_launch_start;
klass->launch_finish = sev_launch_finish;
klass->launch_update_data = sev_launch_update_data;
@@ -2242,6 +2296,7 @@ sev_snp_guest_class_init(ObjectClass *oc, void *data)
SevCommonStateClass *klass = SEV_COMMON_CLASS(oc);
X86ConfidentialGuestClass *x86_klass = X86_CONFIDENTIAL_GUEST_CLASS(oc);
+ klass->build_kernel_loader_hashes = sev_snp_build_kernel_loader_hashes;
klass->launch_start = sev_snp_launch_start;
klass->launch_finish = sev_snp_launch_finish;
klass->launch_update_data = sev_snp_launch_update_data;
--
2.39.3

@ -0,0 +1,268 @@
From ab6197309551bd6ddd9f8239191f68dfac23684b Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Tue, 9 Jul 2024 23:10:05 -0500
Subject: [PATCH 090/100] i386/sev: Don't allow automatic fallback to legacy
KVM_SEV*_INIT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [90/91] 2b1345faa56f993bb6e13d63e11656c784e20412 (bonzini/rhel-qemu-kvm)
Currently if the 'legacy-vm-type' property of the sev-guest object is
'on', QEMU will attempt to use the newer KVM_SEV_INIT2 kernel
interface in conjunction with the newer KVM_X86_SEV_VM and
KVM_X86_SEV_ES_VM KVM VM types.
This can lead to measurement changes if, for instance, an SEV guest was
created on a host that originally had an older kernel that didn't
support KVM_SEV_INIT2, but is booted on the same host later on after the
host kernel was upgraded.
Instead, if legacy-vm-type is 'off', QEMU should fail if the
KVM_SEV_INIT2 interface is not provided by the current host kernel.
Modify the fallback handling accordingly.
In the future, VMSA features and other flags might be added to QEMU
which will require legacy-vm-type to be 'off' because they will rely
on the newer KVM_SEV_INIT2 interface. It may be difficult to convey to
users what values of legacy-vm-type are compatible with which
features/options, so as part of this rework, switch legacy-vm-type to a
tri-state OnOffAuto option. 'auto' in this case will automatically
switch to using the newer KVM_SEV_INIT2, but only if it is required to
make use of new VMSA features or other options only available via
KVM_SEV_INIT2.
Defining 'auto' in this way would avoid inadvertantly breaking
compatibility with older kernels since it would only be used in cases
where users opt into newer features that are only available via
KVM_SEV_INIT2 and newer kernels, and provide better default behavior
than the legacy-vm-type=off behavior that was previously in place, so
make it the default for 9.1+ machine types.
Cc: Daniel P. Berrangé <berrange@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
cc: kvm@vger.kernel.org
Signed-off-by: Michael Roth <michael.roth@amd.com>
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
Link: https://lore.kernel.org/r/20240710041005.83720-1-michael.roth@amd.com
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 9d38d9dca2a81aaf5752d45d221021ef96d496cd)
RHEL: adjust compatiility setting, applying it to 9.4 machine type
---
hw/i386/pc.c | 2 +-
qapi/qom.json | 18 ++++++----
target/i386/sev.c | 85 +++++++++++++++++++++++++++++++++++++++--------
3 files changed, 83 insertions(+), 22 deletions(-)
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index b25d075b59..e9c5ea5d8f 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -352,7 +352,7 @@ const size_t pc_rhel_compat_len = G_N_ELEMENTS(pc_rhel_compat);
GlobalProperty pc_rhel_9_5_compat[] = {
/* pc_rhel_9_5_compat from pc_compat_pc_9_0 (backported from 9.1) */
{ TYPE_X86_CPU, "guest-phys-bits", "0" },
- { "sev-guest", "legacy-vm-type", "true" },
+ { "sev-guest", "legacy-vm-type", "on" },
};
const size_t pc_rhel_9_5_compat_len = G_N_ELEMENTS(pc_rhel_9_5_compat);
diff --git a/qapi/qom.json b/qapi/qom.json
index 8bd299265e..17bd5a0cf7 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -912,12 +912,16 @@
# @handle: SEV firmware handle (default: 0)
#
# @legacy-vm-type: Use legacy KVM_SEV_INIT KVM interface for creating the VM.
-# The newer KVM_SEV_INIT2 interface syncs additional vCPU
-# state when initializing the VMSA structures, which will
-# result in a different guest measurement. Set this to
-# maintain compatibility with older QEMU or kernel versions
-# that rely on legacy KVM_SEV_INIT behavior.
-# (default: false) (since 9.1)
+# The newer KVM_SEV_INIT2 interface, from Linux >= 6.10, syncs
+# additional vCPU state when initializing the VMSA structures,
+# which will result in a different guest measurement. Set
+# this to 'on' to force compatibility with older QEMU or kernel
+# versions that rely on legacy KVM_SEV_INIT behavior. 'auto'
+# will behave identically to 'on', but will automatically
+# switch to using KVM_SEV_INIT2 if the user specifies any
+# additional options that require it. If set to 'off', QEMU
+# will require KVM_SEV_INIT2 unconditionally.
+# (default: off) (since 9.1)
#
# Since: 2.12
##
@@ -927,7 +931,7 @@
'*session-file': 'str',
'*policy': 'uint32',
'*handle': 'uint32',
- '*legacy-vm-type': 'bool' } }
+ '*legacy-vm-type': 'OnOffAuto' } }
##
# @SevSnpGuestProperties:
diff --git a/target/i386/sev.c b/target/i386/sev.c
index 491fab74fd..b921defb63 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -144,7 +144,7 @@ struct SevGuestState {
uint32_t policy;
char *dh_cert_file;
char *session_file;
- bool legacy_vm_type;
+ OnOffAuto legacy_vm_type;
};
struct SevSnpGuestState {
@@ -1334,6 +1334,17 @@ sev_vm_state_change(void *opaque, bool running, RunState state)
}
}
+/*
+ * This helper is to examine sev-guest properties and determine if any options
+ * have been set which rely on the newer KVM_SEV_INIT2 interface and associated
+ * KVM VM types.
+ */
+static bool sev_init2_required(SevGuestState *sev_guest)
+{
+ /* Currently no KVM_SEV_INIT2-specific options are exposed via QEMU */
+ return false;
+}
+
static int sev_kvm_type(X86ConfidentialGuest *cg)
{
SevCommonState *sev_common = SEV_COMMON(cg);
@@ -1344,14 +1355,39 @@ static int sev_kvm_type(X86ConfidentialGuest *cg)
goto out;
}
+ /* These are the only cases where legacy VM types can be used. */
+ if (sev_guest->legacy_vm_type == ON_OFF_AUTO_ON ||
+ (sev_guest->legacy_vm_type == ON_OFF_AUTO_AUTO &&
+ !sev_init2_required(sev_guest))) {
+ sev_common->kvm_type = KVM_X86_DEFAULT_VM;
+ goto out;
+ }
+
+ /*
+ * Newer VM types are required, either explicitly via legacy-vm-type=on, or
+ * implicitly via legacy-vm-type=auto along with additional sev-guest
+ * properties that require the newer VM types.
+ */
kvm_type = (sev_guest->policy & SEV_POLICY_ES) ?
KVM_X86_SEV_ES_VM : KVM_X86_SEV_VM;
- if (kvm_is_vm_type_supported(kvm_type) && !sev_guest->legacy_vm_type) {
- sev_common->kvm_type = kvm_type;
- } else {
- sev_common->kvm_type = KVM_X86_DEFAULT_VM;
+ if (!kvm_is_vm_type_supported(kvm_type)) {
+ if (sev_guest->legacy_vm_type == ON_OFF_AUTO_AUTO) {
+ error_report("SEV: host kernel does not support requested %s VM type, which is required "
+ "for the set of options specified. To allow use of the legacy "
+ "KVM_X86_DEFAULT_VM VM type, please disable any options that are not "
+ "compatible with the legacy VM type, or upgrade your kernel.",
+ kvm_type == KVM_X86_SEV_VM ? "KVM_X86_SEV_VM" : "KVM_X86_SEV_ES_VM");
+ } else {
+ error_report("SEV: host kernel does not support requested %s VM type. To allow use of "
+ "the legacy KVM_X86_DEFAULT_VM VM type, the 'legacy-vm-type' argument "
+ "must be set to 'on' or 'auto' for the sev-guest object.",
+ kvm_type == KVM_X86_SEV_VM ? "KVM_X86_SEV_VM" : "KVM_X86_SEV_ES_VM");
+ }
+
+ return -1;
}
+ sev_common->kvm_type = kvm_type;
out:
return sev_common->kvm_type;
}
@@ -1442,14 +1478,24 @@ static int sev_common_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
}
trace_kvm_sev_init();
- if (x86_klass->kvm_type(X86_CONFIDENTIAL_GUEST(sev_common)) == KVM_X86_DEFAULT_VM) {
+ switch (x86_klass->kvm_type(X86_CONFIDENTIAL_GUEST(sev_common))) {
+ case KVM_X86_DEFAULT_VM:
cmd = sev_es_enabled() ? KVM_SEV_ES_INIT : KVM_SEV_INIT;
ret = sev_ioctl(sev_common->sev_fd, cmd, NULL, &fw_error);
- } else {
+ break;
+ case KVM_X86_SEV_VM:
+ case KVM_X86_SEV_ES_VM:
+ case KVM_X86_SNP_VM: {
struct kvm_sev_init args = { 0 };
ret = sev_ioctl(sev_common->sev_fd, KVM_SEV_INIT2, &args, &fw_error);
+ break;
+ }
+ default:
+ error_setg(errp, "%s: host kernel does not support the requested SEV configuration.",
+ __func__);
+ return -1;
}
if (ret) {
@@ -2037,14 +2083,23 @@ sev_guest_set_session_file(Object *obj, const char *value, Error **errp)
SEV_GUEST(obj)->session_file = g_strdup(value);
}
-static bool sev_guest_get_legacy_vm_type(Object *obj, Error **errp)
+static void sev_guest_get_legacy_vm_type(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
{
- return SEV_GUEST(obj)->legacy_vm_type;
+ SevGuestState *sev_guest = SEV_GUEST(obj);
+ OnOffAuto legacy_vm_type = sev_guest->legacy_vm_type;
+
+ visit_type_OnOffAuto(v, name, &legacy_vm_type, errp);
}
-static void sev_guest_set_legacy_vm_type(Object *obj, bool value, Error **errp)
+static void sev_guest_set_legacy_vm_type(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
{
- SEV_GUEST(obj)->legacy_vm_type = value;
+ SevGuestState *sev_guest = SEV_GUEST(obj);
+
+ visit_type_OnOffAuto(v, name, &sev_guest->legacy_vm_type, errp);
}
static void
@@ -2070,9 +2125,9 @@ sev_guest_class_init(ObjectClass *oc, void *data)
sev_guest_set_session_file);
object_class_property_set_description(oc, "session-file",
"guest owners session parameters (encoded with base64)");
- object_class_property_add_bool(oc, "legacy-vm-type",
- sev_guest_get_legacy_vm_type,
- sev_guest_set_legacy_vm_type);
+ object_class_property_add(oc, "legacy-vm-type", "OnOffAuto",
+ sev_guest_get_legacy_vm_type,
+ sev_guest_set_legacy_vm_type, NULL, NULL);
object_class_property_set_description(oc, "legacy-vm-type",
"use legacy VM type to maintain measurement compatibility with older QEMU or kernel versions.");
}
@@ -2088,6 +2143,8 @@ sev_guest_instance_init(Object *obj)
object_property_add_uint32_ptr(obj, "policy", &sev_guest->policy,
OBJ_PROP_FLAG_READWRITE);
object_apply_compat_props(obj);
+
+ sev_guest->legacy_vm_type = ON_OFF_AUTO_AUTO;
}
/* guest info specific sev/sev-es */
--
2.39.3

@ -0,0 +1,46 @@
From ebb3c3536366c383fa09b0987a4efb68d018b7b8 Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Thu, 30 May 2024 06:16:24 -0500
Subject: [PATCH 064/100] i386/sev: Don't return launch measurements for
SEV-SNP guests
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
RH-MergeRequest: 245: SEV-SNP support
RH-Jira: RHEL-39544
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Bandan Das <bdas@redhat.com>
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
RH-Commit: [64/91] 5a29bb2d8b5a07aec6fd271ec37345e665e9cce4 (bonzini/rhel-qemu-kvm)
For SEV-SNP guests, launch measurement is queried from within the guest
during attestation, so don't attempt to return it as part of
query-sev-launch-measure.
Signed-off-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Pankaj Gupta <pankaj.gupta@amd.com>
Message-ID: <20240530111643.1091816-13-pankaj.gupta@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 73ae63b162fc1fed520f53ad200712964d7d0264)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
target/i386/sev.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/target/i386/sev.c b/target/i386/sev.c
index 6525b3c1a0..c3daaf1ad5 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -795,7 +795,9 @@ sev_launch_get_measure(Notifier *notifier, void *unused)
static char *sev_get_launch_measurement(void)
{
- SevGuestState *sev_guest = SEV_GUEST(MACHINE(qdev_get_machine())->cgs);
+ ConfidentialGuestSupport *cgs = MACHINE(qdev_get_machine())->cgs;
+ SevGuestState *sev_guest =
+ (SevGuestState *)object_dynamic_cast(OBJECT(cgs), TYPE_SEV_GUEST);
if (sev_guest &&
SEV_COMMON(sev_guest)->state >= SEV_STATE_LAUNCH_SECRET) {
--
2.39.3

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save