From e8c1128bf982f2e5f7c9ca6c1de632feec3d9e94 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Fri, 27 Jun 2025 14:33:31 +0800 Subject: [PATCH 01/27] vfio/container: Fix potential SIGSEGV when recover from unmap-all-vaddr failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CPR overrides then restores dma_map in both outgoing and incoming QEMU, for different reasons. But it only sets saved_dma_map in the target. Fix it by always setting saved_dma_map. Fixes: eba1f657cbb1 ("vfio/container: recover from unmap-all-vaddr failure") Suggested-by: Steven Sistare Signed-off-by: Zhenzhong Duan Reviewed-by: Steve Sistare Link: https://lore.kernel.org/qemu-devel/20250627063332.5173-2-zhenzhong.duan@intel.com Signed-off-by: Cédric Le Goater --- hw/vfio/cpr-legacy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/vfio/cpr-legacy.c b/hw/vfio/cpr-legacy.c index a84c3247b7..0a5d1bd480 100644 --- a/hw/vfio/cpr-legacy.c +++ b/hw/vfio/cpr-legacy.c @@ -180,9 +180,9 @@ bool vfio_legacy_cpr_register_container(VFIOContainer *container, Error **errp) vmstate_register(NULL, -1, &vfio_container_vmstate, container); /* During incoming CPR, divert calls to dma_map. */ + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + container->cpr.saved_dma_map = vioc->dma_map; if (cpr_is_incoming()) { - VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - container->cpr.saved_dma_map = vioc->dma_map; vioc->dma_map = vfio_legacy_cpr_dma_map; } From 924c3ccb310e615bd350d4c77b269b19d95bf5e4 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Fri, 27 Jun 2025 14:33:32 +0800 Subject: [PATCH 02/27] vfio/container: Fix vfio_container_post_load() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When there are multiple VFIO containers, vioc->dma_map is restored multiple times, this made only first container work and remaining containers using vioc->dma_map restored by first container. Fix it by save and restore vioc->dma_map locally. saved_dma_map in VFIOContainerCPR becomes useless and is removed. Fixes: 7e9f21411302 ("vfio/container: restore DMA vaddr") Signed-off-by: Zhenzhong Duan Reviewed-by: Steve Sistare Link: https://lore.kernel.org/qemu-devel/20250627063332.5173-3-zhenzhong.duan@intel.com Signed-off-by: Cédric Le Goater --- hw/vfio/cpr-legacy.c | 23 +++++++++-------------- include/hw/vfio/vfio-cpr.h | 7 ++++--- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/hw/vfio/cpr-legacy.c b/hw/vfio/cpr-legacy.c index 0a5d1bd480..1216717546 100644 --- a/hw/vfio/cpr-legacy.c +++ b/hw/vfio/cpr-legacy.c @@ -99,20 +99,21 @@ static int vfio_container_post_load(void *opaque, int version_id) { VFIOContainer *container = opaque; VFIOContainerBase *bcontainer = &container->bcontainer; - VFIOGroup *group; + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + dma_map_fn saved_dma_map = vioc->dma_map; Error *local_err = NULL; + /* During incoming CPR, divert calls to dma_map. */ + vioc->dma_map = vfio_legacy_cpr_dma_map; + if (!vfio_listener_register(bcontainer, &local_err)) { error_report_err(local_err); return -1; } - QLIST_FOREACH(group, &container->group_list, container_next) { - VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + /* Restore original dma_map function */ + vioc->dma_map = saved_dma_map; - /* Restore original dma_map function */ - vioc->dma_map = container->cpr.saved_dma_map; - } return 0; } @@ -148,6 +149,7 @@ static int vfio_cpr_fail_notifier(NotifierWithReturn *notifier, */ VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + dma_map_fn saved_dma_map = vioc->dma_map; vioc->dma_map = vfio_legacy_cpr_dma_map; container->cpr.remap_listener = (MemoryListener) { @@ -158,7 +160,7 @@ static int vfio_cpr_fail_notifier(NotifierWithReturn *notifier, bcontainer->space->as); memory_listener_unregister(&container->cpr.remap_listener); container->cpr.vaddr_unmapped = false; - vioc->dma_map = container->cpr.saved_dma_map; + vioc->dma_map = saved_dma_map; } return 0; } @@ -179,13 +181,6 @@ bool vfio_legacy_cpr_register_container(VFIOContainer *container, Error **errp) vmstate_register(NULL, -1, &vfio_container_vmstate, container); - /* During incoming CPR, divert calls to dma_map. */ - VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - container->cpr.saved_dma_map = vioc->dma_map; - if (cpr_is_incoming()) { - vioc->dma_map = vfio_legacy_cpr_dma_map; - } - migration_add_notifier_mode(&container->cpr.transfer_notifier, vfio_cpr_fail_notifier, MIG_MODE_CPR_TRANSFER); diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h index 8bf85b9f4e..dbb2a16b7a 100644 --- a/include/hw/vfio/vfio-cpr.h +++ b/include/hw/vfio/vfio-cpr.h @@ -16,14 +16,15 @@ struct VFIOContainer; struct VFIOContainerBase; struct VFIOGroup; +typedef int (*dma_map_fn)(const struct VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, void *vaddr, + bool readonly, MemoryRegion *mr); + typedef struct VFIOContainerCPR { Error *blocker; bool vaddr_unmapped; NotifierWithReturn transfer_notifier; MemoryListener remap_listener; - int (*saved_dma_map)(const struct VFIOContainerBase *bcontainer, - hwaddr iova, ram_addr_t size, - void *vaddr, bool readonly, MemoryRegion *mr); } VFIOContainerCPR; typedef struct VFIODeviceCPR { From f7c5dff26e0128246f5c220b38da51e56a756dbb Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Wed, 2 Jul 2025 12:59:49 +0100 Subject: [PATCH 03/27] vfio-user: do not register vfio-user container with cpr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As the full cpr implementation is yet to be merged upstream, do not register the vfio-user container with cpr. Full vfio-user support for cpr can be merged later as a follow-up series. Signed-off-by: Mark Cave-Ayland Reviewed-by: Cédric Le Goater Message-ID: <20250702120043.267634-1-mark.caveayland@nutanix.com> [ clg: Removed now useless "hw/vfio/vfio-cpr.h" include ] Signed-off-by: Cédric Le Goater --- hw/vfio-user/container.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/hw/vfio-user/container.c b/hw/vfio-user/container.c index 3133fef177..d318e6a339 100644 --- a/hw/vfio-user/container.c +++ b/hw/vfio-user/container.c @@ -13,7 +13,6 @@ #include "hw/vfio-user/container.h" #include "hw/vfio-user/device.h" #include "hw/vfio-user/trace.h" -#include "hw/vfio/vfio-cpr.h" #include "hw/vfio/vfio-device.h" #include "hw/vfio/vfio-listener.h" #include "qapi/error.h" @@ -225,14 +224,10 @@ vfio_user_container_connect(AddressSpace *as, VFIODevice *vbasedev, bcontainer = &container->bcontainer; - if (!vfio_cpr_register_container(bcontainer, errp)) { - goto free_container_exit; - } - ret = ram_block_uncoordinated_discard_disable(true); if (ret) { error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); - goto unregister_container_exit; + goto free_container_exit; } vioc = VFIO_IOMMU_GET_CLASS(bcontainer); @@ -261,9 +256,6 @@ listener_release_exit: enable_discards_exit: ram_block_uncoordinated_discard_disable(false); -unregister_container_exit: - vfio_cpr_unregister_container(bcontainer); - free_container_exit: object_unref(container); @@ -286,7 +278,6 @@ static void vfio_user_container_disconnect(VFIOUserContainer *container) vioc->release(bcontainer); } - vfio_cpr_unregister_container(bcontainer); object_unref(container); vfio_address_space_put(space); From 34ea448263a8dad7443bfeaf5eca75e49abc5865 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 25 Jun 2025 15:33:10 +0800 Subject: [PATCH 04/27] i386/tdx: Build TDX only for 64-bit target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Build errors related to TDX were reported when QEMU built on 32-bit host[1][2]. Since TDX cannot work on 32-bit host and it's also not worth supporting TDX with 32-bit QEMU, limit TDX to 64-bit target only. [1] https://lore.kernel.org/qemu-devel/20250602173101.1052983-1-clg@redhat.com/ [2] https://lore.kernel.org/qemu-devel/b8171c39-6a92-4078-a59a-a63d7452e1e9@kaod.org/ Suggested-by: Cédric Le Goater Signed-off-by: Xiaoyao Li Reviewed-by: Zhao Liu Tested-by: Cédric Le Goater Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/20250625073310.2796298-1-xiaoyao.li@intel.com Signed-off-by: Cédric Le Goater --- hw/i386/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig index eb65bda6e0..14d23e27b5 100644 --- a/hw/i386/Kconfig +++ b/hw/i386/Kconfig @@ -13,7 +13,7 @@ config SGX config TDX bool select X86_FW_OVMF - depends on KVM + depends on KVM && X86_64 config PC bool From 4ba5cdfdf70843d7a74a7cb9704688c4c6957dd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 30 Jun 2025 19:23:01 +0200 Subject: [PATCH 05/27] b4: Drop linktrailermask MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When grabbing a patch series, the link trailer is replaced with a Message-ID, which is not useful compared to an URL. Fix that by dropping the linktrailermask config. Cc: Philippe Mathieu-Daudé Cc: Jiaxun Yang Fixes: 838cf72b5d2c ("Add a b4 configuration file") Link: https://lore.kernel.org/qemu-devel/20250630172301.519848-1-clg@redhat.com Signed-off-by: Cédric Le Goater --- .b4-config | 1 - 1 file changed, 1 deletion(-) diff --git a/.b4-config b/.b4-config index 4b9b2fe290..126f503ded 100644 --- a/.b4-config +++ b/.b4-config @@ -11,4 +11,3 @@ prep-perpatch-check-cmd = scripts/checkpatch.pl -q --terse --no-summary --mailback - searchmask = https://lore.kernel.org/qemu-devel/?x=m&t=1&q=%s linkmask = https://lore.kernel.org/qemu-devel/%s - linktrailermask = Message-ID: <%s> From 3f2a36d0ef6339ce78e107959da81f76283582f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 30 Jun 2025 19:20:48 +0200 Subject: [PATCH 06/27] Makefile: prune quilt source files for cscope MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both quilt, to apply patches, and cscope, to navigate in the code, are useful tools. Make sure source files that quilt saves when applying patches are not taken into account when building the cscope database. Link: https://lore.kernel.org/qemu-devel/20250630172048.519182-1-clg@redhat.com Signed-off-by: Cédric Le Goater --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index c92a3cf785..74c2da2037 100644 --- a/Makefile +++ b/Makefile @@ -227,6 +227,7 @@ distclean: clean recurse-distclean rm -Rf .sdk qemu-bundle find-src-path = find "$(SRC_PATH)" -path "$(SRC_PATH)/meson" -prune -o \ + -path "$(SRC_PATH)/.pc" -prune -o \ -type l -prune -o \( -name "*.[chsS]" -o -name "*.[ch].inc" \) .PHONY: ctags From 30edcb4d4e7a265c2912ca6978b150c7c75b654f Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 2 Jul 2025 14:58:38 -0700 Subject: [PATCH 07/27] vfio-pci: preserve MSI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Save the MSI message area as part of vfio-pci vmstate, and preserve the interrupt and notifier eventfd's. migrate_incoming loads the MSI data, then the vfio-pci post_load handler finds the eventfds in CPR state, rebuilds vector data structures, and attaches the interrupts to the new KVM instance. Signed-off-by: Steve Sistare Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/1751493538-202042-2-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/cpr.c | 97 ++++++++++++++++++++++++++++++++++++++ hw/vfio/pci.c | 52 +++++++++++++++++++- hw/vfio/pci.h | 2 + include/hw/vfio/vfio-cpr.h | 8 ++++ 4 files changed, 157 insertions(+), 2 deletions(-) diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c index fdbb58e203..e467373e8d 100644 --- a/hw/vfio/cpr.c +++ b/hw/vfio/cpr.c @@ -9,6 +9,8 @@ #include "hw/vfio/vfio-device.h" #include "hw/vfio/vfio-cpr.h" #include "hw/vfio/pci.h" +#include "hw/pci/msix.h" +#include "hw/pci/msi.h" #include "migration/cpr.h" #include "qapi/error.h" #include "system/runstate.h" @@ -40,6 +42,69 @@ void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer) migration_remove_notifier(&bcontainer->cpr_reboot_notifier); } +#define STRDUP_VECTOR_FD_NAME(vdev, name) \ + g_strdup_printf("%s_%s", (vdev)->vbasedev.name, (name)) + +void vfio_cpr_save_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr, + int fd) +{ + g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name); + cpr_save_fd(fdname, nr, fd); +} + +int vfio_cpr_load_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr) +{ + g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name); + return cpr_find_fd(fdname, nr); +} + +void vfio_cpr_delete_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr) +{ + g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name); + cpr_delete_fd(fdname, nr); +} + +static void vfio_cpr_claim_vectors(VFIOPCIDevice *vdev, int nr_vectors, + bool msix) +{ + int i, fd; + bool pending = false; + PCIDevice *pdev = &vdev->pdev; + + vdev->nr_vectors = nr_vectors; + vdev->msi_vectors = g_new0(VFIOMSIVector, nr_vectors); + vdev->interrupt = msix ? VFIO_INT_MSIX : VFIO_INT_MSI; + + vfio_pci_prepare_kvm_msi_virq_batch(vdev); + + for (i = 0; i < nr_vectors; i++) { + VFIOMSIVector *vector = &vdev->msi_vectors[i]; + + fd = vfio_cpr_load_vector_fd(vdev, "interrupt", i); + if (fd >= 0) { + vfio_pci_vector_init(vdev, i); + vfio_pci_msi_set_handler(vdev, i); + } + + if (vfio_cpr_load_vector_fd(vdev, "kvm_interrupt", i) >= 0) { + vfio_pci_add_kvm_msi_virq(vdev, vector, i, msix); + } else { + vdev->msi_vectors[i].virq = -1; + } + + if (msix && msix_is_pending(pdev, i) && msix_is_masked(pdev, i)) { + set_bit(i, vdev->msix->pending); + pending = true; + } + } + + vfio_pci_commit_kvm_msi_virq_batch(vdev); + + if (msix) { + memory_region_set_enabled(&pdev->msix_pba_mmio, pending); + } +} + /* * The kernel may change non-emulated config bits. Exclude them from the * changed-bits check in get_pci_config_device. @@ -58,13 +123,45 @@ static int vfio_cpr_pci_pre_load(void *opaque) return 0; } +static int vfio_cpr_pci_post_load(void *opaque, int version_id) +{ + VFIOPCIDevice *vdev = opaque; + PCIDevice *pdev = &vdev->pdev; + int nr_vectors; + + if (msix_enabled(pdev)) { + vfio_pci_msix_set_notifiers(vdev); + nr_vectors = vdev->msix->entries; + vfio_cpr_claim_vectors(vdev, nr_vectors, true); + + } else if (msi_enabled(pdev)) { + nr_vectors = msi_nr_vectors_allocated(pdev); + vfio_cpr_claim_vectors(vdev, nr_vectors, false); + + } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) { + g_assert_not_reached(); /* completed in a subsequent patch */ + } + + return 0; +} + +static bool pci_msix_present(void *opaque, int version_id) +{ + PCIDevice *pdev = opaque; + + return msix_present(pdev); +} + const VMStateDescription vfio_cpr_pci_vmstate = { .name = "vfio-cpr-pci", .version_id = 0, .minimum_version_id = 0, .pre_load = vfio_cpr_pci_pre_load, + .post_load = vfio_cpr_pci_post_load, .needed = cpr_incoming_needed, .fields = (VMStateField[]) { + VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice), + VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, pci_msix_present), VMSTATE_END_OF_LIST() } }; diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index fa25bded25..5f9f2640e5 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -29,6 +29,7 @@ #include "hw/pci/pci_bridge.h" #include "hw/qdev-properties.h" #include "hw/qdev-properties-system.h" +#include "hw/vfio/vfio-cpr.h" #include "migration/vmstate.h" #include "migration/cpr.h" #include "qobject/qdict.h" @@ -57,20 +58,33 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); static void vfio_msi_disable_common(VFIOPCIDevice *vdev); +/* Create new or reuse existing eventfd */ static bool vfio_notifier_init(VFIOPCIDevice *vdev, EventNotifier *e, const char *name, int nr, Error **errp) { - int ret = event_notifier_init(e, 0); + int fd, ret; + fd = vfio_cpr_load_vector_fd(vdev, name, nr); + if (fd >= 0) { + event_notifier_init_fd(e, fd); + return true; + } + + ret = event_notifier_init(e, 0); if (ret) { error_setg_errno(errp, -ret, "vfio_notifier_init %s failed", name); + return false; } - return !ret; + + fd = event_notifier_get_fd(e); + vfio_cpr_save_vector_fd(vdev, name, nr, fd); + return true; } static void vfio_notifier_cleanup(VFIOPCIDevice *vdev, EventNotifier *e, const char *name, int nr) { + vfio_cpr_delete_vector_fd(vdev, name, nr); event_notifier_cleanup(e); } @@ -394,6 +408,14 @@ static void vfio_msi_interrupt(void *opaque) notify(&vdev->pdev, nr); } +void vfio_pci_msi_set_handler(VFIOPCIDevice *vdev, int nr) +{ + VFIOMSIVector *vector = &vdev->msi_vectors[nr]; + int fd = event_notifier_get_fd(&vector->interrupt); + + qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector); +} + /* * Get MSI-X enabled, but no vector enabled, by setting vector 0 with an invalid * fd to kernel. @@ -656,6 +678,15 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, static int vfio_msix_vector_use(PCIDevice *pdev, unsigned int nr, MSIMessage msg) { + /* + * Ignore the callback from msix_set_vector_notifiers during resume. + * The necessary subset of these actions is called from + * vfio_cpr_claim_vectors during post load. + */ + if (cpr_is_incoming()) { + return 0; + } + return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt); } @@ -686,6 +717,12 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) } } +void vfio_pci_msix_set_notifiers(VFIOPCIDevice *vdev) +{ + msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use, + vfio_msix_vector_release, NULL); +} + void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev) { assert(!vdev->defer_kvm_irq_routing); @@ -2965,6 +3002,11 @@ void vfio_pci_register_err_notifier(VFIOPCIDevice *vdev) fd = event_notifier_get_fd(&vdev->err_notifier); qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev); + /* Do not alter irq_signaling during vfio_realize for cpr */ + if (cpr_is_incoming()) { + return; + } + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); @@ -3032,6 +3074,12 @@ void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev) fd = event_notifier_get_fd(&vdev->req_notifier); qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev); + /* Do not alter irq_signaling during vfio_realize for cpr */ + if (cpr_is_incoming()) { + vdev->req_enabled = true; + return; + } + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 5ba7330b27..495fae737d 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -218,6 +218,8 @@ void vfio_pci_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev); void vfio_pci_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev); bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp); +void vfio_pci_msix_set_notifiers(VFIOPCIDevice *vdev); +void vfio_pci_msi_set_handler(VFIOPCIDevice *vdev, int nr); uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len); void vfio_pci_write_config(PCIDevice *pdev, diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h index dbb2a16b7a..f21578da3c 100644 --- a/include/hw/vfio/vfio-cpr.h +++ b/include/hw/vfio/vfio-cpr.h @@ -15,6 +15,7 @@ struct VFIOContainer; struct VFIOContainerBase; struct VFIOGroup; +struct VFIOPCIDevice; typedef int (*dma_map_fn)(const struct VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, @@ -53,6 +54,13 @@ void vfio_cpr_giommu_remap(struct VFIOContainerBase *bcontainer, bool vfio_cpr_ram_discard_register_listener( struct VFIOContainerBase *bcontainer, MemoryRegionSection *section); +void vfio_cpr_save_vector_fd(struct VFIOPCIDevice *vdev, const char *name, + int nr, int fd); +int vfio_cpr_load_vector_fd(struct VFIOPCIDevice *vdev, const char *name, + int nr); +void vfio_cpr_delete_vector_fd(struct VFIOPCIDevice *vdev, const char *name, + int nr); + extern const VMStateDescription vfio_cpr_pci_vmstate; #endif /* HW_VFIO_VFIO_CPR_H */ From 87aeaead5c75448d6e36bf6f9114862c8d523871 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 2 Jul 2025 14:58:39 -0700 Subject: [PATCH 08/27] vfio-pci: preserve INTx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Preserve vfio INTx state across cpr-transfer. Preserve VFIOINTx fields as follows: pin : Recover this from the vfio config in kernel space interrupt : Preserve its eventfd descriptor across exec. unmask : Ditto route.irq : This could perhaps be recovered in vfio_pci_post_load by calling pci_device_route_intx_to_irq(pin), whose implementation reads config space for a bridge device such as ich9. However, there is no guarantee that the bridge vmstate is read before vfio vmstate. Rather than fiddling with MigrationPriority for vmstate handlers, explicitly save route.irq in vfio vmstate. pending : save in vfio vmstate. mmap_timeout, mmap_timer : Re-initialize bool kvm_accel : Re-initialize In vfio_realize, defer calling vfio_intx_enable until the vmstate is available, in vfio_pci_post_load. Modify vfio_intx_enable and vfio_intx_kvm_enable to skip vfio initialization, but still perform kvm initialization. Signed-off-by: Steve Sistare Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/1751493538-202042-3-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/cpr.c | 27 ++++++++++++++++++++++++- hw/vfio/pci.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 79 insertions(+), 3 deletions(-) diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c index e467373e8d..f5555cabe7 100644 --- a/hw/vfio/cpr.c +++ b/hw/vfio/cpr.c @@ -139,7 +139,11 @@ static int vfio_cpr_pci_post_load(void *opaque, int version_id) vfio_cpr_claim_vectors(vdev, nr_vectors, false); } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) { - g_assert_not_reached(); /* completed in a subsequent patch */ + Error *local_err = NULL; + if (!vfio_pci_intx_enable(vdev, &local_err)) { + error_report_err(local_err); + return -1; + } } return 0; @@ -152,6 +156,26 @@ static bool pci_msix_present(void *opaque, int version_id) return msix_present(pdev); } +static const VMStateDescription vfio_intx_vmstate = { + .name = "vfio-cpr-intx", + .version_id = 0, + .minimum_version_id = 0, + .fields = (VMStateField[]) { + VMSTATE_BOOL(pending, VFIOINTx), + VMSTATE_UINT32(route.mode, VFIOINTx), + VMSTATE_INT32(route.irq, VFIOINTx), + VMSTATE_END_OF_LIST() + } +}; + +#define VMSTATE_VFIO_INTX(_field, _state) { \ + .name = (stringify(_field)), \ + .size = sizeof(VFIOINTx), \ + .vmsd = &vfio_intx_vmstate, \ + .flags = VMS_STRUCT, \ + .offset = vmstate_offset_value(_state, _field, VFIOINTx), \ +} + const VMStateDescription vfio_cpr_pci_vmstate = { .name = "vfio-cpr-pci", .version_id = 0, @@ -162,6 +186,7 @@ const VMStateDescription vfio_cpr_pci_vmstate = { .fields = (VMStateField[]) { VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice), VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, pci_msix_present), + VMSTATE_VFIO_INTX(intx, VFIOPCIDevice), VMSTATE_END_OF_LIST() } }; diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 5f9f2640e5..dd0b2a0b94 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -210,6 +210,36 @@ fail: #endif } +static bool vfio_cpr_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) +{ +#ifdef CONFIG_KVM + if (vdev->no_kvm_intx || !kvm_irqfds_enabled() || + vdev->intx.route.mode != PCI_INTX_ENABLED || + !kvm_resamplefds_enabled()) { + return true; + } + + if (!vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0, errp)) { + return false; + } + + if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, + &vdev->intx.interrupt, + &vdev->intx.unmask, + vdev->intx.route.irq)) { + error_setg_errno(errp, errno, "failed to setup resample irqfd"); + vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0); + return false; + } + + vdev->intx.kvm_accel = true; + trace_vfio_intx_enable_kvm(vdev->vbasedev.name); + return true; +#else + return true; +#endif +} + static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev) { #ifdef CONFIG_KVM @@ -305,7 +335,13 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) return true; } - vfio_disable_interrupts(vdev); + /* + * Do not alter interrupt state during vfio_realize and cpr load. + * The incoming state is cleared thereafter. + */ + if (!cpr_is_incoming()) { + vfio_disable_interrupts(vdev); + } vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */ pci_config_set_interrupt_pin(vdev->pdev.config, pin); @@ -328,6 +364,14 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) fd = event_notifier_get_fd(&vdev->intx.interrupt); qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev); + + if (cpr_is_incoming()) { + if (!vfio_cpr_intx_enable_kvm(vdev, &err)) { + warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); + } + goto skip_signaling; + } + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) { qemu_set_fd_handler(fd, NULL, NULL, vdev); @@ -339,6 +383,7 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); } +skip_signaling: vdev->interrupt = VFIO_INT_INTx; trace_vfio_intx_enable(vdev->vbasedev.name); @@ -3237,7 +3282,13 @@ bool vfio_pci_interrupt_setup(VFIOPCIDevice *vdev, Error **errp) vfio_intx_routing_notifier); vdev->irqchip_change_notifier.notify = vfio_irqchip_change; kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier); - if (!vfio_intx_enable(vdev, errp)) { + + /* + * During CPR, do not call vfio_intx_enable at this time. Instead, + * call it from vfio_pci_post_load after the intx routing data has + * been loaded from vmstate. + */ + if (!cpr_is_incoming() && !vfio_intx_enable(vdev, errp)) { timer_free(vdev->intx.mmap_timer); pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); From 7ed0919119b0e7a6b7db1dcaca3a2cb30c771dd1 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 2 Jul 2025 14:58:40 -0700 Subject: [PATCH 09/27] migration: close kvm after cpr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cpr-transfer breaks vfio network connectivity to and from the guest, and the host system log shows: irq bypass consumer (token 00000000a03c32e5) registration fails: -16 which is EBUSY. This occurs because KVM descriptors are still open in the old QEMU process. Close them. Cc: Paolo Bonzini Signed-off-by: Steve Sistare Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/qemu-devel/1751493538-202042-4-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- accel/kvm/kvm-all.c | 32 ++++++++++++++++++++++++++++++++ hw/vfio/cpr-legacy.c | 2 ++ hw/vfio/cpr.c | 21 +++++++++++++++++++++ hw/vfio/helpers.c | 11 +++++++++++ include/hw/vfio/vfio-cpr.h | 2 ++ include/hw/vfio/vfio-device.h | 2 ++ include/system/kvm.h | 1 + 7 files changed, 71 insertions(+) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index d095d1b98f..8141854617 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -515,16 +515,23 @@ static int do_kvm_destroy_vcpu(CPUState *cpu) goto err; } + /* If I am the CPU that created coalesced_mmio_ring, then discard it */ + if (s->coalesced_mmio_ring == (void *)cpu->kvm_run + PAGE_SIZE) { + s->coalesced_mmio_ring = NULL; + } + ret = munmap(cpu->kvm_run, mmap_size); if (ret < 0) { goto err; } + cpu->kvm_run = NULL; if (cpu->kvm_dirty_gfns) { ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_bytes); if (ret < 0) { goto err; } + cpu->kvm_dirty_gfns = NULL; } kvm_park_vcpu(cpu); @@ -608,6 +615,31 @@ err: return ret; } +void kvm_close(void) +{ + CPUState *cpu; + + if (!kvm_state || kvm_state->fd == -1) { + return; + } + + CPU_FOREACH(cpu) { + cpu_remove_sync(cpu); + close(cpu->kvm_fd); + cpu->kvm_fd = -1; + close(cpu->kvm_vcpu_stats_fd); + cpu->kvm_vcpu_stats_fd = -1; + } + + if (kvm_state && kvm_state->fd != -1) { + close(kvm_state->vmfd); + kvm_state->vmfd = -1; + close(kvm_state->fd); + kvm_state->fd = -1; + } + kvm_state = NULL; +} + /* * dirty pages logging control */ diff --git a/hw/vfio/cpr-legacy.c b/hw/vfio/cpr-legacy.c index 1216717546..553b203e9b 100644 --- a/hw/vfio/cpr-legacy.c +++ b/hw/vfio/cpr-legacy.c @@ -179,6 +179,8 @@ bool vfio_legacy_cpr_register_container(VFIOContainer *container, Error **errp) MIG_MODE_CPR_TRANSFER, -1) == 0; } + vfio_cpr_add_kvm_notifier(); + vmstate_register(NULL, -1, &vfio_container_vmstate, container); migration_add_notifier_mode(&container->cpr.transfer_notifier, diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c index f5555cabe7..0e903cdd2f 100644 --- a/hw/vfio/cpr.c +++ b/hw/vfio/cpr.c @@ -190,3 +190,24 @@ const VMStateDescription vfio_cpr_pci_vmstate = { VMSTATE_END_OF_LIST() } }; + +static NotifierWithReturn kvm_close_notifier; + +static int vfio_cpr_kvm_close_notifier(NotifierWithReturn *notifier, + MigrationEvent *e, + Error **errp) +{ + if (e->type == MIG_EVENT_PRECOPY_DONE) { + vfio_kvm_device_close(); + } + return 0; +} + +void vfio_cpr_add_kvm_notifier(void) +{ + if (!kvm_close_notifier.notify) { + migration_add_notifier_mode(&kvm_close_notifier, + vfio_cpr_kvm_close_notifier, + MIG_MODE_CPR_TRANSFER); + } +} diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index d0dbab1d17..9a5f621545 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -117,6 +117,17 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, int vfio_kvm_device_fd = -1; #endif +void vfio_kvm_device_close(void) +{ +#ifdef CONFIG_KVM + kvm_close(); + if (vfio_kvm_device_fd != -1) { + close(vfio_kvm_device_fd); + vfio_kvm_device_fd = -1; + } +#endif +} + int vfio_kvm_device_add_fd(int fd, Error **errp) { #ifdef CONFIG_KVM diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h index f21578da3c..d37acc4a0a 100644 --- a/include/hw/vfio/vfio-cpr.h +++ b/include/hw/vfio/vfio-cpr.h @@ -63,4 +63,6 @@ void vfio_cpr_delete_vector_fd(struct VFIOPCIDevice *vdev, const char *name, extern const VMStateDescription vfio_cpr_pci_vmstate; +void vfio_cpr_add_kvm_notifier(void); + #endif /* HW_VFIO_VFIO_CPR_H */ diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h index c616652ee7..f503837ccc 100644 --- a/include/hw/vfio/vfio-device.h +++ b/include/hw/vfio/vfio-device.h @@ -283,4 +283,6 @@ void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp); void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, DeviceState *dev, bool ram_discard); int vfio_device_get_aw_bits(VFIODevice *vdev); + +void vfio_kvm_device_close(void); #endif /* HW_VFIO_VFIO_COMMON_H */ diff --git a/include/system/kvm.h b/include/system/kvm.h index 7cc60d26f2..4896a3c9c5 100644 --- a/include/system/kvm.h +++ b/include/system/kvm.h @@ -195,6 +195,7 @@ bool kvm_has_sync_mmu(void); int kvm_has_vcpu_events(void); int kvm_max_nested_state_length(void); int kvm_has_gsi_routing(void); +void kvm_close(void); /** * kvm_arm_supports_user_irq From ccfc6715cf54caafba4c2516af394b4ed979c615 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 2 Jul 2025 14:58:41 -0700 Subject: [PATCH 10/27] migration: cpr_get_fd_param helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the helper function cpr_get_fd_param, to use when preserving a file descriptor that is opened externally and passed to QEMU. cpr_get_fd_param returns a descriptor number either from a QEMU command-line parameter, from a getfd command, or from CPR state. When a descriptor is passed to new QEMU via SCM_RIGHTS, its number changes. Hence, during CPR, the command-line parameter is ignored in new QEMU, and over-ridden by the value found in CPR state. Similarly, if the descriptor was originally specified by a getfd command in old QEMU, the fd number is not known outside of QEMU, and it changes when sent to new QEMU via SCM_RIGHTS. Hence the user cannot send getfd to new QEMU, but when the user sends a hotplug command that references the fd, cpr_get_fd_param finds its value in CPR state. Signed-off-by: Steve Sistare Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/qemu-devel/1751493538-202042-5-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- include/migration/cpr.h | 2 ++ migration/cpr.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/include/migration/cpr.h b/include/migration/cpr.h index 07858e93fa..eb27a93301 100644 --- a/include/migration/cpr.h +++ b/include/migration/cpr.h @@ -32,6 +32,8 @@ void cpr_state_close(void); struct QIOChannel *cpr_state_ioc(void); bool cpr_incoming_needed(void *opaque); +int cpr_get_fd_param(const char *name, const char *fdname, int index, + Error **errp); QEMUFile *cpr_transfer_output(MigrationChannel *channel, Error **errp); QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp); diff --git a/migration/cpr.c b/migration/cpr.c index a50a57edca..535d587aee 100644 --- a/migration/cpr.c +++ b/migration/cpr.c @@ -13,6 +13,7 @@ #include "migration/qemu-file.h" #include "migration/savevm.h" #include "migration/vmstate.h" +#include "monitor/monitor.h" #include "system/runstate.h" #include "trace.h" @@ -264,3 +265,39 @@ bool cpr_incoming_needed(void *opaque) MigMode mode = migrate_mode(); return mode == MIG_MODE_CPR_TRANSFER; } + +/* + * cpr_get_fd_param: find a descriptor and return its value. + * + * @name: CPR name for the descriptor + * @fdname: An integer-valued string, or a name passed to a getfd command + * @index: CPR index of the descriptor + * @errp: returned error message + * + * If CPR is not being performed, then use @fdname to find the fd. + * If CPR is being performed, then ignore @fdname, and look for @name + * and @index in CPR state. + * + * On success returns the fd value, else returns -1. + */ +int cpr_get_fd_param(const char *name, const char *fdname, int index, + Error **errp) +{ + ERRP_GUARD(); + int fd; + + if (cpr_is_incoming()) { + fd = cpr_find_fd(name, index); + if (fd < 0) { + error_setg(errp, "cannot find saved value for fd %s", fdname); + } + } else { + fd = monitor_fd_param(monitor_cur(), fdname, errp); + if (fd >= 0) { + cpr_save_fd(name, index, fd); + } else { + error_prepend(errp, "Could not parse object fd %s:", fdname); + } + } + return fd; +} From e563dc88c21915e111ecef0756cc291e9e473c35 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 2 Jul 2025 14:58:42 -0700 Subject: [PATCH 11/27] backends/iommufd: iommufd_backend_map_file_dma MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define iommufd_backend_map_file_dma to implement IOMMU_IOAS_MAP_FILE. This will be called as a substitute for iommufd_backend_map_dma, so the error conditions for BARs are copied as-is from that function. Signed-off-by: Steve Sistare Reviewed-by: Zhenzhong Duan Link: https://lore.kernel.org/qemu-devel/1751493538-202042-6-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- backends/iommufd.c | 34 ++++++++++++++++++++++++++++++++++ backends/trace-events | 1 + include/system/iommufd.h | 3 +++ 3 files changed, 38 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index c2c47abf7e..3a2ecc7f5b 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -172,6 +172,40 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, return ret; } +int iommufd_backend_map_file_dma(IOMMUFDBackend *be, uint32_t ioas_id, + hwaddr iova, ram_addr_t size, + int mfd, unsigned long start, bool readonly) +{ + int ret, fd = be->fd; + struct iommu_ioas_map_file map = { + .size = sizeof(map), + .flags = IOMMU_IOAS_MAP_READABLE | + IOMMU_IOAS_MAP_FIXED_IOVA, + .ioas_id = ioas_id, + .fd = mfd, + .start = start, + .iova = iova, + .length = size, + }; + + if (!readonly) { + map.flags |= IOMMU_IOAS_MAP_WRITEABLE; + } + + ret = ioctl(fd, IOMMU_IOAS_MAP_FILE, &map); + trace_iommufd_backend_map_file_dma(fd, ioas_id, iova, size, mfd, start, + readonly, ret); + if (ret) { + ret = -errno; + + /* TODO: Not support mapping hardware PCI BAR region for now. */ + if (errno == EFAULT) { + warn_report("IOMMU_IOAS_MAP_FILE failed: %m, PCI BAR?"); + } + } + return ret; +} + int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, ram_addr_t size) { diff --git a/backends/trace-events b/backends/trace-events index 7278214ea5..e5f3e70cd1 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -11,6 +11,7 @@ iommufd_backend_connect(int fd, bool owned, uint32_t users) "fd=%d owned=%d user iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d" iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d" iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)" +iommufd_backend_map_file_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int fd, unsigned long start, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" fd=%d start=%ld readonly=%d (%d)" iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas) " iommufd=%d ioas=%d" diff --git a/include/system/iommufd.h b/include/system/iommufd.h index 283861b924..2d24d93d17 100644 --- a/include/system/iommufd.h +++ b/include/system/iommufd.h @@ -43,6 +43,9 @@ void iommufd_backend_disconnect(IOMMUFDBackend *be); bool iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, Error **errp); void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id); +int iommufd_backend_map_file_dma(IOMMUFDBackend *be, uint32_t ioas_id, + hwaddr iova, ram_addr_t size, int fd, + unsigned long start, bool readonly); int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly); int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, From ab48cedc648a60a3e51db73acf12148d90f19c4c Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 2 Jul 2025 14:58:43 -0700 Subject: [PATCH 12/27] backends/iommufd: change process ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define the change process ioctl Signed-off-by: Steve Sistare Reviewed-by: Cédric Le Goater Reviewed-by: Zhenzhong Duan Link: https://lore.kernel.org/qemu-devel/1751493538-202042-7-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- backends/iommufd.c | 24 ++++++++++++++++++++++++ backends/trace-events | 1 + include/system/iommufd.h | 3 +++ 3 files changed, 28 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index 3a2ecc7f5b..87f81a05f6 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -73,6 +73,30 @@ static void iommufd_backend_class_init(ObjectClass *oc, const void *data) object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd); } +bool iommufd_change_process_capable(IOMMUFDBackend *be) +{ + struct iommu_ioas_change_process args = {.size = sizeof(args)}; + + /* + * Call IOMMU_IOAS_CHANGE_PROCESS to verify it is a recognized ioctl. + * This is a no-op if the process has not changed since DMA was mapped. + */ + return !ioctl(be->fd, IOMMU_IOAS_CHANGE_PROCESS, &args); +} + +bool iommufd_change_process(IOMMUFDBackend *be, Error **errp) +{ + struct iommu_ioas_change_process args = {.size = sizeof(args)}; + bool ret = !ioctl(be->fd, IOMMU_IOAS_CHANGE_PROCESS, &args); + + if (!ret) { + error_setg_errno(errp, errno, "IOMMU_IOAS_CHANGE_PROCESS fd %d failed", + be->fd); + } + trace_iommufd_change_process(be->fd, ret); + return ret; +} + bool iommufd_backend_connect(IOMMUFDBackend *be, Error **errp) { int fd; diff --git a/backends/trace-events b/backends/trace-events index e5f3e70cd1..56132d3fd2 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -7,6 +7,7 @@ dbus_vmstate_loading(const char *id) "id: %s" dbus_vmstate_saving(const char *id) "id: %s" # iommufd.c +iommufd_change_process(int fd, bool ret) "fd=%d (%d)" iommufd_backend_connect(int fd, bool owned, uint32_t users) "fd=%d owned=%d users=%d" iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d" iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d" diff --git a/include/system/iommufd.h b/include/system/iommufd.h index 2d24d93d17..db5f2c716c 100644 --- a/include/system/iommufd.h +++ b/include/system/iommufd.h @@ -69,6 +69,9 @@ bool iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t id, uint32_t *entry_num, void *data, Error **errp); +bool iommufd_change_process_capable(IOMMUFDBackend *be); +bool iommufd_change_process(IOMMUFDBackend *be, Error **errp); + #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" OBJECT_DECLARE_TYPE(HostIOMMUDeviceIOMMUFD, HostIOMMUDeviceIOMMUFDClass, HOST_IOMMU_DEVICE_IOMMUFD) From d7ae4a740c8e49962ccfbbf41da0b3b8314518ce Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 2 Jul 2025 14:58:44 -0700 Subject: [PATCH 13/27] physmem: qemu_ram_get_fd_offset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define qemu_ram_get_fd_offset, so CPR can map a memory region using IOMMU_IOAS_MAP_FILE in a subsequent patch. Signed-off-by: Steve Sistare Reviewed-by: Peter Xu Reviewed-by: Zhenzhong Duan Link: https://lore.kernel.org/qemu-devel/1751493538-202042-8-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- include/exec/cpu-common.h | 1 + system/physmem.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index a68485547d..9b658a3f48 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -85,6 +85,7 @@ void qemu_ram_unset_idstr(RAMBlock *block); const char *qemu_ram_get_idstr(RAMBlock *rb); void *qemu_ram_get_host_addr(RAMBlock *rb); ram_addr_t qemu_ram_get_offset(RAMBlock *rb); +ram_addr_t qemu_ram_get_fd_offset(RAMBlock *rb); ram_addr_t qemu_ram_get_used_length(RAMBlock *rb); ram_addr_t qemu_ram_get_max_length(RAMBlock *rb); bool qemu_ram_is_shared(RAMBlock *rb); diff --git a/system/physmem.c b/system/physmem.c index ff0ca40222..130c148ffb 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -1593,6 +1593,11 @@ ram_addr_t qemu_ram_get_offset(RAMBlock *rb) return rb->offset; } +ram_addr_t qemu_ram_get_fd_offset(RAMBlock *rb) +{ + return rb->fd_offset; +} + ram_addr_t qemu_ram_get_used_length(RAMBlock *rb) { return rb->used_length; From fb32965b6dd8a001815593642a5146fbd2e85651 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 2 Jul 2025 14:58:45 -0700 Subject: [PATCH 14/27] vfio/iommufd: use IOMMU_IOAS_MAP_FILE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use IOMMU_IOAS_MAP_FILE when the mapped region is backed by a file. Such a mapping can be preserved without modification during CPR, because it depends on the file's address space, which does not change, rather than on the process's address space, which does change. Signed-off-by: Steve Sistare Reviewed-by: Zhenzhong Duan Link: https://lore.kernel.org/qemu-devel/1751493538-202042-9-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/container-base.c | 9 +++++++++ hw/vfio/iommufd.c | 13 +++++++++++++ include/hw/vfio/vfio-container-base.h | 15 +++++++++++++++ 3 files changed, 37 insertions(+) diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index d834bd4822..56304978e1 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -78,7 +78,16 @@ int vfio_container_dma_map(VFIOContainerBase *bcontainer, void *vaddr, bool readonly, MemoryRegion *mr) { VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + RAMBlock *rb = mr->ram_block; + int mfd = rb ? qemu_ram_get_fd(rb) : -1; + if (mfd >= 0 && vioc->dma_map_file) { + unsigned long start = vaddr - qemu_ram_get_host_addr(rb); + unsigned long offset = qemu_ram_get_fd_offset(rb); + + return vioc->dma_map_file(bcontainer, iova, size, mfd, start + offset, + readonly); + } g_assert(vioc->dma_map); return vioc->dma_map(bcontainer, iova, size, vaddr, readonly, mr); } diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index d3efef71af..962a1e2b1f 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -45,6 +45,18 @@ static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, iova, size, vaddr, readonly); } +static int iommufd_cdev_map_file(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + int fd, unsigned long start, bool readonly) +{ + const VFIOIOMMUFDContainer *container = + container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + + return iommufd_backend_map_file_dma(container->be, + container->ioas_id, + iova, size, fd, start, readonly); +} + static int iommufd_cdev_unmap(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb, bool unmap_all) @@ -807,6 +819,7 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, const void *data) VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); vioc->dma_map = iommufd_cdev_map; + vioc->dma_map_file = iommufd_cdev_map_file; vioc->dma_unmap = iommufd_cdev_unmap; vioc->attach_device = iommufd_cdev_attach; vioc->detach_device = iommufd_cdev_detach; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 3cd86ec59e..bded6e993f 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -167,6 +167,21 @@ struct VFIOIOMMUClass { int (*dma_map)(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly, MemoryRegion *mr); + /** + * @dma_map_file + * + * Map a file range for the container. + * + * @bcontainer: #VFIOContainerBase to use for map + * @iova: start address to map + * @size: size of the range to map + * @fd: descriptor of the file to map + * @start: starting file offset of the range to map + * @readonly: map read only if true + */ + int (*dma_map_file)(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + int fd, unsigned long start, bool readonly); /** * @dma_unmap * From b9b389b9e0f08b207786d501b790dab9cb2c09de Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 2 Jul 2025 14:58:46 -0700 Subject: [PATCH 15/27] vfio/iommufd: invariant device name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cpr-transfer will use the device name as a key to find the value of the device descriptor in new QEMU. However, if the descriptor number is specified by a command-line fd parameter, then vfio_device_get_name creates a name that includes the fd number. This causes a chicken-and-egg problem: new QEMU must know the fd number to construct a name to find the fd number. To fix, create an invariant name based on the id command-line parameter, if id is defined. The user will need to provide such an id to use CPR. Signed-off-by: Steve Sistare Reviewed-by: Cédric Le Goater Reviewed-by: Zhenzhong Duan Link: https://lore.kernel.org/qemu-devel/1751493538-202042-10-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/device.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/hw/vfio/device.c b/hw/vfio/device.c index d91c695b69..3cd365fb8b 100644 --- a/hw/vfio/device.c +++ b/hw/vfio/device.c @@ -316,12 +316,17 @@ bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp) error_setg(errp, "Use FD passing only with iommufd backend"); return false; } - /* - * Give a name with fd so any function printing out vbasedev->name - * will not break. - */ if (!vbasedev->name) { - vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd); + + if (vbasedev->dev->id) { + vbasedev->name = g_strdup(vbasedev->dev->id); + return true; + } else { + /* + * Assign a name so any function printing it will not break. + */ + vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd); + } } } From 184053f04f6ad6b2950d4712063ffed43bb2720f Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 2 Jul 2025 14:58:47 -0700 Subject: [PATCH 16/27] vfio/iommufd: add vfio_device_free_name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define vfio_device_free_name to free the name created by vfio_device_get_name. A subsequent patch will do more there. No functional change. Signed-off-by: Steve Sistare Reviewed-by: Cédric Le Goater Reviewed-by: Zhenzhong Duan Link: https://lore.kernel.org/qemu-devel/1751493538-202042-11-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/ap.c | 4 ++-- hw/vfio/ccw.c | 4 ++-- hw/vfio/device.c | 5 +++++ hw/vfio/pci.c | 2 +- hw/vfio/platform.c | 2 +- include/hw/vfio/vfio-device.h | 1 + 6 files changed, 12 insertions(+), 6 deletions(-) diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index 1df4438149..7719f24579 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -265,7 +265,7 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp) error: error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name); - g_free(vbasedev->name); + vfio_device_free_name(vbasedev); } static void vfio_ap_unrealize(DeviceState *dev) @@ -275,7 +275,7 @@ static void vfio_ap_unrealize(DeviceState *dev) vfio_ap_unregister_irq_notifier(vapdev, VFIO_AP_REQ_IRQ_INDEX); vfio_ap_unregister_irq_notifier(vapdev, VFIO_AP_CFG_CHG_IRQ_INDEX); vfio_device_detach(&vapdev->vdev); - g_free(vapdev->vdev.name); + vfio_device_free_name(&vapdev->vdev); } static const Property vfio_ap_properties[] = { diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index cea9d6e005..9560b8d851 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -619,7 +619,7 @@ out_io_notifier_err: out_region_err: vfio_device_detach(vbasedev); out_attach_dev_err: - g_free(vbasedev->name); + vfio_device_free_name(vbasedev); out_unrealize: if (cdc->unrealize) { cdc->unrealize(cdev); @@ -637,7 +637,7 @@ static void vfio_ccw_unrealize(DeviceState *dev) vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX); vfio_ccw_put_region(vcdev); vfio_device_detach(&vcdev->vdev); - g_free(vcdev->vdev.name); + vfio_device_free_name(&vcdev->vdev); if (cdc->unrealize) { cdc->unrealize(cdev); diff --git a/hw/vfio/device.c b/hw/vfio/device.c index 3cd365fb8b..97eddd04f5 100644 --- a/hw/vfio/device.c +++ b/hw/vfio/device.c @@ -333,6 +333,11 @@ bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp) return true; } +void vfio_device_free_name(VFIODevice *vbasedev) +{ + g_clear_pointer(&vbasedev->name, g_free); +} + void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp) { ERRP_GUARD(); diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index dd0b2a0b94..1093b28df7 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2996,7 +2996,7 @@ void vfio_pci_put_device(VFIOPCIDevice *vdev) vfio_device_detach(&vdev->vbasedev); - g_free(vdev->vbasedev.name); + vfio_device_free_name(&vdev->vbasedev); g_free(vdev->msix); } diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index 9a21f2e50a..5c1795a26f 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -530,7 +530,7 @@ static bool vfio_base_device_init(VFIODevice *vbasedev, Error **errp) { /* @fd takes precedence over @sysfsdev which takes precedence over @host */ if (vbasedev->fd < 0 && vbasedev->sysfsdev) { - g_free(vbasedev->name); + vfio_device_free_name(vbasedev); vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); } else if (vbasedev->fd < 0) { if (!vbasedev->name || strchr(vbasedev->name, '/')) { diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h index f503837ccc..1901a35aa9 100644 --- a/include/hw/vfio/vfio-device.h +++ b/include/hw/vfio/vfio-device.h @@ -279,6 +279,7 @@ int vfio_device_get_irq_info(VFIODevice *vbasedev, int index, /* Returns 0 on success, or a negative errno. */ bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp); +void vfio_device_free_name(VFIODevice *vbasedev); void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp); void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, DeviceState *dev, bool ram_discard); From a434fd8f6462c1541927d22e07c58425d6cbd84b Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 2 Jul 2025 14:58:48 -0700 Subject: [PATCH 17/27] vfio/iommufd: device name blocker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If an invariant device name cannot be created, block CPR. Signed-off-by: Steve Sistare Reviewed-by: Zhenzhong Duan Link: https://lore.kernel.org/qemu-devel/1751493538-202042-12-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/device.c | 11 +++++++++++ include/hw/vfio/vfio-cpr.h | 1 + 2 files changed, 12 insertions(+) diff --git a/hw/vfio/device.c b/hw/vfio/device.c index 97eddd04f5..0ae3f3c660 100644 --- a/hw/vfio/device.c +++ b/hw/vfio/device.c @@ -28,6 +28,8 @@ #include "qapi/error.h" #include "qemu/error-report.h" #include "qemu/units.h" +#include "migration/cpr.h" +#include "migration/blocker.h" #include "monitor/monitor.h" #include "vfio-helpers.h" @@ -324,8 +326,16 @@ bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp) } else { /* * Assign a name so any function printing it will not break. + * The fd number changes across processes, so this cannot be + * used as an invariant name for CPR. */ vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd); + error_setg(&vbasedev->cpr.id_blocker, + "vfio device with fd=%d needs an id property", + vbasedev->fd); + return migrate_add_blocker_modes(&vbasedev->cpr.id_blocker, + errp, MIG_MODE_CPR_TRANSFER, + -1) == 0; } } } @@ -336,6 +346,7 @@ bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp) void vfio_device_free_name(VFIODevice *vbasedev) { g_clear_pointer(&vbasedev->name, g_free); + migrate_del_blocker(&vbasedev->cpr.id_blocker); } void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp) diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h index d37acc4a0a..fa7d43ddd8 100644 --- a/include/hw/vfio/vfio-cpr.h +++ b/include/hw/vfio/vfio-cpr.h @@ -30,6 +30,7 @@ typedef struct VFIOContainerCPR { typedef struct VFIODeviceCPR { Error *mdev_blocker; + Error *id_blocker; } VFIODeviceCPR; bool vfio_legacy_cpr_register_container(struct VFIOContainer *container, From 06c6a65852af0b7648cdb6ff6cf2e66929a7b5f5 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 2 Jul 2025 14:58:49 -0700 Subject: [PATCH 18/27] vfio/iommufd: register container for cpr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Register a vfio iommufd container and device for CPR, replacing the generic CPR register call with a more specific iommufd register call. Add a blocker if the kernel does not support IOMMU_IOAS_CHANGE_PROCESS. This is mostly boiler plate. The fields to to saved and restored are added in subsequent patches. Signed-off-by: Steve Sistare Reviewed-by: Zhenzhong Duan Link: https://lore.kernel.org/qemu-devel/1751493538-202042-13-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- backends/iommufd.c | 10 +++++ hw/vfio/cpr-iommufd.c | 86 ++++++++++++++++++++++++++++++++++++++ hw/vfio/iommufd.c | 6 ++- hw/vfio/meson.build | 1 + include/hw/vfio/vfio-cpr.h | 12 ++++++ include/system/iommufd.h | 1 + 6 files changed, 114 insertions(+), 2 deletions(-) create mode 100644 hw/vfio/cpr-iommufd.c diff --git a/backends/iommufd.c b/backends/iommufd.c index 87f81a05f6..c554ce5385 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -108,6 +108,13 @@ bool iommufd_backend_connect(IOMMUFDBackend *be, Error **errp) } be->fd = fd; } + if (!be->users && !vfio_iommufd_cpr_register_iommufd(be, errp)) { + if (be->owned) { + close(be->fd); + be->fd = -1; + } + return false; + } be->users++; trace_iommufd_backend_connect(be->fd, be->owned, be->users); @@ -125,6 +132,9 @@ void iommufd_backend_disconnect(IOMMUFDBackend *be) be->fd = -1; } out: + if (!be->users) { + vfio_iommufd_cpr_unregister_iommufd(be); + } trace_iommufd_backend_disconnect(be->fd, be->users); } diff --git a/hw/vfio/cpr-iommufd.c b/hw/vfio/cpr-iommufd.c new file mode 100644 index 0000000000..2f58b43793 --- /dev/null +++ b/hw/vfio/cpr-iommufd.c @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2024-2025 Oracle and/or its affiliates. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "hw/vfio/vfio-cpr.h" +#include "migration/blocker.h" +#include "migration/cpr.h" +#include "migration/migration.h" +#include "migration/vmstate.h" +#include "system/iommufd.h" +#include "vfio-iommufd.h" + +static bool vfio_cpr_supported(IOMMUFDBackend *be, Error **errp) +{ + if (!iommufd_change_process_capable(be)) { + if (errp) { + error_setg(errp, "vfio iommufd backend does not support " + "IOMMU_IOAS_CHANGE_PROCESS"); + } + return false; + } + return true; +} + +static const VMStateDescription iommufd_cpr_vmstate = { + .name = "iommufd", + .version_id = 0, + .minimum_version_id = 0, + .needed = cpr_incoming_needed, + .fields = (VMStateField[]) { + VMSTATE_END_OF_LIST() + } +}; + +bool vfio_iommufd_cpr_register_iommufd(IOMMUFDBackend *be, Error **errp) +{ + Error **cpr_blocker = &be->cpr_blocker; + + if (!vfio_cpr_supported(be, cpr_blocker)) { + return migrate_add_blocker_modes(cpr_blocker, errp, + MIG_MODE_CPR_TRANSFER, -1) == 0; + } + + vmstate_register(NULL, -1, &iommufd_cpr_vmstate, be); + + return true; +} + +void vfio_iommufd_cpr_unregister_iommufd(IOMMUFDBackend *be) +{ + vmstate_unregister(NULL, &iommufd_cpr_vmstate, be); + migrate_del_blocker(&be->cpr_blocker); +} + +bool vfio_iommufd_cpr_register_container(VFIOIOMMUFDContainer *container, + Error **errp) +{ + VFIOContainerBase *bcontainer = &container->bcontainer; + + migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier, + vfio_cpr_reboot_notifier, + MIG_MODE_CPR_REBOOT); + + vfio_cpr_add_kvm_notifier(); + + return true; +} + +void vfio_iommufd_cpr_unregister_container(VFIOIOMMUFDContainer *container) +{ + VFIOContainerBase *bcontainer = &container->bcontainer; + + migration_remove_notifier(&bcontainer->cpr_reboot_notifier); +} + +void vfio_iommufd_cpr_register_device(VFIODevice *vbasedev) +{ +} + +void vfio_iommufd_cpr_unregister_device(VFIODevice *vbasedev) +{ +} diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 962a1e2b1f..ff291be235 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -446,7 +446,7 @@ static void iommufd_cdev_container_destroy(VFIOIOMMUFDContainer *container) if (!QLIST_EMPTY(&bcontainer->device_list)) { return; } - vfio_cpr_unregister_container(bcontainer); + vfio_iommufd_cpr_unregister_container(container); vfio_listener_unregister(bcontainer); iommufd_backend_free_id(container->be, container->ioas_id); object_unref(container); @@ -592,7 +592,7 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, goto err_listener_register; } - if (!vfio_cpr_register_container(bcontainer, errp)) { + if (!vfio_iommufd_cpr_register_container(container, errp)) { goto err_listener_register; } @@ -623,6 +623,7 @@ found_container: } vfio_device_prepare(vbasedev, bcontainer, &dev_info); + vfio_iommufd_cpr_register_device(vbasedev); trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs, vbasedev->num_regions, vbasedev->flags); @@ -660,6 +661,7 @@ static void iommufd_cdev_detach(VFIODevice *vbasedev) iommufd_cdev_container_destroy(container); vfio_address_space_put(space); + vfio_iommufd_cpr_unregister_device(vbasedev); iommufd_cdev_unbind_and_disconnect(vbasedev); close(vbasedev->fd); } diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index 63ea393076..7a881740a6 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -31,6 +31,7 @@ system_ss.add(when: 'CONFIG_VFIO', if_true: files( )) system_ss.add(when: ['CONFIG_VFIO', 'CONFIG_IOMMUFD'], if_true: files( 'iommufd.c', + 'cpr-iommufd.c', )) system_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( 'display.c', diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h index fa7d43ddd8..87b4206d81 100644 --- a/include/hw/vfio/vfio-cpr.h +++ b/include/hw/vfio/vfio-cpr.h @@ -15,7 +15,10 @@ struct VFIOContainer; struct VFIOContainerBase; struct VFIOGroup; +struct VFIODevice; struct VFIOPCIDevice; +struct VFIOIOMMUFDContainer; +struct IOMMUFDBackend; typedef int (*dma_map_fn)(const struct VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, @@ -44,6 +47,15 @@ bool vfio_cpr_register_container(struct VFIOContainerBase *bcontainer, Error **errp); void vfio_cpr_unregister_container(struct VFIOContainerBase *bcontainer); +bool vfio_iommufd_cpr_register_container(struct VFIOIOMMUFDContainer *container, + Error **errp); +void vfio_iommufd_cpr_unregister_container( + struct VFIOIOMMUFDContainer *container); +bool vfio_iommufd_cpr_register_iommufd(struct IOMMUFDBackend *be, Error **errp); +void vfio_iommufd_cpr_unregister_iommufd(struct IOMMUFDBackend *be); +void vfio_iommufd_cpr_register_device(struct VFIODevice *vbasedev); +void vfio_iommufd_cpr_unregister_device(struct VFIODevice *vbasedev); + int vfio_cpr_group_get_device_fd(int d, const char *name); bool vfio_cpr_container_match(struct VFIOContainer *container, diff --git a/include/system/iommufd.h b/include/system/iommufd.h index db5f2c716c..c9c72ffc45 100644 --- a/include/system/iommufd.h +++ b/include/system/iommufd.h @@ -32,6 +32,7 @@ struct IOMMUFDBackend { /*< protected >*/ int fd; /* /dev/iommu file descriptor */ bool owned; /* is the /dev/iommu opened internally */ + Error *cpr_blocker;/* set if be does not support CPR */ uint32_t users; /*< public >*/ From a6f2f9c42f3a5418fc7000b1fd331b086b6133d9 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 2 Jul 2025 14:58:50 -0700 Subject: [PATCH 19/27] migration: vfio cpr state hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define a list of vfio devices in CPR state, in a subsection so that older QEMU can be live updated to this version. However, new QEMU will not be live updateable to old QEMU. This is acceptable because CPR is not yet commonly used, and updates to older versions are unusual. The contents of each device object will be defined by the vfio subsystem in a subsequent patch. Signed-off-by: Steve Sistare Reviewed-by: Zhenzhong Duan Link: https://lore.kernel.org/qemu-devel/1751493538-202042-14-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/cpr-iommufd.c | 2 ++ hw/vfio/iommufd-stubs.c | 18 ++++++++++++++++++ hw/vfio/meson.build | 1 + include/hw/vfio/vfio-cpr.h | 1 + include/migration/cpr.h | 12 ++++++++++++ migration/cpr.c | 15 ++++++--------- 6 files changed, 40 insertions(+), 9 deletions(-) create mode 100644 hw/vfio/iommufd-stubs.c diff --git a/hw/vfio/cpr-iommufd.c b/hw/vfio/cpr-iommufd.c index 2f58b43793..f95773b02c 100644 --- a/hw/vfio/cpr-iommufd.c +++ b/hw/vfio/cpr-iommufd.c @@ -14,6 +14,8 @@ #include "system/iommufd.h" #include "vfio-iommufd.h" +const VMStateDescription vmstate_cpr_vfio_devices; /* TBD in a later patch */ + static bool vfio_cpr_supported(IOMMUFDBackend *be, Error **errp) { if (!iommufd_change_process_capable(be)) { diff --git a/hw/vfio/iommufd-stubs.c b/hw/vfio/iommufd-stubs.c new file mode 100644 index 0000000000..0be5276175 --- /dev/null +++ b/hw/vfio/iommufd-stubs.c @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2025 Oracle and/or its affiliates. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "migration/cpr.h" +#include "migration/vmstate.h" + +const VMStateDescription vmstate_cpr_vfio_devices = { + .name = CPR_STATE "/vfio devices", + .version_id = 1, + .minimum_version_id = 1, + .fields = (const VMStateField[]){ + VMSTATE_END_OF_LIST() + } +}; diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index 7a881740a6..bfaf6be805 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -33,6 +33,7 @@ system_ss.add(when: ['CONFIG_VFIO', 'CONFIG_IOMMUFD'], if_true: files( 'iommufd.c', 'cpr-iommufd.c', )) +system_ss.add(when: 'CONFIG_IOMMUFD', if_false: files('iommufd-stubs.c')) system_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( 'display.c', )) diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h index 87b4206d81..286e3d4e9a 100644 --- a/include/hw/vfio/vfio-cpr.h +++ b/include/hw/vfio/vfio-cpr.h @@ -75,6 +75,7 @@ void vfio_cpr_delete_vector_fd(struct VFIOPCIDevice *vdev, const char *name, int nr); extern const VMStateDescription vfio_cpr_pci_vmstate; +extern const VMStateDescription vmstate_cpr_vfio_devices; void vfio_cpr_add_kvm_notifier(void); diff --git a/include/migration/cpr.h b/include/migration/cpr.h index eb27a93301..3fc19a74ef 100644 --- a/include/migration/cpr.h +++ b/include/migration/cpr.h @@ -9,11 +9,23 @@ #define MIGRATION_CPR_H #include "qapi/qapi-types-migration.h" +#include "qemu/queue.h" #define MIG_MODE_NONE -1 #define QEMU_CPR_FILE_MAGIC 0x51435052 #define QEMU_CPR_FILE_VERSION 0x00000001 +#define CPR_STATE "CprState" + +typedef QLIST_HEAD(CprFdList, CprFd) CprFdList; +typedef QLIST_HEAD(CprVFIODeviceList, CprVFIODevice) CprVFIODeviceList; + +typedef struct CprState { + CprFdList fds; + CprVFIODeviceList vfio_devices; +} CprState; + +extern CprState cpr_state; void cpr_save_fd(const char *name, int id, int fd); void cpr_delete_fd(const char *name, int id); diff --git a/migration/cpr.c b/migration/cpr.c index 535d587aee..42ad0b0d50 100644 --- a/migration/cpr.c +++ b/migration/cpr.c @@ -7,6 +7,7 @@ #include "qemu/osdep.h" #include "qapi/error.h" +#include "hw/vfio/vfio-device.h" #include "migration/cpr.h" #include "migration/misc.h" #include "migration/options.h" @@ -20,13 +21,7 @@ /*************************************************************************/ /* cpr state container for all information to be saved. */ -typedef QLIST_HEAD(CprFdList, CprFd) CprFdList; - -typedef struct CprState { - CprFdList fds; -} CprState; - -static CprState cpr_state; +CprState cpr_state; /****************************************************************************/ @@ -127,8 +122,6 @@ int cpr_open_fd(const char *path, int flags, const char *name, int id, } /*************************************************************************/ -#define CPR_STATE "CprState" - static const VMStateDescription vmstate_cpr_state = { .name = CPR_STATE, .version_id = 1, @@ -136,6 +129,10 @@ static const VMStateDescription vmstate_cpr_state = { .fields = (VMStateField[]) { VMSTATE_QLIST_V(fds, CprState, 1, vmstate_cpr_fd, CprFd, next), VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription * const []) { + &vmstate_cpr_vfio_devices, + NULL } }; /*************************************************************************/ From f2f3e4667e4d6026f39ab17f355f79b2f8431e19 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 2 Jul 2025 14:58:51 -0700 Subject: [PATCH 20/27] vfio/iommufd: cpr state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VFIO iommufd devices will need access to ioas_id, devid, and hwpt_id in new QEMU at realize time, so add them to CPR state. Define CprVFIODevice as the object which holds the state and is serialized to the vmstate file. Define accessors to copy state between VFIODevice and CprVFIODevice. Signed-off-by: Steve Sistare Reviewed-by: Zhenzhong Duan Link: https://lore.kernel.org/qemu-devel/1751493538-202042-15-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/cpr-iommufd.c | 98 +++++++++++++++++++++++++++++++++++++- hw/vfio/iommufd.c | 2 + hw/vfio/trace-events | 3 ++ include/hw/vfio/vfio-cpr.h | 3 ++ 4 files changed, 105 insertions(+), 1 deletion(-) diff --git a/hw/vfio/cpr-iommufd.c b/hw/vfio/cpr-iommufd.c index f95773b02c..4166201e3f 100644 --- a/hw/vfio/cpr-iommufd.c +++ b/hw/vfio/cpr-iommufd.c @@ -7,14 +7,98 @@ #include "qemu/osdep.h" #include "qapi/error.h" #include "hw/vfio/vfio-cpr.h" +#include "hw/vfio/vfio-device.h" #include "migration/blocker.h" #include "migration/cpr.h" #include "migration/migration.h" #include "migration/vmstate.h" #include "system/iommufd.h" #include "vfio-iommufd.h" +#include "trace.h" -const VMStateDescription vmstate_cpr_vfio_devices; /* TBD in a later patch */ +typedef struct CprVFIODevice { + char *name; + unsigned int namelen; + uint32_t ioas_id; + int devid; + uint32_t hwpt_id; + QLIST_ENTRY(CprVFIODevice) next; +} CprVFIODevice; + +static const VMStateDescription vmstate_cpr_vfio_device = { + .name = "cpr vfio device", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(namelen, CprVFIODevice), + VMSTATE_VBUFFER_ALLOC_UINT32(name, CprVFIODevice, 0, NULL, namelen), + VMSTATE_INT32(devid, CprVFIODevice), + VMSTATE_UINT32(ioas_id, CprVFIODevice), + VMSTATE_UINT32(hwpt_id, CprVFIODevice), + VMSTATE_END_OF_LIST() + } +}; + +const VMStateDescription vmstate_cpr_vfio_devices = { + .name = CPR_STATE "/vfio devices", + .version_id = 1, + .minimum_version_id = 1, + .fields = (const VMStateField[]){ + VMSTATE_QLIST_V(vfio_devices, CprState, 1, vmstate_cpr_vfio_device, + CprVFIODevice, next), + VMSTATE_END_OF_LIST() + } +}; + +static void vfio_cpr_save_device(VFIODevice *vbasedev) +{ + CprVFIODevice *elem = g_new0(CprVFIODevice, 1); + + elem->name = g_strdup(vbasedev->name); + elem->namelen = strlen(vbasedev->name) + 1; + elem->ioas_id = vbasedev->cpr.ioas_id; + elem->devid = vbasedev->devid; + elem->hwpt_id = vbasedev->cpr.hwpt_id; + QLIST_INSERT_HEAD(&cpr_state.vfio_devices, elem, next); +} + +static CprVFIODevice *find_device(const char *name) +{ + CprVFIODeviceList *head = &cpr_state.vfio_devices; + CprVFIODevice *elem; + + QLIST_FOREACH(elem, head, next) { + if (!strcmp(elem->name, name)) { + return elem; + } + } + return NULL; +} + +static void vfio_cpr_delete_device(const char *name) +{ + CprVFIODevice *elem = find_device(name); + + if (elem) { + QLIST_REMOVE(elem, next); + g_free(elem->name); + g_free(elem); + } +} + +static bool vfio_cpr_find_device(VFIODevice *vbasedev) +{ + CprVFIODevice *elem = find_device(vbasedev->name); + + if (elem) { + vbasedev->cpr.ioas_id = elem->ioas_id; + vbasedev->devid = elem->devid; + vbasedev->cpr.hwpt_id = elem->hwpt_id; + trace_vfio_cpr_find_device(elem->ioas_id, elem->devid, elem->hwpt_id); + return true; + } + return false; +} static bool vfio_cpr_supported(IOMMUFDBackend *be, Error **errp) { @@ -81,8 +165,20 @@ void vfio_iommufd_cpr_unregister_container(VFIOIOMMUFDContainer *container) void vfio_iommufd_cpr_register_device(VFIODevice *vbasedev) { + if (!cpr_is_incoming()) { + vfio_cpr_save_device(vbasedev); + } } void vfio_iommufd_cpr_unregister_device(VFIODevice *vbasedev) { + vfio_cpr_delete_device(vbasedev->name); +} + +void vfio_cpr_load_device(VFIODevice *vbasedev) +{ + if (cpr_is_incoming()) { + bool ret = vfio_cpr_find_device(vbasedev); + g_assert(ret); + } } diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index ff291be235..f0d57ea65f 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -515,6 +515,8 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, const VFIOIOMMUClass *iommufd_vioc = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); + vfio_cpr_load_device(vbasedev); + if (vbasedev->fd < 0) { devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp); if (devfd < 0) { diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index e1728c4ef6..8ec0ad0cde 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -197,6 +197,9 @@ iommufd_cdev_alloc_ioas(int iommufd, int ioas_id) " [iommufd=%d] new IOMMUFD con iommufd_cdev_device_info(char *name, int devfd, int num_irqs, int num_regions, int flags) " %s (%d) num_irqs=%d num_regions=%d flags=%d" iommufd_cdev_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int dev_id) "\t%04x:%02x:%02x.%x devid %d" +# cpr-iommufd.c +vfio_cpr_find_device(uint32_t ioas_id, int devid, uint32_t hwpt_id) "ioas_id %u, devid %d, hwpt_id %u" + # device.c vfio_device_get_region_info_type(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x" vfio_device_reset_handler(void) "" diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h index 286e3d4e9a..2878372495 100644 --- a/include/hw/vfio/vfio-cpr.h +++ b/include/hw/vfio/vfio-cpr.h @@ -34,6 +34,8 @@ typedef struct VFIOContainerCPR { typedef struct VFIODeviceCPR { Error *mdev_blocker; Error *id_blocker; + uint32_t hwpt_id; + uint32_t ioas_id; } VFIODeviceCPR; bool vfio_legacy_cpr_register_container(struct VFIOContainer *container, @@ -55,6 +57,7 @@ bool vfio_iommufd_cpr_register_iommufd(struct IOMMUFDBackend *be, Error **errp); void vfio_iommufd_cpr_unregister_iommufd(struct IOMMUFDBackend *be); void vfio_iommufd_cpr_register_device(struct VFIODevice *vbasedev); void vfio_iommufd_cpr_unregister_device(struct VFIODevice *vbasedev); +void vfio_cpr_load_device(struct VFIODevice *vbasedev); int vfio_cpr_group_get_device_fd(int d, const char *name); From 2a3f0a59bd6479f75fa5335f82b85b4f9cd7ed4e Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 2 Jul 2025 14:58:52 -0700 Subject: [PATCH 21/27] vfio/iommufd: preserve descriptors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Save the iommu and vfio device fd in CPR state when it is created. After CPR, the fd number is found in CPR state and reused. Signed-off-by: Steve Sistare Reviewed-by: Zhenzhong Duan Link: https://lore.kernel.org/qemu-devel/1751493538-202042-16-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- backends/iommufd.c | 35 +++++++++++++++++++++++++++++------ hw/vfio/cpr-iommufd.c | 10 ++++++++++ hw/vfio/device.c | 9 +-------- 3 files changed, 40 insertions(+), 14 deletions(-) diff --git a/backends/iommufd.c b/backends/iommufd.c index c554ce5385..e0917923bf 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -16,12 +16,18 @@ #include "qemu/module.h" #include "qom/object_interfaces.h" #include "qemu/error-report.h" +#include "migration/cpr.h" #include "monitor/monitor.h" #include "trace.h" #include "hw/vfio/vfio-device.h" #include #include +static const char *iommufd_fd_name(IOMMUFDBackend *be) +{ + return object_get_canonical_path_component(OBJECT(be)); +} + static void iommufd_backend_init(Object *obj) { IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); @@ -64,11 +70,27 @@ static bool iommufd_backend_can_be_deleted(UserCreatable *uc) return !be->users; } +static void iommufd_backend_complete(UserCreatable *uc, Error **errp) +{ + IOMMUFDBackend *be = IOMMUFD_BACKEND(uc); + const char *name = iommufd_fd_name(be); + + if (!be->owned) { + /* fd came from the command line. Fetch updated value from cpr state. */ + if (cpr_is_incoming()) { + be->fd = cpr_find_fd(name, 0); + } else { + cpr_save_fd(name, 0, be->fd); + } + } +} + static void iommufd_backend_class_init(ObjectClass *oc, const void *data) { UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc); ucc->can_be_deleted = iommufd_backend_can_be_deleted; + ucc->complete = iommufd_backend_complete; object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd); } @@ -102,7 +124,7 @@ bool iommufd_backend_connect(IOMMUFDBackend *be, Error **errp) int fd; if (be->owned && !be->users) { - fd = qemu_open("/dev/iommu", O_RDWR, errp); + fd = cpr_open_fd("/dev/iommu", O_RDWR, iommufd_fd_name(be), 0, errp); if (fd < 0) { return false; } @@ -127,14 +149,15 @@ void iommufd_backend_disconnect(IOMMUFDBackend *be) goto out; } be->users--; - if (!be->users && be->owned) { - close(be->fd); - be->fd = -1; - } -out: if (!be->users) { vfio_iommufd_cpr_unregister_iommufd(be); + if (be->owned) { + cpr_delete_fd(iommufd_fd_name(be), 0); + close(be->fd); + be->fd = -1; + } } +out: trace_iommufd_backend_disconnect(be->fd, be->users); } diff --git a/hw/vfio/cpr-iommufd.c b/hw/vfio/cpr-iommufd.c index 4166201e3f..a72b68daa8 100644 --- a/hw/vfio/cpr-iommufd.c +++ b/hw/vfio/cpr-iommufd.c @@ -166,12 +166,18 @@ void vfio_iommufd_cpr_unregister_container(VFIOIOMMUFDContainer *container) void vfio_iommufd_cpr_register_device(VFIODevice *vbasedev) { if (!cpr_is_incoming()) { + /* + * Beware fd may have already been saved by vfio_device_set_fd, + * so call resave to avoid a duplicate entry. + */ + cpr_resave_fd(vbasedev->name, 0, vbasedev->fd); vfio_cpr_save_device(vbasedev); } } void vfio_iommufd_cpr_unregister_device(VFIODevice *vbasedev) { + cpr_delete_fd(vbasedev->name, 0); vfio_cpr_delete_device(vbasedev->name); } @@ -180,5 +186,9 @@ void vfio_cpr_load_device(VFIODevice *vbasedev) if (cpr_is_incoming()) { bool ret = vfio_cpr_find_device(vbasedev); g_assert(ret); + + if (vbasedev->fd < 0) { + vbasedev->fd = cpr_find_fd(vbasedev->name, 0); + } } } diff --git a/hw/vfio/device.c b/hw/vfio/device.c index 0ae3f3c660..96cf21462c 100644 --- a/hw/vfio/device.c +++ b/hw/vfio/device.c @@ -351,14 +351,7 @@ void vfio_device_free_name(VFIODevice *vbasedev) void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp) { - ERRP_GUARD(); - int fd = monitor_fd_param(monitor_cur(), str, errp); - - if (fd < 0) { - error_prepend(errp, "Could not parse remote object fd %s:", str); - return; - } - vbasedev->fd = fd; + vbasedev->fd = cpr_get_fd_param(vbasedev->dev->id, str, 0, errp); } static VFIODeviceIOOps vfio_device_io_ops_ioctl; From 4296ee07455e48c169eb110fbca6ef724c119381 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 2 Jul 2025 14:58:53 -0700 Subject: [PATCH 22/27] vfio/iommufd: reconstruct device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reconstruct userland device state after CPR. During vfio_realize, skip all ioctls that configure the device, as it was already configured in old QEMU. Skip bind, and use the devid from CPR state. Skip allocation of, and attachment to, ioas_id. Recover ioas_id from CPR state, and use it to find a matching container, if any, before creating a new one. This reconstruction is not complete. hwpt_id is handled in a subsequent patch. Signed-off-by: Steve Sistare Reviewed-by: Zhenzhong Duan Link: https://lore.kernel.org/qemu-devel/1751493538-202042-17-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/iommufd.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index f0d57ea65f..a650517a1d 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -25,6 +25,7 @@ #include "system/reset.h" #include "qemu/cutils.h" #include "qemu/chardev_open.h" +#include "migration/cpr.h" #include "pci.h" #include "vfio-iommufd.h" #include "vfio-helpers.h" @@ -121,6 +122,10 @@ static bool iommufd_cdev_connect_and_bind(VFIODevice *vbasedev, Error **errp) goto err_kvm_device_add; } + if (cpr_is_incoming()) { + goto skip_bind; + } + /* Bind device to iommufd */ bind.iommufd = iommufd->fd; if (ioctl(vbasedev->fd, VFIO_DEVICE_BIND_IOMMUFD, &bind)) { @@ -132,6 +137,8 @@ static bool iommufd_cdev_connect_and_bind(VFIODevice *vbasedev, Error **errp) vbasedev->devid = bind.out_devid; trace_iommufd_cdev_connect_and_bind(bind.iommufd, vbasedev->name, vbasedev->fd, vbasedev->devid); + +skip_bind: return true; err_bind: iommufd_cdev_kvm_device_del(vbasedev); @@ -421,7 +428,9 @@ static bool iommufd_cdev_attach_container(VFIODevice *vbasedev, return iommufd_cdev_autodomains_get(vbasedev, container, errp); } - return !iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp); + /* If CPR, we are already attached to ioas_id. */ + return cpr_is_incoming() || + !iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp); } static void iommufd_cdev_detach_container(VFIODevice *vbasedev, @@ -510,6 +519,7 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, VFIOAddressSpace *space; struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; int ret, devfd; + bool res; uint32_t ioas_id; Error *err = NULL; const VFIOIOMMUClass *iommufd_vioc = @@ -540,7 +550,16 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, vbasedev->iommufd != container->be) { continue; } - if (!iommufd_cdev_attach_container(vbasedev, container, &err)) { + + if (!cpr_is_incoming()) { + res = iommufd_cdev_attach_container(vbasedev, container, &err); + } else if (vbasedev->cpr.ioas_id == container->ioas_id) { + res = true; + } else { + continue; + } + + if (!res) { const char *msg = error_get_pretty(err); trace_iommufd_cdev_fail_attach_existing_container(msg); @@ -557,6 +576,11 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, } } + if (cpr_is_incoming()) { + ioas_id = vbasedev->cpr.ioas_id; + goto skip_ioas_alloc; + } + /* Need to allocate a new dedicated container */ if (!iommufd_backend_alloc_ioas(vbasedev->iommufd, &ioas_id, errp)) { goto err_alloc_ioas; @@ -564,10 +588,12 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, trace_iommufd_cdev_alloc_ioas(vbasedev->iommufd->fd, ioas_id); +skip_ioas_alloc: container = VFIO_IOMMU_IOMMUFD(object_new(TYPE_VFIO_IOMMU_IOMMUFD)); container->be = vbasedev->iommufd; container->ioas_id = ioas_id; QLIST_INIT(&container->hwpt_list); + vbasedev->cpr.ioas_id = ioas_id; bcontainer = &container->bcontainer; vfio_address_space_insert(space, bcontainer); From 010643eeb1521c0240c83ee8678544de0cd30b78 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 2 Jul 2025 14:58:54 -0700 Subject: [PATCH 23/27] vfio/iommufd: reconstruct hwpt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Skip allocation of, and attachment to, hwpt_id. Recover it from CPR state. Signed-off-by: Steve Sistare Reviewed-by: Zhenzhong Duan Link: https://lore.kernel.org/qemu-devel/1751493538-202042-18-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/iommufd.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index a650517a1d..48c590b6a9 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -332,7 +332,14 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev, /* Try to find a domain */ QLIST_FOREACH(hwpt, &container->hwpt_list, next) { - ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); + if (!cpr_is_incoming()) { + ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); + } else if (vbasedev->cpr.hwpt_id == hwpt->hwpt_id) { + ret = 0; + } else { + continue; + } + if (ret) { /* -EINVAL means the domain is incompatible with the device. */ if (ret == -EINVAL) { @@ -349,6 +356,7 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev, return false; } else { vbasedev->hwpt = hwpt; + vbasedev->cpr.hwpt_id = hwpt->hwpt_id; QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt); return true; @@ -371,6 +379,11 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev, flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING; } + if (cpr_is_incoming()) { + hwpt_id = vbasedev->cpr.hwpt_id; + goto skip_alloc; + } + if (!iommufd_backend_alloc_hwpt(iommufd, vbasedev->devid, container->ioas_id, flags, IOMMU_HWPT_DATA_NONE, 0, NULL, @@ -378,19 +391,20 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev, return false; } + ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt_id, errp); + if (ret) { + iommufd_backend_free_id(container->be, hwpt_id); + return false; + } + +skip_alloc: hwpt = g_malloc0(sizeof(*hwpt)); hwpt->hwpt_id = hwpt_id; hwpt->hwpt_flags = flags; QLIST_INIT(&hwpt->device_list); - ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); - if (ret) { - iommufd_backend_free_id(container->be, hwpt->hwpt_id); - g_free(hwpt); - return false; - } - vbasedev->hwpt = hwpt; + vbasedev->cpr.hwpt_id = hwpt->hwpt_id; vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt); QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next); From 5c066c4be2328c46f03e9166ea720b13bf68ea5d Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 2 Jul 2025 14:58:55 -0700 Subject: [PATCH 24/27] vfio/iommufd: change process MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Finish CPR by change the owning process of the iommufd device in post load. Signed-off-by: Steve Sistare Reviewed-by: Zhenzhong Duan Link: https://lore.kernel.org/qemu-devel/1751493538-202042-19-git-send-email-steven.sistare@oracle.com [ clg: Fixed missing "qemu/error-report.h" include ] Signed-off-by: Cédric Le Goater --- hw/vfio/cpr-iommufd.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/hw/vfio/cpr-iommufd.c b/hw/vfio/cpr-iommufd.c index a72b68daa8..148a06d552 100644 --- a/hw/vfio/cpr-iommufd.c +++ b/hw/vfio/cpr-iommufd.c @@ -5,6 +5,7 @@ */ #include "qemu/osdep.h" +#include "qemu/error-report.h" #include "qapi/error.h" #include "hw/vfio/vfio-cpr.h" #include "hw/vfio/vfio-device.h" @@ -112,10 +113,40 @@ static bool vfio_cpr_supported(IOMMUFDBackend *be, Error **errp) return true; } +static int iommufd_cpr_pre_save(void *opaque) +{ + IOMMUFDBackend *be = opaque; + + /* + * The process has not changed yet, but proactively try the ioctl, + * and it will fail if any DMA mappings are not supported. + */ + if (!iommufd_change_process_capable(be)) { + error_report("some memory regions do not support " + "IOMMU_IOAS_CHANGE_PROCESS"); + return -1; + } + return 0; +} + +static int iommufd_cpr_post_load(void *opaque, int version_id) +{ + IOMMUFDBackend *be = opaque; + Error *local_err = NULL; + + if (!iommufd_change_process(be, &local_err)) { + error_report_err(local_err); + return -1; + } + return 0; +} + static const VMStateDescription iommufd_cpr_vmstate = { .name = "iommufd", .version_id = 0, .minimum_version_id = 0, + .pre_save = iommufd_cpr_pre_save, + .post_load = iommufd_cpr_post_load, .needed = cpr_incoming_needed, .fields = (VMStateField[]) { VMSTATE_END_OF_LIST() From 6ff4cccd13155e718e630fe16a72d3cc9decde3b Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 2 Jul 2025 14:58:56 -0700 Subject: [PATCH 25/27] iommufd: preserve DMA mappings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During cpr-transfer load in new QEMU, the vfio_memory_listener causes spurious calls to map and unmap DMA regions, as devices are created and the address space is built. This memory was already already mapped by the device in old QEMU, so suppress the map and unmap callbacks during incoming CPR. Signed-off-by: Steve Sistare Reviewed-by: Zhenzhong Duan Link: https://lore.kernel.org/qemu-devel/1751493538-202042-20-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- backends/iommufd.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index e0917923bf..2a33c7ab0b 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -245,6 +245,10 @@ int iommufd_backend_map_file_dma(IOMMUFDBackend *be, uint32_t ioas_id, .length = size, }; + if (cpr_is_incoming()) { + return 0; + } + if (!readonly) { map.flags |= IOMMU_IOAS_MAP_WRITEABLE; } @@ -274,6 +278,10 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, .length = size, }; + if (cpr_is_incoming()) { + return 0; + } + ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap); /* * IOMMUFD takes mapping as some kind of object, unmapping From 99cedd5d552130b9b27743c40ca9012e1f4f0371 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 2 Jul 2025 14:58:57 -0700 Subject: [PATCH 26/27] vfio/container: delete old cpr register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vfio_cpr_[un]register_container is no longer used since they were subsumed by container type-specific registration. Delete them. Signed-off-by: Steve Sistare Reviewed-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/1751493538-202042-21-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/cpr.c | 13 ------------- include/hw/vfio/vfio-cpr.h | 4 ---- 2 files changed, 17 deletions(-) diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c index 0e903cdd2f..af0f12a7ad 100644 --- a/hw/vfio/cpr.c +++ b/hw/vfio/cpr.c @@ -29,19 +29,6 @@ int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier, return 0; } -bool vfio_cpr_register_container(VFIOContainerBase *bcontainer, Error **errp) -{ - migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier, - vfio_cpr_reboot_notifier, - MIG_MODE_CPR_REBOOT); - return true; -} - -void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer) -{ - migration_remove_notifier(&bcontainer->cpr_reboot_notifier); -} - #define STRDUP_VECTOR_FD_NAME(vdev, name) \ g_strdup_printf("%s_%s", (vdev)->vbasedev.name, (name)) diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h index 2878372495..80ad20d216 100644 --- a/include/hw/vfio/vfio-cpr.h +++ b/include/hw/vfio/vfio-cpr.h @@ -45,10 +45,6 @@ void vfio_legacy_cpr_unregister_container(struct VFIOContainer *container); int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier, MigrationEvent *e, Error **errp); -bool vfio_cpr_register_container(struct VFIOContainerBase *bcontainer, - Error **errp); -void vfio_cpr_unregister_container(struct VFIOContainerBase *bcontainer); - bool vfio_iommufd_cpr_register_container(struct VFIOIOMMUFDContainer *container, Error **errp); void vfio_iommufd_cpr_unregister_container( From 7437caad2052d920452ff7b9b7bc84f5e8e55c90 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 2 Jul 2025 14:58:58 -0700 Subject: [PATCH 27/27] vfio: doc changes for cpr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update documentation to say that cpr-transfer supports vfio and iommufd. Signed-off-by: Steve Sistare Reviewed-by: Cédric Le Goater Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/qemu-devel/1751493538-202042-22-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- docs/devel/migration/CPR.rst | 5 ++--- qapi/migration.json | 6 ++++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/devel/migration/CPR.rst b/docs/devel/migration/CPR.rst index 7897873c86..0a0fd4f6dc 100644 --- a/docs/devel/migration/CPR.rst +++ b/docs/devel/migration/CPR.rst @@ -152,8 +152,7 @@ cpr-transfer mode This mode allows the user to transfer a guest to a new QEMU instance on the same host with minimal guest pause time, by preserving guest RAM in place, albeit with new virtual addresses in new QEMU. Devices -and their pinned memory pages will also be preserved in a future QEMU -release. +and their pinned memory pages are also preserved for VFIO and IOMMUFD. The user starts new QEMU on the same host as old QEMU, with command- line arguments to create the same machine, plus the ``-incoming`` @@ -322,6 +321,6 @@ Futures cpr-transfer mode is based on a capability to transfer open file descriptors from old to new QEMU. In the future, descriptors for -vfio, iommufd, vhost, and char devices could be transferred, +vhost, and char devices could be transferred, preserving those devices and their kernel state without interruption, even if they do not explicitly support live migration. diff --git a/qapi/migration.json b/qapi/migration.json index 4963f6ca12..e8a7d3b2a9 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -620,8 +620,10 @@ # # @cpr-transfer: This mode allows the user to transfer a guest to a # new QEMU instance on the same host with minimal guest pause -# time by preserving guest RAM in place. Devices and their pinned -# pages will also be preserved in a future QEMU release. +# time by preserving guest RAM in place. +# +# Devices and their pinned pages are also preserved for VFIO and +# IOMMUFD. (since 10.1) # # The user starts new QEMU on the same host as old QEMU, with # command-line arguments to create the same machine, plus the