vfio queue:

* Added small cleanups for b4 and scope
 * Restricted TDX build to 64-bit target
 * Fixed issues introduced in first part of VFIO live update support
 * Added full VFIO live update support
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEEoPZlSPBIlev+awtgUaNDx8/77KEFAmhnlBMACgkQUaNDx8/7
 7KFOxw//dIPpGcYIjEGpIkIh6NF3VK6xmDAG0aZEeM+5fCzdor2DPkD7ZPyqND3S
 /YkR8GSOHd+Qm5W+73LHOdV5RFMt4wagyHiAKUMpEFHY7ZLduxIXlACoUo+F5cnh
 SUnhC6KX7Gu1/Nndb4X4w6SNOyhoRKtQ2EqpRsrGdIaBkX8s6w2jF/INPTPdpg73
 lulJZCAFNzyIWytck9ohJf8To9IsvkCXTF6mcywURa9MBaAarRttXoFjuZsXb7zn
 NqGVtantNAaJmKu26X3ScUWn9P02WryhPB6KT7+B3G/b87Su1cnbAwYakNSFPJIx
 I/gaw0EPzHM+b6mavA4IdvKDJGR7GMvpJEGqUEpntc6FJ3+g1B7qsedgeBUc/RKB
 UaRmtYbvlMv5wSmaLcxsT3S3BnABbrd4EedZX5uOBFMrtnTiOqrMUEcoMaf5ogvN
 KlJkrjNQkfHxTbp5G+nXHuTzae3k2Ylm196b2yhgARfUL70jiak/B+ADeezVcVmW
 6ZpotrAvMxu9RlFdxTSbL0/lR0rfKZTecqMOSFA+FlmjcTJ0QW1SbweMdsfgW/uU
 /2Hfmw6zUQ80/tMqYMztFWsiov7C8a8ZMmuZwDQp+AdCVGgFEigfNJVQYgujbqKz
 g9Ta9cNPyvF5hpnml5u8IzAzM95HrhIPFmmpUBZyWOCeL6chSHk=
 =Cu7b
 -----END PGP SIGNATURE-----

Merge tag 'pull-vfio-20250704' of https://github.com/legoater/qemu into staging

vfio queue:

* Added small cleanups for b4 and scope
* Restricted TDX build to 64-bit target
* Fixed issues introduced in first part of VFIO live update support
* Added full VFIO live update support

# -----BEGIN PGP SIGNATURE-----
#
# iQIzBAABCAAdFiEEoPZlSPBIlev+awtgUaNDx8/77KEFAmhnlBMACgkQUaNDx8/7
# 7KFOxw//dIPpGcYIjEGpIkIh6NF3VK6xmDAG0aZEeM+5fCzdor2DPkD7ZPyqND3S
# /YkR8GSOHd+Qm5W+73LHOdV5RFMt4wagyHiAKUMpEFHY7ZLduxIXlACoUo+F5cnh
# SUnhC6KX7Gu1/Nndb4X4w6SNOyhoRKtQ2EqpRsrGdIaBkX8s6w2jF/INPTPdpg73
# lulJZCAFNzyIWytck9ohJf8To9IsvkCXTF6mcywURa9MBaAarRttXoFjuZsXb7zn
# NqGVtantNAaJmKu26X3ScUWn9P02WryhPB6KT7+B3G/b87Su1cnbAwYakNSFPJIx
# I/gaw0EPzHM+b6mavA4IdvKDJGR7GMvpJEGqUEpntc6FJ3+g1B7qsedgeBUc/RKB
# UaRmtYbvlMv5wSmaLcxsT3S3BnABbrd4EedZX5uOBFMrtnTiOqrMUEcoMaf5ogvN
# KlJkrjNQkfHxTbp5G+nXHuTzae3k2Ylm196b2yhgARfUL70jiak/B+ADeezVcVmW
# 6ZpotrAvMxu9RlFdxTSbL0/lR0rfKZTecqMOSFA+FlmjcTJ0QW1SbweMdsfgW/uU
# /2Hfmw6zUQ80/tMqYMztFWsiov7C8a8ZMmuZwDQp+AdCVGgFEigfNJVQYgujbqKz
# g9Ta9cNPyvF5hpnml5u8IzAzM95HrhIPFmmpUBZyWOCeL6chSHk=
# =Cu7b
# -----END PGP SIGNATURE-----
# gpg: Signature made Fri 04 Jul 2025 04:42:59 EDT
# gpg:                using RSA key A0F66548F04895EBFE6B0B6051A343C7CFFBECA1
# gpg: Good signature from "Cédric Le Goater <clg@redhat.com>" [full]
# gpg:                 aka "Cédric Le Goater <clg@kaod.org>" [full]
# Primary key fingerprint: A0F6 6548 F048 95EB FE6B  0B60 51A3 43C7 CFFB ECA1

* tag 'pull-vfio-20250704' of https://github.com/legoater/qemu: (27 commits)
  vfio: doc changes for cpr
  vfio/container: delete old cpr register
  iommufd: preserve DMA mappings
  vfio/iommufd: change process
  vfio/iommufd: reconstruct hwpt
  vfio/iommufd: reconstruct device
  vfio/iommufd: preserve descriptors
  vfio/iommufd: cpr state
  migration: vfio cpr state hook
  vfio/iommufd: register container for cpr
  vfio/iommufd: device name blocker
  vfio/iommufd: add vfio_device_free_name
  vfio/iommufd: invariant device name
  vfio/iommufd: use IOMMU_IOAS_MAP_FILE
  physmem: qemu_ram_get_fd_offset
  backends/iommufd: change process ioctl
  backends/iommufd: iommufd_backend_map_file_dma
  migration: cpr_get_fd_param helper
  migration: close kvm after cpr
  vfio-pci: preserve INTx
  ...

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
This commit is contained in:
Stefan Hajnoczi 2025-07-04 08:58:39 -04:00
commit 563ac3d181
33 changed files with 888 additions and 92 deletions

View file

@ -11,4 +11,3 @@
prep-perpatch-check-cmd = scripts/checkpatch.pl -q --terse --no-summary --mailback -
searchmask = https://lore.kernel.org/qemu-devel/?x=m&t=1&q=%s
linkmask = https://lore.kernel.org/qemu-devel/%s
linktrailermask = Message-ID: <%s>

View file

@ -227,6 +227,7 @@ distclean: clean recurse-distclean
rm -Rf .sdk qemu-bundle
find-src-path = find "$(SRC_PATH)" -path "$(SRC_PATH)/meson" -prune -o \
-path "$(SRC_PATH)/.pc" -prune -o \
-type l -prune -o \( -name "*.[chsS]" -o -name "*.[ch].inc" \)
.PHONY: ctags

View file

@ -515,16 +515,23 @@ static int do_kvm_destroy_vcpu(CPUState *cpu)
goto err;
}
/* If I am the CPU that created coalesced_mmio_ring, then discard it */
if (s->coalesced_mmio_ring == (void *)cpu->kvm_run + PAGE_SIZE) {
s->coalesced_mmio_ring = NULL;
}
ret = munmap(cpu->kvm_run, mmap_size);
if (ret < 0) {
goto err;
}
cpu->kvm_run = NULL;
if (cpu->kvm_dirty_gfns) {
ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_bytes);
if (ret < 0) {
goto err;
}
cpu->kvm_dirty_gfns = NULL;
}
kvm_park_vcpu(cpu);
@ -608,6 +615,31 @@ err:
return ret;
}
void kvm_close(void)
{
CPUState *cpu;
if (!kvm_state || kvm_state->fd == -1) {
return;
}
CPU_FOREACH(cpu) {
cpu_remove_sync(cpu);
close(cpu->kvm_fd);
cpu->kvm_fd = -1;
close(cpu->kvm_vcpu_stats_fd);
cpu->kvm_vcpu_stats_fd = -1;
}
if (kvm_state && kvm_state->fd != -1) {
close(kvm_state->vmfd);
kvm_state->vmfd = -1;
close(kvm_state->fd);
kvm_state->fd = -1;
}
kvm_state = NULL;
}
/*
* dirty pages logging control
*/

View file

@ -16,12 +16,18 @@
#include "qemu/module.h"
#include "qom/object_interfaces.h"
#include "qemu/error-report.h"
#include "migration/cpr.h"
#include "monitor/monitor.h"
#include "trace.h"
#include "hw/vfio/vfio-device.h"
#include <sys/ioctl.h>
#include <linux/iommufd.h>
static const char *iommufd_fd_name(IOMMUFDBackend *be)
{
return object_get_canonical_path_component(OBJECT(be));
}
static void iommufd_backend_init(Object *obj)
{
IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
@ -64,26 +70,73 @@ static bool iommufd_backend_can_be_deleted(UserCreatable *uc)
return !be->users;
}
static void iommufd_backend_complete(UserCreatable *uc, Error **errp)
{
IOMMUFDBackend *be = IOMMUFD_BACKEND(uc);
const char *name = iommufd_fd_name(be);
if (!be->owned) {
/* fd came from the command line. Fetch updated value from cpr state. */
if (cpr_is_incoming()) {
be->fd = cpr_find_fd(name, 0);
} else {
cpr_save_fd(name, 0, be->fd);
}
}
}
static void iommufd_backend_class_init(ObjectClass *oc, const void *data)
{
UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
ucc->can_be_deleted = iommufd_backend_can_be_deleted;
ucc->complete = iommufd_backend_complete;
object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd);
}
bool iommufd_change_process_capable(IOMMUFDBackend *be)
{
struct iommu_ioas_change_process args = {.size = sizeof(args)};
/*
* Call IOMMU_IOAS_CHANGE_PROCESS to verify it is a recognized ioctl.
* This is a no-op if the process has not changed since DMA was mapped.
*/
return !ioctl(be->fd, IOMMU_IOAS_CHANGE_PROCESS, &args);
}
bool iommufd_change_process(IOMMUFDBackend *be, Error **errp)
{
struct iommu_ioas_change_process args = {.size = sizeof(args)};
bool ret = !ioctl(be->fd, IOMMU_IOAS_CHANGE_PROCESS, &args);
if (!ret) {
error_setg_errno(errp, errno, "IOMMU_IOAS_CHANGE_PROCESS fd %d failed",
be->fd);
}
trace_iommufd_change_process(be->fd, ret);
return ret;
}
bool iommufd_backend_connect(IOMMUFDBackend *be, Error **errp)
{
int fd;
if (be->owned && !be->users) {
fd = qemu_open("/dev/iommu", O_RDWR, errp);
fd = cpr_open_fd("/dev/iommu", O_RDWR, iommufd_fd_name(be), 0, errp);
if (fd < 0) {
return false;
}
be->fd = fd;
}
if (!be->users && !vfio_iommufd_cpr_register_iommufd(be, errp)) {
if (be->owned) {
close(be->fd);
be->fd = -1;
}
return false;
}
be->users++;
trace_iommufd_backend_connect(be->fd, be->owned, be->users);
@ -96,10 +149,14 @@ void iommufd_backend_disconnect(IOMMUFDBackend *be)
goto out;
}
be->users--;
if (!be->users && be->owned) {
if (!be->users) {
vfio_iommufd_cpr_unregister_iommufd(be);
if (be->owned) {
cpr_delete_fd(iommufd_fd_name(be), 0);
close(be->fd);
be->fd = -1;
}
}
out:
trace_iommufd_backend_disconnect(be->fd, be->users);
}
@ -172,6 +229,44 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
return ret;
}
int iommufd_backend_map_file_dma(IOMMUFDBackend *be, uint32_t ioas_id,
hwaddr iova, ram_addr_t size,
int mfd, unsigned long start, bool readonly)
{
int ret, fd = be->fd;
struct iommu_ioas_map_file map = {
.size = sizeof(map),
.flags = IOMMU_IOAS_MAP_READABLE |
IOMMU_IOAS_MAP_FIXED_IOVA,
.ioas_id = ioas_id,
.fd = mfd,
.start = start,
.iova = iova,
.length = size,
};
if (cpr_is_incoming()) {
return 0;
}
if (!readonly) {
map.flags |= IOMMU_IOAS_MAP_WRITEABLE;
}
ret = ioctl(fd, IOMMU_IOAS_MAP_FILE, &map);
trace_iommufd_backend_map_file_dma(fd, ioas_id, iova, size, mfd, start,
readonly, ret);
if (ret) {
ret = -errno;
/* TODO: Not support mapping hardware PCI BAR region for now. */
if (errno == EFAULT) {
warn_report("IOMMU_IOAS_MAP_FILE failed: %m, PCI BAR?");
}
}
return ret;
}
int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
hwaddr iova, ram_addr_t size)
{
@ -183,6 +278,10 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
.length = size,
};
if (cpr_is_incoming()) {
return 0;
}
ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap);
/*
* IOMMUFD takes mapping as some kind of object, unmapping

View file

@ -7,10 +7,12 @@ dbus_vmstate_loading(const char *id) "id: %s"
dbus_vmstate_saving(const char *id) "id: %s"
# iommufd.c
iommufd_change_process(int fd, bool ret) "fd=%d (%d)"
iommufd_backend_connect(int fd, bool owned, uint32_t users) "fd=%d owned=%d users=%d"
iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d"
iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d"
iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)"
iommufd_backend_map_file_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int fd, unsigned long start, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" fd=%d start=%ld readonly=%d (%d)"
iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas) " iommufd=%d ioas=%d"

View file

@ -152,8 +152,7 @@ cpr-transfer mode
This mode allows the user to transfer a guest to a new QEMU instance
on the same host with minimal guest pause time, by preserving guest
RAM in place, albeit with new virtual addresses in new QEMU. Devices
and their pinned memory pages will also be preserved in a future QEMU
release.
and their pinned memory pages are also preserved for VFIO and IOMMUFD.
The user starts new QEMU on the same host as old QEMU, with command-
line arguments to create the same machine, plus the ``-incoming``
@ -322,6 +321,6 @@ Futures
cpr-transfer mode is based on a capability to transfer open file
descriptors from old to new QEMU. In the future, descriptors for
vfio, iommufd, vhost, and char devices could be transferred,
vhost, and char devices could be transferred,
preserving those devices and their kernel state without interruption,
even if they do not explicitly support live migration.

View file

@ -13,7 +13,7 @@ config SGX
config TDX
bool
select X86_FW_OVMF
depends on KVM
depends on KVM && X86_64
config PC
bool

View file

@ -13,7 +13,6 @@
#include "hw/vfio-user/container.h"
#include "hw/vfio-user/device.h"
#include "hw/vfio-user/trace.h"
#include "hw/vfio/vfio-cpr.h"
#include "hw/vfio/vfio-device.h"
#include "hw/vfio/vfio-listener.h"
#include "qapi/error.h"
@ -225,14 +224,10 @@ vfio_user_container_connect(AddressSpace *as, VFIODevice *vbasedev,
bcontainer = &container->bcontainer;
if (!vfio_cpr_register_container(bcontainer, errp)) {
goto free_container_exit;
}
ret = ram_block_uncoordinated_discard_disable(true);
if (ret) {
error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
goto unregister_container_exit;
goto free_container_exit;
}
vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
@ -261,9 +256,6 @@ listener_release_exit:
enable_discards_exit:
ram_block_uncoordinated_discard_disable(false);
unregister_container_exit:
vfio_cpr_unregister_container(bcontainer);
free_container_exit:
object_unref(container);
@ -286,7 +278,6 @@ static void vfio_user_container_disconnect(VFIOUserContainer *container)
vioc->release(bcontainer);
}
vfio_cpr_unregister_container(bcontainer);
object_unref(container);
vfio_address_space_put(space);

View file

@ -265,7 +265,7 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp)
error:
error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name);
g_free(vbasedev->name);
vfio_device_free_name(vbasedev);
}
static void vfio_ap_unrealize(DeviceState *dev)
@ -275,7 +275,7 @@ static void vfio_ap_unrealize(DeviceState *dev)
vfio_ap_unregister_irq_notifier(vapdev, VFIO_AP_REQ_IRQ_INDEX);
vfio_ap_unregister_irq_notifier(vapdev, VFIO_AP_CFG_CHG_IRQ_INDEX);
vfio_device_detach(&vapdev->vdev);
g_free(vapdev->vdev.name);
vfio_device_free_name(&vapdev->vdev);
}
static const Property vfio_ap_properties[] = {

View file

@ -619,7 +619,7 @@ out_io_notifier_err:
out_region_err:
vfio_device_detach(vbasedev);
out_attach_dev_err:
g_free(vbasedev->name);
vfio_device_free_name(vbasedev);
out_unrealize:
if (cdc->unrealize) {
cdc->unrealize(cdev);
@ -637,7 +637,7 @@ static void vfio_ccw_unrealize(DeviceState *dev)
vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX);
vfio_ccw_put_region(vcdev);
vfio_device_detach(&vcdev->vdev);
g_free(vcdev->vdev.name);
vfio_device_free_name(&vcdev->vdev);
if (cdc->unrealize) {
cdc->unrealize(cdev);

View file

@ -78,7 +78,16 @@ int vfio_container_dma_map(VFIOContainerBase *bcontainer,
void *vaddr, bool readonly, MemoryRegion *mr)
{
VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
RAMBlock *rb = mr->ram_block;
int mfd = rb ? qemu_ram_get_fd(rb) : -1;
if (mfd >= 0 && vioc->dma_map_file) {
unsigned long start = vaddr - qemu_ram_get_host_addr(rb);
unsigned long offset = qemu_ram_get_fd_offset(rb);
return vioc->dma_map_file(bcontainer, iova, size, mfd, start + offset,
readonly);
}
g_assert(vioc->dma_map);
return vioc->dma_map(bcontainer, iova, size, vaddr, readonly, mr);
}

225
hw/vfio/cpr-iommufd.c Normal file
View file

@ -0,0 +1,225 @@
/*
* Copyright (c) 2024-2025 Oracle and/or its affiliates.
*
* SPDX-License-Identifier: GPL-2.0-or-later
*/
#include "qemu/osdep.h"
#include "qemu/error-report.h"
#include "qapi/error.h"
#include "hw/vfio/vfio-cpr.h"
#include "hw/vfio/vfio-device.h"
#include "migration/blocker.h"
#include "migration/cpr.h"
#include "migration/migration.h"
#include "migration/vmstate.h"
#include "system/iommufd.h"
#include "vfio-iommufd.h"
#include "trace.h"
typedef struct CprVFIODevice {
char *name;
unsigned int namelen;
uint32_t ioas_id;
int devid;
uint32_t hwpt_id;
QLIST_ENTRY(CprVFIODevice) next;
} CprVFIODevice;
static const VMStateDescription vmstate_cpr_vfio_device = {
.name = "cpr vfio device",
.version_id = 1,
.minimum_version_id = 1,
.fields = (VMStateField[]) {
VMSTATE_UINT32(namelen, CprVFIODevice),
VMSTATE_VBUFFER_ALLOC_UINT32(name, CprVFIODevice, 0, NULL, namelen),
VMSTATE_INT32(devid, CprVFIODevice),
VMSTATE_UINT32(ioas_id, CprVFIODevice),
VMSTATE_UINT32(hwpt_id, CprVFIODevice),
VMSTATE_END_OF_LIST()
}
};
const VMStateDescription vmstate_cpr_vfio_devices = {
.name = CPR_STATE "/vfio devices",
.version_id = 1,
.minimum_version_id = 1,
.fields = (const VMStateField[]){
VMSTATE_QLIST_V(vfio_devices, CprState, 1, vmstate_cpr_vfio_device,
CprVFIODevice, next),
VMSTATE_END_OF_LIST()
}
};
static void vfio_cpr_save_device(VFIODevice *vbasedev)
{
CprVFIODevice *elem = g_new0(CprVFIODevice, 1);
elem->name = g_strdup(vbasedev->name);
elem->namelen = strlen(vbasedev->name) + 1;
elem->ioas_id = vbasedev->cpr.ioas_id;
elem->devid = vbasedev->devid;
elem->hwpt_id = vbasedev->cpr.hwpt_id;
QLIST_INSERT_HEAD(&cpr_state.vfio_devices, elem, next);
}
static CprVFIODevice *find_device(const char *name)
{
CprVFIODeviceList *head = &cpr_state.vfio_devices;
CprVFIODevice *elem;
QLIST_FOREACH(elem, head, next) {
if (!strcmp(elem->name, name)) {
return elem;
}
}
return NULL;
}
static void vfio_cpr_delete_device(const char *name)
{
CprVFIODevice *elem = find_device(name);
if (elem) {
QLIST_REMOVE(elem, next);
g_free(elem->name);
g_free(elem);
}
}
static bool vfio_cpr_find_device(VFIODevice *vbasedev)
{
CprVFIODevice *elem = find_device(vbasedev->name);
if (elem) {
vbasedev->cpr.ioas_id = elem->ioas_id;
vbasedev->devid = elem->devid;
vbasedev->cpr.hwpt_id = elem->hwpt_id;
trace_vfio_cpr_find_device(elem->ioas_id, elem->devid, elem->hwpt_id);
return true;
}
return false;
}
static bool vfio_cpr_supported(IOMMUFDBackend *be, Error **errp)
{
if (!iommufd_change_process_capable(be)) {
if (errp) {
error_setg(errp, "vfio iommufd backend does not support "
"IOMMU_IOAS_CHANGE_PROCESS");
}
return false;
}
return true;
}
static int iommufd_cpr_pre_save(void *opaque)
{
IOMMUFDBackend *be = opaque;
/*
* The process has not changed yet, but proactively try the ioctl,
* and it will fail if any DMA mappings are not supported.
*/
if (!iommufd_change_process_capable(be)) {
error_report("some memory regions do not support "
"IOMMU_IOAS_CHANGE_PROCESS");
return -1;
}
return 0;
}
static int iommufd_cpr_post_load(void *opaque, int version_id)
{
IOMMUFDBackend *be = opaque;
Error *local_err = NULL;
if (!iommufd_change_process(be, &local_err)) {
error_report_err(local_err);
return -1;
}
return 0;
}
static const VMStateDescription iommufd_cpr_vmstate = {
.name = "iommufd",
.version_id = 0,
.minimum_version_id = 0,
.pre_save = iommufd_cpr_pre_save,
.post_load = iommufd_cpr_post_load,
.needed = cpr_incoming_needed,
.fields = (VMStateField[]) {
VMSTATE_END_OF_LIST()
}
};
bool vfio_iommufd_cpr_register_iommufd(IOMMUFDBackend *be, Error **errp)
{
Error **cpr_blocker = &be->cpr_blocker;
if (!vfio_cpr_supported(be, cpr_blocker)) {
return migrate_add_blocker_modes(cpr_blocker, errp,
MIG_MODE_CPR_TRANSFER, -1) == 0;
}
vmstate_register(NULL, -1, &iommufd_cpr_vmstate, be);
return true;
}
void vfio_iommufd_cpr_unregister_iommufd(IOMMUFDBackend *be)
{
vmstate_unregister(NULL, &iommufd_cpr_vmstate, be);
migrate_del_blocker(&be->cpr_blocker);
}
bool vfio_iommufd_cpr_register_container(VFIOIOMMUFDContainer *container,
Error **errp)
{
VFIOContainerBase *bcontainer = &container->bcontainer;
migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier,
vfio_cpr_reboot_notifier,
MIG_MODE_CPR_REBOOT);
vfio_cpr_add_kvm_notifier();
return true;
}
void vfio_iommufd_cpr_unregister_container(VFIOIOMMUFDContainer *container)
{
VFIOContainerBase *bcontainer = &container->bcontainer;
migration_remove_notifier(&bcontainer->cpr_reboot_notifier);
}
void vfio_iommufd_cpr_register_device(VFIODevice *vbasedev)
{
if (!cpr_is_incoming()) {
/*
* Beware fd may have already been saved by vfio_device_set_fd,
* so call resave to avoid a duplicate entry.
*/
cpr_resave_fd(vbasedev->name, 0, vbasedev->fd);
vfio_cpr_save_device(vbasedev);
}
}
void vfio_iommufd_cpr_unregister_device(VFIODevice *vbasedev)
{
cpr_delete_fd(vbasedev->name, 0);
vfio_cpr_delete_device(vbasedev->name);
}
void vfio_cpr_load_device(VFIODevice *vbasedev)
{
if (cpr_is_incoming()) {
bool ret = vfio_cpr_find_device(vbasedev);
g_assert(ret);
if (vbasedev->fd < 0) {
vbasedev->fd = cpr_find_fd(vbasedev->name, 0);
}
}
}

View file

@ -99,20 +99,21 @@ static int vfio_container_post_load(void *opaque, int version_id)
{
VFIOContainer *container = opaque;
VFIOContainerBase *bcontainer = &container->bcontainer;
VFIOGroup *group;
VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
dma_map_fn saved_dma_map = vioc->dma_map;
Error *local_err = NULL;
/* During incoming CPR, divert calls to dma_map. */
vioc->dma_map = vfio_legacy_cpr_dma_map;
if (!vfio_listener_register(bcontainer, &local_err)) {
error_report_err(local_err);
return -1;
}
QLIST_FOREACH(group, &container->group_list, container_next) {
VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
/* Restore original dma_map function */
vioc->dma_map = container->cpr.saved_dma_map;
}
vioc->dma_map = saved_dma_map;
return 0;
}
@ -148,6 +149,7 @@ static int vfio_cpr_fail_notifier(NotifierWithReturn *notifier,
*/
VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
dma_map_fn saved_dma_map = vioc->dma_map;
vioc->dma_map = vfio_legacy_cpr_dma_map;
container->cpr.remap_listener = (MemoryListener) {
@ -158,7 +160,7 @@ static int vfio_cpr_fail_notifier(NotifierWithReturn *notifier,
bcontainer->space->as);
memory_listener_unregister(&container->cpr.remap_listener);
container->cpr.vaddr_unmapped = false;
vioc->dma_map = container->cpr.saved_dma_map;
vioc->dma_map = saved_dma_map;
}
return 0;
}
@ -177,14 +179,9 @@ bool vfio_legacy_cpr_register_container(VFIOContainer *container, Error **errp)
MIG_MODE_CPR_TRANSFER, -1) == 0;
}
vmstate_register(NULL, -1, &vfio_container_vmstate, container);
vfio_cpr_add_kvm_notifier();
/* During incoming CPR, divert calls to dma_map. */
if (cpr_is_incoming()) {
VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
container->cpr.saved_dma_map = vioc->dma_map;
vioc->dma_map = vfio_legacy_cpr_dma_map;
}
vmstate_register(NULL, -1, &vfio_container_vmstate, container);
migration_add_notifier_mode(&container->cpr.transfer_notifier,
vfio_cpr_fail_notifier,

View file

@ -9,6 +9,8 @@
#include "hw/vfio/vfio-device.h"
#include "hw/vfio/vfio-cpr.h"
#include "hw/vfio/pci.h"
#include "hw/pci/msix.h"
#include "hw/pci/msi.h"
#include "migration/cpr.h"
#include "qapi/error.h"
#include "system/runstate.h"
@ -27,17 +29,67 @@ int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier,
return 0;
}
bool vfio_cpr_register_container(VFIOContainerBase *bcontainer, Error **errp)
#define STRDUP_VECTOR_FD_NAME(vdev, name) \
g_strdup_printf("%s_%s", (vdev)->vbasedev.name, (name))
void vfio_cpr_save_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr,
int fd)
{
migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier,
vfio_cpr_reboot_notifier,
MIG_MODE_CPR_REBOOT);
return true;
g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name);
cpr_save_fd(fdname, nr, fd);
}
void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer)
int vfio_cpr_load_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr)
{
migration_remove_notifier(&bcontainer->cpr_reboot_notifier);
g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name);
return cpr_find_fd(fdname, nr);
}
void vfio_cpr_delete_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr)
{
g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name);
cpr_delete_fd(fdname, nr);
}
static void vfio_cpr_claim_vectors(VFIOPCIDevice *vdev, int nr_vectors,
bool msix)
{
int i, fd;
bool pending = false;
PCIDevice *pdev = &vdev->pdev;
vdev->nr_vectors = nr_vectors;
vdev->msi_vectors = g_new0(VFIOMSIVector, nr_vectors);
vdev->interrupt = msix ? VFIO_INT_MSIX : VFIO_INT_MSI;
vfio_pci_prepare_kvm_msi_virq_batch(vdev);
for (i = 0; i < nr_vectors; i++) {
VFIOMSIVector *vector = &vdev->msi_vectors[i];
fd = vfio_cpr_load_vector_fd(vdev, "interrupt", i);
if (fd >= 0) {
vfio_pci_vector_init(vdev, i);
vfio_pci_msi_set_handler(vdev, i);
}
if (vfio_cpr_load_vector_fd(vdev, "kvm_interrupt", i) >= 0) {
vfio_pci_add_kvm_msi_virq(vdev, vector, i, msix);
} else {
vdev->msi_vectors[i].virq = -1;
}
if (msix && msix_is_pending(pdev, i) && msix_is_masked(pdev, i)) {
set_bit(i, vdev->msix->pending);
pending = true;
}
}
vfio_pci_commit_kvm_msi_virq_batch(vdev);
if (msix) {
memory_region_set_enabled(&pdev->msix_pba_mmio, pending);
}
}
/*
@ -58,13 +110,91 @@ static int vfio_cpr_pci_pre_load(void *opaque)
return 0;
}
static int vfio_cpr_pci_post_load(void *opaque, int version_id)
{
VFIOPCIDevice *vdev = opaque;
PCIDevice *pdev = &vdev->pdev;
int nr_vectors;
if (msix_enabled(pdev)) {
vfio_pci_msix_set_notifiers(vdev);
nr_vectors = vdev->msix->entries;
vfio_cpr_claim_vectors(vdev, nr_vectors, true);
} else if (msi_enabled(pdev)) {
nr_vectors = msi_nr_vectors_allocated(pdev);
vfio_cpr_claim_vectors(vdev, nr_vectors, false);
} else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
Error *local_err = NULL;
if (!vfio_pci_intx_enable(vdev, &local_err)) {
error_report_err(local_err);
return -1;
}
}
return 0;
}
static bool pci_msix_present(void *opaque, int version_id)
{
PCIDevice *pdev = opaque;
return msix_present(pdev);
}
static const VMStateDescription vfio_intx_vmstate = {
.name = "vfio-cpr-intx",
.version_id = 0,
.minimum_version_id = 0,
.fields = (VMStateField[]) {
VMSTATE_BOOL(pending, VFIOINTx),
VMSTATE_UINT32(route.mode, VFIOINTx),
VMSTATE_INT32(route.irq, VFIOINTx),
VMSTATE_END_OF_LIST()
}
};
#define VMSTATE_VFIO_INTX(_field, _state) { \
.name = (stringify(_field)), \
.size = sizeof(VFIOINTx), \
.vmsd = &vfio_intx_vmstate, \
.flags = VMS_STRUCT, \
.offset = vmstate_offset_value(_state, _field, VFIOINTx), \
}
const VMStateDescription vfio_cpr_pci_vmstate = {
.name = "vfio-cpr-pci",
.version_id = 0,
.minimum_version_id = 0,
.pre_load = vfio_cpr_pci_pre_load,
.post_load = vfio_cpr_pci_post_load,
.needed = cpr_incoming_needed,
.fields = (VMStateField[]) {
VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice),
VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, pci_msix_present),
VMSTATE_VFIO_INTX(intx, VFIOPCIDevice),
VMSTATE_END_OF_LIST()
}
};
static NotifierWithReturn kvm_close_notifier;
static int vfio_cpr_kvm_close_notifier(NotifierWithReturn *notifier,
MigrationEvent *e,
Error **errp)
{
if (e->type == MIG_EVENT_PRECOPY_DONE) {
vfio_kvm_device_close();
}
return 0;
}
void vfio_cpr_add_kvm_notifier(void)
{
if (!kvm_close_notifier.notify) {
migration_add_notifier_mode(&kvm_close_notifier,
vfio_cpr_kvm_close_notifier,
MIG_MODE_CPR_TRANSFER);
}
}

View file

@ -28,6 +28,8 @@
#include "qapi/error.h"
#include "qemu/error-report.h"
#include "qemu/units.h"
#include "migration/cpr.h"
#include "migration/blocker.h"
#include "monitor/monitor.h"
#include "vfio-helpers.h"
@ -316,28 +318,40 @@ bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp)
error_setg(errp, "Use FD passing only with iommufd backend");
return false;
}
/*
* Give a name with fd so any function printing out vbasedev->name
* will not break.
*/
if (!vbasedev->name) {
if (vbasedev->dev->id) {
vbasedev->name = g_strdup(vbasedev->dev->id);
return true;
} else {
/*
* Assign a name so any function printing it will not break.
* The fd number changes across processes, so this cannot be
* used as an invariant name for CPR.
*/
vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd);
error_setg(&vbasedev->cpr.id_blocker,
"vfio device with fd=%d needs an id property",
vbasedev->fd);
return migrate_add_blocker_modes(&vbasedev->cpr.id_blocker,
errp, MIG_MODE_CPR_TRANSFER,
-1) == 0;
}
}
}
return true;
}
void vfio_device_free_name(VFIODevice *vbasedev)
{
g_clear_pointer(&vbasedev->name, g_free);
migrate_del_blocker(&vbasedev->cpr.id_blocker);
}
void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp)
{
ERRP_GUARD();
int fd = monitor_fd_param(monitor_cur(), str, errp);
if (fd < 0) {
error_prepend(errp, "Could not parse remote object fd %s:", str);
return;
}
vbasedev->fd = fd;
vbasedev->fd = cpr_get_fd_param(vbasedev->dev->id, str, 0, errp);
}
static VFIODeviceIOOps vfio_device_io_ops_ioctl;

View file

@ -117,6 +117,17 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
int vfio_kvm_device_fd = -1;
#endif
void vfio_kvm_device_close(void)
{
#ifdef CONFIG_KVM
kvm_close();
if (vfio_kvm_device_fd != -1) {
close(vfio_kvm_device_fd);
vfio_kvm_device_fd = -1;
}
#endif
}
int vfio_kvm_device_add_fd(int fd, Error **errp)
{
#ifdef CONFIG_KVM

18
hw/vfio/iommufd-stubs.c Normal file
View file

@ -0,0 +1,18 @@
/*
* Copyright (c) 2025 Oracle and/or its affiliates.
*
* SPDX-License-Identifier: GPL-2.0-or-later
*/
#include "qemu/osdep.h"
#include "migration/cpr.h"
#include "migration/vmstate.h"
const VMStateDescription vmstate_cpr_vfio_devices = {
.name = CPR_STATE "/vfio devices",
.version_id = 1,
.minimum_version_id = 1,
.fields = (const VMStateField[]){
VMSTATE_END_OF_LIST()
}
};

View file

@ -25,6 +25,7 @@
#include "system/reset.h"
#include "qemu/cutils.h"
#include "qemu/chardev_open.h"
#include "migration/cpr.h"
#include "pci.h"
#include "vfio-iommufd.h"
#include "vfio-helpers.h"
@ -45,6 +46,18 @@ static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova,
iova, size, vaddr, readonly);
}
static int iommufd_cdev_map_file(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
int fd, unsigned long start, bool readonly)
{
const VFIOIOMMUFDContainer *container =
container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer);
return iommufd_backend_map_file_dma(container->be,
container->ioas_id,
iova, size, fd, start, readonly);
}
static int iommufd_cdev_unmap(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
IOMMUTLBEntry *iotlb, bool unmap_all)
@ -109,6 +122,10 @@ static bool iommufd_cdev_connect_and_bind(VFIODevice *vbasedev, Error **errp)
goto err_kvm_device_add;
}
if (cpr_is_incoming()) {
goto skip_bind;
}
/* Bind device to iommufd */
bind.iommufd = iommufd->fd;
if (ioctl(vbasedev->fd, VFIO_DEVICE_BIND_IOMMUFD, &bind)) {
@ -120,6 +137,8 @@ static bool iommufd_cdev_connect_and_bind(VFIODevice *vbasedev, Error **errp)
vbasedev->devid = bind.out_devid;
trace_iommufd_cdev_connect_and_bind(bind.iommufd, vbasedev->name,
vbasedev->fd, vbasedev->devid);
skip_bind:
return true;
err_bind:
iommufd_cdev_kvm_device_del(vbasedev);
@ -313,7 +332,14 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
/* Try to find a domain */
QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
if (!cpr_is_incoming()) {
ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp);
} else if (vbasedev->cpr.hwpt_id == hwpt->hwpt_id) {
ret = 0;
} else {
continue;
}
if (ret) {
/* -EINVAL means the domain is incompatible with the device. */
if (ret == -EINVAL) {
@ -330,6 +356,7 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
return false;
} else {
vbasedev->hwpt = hwpt;
vbasedev->cpr.hwpt_id = hwpt->hwpt_id;
QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt);
return true;
@ -352,6 +379,11 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
}
if (cpr_is_incoming()) {
hwpt_id = vbasedev->cpr.hwpt_id;
goto skip_alloc;
}
if (!iommufd_backend_alloc_hwpt(iommufd, vbasedev->devid,
container->ioas_id, flags,
IOMMU_HWPT_DATA_NONE, 0, NULL,
@ -359,19 +391,20 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
return false;
}
ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt_id, errp);
if (ret) {
iommufd_backend_free_id(container->be, hwpt_id);
return false;
}
skip_alloc:
hwpt = g_malloc0(sizeof(*hwpt));
hwpt->hwpt_id = hwpt_id;
hwpt->hwpt_flags = flags;
QLIST_INIT(&hwpt->device_list);
ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp);
if (ret) {
iommufd_backend_free_id(container->be, hwpt->hwpt_id);
g_free(hwpt);
return false;
}
vbasedev->hwpt = hwpt;
vbasedev->cpr.hwpt_id = hwpt->hwpt_id;
vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt);
QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next);
@ -409,7 +442,9 @@ static bool iommufd_cdev_attach_container(VFIODevice *vbasedev,
return iommufd_cdev_autodomains_get(vbasedev, container, errp);
}
return !iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp);
/* If CPR, we are already attached to ioas_id. */
return cpr_is_incoming() ||
!iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp);
}
static void iommufd_cdev_detach_container(VFIODevice *vbasedev,
@ -434,7 +469,7 @@ static void iommufd_cdev_container_destroy(VFIOIOMMUFDContainer *container)
if (!QLIST_EMPTY(&bcontainer->device_list)) {
return;
}
vfio_cpr_unregister_container(bcontainer);
vfio_iommufd_cpr_unregister_container(container);
vfio_listener_unregister(bcontainer);
iommufd_backend_free_id(container->be, container->ioas_id);
object_unref(container);
@ -498,11 +533,14 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
VFIOAddressSpace *space;
struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
int ret, devfd;
bool res;
uint32_t ioas_id;
Error *err = NULL;
const VFIOIOMMUClass *iommufd_vioc =
VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD));
vfio_cpr_load_device(vbasedev);
if (vbasedev->fd < 0) {
devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp);
if (devfd < 0) {
@ -526,7 +564,16 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
vbasedev->iommufd != container->be) {
continue;
}
if (!iommufd_cdev_attach_container(vbasedev, container, &err)) {
if (!cpr_is_incoming()) {
res = iommufd_cdev_attach_container(vbasedev, container, &err);
} else if (vbasedev->cpr.ioas_id == container->ioas_id) {
res = true;
} else {
continue;
}
if (!res) {
const char *msg = error_get_pretty(err);
trace_iommufd_cdev_fail_attach_existing_container(msg);
@ -543,6 +590,11 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
}
}
if (cpr_is_incoming()) {
ioas_id = vbasedev->cpr.ioas_id;
goto skip_ioas_alloc;
}
/* Need to allocate a new dedicated container */
if (!iommufd_backend_alloc_ioas(vbasedev->iommufd, &ioas_id, errp)) {
goto err_alloc_ioas;
@ -550,10 +602,12 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
trace_iommufd_cdev_alloc_ioas(vbasedev->iommufd->fd, ioas_id);
skip_ioas_alloc:
container = VFIO_IOMMU_IOMMUFD(object_new(TYPE_VFIO_IOMMU_IOMMUFD));
container->be = vbasedev->iommufd;
container->ioas_id = ioas_id;
QLIST_INIT(&container->hwpt_list);
vbasedev->cpr.ioas_id = ioas_id;
bcontainer = &container->bcontainer;
vfio_address_space_insert(space, bcontainer);
@ -580,7 +634,7 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
goto err_listener_register;
}
if (!vfio_cpr_register_container(bcontainer, errp)) {
if (!vfio_iommufd_cpr_register_container(container, errp)) {
goto err_listener_register;
}
@ -611,6 +665,7 @@ found_container:
}
vfio_device_prepare(vbasedev, bcontainer, &dev_info);
vfio_iommufd_cpr_register_device(vbasedev);
trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs,
vbasedev->num_regions, vbasedev->flags);
@ -648,6 +703,7 @@ static void iommufd_cdev_detach(VFIODevice *vbasedev)
iommufd_cdev_container_destroy(container);
vfio_address_space_put(space);
vfio_iommufd_cpr_unregister_device(vbasedev);
iommufd_cdev_unbind_and_disconnect(vbasedev);
close(vbasedev->fd);
}
@ -807,6 +863,7 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, const void *data)
VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
vioc->dma_map = iommufd_cdev_map;
vioc->dma_map_file = iommufd_cdev_map_file;
vioc->dma_unmap = iommufd_cdev_unmap;
vioc->attach_device = iommufd_cdev_attach;
vioc->detach_device = iommufd_cdev_detach;

View file

@ -31,7 +31,9 @@ system_ss.add(when: 'CONFIG_VFIO', if_true: files(
))
system_ss.add(when: ['CONFIG_VFIO', 'CONFIG_IOMMUFD'], if_true: files(
'iommufd.c',
'cpr-iommufd.c',
))
system_ss.add(when: 'CONFIG_IOMMUFD', if_false: files('iommufd-stubs.c'))
system_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files(
'display.c',
))

View file

@ -29,6 +29,7 @@
#include "hw/pci/pci_bridge.h"
#include "hw/qdev-properties.h"
#include "hw/qdev-properties-system.h"
#include "hw/vfio/vfio-cpr.h"
#include "migration/vmstate.h"
#include "migration/cpr.h"
#include "qobject/qdict.h"
@ -57,20 +58,33 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
static void vfio_msi_disable_common(VFIOPCIDevice *vdev);
/* Create new or reuse existing eventfd */
static bool vfio_notifier_init(VFIOPCIDevice *vdev, EventNotifier *e,
const char *name, int nr, Error **errp)
{
int ret = event_notifier_init(e, 0);
int fd, ret;
fd = vfio_cpr_load_vector_fd(vdev, name, nr);
if (fd >= 0) {
event_notifier_init_fd(e, fd);
return true;
}
ret = event_notifier_init(e, 0);
if (ret) {
error_setg_errno(errp, -ret, "vfio_notifier_init %s failed", name);
return false;
}
return !ret;
fd = event_notifier_get_fd(e);
vfio_cpr_save_vector_fd(vdev, name, nr, fd);
return true;
}
static void vfio_notifier_cleanup(VFIOPCIDevice *vdev, EventNotifier *e,
const char *name, int nr)
{
vfio_cpr_delete_vector_fd(vdev, name, nr);
event_notifier_cleanup(e);
}
@ -196,6 +210,36 @@ fail:
#endif
}
static bool vfio_cpr_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp)
{
#ifdef CONFIG_KVM
if (vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
vdev->intx.route.mode != PCI_INTX_ENABLED ||
!kvm_resamplefds_enabled()) {
return true;
}
if (!vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0, errp)) {
return false;
}
if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
&vdev->intx.interrupt,
&vdev->intx.unmask,
vdev->intx.route.irq)) {
error_setg_errno(errp, errno, "failed to setup resample irqfd");
vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0);
return false;
}
vdev->intx.kvm_accel = true;
trace_vfio_intx_enable_kvm(vdev->vbasedev.name);
return true;
#else
return true;
#endif
}
static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev)
{
#ifdef CONFIG_KVM
@ -291,7 +335,13 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
return true;
}
/*
* Do not alter interrupt state during vfio_realize and cpr load.
* The incoming state is cleared thereafter.
*/
if (!cpr_is_incoming()) {
vfio_disable_interrupts(vdev);
}
vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
pci_config_set_interrupt_pin(vdev->pdev.config, pin);
@ -314,6 +364,14 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
fd = event_notifier_get_fd(&vdev->intx.interrupt);
qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
if (cpr_is_incoming()) {
if (!vfio_cpr_intx_enable_kvm(vdev, &err)) {
warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
}
goto skip_signaling;
}
if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) {
qemu_set_fd_handler(fd, NULL, NULL, vdev);
@ -325,6 +383,7 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
}
skip_signaling:
vdev->interrupt = VFIO_INT_INTx;
trace_vfio_intx_enable(vdev->vbasedev.name);
@ -394,6 +453,14 @@ static void vfio_msi_interrupt(void *opaque)
notify(&vdev->pdev, nr);
}
void vfio_pci_msi_set_handler(VFIOPCIDevice *vdev, int nr)
{
VFIOMSIVector *vector = &vdev->msi_vectors[nr];
int fd = event_notifier_get_fd(&vector->interrupt);
qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector);
}
/*
* Get MSI-X enabled, but no vector enabled, by setting vector 0 with an invalid
* fd to kernel.
@ -656,6 +723,15 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
static int vfio_msix_vector_use(PCIDevice *pdev,
unsigned int nr, MSIMessage msg)
{
/*
* Ignore the callback from msix_set_vector_notifiers during resume.
* The necessary subset of these actions is called from
* vfio_cpr_claim_vectors during post load.
*/
if (cpr_is_incoming()) {
return 0;
}
return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
}
@ -686,6 +762,12 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
}
}
void vfio_pci_msix_set_notifiers(VFIOPCIDevice *vdev)
{
msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
vfio_msix_vector_release, NULL);
}
void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
{
assert(!vdev->defer_kvm_irq_routing);
@ -2914,7 +2996,7 @@ void vfio_pci_put_device(VFIOPCIDevice *vdev)
vfio_device_detach(&vdev->vbasedev);
g_free(vdev->vbasedev.name);
vfio_device_free_name(&vdev->vbasedev);
g_free(vdev->msix);
}
@ -2965,6 +3047,11 @@ void vfio_pci_register_err_notifier(VFIOPCIDevice *vdev)
fd = event_notifier_get_fd(&vdev->err_notifier);
qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev);
/* Do not alter irq_signaling during vfio_realize for cpr */
if (cpr_is_incoming()) {
return;
}
if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
@ -3032,6 +3119,12 @@ void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev)
fd = event_notifier_get_fd(&vdev->req_notifier);
qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev);
/* Do not alter irq_signaling during vfio_realize for cpr */
if (cpr_is_incoming()) {
vdev->req_enabled = true;
return;
}
if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
@ -3189,7 +3282,13 @@ bool vfio_pci_interrupt_setup(VFIOPCIDevice *vdev, Error **errp)
vfio_intx_routing_notifier);
vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
if (!vfio_intx_enable(vdev, errp)) {
/*
* During CPR, do not call vfio_intx_enable at this time. Instead,
* call it from vfio_pci_post_load after the intx routing data has
* been loaded from vmstate.
*/
if (!cpr_is_incoming() && !vfio_intx_enable(vdev, errp)) {
timer_free(vdev->intx.mmap_timer);
pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);

View file

@ -218,6 +218,8 @@ void vfio_pci_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev);
void vfio_pci_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev);
bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp);
void vfio_pci_msix_set_notifiers(VFIOPCIDevice *vdev);
void vfio_pci_msi_set_handler(VFIOPCIDevice *vdev, int nr);
uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
void vfio_pci_write_config(PCIDevice *pdev,

View file

@ -530,7 +530,7 @@ static bool vfio_base_device_init(VFIODevice *vbasedev, Error **errp)
{
/* @fd takes precedence over @sysfsdev which takes precedence over @host */
if (vbasedev->fd < 0 && vbasedev->sysfsdev) {
g_free(vbasedev->name);
vfio_device_free_name(vbasedev);
vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
} else if (vbasedev->fd < 0) {
if (!vbasedev->name || strchr(vbasedev->name, '/')) {

View file

@ -197,6 +197,9 @@ iommufd_cdev_alloc_ioas(int iommufd, int ioas_id) " [iommufd=%d] new IOMMUFD con
iommufd_cdev_device_info(char *name, int devfd, int num_irqs, int num_regions, int flags) " %s (%d) num_irqs=%d num_regions=%d flags=%d"
iommufd_cdev_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int dev_id) "\t%04x:%02x:%02x.%x devid %d"
# cpr-iommufd.c
vfio_cpr_find_device(uint32_t ioas_id, int devid, uint32_t hwpt_id) "ioas_id %u, devid %d, hwpt_id %u"
# device.c
vfio_device_get_region_info_type(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x"
vfio_device_reset_handler(void) ""

View file

@ -85,6 +85,7 @@ void qemu_ram_unset_idstr(RAMBlock *block);
const char *qemu_ram_get_idstr(RAMBlock *rb);
void *qemu_ram_get_host_addr(RAMBlock *rb);
ram_addr_t qemu_ram_get_offset(RAMBlock *rb);
ram_addr_t qemu_ram_get_fd_offset(RAMBlock *rb);
ram_addr_t qemu_ram_get_used_length(RAMBlock *rb);
ram_addr_t qemu_ram_get_max_length(RAMBlock *rb);
bool qemu_ram_is_shared(RAMBlock *rb);

View file

@ -167,6 +167,21 @@ struct VFIOIOMMUClass {
int (*dma_map)(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
void *vaddr, bool readonly, MemoryRegion *mr);
/**
* @dma_map_file
*
* Map a file range for the container.
*
* @bcontainer: #VFIOContainerBase to use for map
* @iova: start address to map
* @size: size of the range to map
* @fd: descriptor of the file to map
* @start: starting file offset of the range to map
* @readonly: map read only if true
*/
int (*dma_map_file)(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
int fd, unsigned long start, bool readonly);
/**
* @dma_unmap
*

View file

@ -15,19 +15,27 @@
struct VFIOContainer;
struct VFIOContainerBase;
struct VFIOGroup;
struct VFIODevice;
struct VFIOPCIDevice;
struct VFIOIOMMUFDContainer;
struct IOMMUFDBackend;
typedef int (*dma_map_fn)(const struct VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size, void *vaddr,
bool readonly, MemoryRegion *mr);
typedef struct VFIOContainerCPR {
Error *blocker;
bool vaddr_unmapped;
NotifierWithReturn transfer_notifier;
MemoryListener remap_listener;
int (*saved_dma_map)(const struct VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
void *vaddr, bool readonly, MemoryRegion *mr);
} VFIOContainerCPR;
typedef struct VFIODeviceCPR {
Error *mdev_blocker;
Error *id_blocker;
uint32_t hwpt_id;
uint32_t ioas_id;
} VFIODeviceCPR;
bool vfio_legacy_cpr_register_container(struct VFIOContainer *container,
@ -37,9 +45,15 @@ void vfio_legacy_cpr_unregister_container(struct VFIOContainer *container);
int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier, MigrationEvent *e,
Error **errp);
bool vfio_cpr_register_container(struct VFIOContainerBase *bcontainer,
bool vfio_iommufd_cpr_register_container(struct VFIOIOMMUFDContainer *container,
Error **errp);
void vfio_cpr_unregister_container(struct VFIOContainerBase *bcontainer);
void vfio_iommufd_cpr_unregister_container(
struct VFIOIOMMUFDContainer *container);
bool vfio_iommufd_cpr_register_iommufd(struct IOMMUFDBackend *be, Error **errp);
void vfio_iommufd_cpr_unregister_iommufd(struct IOMMUFDBackend *be);
void vfio_iommufd_cpr_register_device(struct VFIODevice *vbasedev);
void vfio_iommufd_cpr_unregister_device(struct VFIODevice *vbasedev);
void vfio_cpr_load_device(struct VFIODevice *vbasedev);
int vfio_cpr_group_get_device_fd(int d, const char *name);
@ -52,6 +66,16 @@ void vfio_cpr_giommu_remap(struct VFIOContainerBase *bcontainer,
bool vfio_cpr_ram_discard_register_listener(
struct VFIOContainerBase *bcontainer, MemoryRegionSection *section);
void vfio_cpr_save_vector_fd(struct VFIOPCIDevice *vdev, const char *name,
int nr, int fd);
int vfio_cpr_load_vector_fd(struct VFIOPCIDevice *vdev, const char *name,
int nr);
void vfio_cpr_delete_vector_fd(struct VFIOPCIDevice *vdev, const char *name,
int nr);
extern const VMStateDescription vfio_cpr_pci_vmstate;
extern const VMStateDescription vmstate_cpr_vfio_devices;
void vfio_cpr_add_kvm_notifier(void);
#endif /* HW_VFIO_VFIO_CPR_H */

View file

@ -279,8 +279,11 @@ int vfio_device_get_irq_info(VFIODevice *vbasedev, int index,
/* Returns 0 on success, or a negative errno. */
bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp);
void vfio_device_free_name(VFIODevice *vbasedev);
void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp);
void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
DeviceState *dev, bool ram_discard);
int vfio_device_get_aw_bits(VFIODevice *vdev);
void vfio_kvm_device_close(void);
#endif /* HW_VFIO_VFIO_COMMON_H */

View file

@ -9,11 +9,23 @@
#define MIGRATION_CPR_H
#include "qapi/qapi-types-migration.h"
#include "qemu/queue.h"
#define MIG_MODE_NONE -1
#define QEMU_CPR_FILE_MAGIC 0x51435052
#define QEMU_CPR_FILE_VERSION 0x00000001
#define CPR_STATE "CprState"
typedef QLIST_HEAD(CprFdList, CprFd) CprFdList;
typedef QLIST_HEAD(CprVFIODeviceList, CprVFIODevice) CprVFIODeviceList;
typedef struct CprState {
CprFdList fds;
CprVFIODeviceList vfio_devices;
} CprState;
extern CprState cpr_state;
void cpr_save_fd(const char *name, int id, int fd);
void cpr_delete_fd(const char *name, int id);
@ -32,6 +44,8 @@ void cpr_state_close(void);
struct QIOChannel *cpr_state_ioc(void);
bool cpr_incoming_needed(void *opaque);
int cpr_get_fd_param(const char *name, const char *fdname, int index,
Error **errp);
QEMUFile *cpr_transfer_output(MigrationChannel *channel, Error **errp);
QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp);

View file

@ -32,6 +32,7 @@ struct IOMMUFDBackend {
/*< protected >*/
int fd; /* /dev/iommu file descriptor */
bool owned; /* is the /dev/iommu opened internally */
Error *cpr_blocker;/* set if be does not support CPR */
uint32_t users;
/*< public >*/
@ -43,6 +44,9 @@ void iommufd_backend_disconnect(IOMMUFDBackend *be);
bool iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
Error **errp);
void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id);
int iommufd_backend_map_file_dma(IOMMUFDBackend *be, uint32_t ioas_id,
hwaddr iova, ram_addr_t size, int fd,
unsigned long start, bool readonly);
int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
ram_addr_t size, void *vaddr, bool readonly);
int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
@ -66,6 +70,9 @@ bool iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t id,
uint32_t *entry_num, void *data,
Error **errp);
bool iommufd_change_process_capable(IOMMUFDBackend *be);
bool iommufd_change_process(IOMMUFDBackend *be, Error **errp);
#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"
OBJECT_DECLARE_TYPE(HostIOMMUDeviceIOMMUFD, HostIOMMUDeviceIOMMUFDClass,
HOST_IOMMU_DEVICE_IOMMUFD)

View file

@ -195,6 +195,7 @@ bool kvm_has_sync_mmu(void);
int kvm_has_vcpu_events(void);
int kvm_max_nested_state_length(void);
int kvm_has_gsi_routing(void);
void kvm_close(void);
/**
* kvm_arm_supports_user_irq

View file

@ -7,25 +7,21 @@
#include "qemu/osdep.h"
#include "qapi/error.h"
#include "hw/vfio/vfio-device.h"
#include "migration/cpr.h"
#include "migration/misc.h"
#include "migration/options.h"
#include "migration/qemu-file.h"
#include "migration/savevm.h"
#include "migration/vmstate.h"
#include "monitor/monitor.h"
#include "system/runstate.h"
#include "trace.h"
/*************************************************************************/
/* cpr state container for all information to be saved. */
typedef QLIST_HEAD(CprFdList, CprFd) CprFdList;
typedef struct CprState {
CprFdList fds;
} CprState;
static CprState cpr_state;
CprState cpr_state;
/****************************************************************************/
@ -126,8 +122,6 @@ int cpr_open_fd(const char *path, int flags, const char *name, int id,
}
/*************************************************************************/
#define CPR_STATE "CprState"
static const VMStateDescription vmstate_cpr_state = {
.name = CPR_STATE,
.version_id = 1,
@ -135,6 +129,10 @@ static const VMStateDescription vmstate_cpr_state = {
.fields = (VMStateField[]) {
VMSTATE_QLIST_V(fds, CprState, 1, vmstate_cpr_fd, CprFd, next),
VMSTATE_END_OF_LIST()
},
.subsections = (const VMStateDescription * const []) {
&vmstate_cpr_vfio_devices,
NULL
}
};
/*************************************************************************/
@ -264,3 +262,39 @@ bool cpr_incoming_needed(void *opaque)
MigMode mode = migrate_mode();
return mode == MIG_MODE_CPR_TRANSFER;
}
/*
* cpr_get_fd_param: find a descriptor and return its value.
*
* @name: CPR name for the descriptor
* @fdname: An integer-valued string, or a name passed to a getfd command
* @index: CPR index of the descriptor
* @errp: returned error message
*
* If CPR is not being performed, then use @fdname to find the fd.
* If CPR is being performed, then ignore @fdname, and look for @name
* and @index in CPR state.
*
* On success returns the fd value, else returns -1.
*/
int cpr_get_fd_param(const char *name, const char *fdname, int index,
Error **errp)
{
ERRP_GUARD();
int fd;
if (cpr_is_incoming()) {
fd = cpr_find_fd(name, index);
if (fd < 0) {
error_setg(errp, "cannot find saved value for fd %s", fdname);
}
} else {
fd = monitor_fd_param(monitor_cur(), fdname, errp);
if (fd >= 0) {
cpr_save_fd(name, index, fd);
} else {
error_prepend(errp, "Could not parse object fd %s:", fdname);
}
}
return fd;
}

View file

@ -620,8 +620,10 @@
#
# @cpr-transfer: This mode allows the user to transfer a guest to a
# new QEMU instance on the same host with minimal guest pause
# time by preserving guest RAM in place. Devices and their pinned
# pages will also be preserved in a future QEMU release.
# time by preserving guest RAM in place.
#
# Devices and their pinned pages are also preserved for VFIO and
# IOMMUFD. (since 10.1)
#
# The user starts new QEMU on the same host as old QEMU, with
# command-line arguments to create the same machine, plus the

View file

@ -1593,6 +1593,11 @@ ram_addr_t qemu_ram_get_offset(RAMBlock *rb)
return rb->offset;
}
ram_addr_t qemu_ram_get_fd_offset(RAMBlock *rb)
{
return rb->fd_offset;
}
ram_addr_t qemu_ram_get_used_length(RAMBlock *rb)
{
return rb->used_length;