vfio/container: preserve descriptors

At vfio creation time, save the value of vfio container, group, and device
descriptors in CPR state.  On qemu restart, vfio_realize() finds and uses
the saved descriptors.

During reuse, device and iommu state is already configured, so operations
in vfio_realize that would modify the configuration, such as vfio ioctl's,
are skipped.  The result is that vfio_realize constructs qemu data
structures that reflect the current state of the device.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Link: https://lore.kernel.org/qemu-devel/1749569991-25171-5-git-send-email-steven.sistare@oracle.com
Signed-off-by: Cédric Le Goater <clg@redhat.com>
This commit is contained in:
Steve Sistare 2025-06-10 08:39:17 -07:00 committed by Cédric Le Goater
parent 54857b0816
commit c29a65ed68
3 changed files with 100 additions and 15 deletions

View file

@ -31,6 +31,8 @@
#include "system/reset.h" #include "system/reset.h"
#include "trace.h" #include "trace.h"
#include "qapi/error.h" #include "qapi/error.h"
#include "migration/cpr.h"
#include "migration/blocker.h"
#include "pci.h" #include "pci.h"
#include "hw/vfio/vfio-container.h" #include "hw/vfio/vfio-container.h"
#include "vfio-helpers.h" #include "vfio-helpers.h"
@ -425,7 +427,12 @@ static VFIOContainer *vfio_create_container(int fd, VFIOGroup *group,
return NULL; return NULL;
} }
if (!vfio_set_iommu(fd, group->fd, &iommu_type, errp)) { /*
* During CPR, just set the container type and skip the ioctls, as the
* container and group are already configured in the kernel.
*/
if (!cpr_is_incoming() &&
!vfio_set_iommu(fd, group->fd, &iommu_type, errp)) {
return NULL; return NULL;
} }
@ -592,6 +599,11 @@ static bool vfio_container_group_add(VFIOContainer *container, VFIOGroup *group,
group->container = container; group->container = container;
QLIST_INSERT_HEAD(&container->group_list, group, container_next); QLIST_INSERT_HEAD(&container->group_list, group, container_next);
vfio_group_add_kvm_device(group); vfio_group_add_kvm_device(group);
/*
* Remember the container fd for each group, so we can attach to the same
* container after CPR.
*/
cpr_resave_fd("vfio_container_for_group", group->groupid, container->fd);
return true; return true;
} }
@ -601,6 +613,7 @@ static void vfio_container_group_del(VFIOContainer *container, VFIOGroup *group)
group->container = NULL; group->container = NULL;
vfio_group_del_kvm_device(group); vfio_group_del_kvm_device(group);
vfio_ram_block_discard_disable(container, false); vfio_ram_block_discard_disable(container, false);
cpr_delete_fd("vfio_container_for_group", group->groupid);
} }
static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as,
@ -615,17 +628,34 @@ static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as,
bool group_was_added = false; bool group_was_added = false;
space = vfio_address_space_get(as); space = vfio_address_space_get(as);
fd = cpr_find_fd("vfio_container_for_group", group->groupid);
QLIST_FOREACH(bcontainer, &space->containers, next) { if (!cpr_is_incoming()) {
container = container_of(bcontainer, VFIOContainer, bcontainer); QLIST_FOREACH(bcontainer, &space->containers, next) {
if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { container = container_of(bcontainer, VFIOContainer, bcontainer);
return vfio_container_group_add(container, group, errp); if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
return vfio_container_group_add(container, group, errp);
}
} }
}
fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp); fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp);
if (fd < 0) { if (fd < 0) {
goto fail; goto fail;
}
} else {
/*
* For incoming CPR, the group is already attached in the kernel.
* If a container with matching fd is found, then update the
* userland group list and return. If not, then after the loop,
* create the container struct and group list.
*/
QLIST_FOREACH(bcontainer, &space->containers, next) {
container = container_of(bcontainer, VFIOContainer, bcontainer);
if (vfio_cpr_container_match(container, group, fd)) {
return vfio_container_group_add(container, group, errp);
}
}
} }
ret = ioctl(fd, VFIO_GET_API_VERSION); ret = ioctl(fd, VFIO_GET_API_VERSION);
@ -697,6 +727,7 @@ static void vfio_container_disconnect(VFIOGroup *group)
QLIST_REMOVE(group, container_next); QLIST_REMOVE(group, container_next);
group->container = NULL; group->container = NULL;
cpr_delete_fd("vfio_container_for_group", group->groupid);
/* /*
* Explicitly release the listener first before unset container, * Explicitly release the listener first before unset container,
@ -750,7 +781,7 @@ static VFIOGroup *vfio_group_get(int groupid, AddressSpace *as, Error **errp)
group = g_malloc0(sizeof(*group)); group = g_malloc0(sizeof(*group));
snprintf(path, sizeof(path), "/dev/vfio/%d", groupid); snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
group->fd = qemu_open(path, O_RDWR, errp); group->fd = cpr_open_fd(path, O_RDWR, "vfio_group", groupid, errp);
if (group->fd < 0) { if (group->fd < 0) {
goto free_group_exit; goto free_group_exit;
} }
@ -782,6 +813,7 @@ static VFIOGroup *vfio_group_get(int groupid, AddressSpace *as, Error **errp)
return group; return group;
close_fd_exit: close_fd_exit:
cpr_delete_fd("vfio_group", groupid);
close(group->fd); close(group->fd);
free_group_exit: free_group_exit:
@ -803,6 +835,7 @@ static void vfio_group_put(VFIOGroup *group)
vfio_container_disconnect(group); vfio_container_disconnect(group);
QLIST_REMOVE(group, next); QLIST_REMOVE(group, next);
trace_vfio_group_put(group->fd); trace_vfio_group_put(group->fd);
cpr_delete_fd("vfio_group", group->groupid);
close(group->fd); close(group->fd);
g_free(group); g_free(group);
} }
@ -813,7 +846,7 @@ static bool vfio_device_get(VFIOGroup *group, const char *name,
g_autofree struct vfio_device_info *info = NULL; g_autofree struct vfio_device_info *info = NULL;
int fd; int fd;
fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name); fd = vfio_cpr_group_get_device_fd(group->fd, name);
if (fd < 0) { if (fd < 0) {
error_setg_errno(errp, errno, "error getting device from group %d", error_setg_errno(errp, errno, "error getting device from group %d",
group->groupid); group->groupid);
@ -826,8 +859,7 @@ static bool vfio_device_get(VFIOGroup *group, const char *name,
info = vfio_get_device_info(fd); info = vfio_get_device_info(fd);
if (!info) { if (!info) {
error_setg_errno(errp, errno, "error getting device info"); error_setg_errno(errp, errno, "error getting device info");
close(fd); goto fail;
return false;
} }
/* /*
@ -841,8 +873,7 @@ static bool vfio_device_get(VFIOGroup *group, const char *name,
if (!QLIST_EMPTY(&group->device_list)) { if (!QLIST_EMPTY(&group->device_list)) {
error_setg(errp, "Inconsistent setting of support for discarding " error_setg(errp, "Inconsistent setting of support for discarding "
"RAM (e.g., balloon) within group"); "RAM (e.g., balloon) within group");
close(fd); goto fail;
return false;
} }
if (!group->ram_block_discard_allowed) { if (!group->ram_block_discard_allowed) {
@ -860,6 +891,11 @@ static bool vfio_device_get(VFIOGroup *group, const char *name,
trace_vfio_device_get(name, info->flags, info->num_regions, info->num_irqs); trace_vfio_device_get(name, info->flags, info->num_regions, info->num_irqs);
return true; return true;
fail:
close(fd);
cpr_delete_fd(name, 0);
return false;
} }
static void vfio_device_put(VFIODevice *vbasedev) static void vfio_device_put(VFIODevice *vbasedev)
@ -870,6 +906,7 @@ static void vfio_device_put(VFIODevice *vbasedev)
QLIST_REMOVE(vbasedev, next); QLIST_REMOVE(vbasedev, next);
vbasedev->group = NULL; vbasedev->group = NULL;
trace_vfio_device_put(vbasedev->fd); trace_vfio_device_put(vbasedev->fd);
cpr_delete_fd(vbasedev->name, 0);
close(vbasedev->fd); close(vbasedev->fd);
} }

View file

@ -8,6 +8,7 @@
#include <linux/vfio.h> #include <linux/vfio.h>
#include "qemu/osdep.h" #include "qemu/osdep.h"
#include "hw/vfio/vfio-container.h" #include "hw/vfio/vfio-container.h"
#include "hw/vfio/vfio-device.h"
#include "migration/blocker.h" #include "migration/blocker.h"
#include "migration/cpr.h" #include "migration/cpr.h"
#include "migration/migration.h" #include "migration/migration.h"
@ -66,3 +67,44 @@ void vfio_legacy_cpr_unregister_container(VFIOContainer *container)
migrate_del_blocker(&container->cpr.blocker); migrate_del_blocker(&container->cpr.blocker);
vmstate_unregister(NULL, &vfio_container_vmstate, container); vmstate_unregister(NULL, &vfio_container_vmstate, container);
} }
int vfio_cpr_group_get_device_fd(int d, const char *name)
{
const int id = 0;
int fd = cpr_find_fd(name, id);
if (fd < 0) {
fd = ioctl(d, VFIO_GROUP_GET_DEVICE_FD, name);
if (fd >= 0) {
cpr_save_fd(name, id, fd);
}
}
return fd;
}
static bool same_device(int fd1, int fd2)
{
struct stat st1, st2;
return !fstat(fd1, &st1) && !fstat(fd2, &st2) && st1.st_dev == st2.st_dev;
}
bool vfio_cpr_container_match(VFIOContainer *container, VFIOGroup *group,
int fd)
{
if (container->fd == fd) {
return true;
}
if (!same_device(container->fd, fd)) {
return false;
}
/*
* Same device, different fd. This occurs when the container fd is
* cpr_save'd multiple times, once for each groupid, so SCM_RIGHTS
* produces duplicates. De-dup it.
*/
cpr_delete_fd("vfio_container_for_group", group->groupid);
close(fd);
cpr_save_fd("vfio_container_for_group", group->groupid, container->fd);
return true;
}

View file

@ -13,6 +13,7 @@
struct VFIOContainer; struct VFIOContainer;
struct VFIOContainerBase; struct VFIOContainerBase;
struct VFIOGroup;
typedef struct VFIOContainerCPR { typedef struct VFIOContainerCPR {
Error *blocker; Error *blocker;
@ -30,4 +31,9 @@ bool vfio_cpr_register_container(struct VFIOContainerBase *bcontainer,
Error **errp); Error **errp);
void vfio_cpr_unregister_container(struct VFIOContainerBase *bcontainer); void vfio_cpr_unregister_container(struct VFIOContainerBase *bcontainer);
int vfio_cpr_group_get_device_fd(int d, const char *name);
bool vfio_cpr_container_match(struct VFIOContainer *container,
struct VFIOGroup *group, int fd);
#endif /* HW_VFIO_VFIO_CPR_H */ #endif /* HW_VFIO_VFIO_CPR_H */