vfio/migration: Multifd device state transfer support - received buffers queuing

The multifd received data needs to be reassembled since device state
packets sent via different multifd channels can arrive out-of-order.

Therefore, each VFIO device state packet carries a header indicating its
position in the stream.
The raw device state data is saved into a VFIOStateBuffer for later
in-order loading into the device.

The last such VFIO device state packet should have
VFIO_DEVICE_STATE_CONFIG_STATE flag set and carry the device config state.

Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Link: https://lore.kernel.org/qemu-devel/e3bff515a8d61c582b94b409eb12a45b1a143a69.1741124640.git.maciej.szmigiero@oracle.com
[ clg: - Reordered savevm_vfio_handlers
       - Added load_state_buffer documentation ]
Signed-off-by: Cédric Le Goater <clg@redhat.com>
This commit is contained in:
Maciej S. Szmigiero 2025-03-04 23:03:53 +01:00 committed by Cédric Le Goater
parent 6bcffb1cad
commit 3228d311ab
5 changed files with 178 additions and 0 deletions

View file

@ -76,6 +76,10 @@ VFIO implements the device hooks for the iterative approach as follows:
* A ``load_state`` function that loads the config section and the data
sections that are generated by the save functions above.
* A ``load_state_buffer`` function that loads the device state and the device
config that arrived via multifd channels.
It's used only in the multifd mode.
* ``cleanup`` functions for both save and load that perform any migration
related cleanup.
@ -194,6 +198,9 @@ Live migration resume path
(RESTORE_VM, _ACTIVE, _STOP)
|
For each device, .load_state() is called for that device section data
transmitted via the main migration channel.
For data transmitted via multifd channels .load_state_buffer() is called
instead.
(RESTORE_VM, _ACTIVE, _RESUMING)
|
At the end, .load_cleanup() is called for each device and vCPUs are started

View file

@ -32,18 +32,181 @@ typedef struct VFIODeviceStatePacket {
uint8_t data[0];
} QEMU_PACKED VFIODeviceStatePacket;
/* type safety */
typedef struct VFIOStateBuffers {
GArray *array;
} VFIOStateBuffers;
typedef struct VFIOStateBuffer {
bool is_present;
char *data;
size_t len;
} VFIOStateBuffer;
typedef struct VFIOMultifd {
VFIOStateBuffers load_bufs;
QemuCond load_bufs_buffer_ready_cond;
QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
uint32_t load_buf_idx;
uint32_t load_buf_idx_last;
} VFIOMultifd;
static void vfio_state_buffer_clear(gpointer data)
{
VFIOStateBuffer *lb = data;
if (!lb->is_present) {
return;
}
g_clear_pointer(&lb->data, g_free);
lb->is_present = false;
}
static void vfio_state_buffers_init(VFIOStateBuffers *bufs)
{
bufs->array = g_array_new(FALSE, TRUE, sizeof(VFIOStateBuffer));
g_array_set_clear_func(bufs->array, vfio_state_buffer_clear);
}
static void vfio_state_buffers_destroy(VFIOStateBuffers *bufs)
{
g_clear_pointer(&bufs->array, g_array_unref);
}
static void vfio_state_buffers_assert_init(VFIOStateBuffers *bufs)
{
assert(bufs->array);
}
static unsigned int vfio_state_buffers_size_get(VFIOStateBuffers *bufs)
{
return bufs->array->len;
}
static void vfio_state_buffers_size_set(VFIOStateBuffers *bufs,
unsigned int size)
{
g_array_set_size(bufs->array, size);
}
static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs,
unsigned int idx)
{
return &g_array_index(bufs->array, VFIOStateBuffer, idx);
}
/* called with load_bufs_mutex locked */
static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev,
VFIODeviceStatePacket *packet,
size_t packet_total_size,
Error **errp)
{
VFIOMigration *migration = vbasedev->migration;
VFIOMultifd *multifd = migration->multifd;
VFIOStateBuffer *lb;
vfio_state_buffers_assert_init(&multifd->load_bufs);
if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) {
vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1);
}
lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx);
if (lb->is_present) {
error_setg(errp, "%s: state buffer %" PRIu32 " already filled",
vbasedev->name, packet->idx);
return false;
}
assert(packet->idx >= multifd->load_buf_idx);
lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet));
lb->len = packet_total_size - sizeof(*packet);
lb->is_present = true;
return true;
}
bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size,
Error **errp)
{
VFIODevice *vbasedev = opaque;
VFIOMigration *migration = vbasedev->migration;
VFIOMultifd *multifd = migration->multifd;
VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data;
if (!vfio_multifd_transfer_enabled(vbasedev)) {
error_setg(errp,
"%s: got device state packet but not doing multifd transfer",
vbasedev->name);
return false;
}
assert(multifd);
if (data_size < sizeof(*packet)) {
error_setg(errp, "%s: packet too short at %zu (min is %zu)",
vbasedev->name, data_size, sizeof(*packet));
return false;
}
if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) {
error_setg(errp, "%s: packet has unknown version %" PRIu32,
vbasedev->name, packet->version);
return false;
}
if (packet->idx == UINT32_MAX) {
error_setg(errp, "%s: packet index is invalid", vbasedev->name);
return false;
}
trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx);
/*
* Holding BQL here would violate the lock order and can cause
* a deadlock once we attempt to lock load_bufs_mutex below.
*/
assert(!bql_locked());
WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
/* config state packet should be the last one in the stream */
if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) {
multifd->load_buf_idx_last = packet->idx;
}
if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size,
errp)) {
return false;
}
qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
}
return true;
}
static VFIOMultifd *vfio_multifd_new(void)
{
VFIOMultifd *multifd = g_new(VFIOMultifd, 1);
vfio_state_buffers_init(&multifd->load_bufs);
qemu_mutex_init(&multifd->load_bufs_mutex);
multifd->load_buf_idx = 0;
multifd->load_buf_idx_last = UINT32_MAX;
qemu_cond_init(&multifd->load_bufs_buffer_ready_cond);
return multifd;
}
static void vfio_multifd_free(VFIOMultifd *multifd)
{
vfio_state_buffers_destroy(&multifd->load_bufs);
qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond);
qemu_mutex_destroy(&multifd->load_bufs_mutex);
g_free(multifd);
}

View file

@ -20,4 +20,7 @@ void vfio_multifd_cleanup(VFIODevice *vbasedev);
bool vfio_multifd_transfer_supported(void);
bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev);
bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size,
Error **errp);
#endif

View file

@ -802,6 +802,10 @@ static const SaveVMHandlers savevm_vfio_handlers = {
.load_cleanup = vfio_load_cleanup,
.load_state = vfio_load_state,
.switchover_ack_needed = vfio_switchover_ack_needed,
/*
* Multifd support
*/
.load_state_buffer = vfio_multifd_load_state_buffer,
};
/* ---------------------------------------------------------------------- */

View file

@ -154,6 +154,7 @@ vfio_load_device_config_state_start(const char *name) " (%s)"
vfio_load_device_config_state_end(const char *name) " (%s)"
vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64
vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size %"PRIu64" ret %d"
vfio_load_state_device_buffer_incoming(const char *name, uint32_t idx) " (%s) idx %"PRIu32
vfio_migration_realize(const char *name) " (%s)"
vfio_migration_set_device_state(const char *name, const char *state) " (%s) state %s"
vfio_migration_set_state(const char *name, const char *new_state, const char *recover_state) " (%s) new state %s, recover state %s"