vfio/migration: Multifd device state transfer support - load thread

Add a thread which loads the VFIO device state buffers that were received
via multifd.

Each VFIO device that has multifd device state transfer enabled has one
such thread, which is created using migration core API
qemu_loadvm_start_load_thread().

Since it's important to finish loading device state transferred via the
main migration channel (via save_live_iterate SaveVMHandler) before
starting loading the data asynchronously transferred via multifd the thread
doing the actual loading of the multifd transferred data is only started
from switchover_start SaveVMHandler.

switchover_start handler is called when MIG_CMD_SWITCHOVER_START
sub-command of QEMU_VM_COMMAND is received via the main migration channel.

This sub-command is only sent after all save_live_iterate data have already
been posted so it is safe to commence loading of the multifd-transferred
device state upon receiving it - loading of save_live_iterate data happens
synchronously in the main migration thread (much like the processing of
MIG_CMD_SWITCHOVER_START) so by the time MIG_CMD_SWITCHOVER_START is
processed all the proceeding data must have already been loaded.

Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Link: https://lore.kernel.org/qemu-devel/9abe612d775aaf42e31646796acd2363c723a57a.1741124640.git.maciej.szmigiero@oracle.com
[ clg: - Reordered savevm_vfio_handlers
       - Added switchover_start documentation ]
Signed-off-by: Cédric Le Goater <clg@redhat.com>
This commit is contained in:
Maciej S. Szmigiero 2025-03-04 23:03:54 +01:00 committed by Cédric Le Goater
parent 3228d311ab
commit c59748c1ff
5 changed files with 251 additions and 0 deletions

View file

@ -44,8 +44,12 @@ typedef struct VFIOStateBuffer {
} VFIOStateBuffer;
typedef struct VFIOMultifd {
bool load_bufs_thread_running;
bool load_bufs_thread_want_exit;
VFIOStateBuffers load_bufs;
QemuCond load_bufs_buffer_ready_cond;
QemuCond load_bufs_thread_finished_cond;
QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
uint32_t load_buf_idx;
uint32_t load_buf_idx_last;
@ -186,6 +190,178 @@ bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size,
return true;
}
static bool vfio_load_bufs_thread_load_config(VFIODevice *vbasedev,
Error **errp)
{
error_setg(errp, "not yet there");
return false;
}
static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd)
{
VFIOStateBuffer *lb;
unsigned int bufs_len;
bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs);
if (multifd->load_buf_idx >= bufs_len) {
assert(multifd->load_buf_idx == bufs_len);
return NULL;
}
lb = vfio_state_buffers_at(&multifd->load_bufs,
multifd->load_buf_idx);
if (!lb->is_present) {
return NULL;
}
return lb;
}
static bool vfio_load_state_buffer_write(VFIODevice *vbasedev,
VFIOStateBuffer *lb,
Error **errp)
{
VFIOMigration *migration = vbasedev->migration;
VFIOMultifd *multifd = migration->multifd;
g_autofree char *buf = NULL;
char *buf_cur;
size_t buf_len;
if (!lb->len) {
return true;
}
trace_vfio_load_state_device_buffer_load_start(vbasedev->name,
multifd->load_buf_idx);
/* lb might become re-allocated when we drop the lock */
buf = g_steal_pointer(&lb->data);
buf_cur = buf;
buf_len = lb->len;
while (buf_len > 0) {
ssize_t wr_ret;
int errno_save;
/*
* Loading data to the device takes a while,
* drop the lock during this process.
*/
qemu_mutex_unlock(&multifd->load_bufs_mutex);
wr_ret = write(migration->data_fd, buf_cur, buf_len);
errno_save = errno;
qemu_mutex_lock(&multifd->load_bufs_mutex);
if (wr_ret < 0) {
error_setg(errp,
"%s: writing state buffer %" PRIu32 " failed: %d",
vbasedev->name, multifd->load_buf_idx, errno_save);
return false;
}
assert(wr_ret <= buf_len);
buf_len -= wr_ret;
buf_cur += wr_ret;
}
trace_vfio_load_state_device_buffer_load_end(vbasedev->name,
multifd->load_buf_idx);
return true;
}
static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd,
bool *should_quit)
{
return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit);
}
/*
* This thread is spawned by vfio_multifd_switchover_start() which gets
* called upon encountering the switchover point marker in main migration
* stream.
*
* It exits after either:
* * completing loading the remaining device state and device config, OR:
* * encountering some error while doing the above, OR:
* * being forcefully aborted by the migration core by it setting should_quit
* or by vfio_load_cleanup_load_bufs_thread() setting
* multifd->load_bufs_thread_want_exit.
*/
static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp)
{
VFIODevice *vbasedev = opaque;
VFIOMigration *migration = vbasedev->migration;
VFIOMultifd *multifd = migration->multifd;
bool ret = false;
trace_vfio_load_bufs_thread_start(vbasedev->name);
assert(multifd);
QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);
assert(multifd->load_bufs_thread_running);
while (true) {
VFIOStateBuffer *lb;
/*
* Always check cancellation first after the buffer_ready wait below in
* case that cond was signalled by vfio_load_cleanup_load_bufs_thread().
*/
if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) {
error_setg(errp, "operation cancelled");
goto thread_exit;
}
assert(multifd->load_buf_idx <= multifd->load_buf_idx_last);
lb = vfio_load_state_buffer_get(multifd);
if (!lb) {
trace_vfio_load_state_device_buffer_starved(vbasedev->name,
multifd->load_buf_idx);
qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond,
&multifd->load_bufs_mutex);
continue;
}
if (multifd->load_buf_idx == multifd->load_buf_idx_last) {
break;
}
if (multifd->load_buf_idx == 0) {
trace_vfio_load_state_device_buffer_start(vbasedev->name);
}
if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) {
goto thread_exit;
}
if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) {
trace_vfio_load_state_device_buffer_end(vbasedev->name);
}
multifd->load_buf_idx++;
}
if (!vfio_load_bufs_thread_load_config(vbasedev, errp)) {
goto thread_exit;
}
ret = true;
thread_exit:
/*
* Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that
* this thread is exiting.
*/
multifd->load_bufs_thread_running = false;
qemu_cond_signal(&multifd->load_bufs_thread_finished_cond);
trace_vfio_load_bufs_thread_end(vbasedev->name);
return ret;
}
static VFIOMultifd *vfio_multifd_new(void)
{
VFIOMultifd *multifd = g_new(VFIOMultifd, 1);
@ -198,11 +374,41 @@ static VFIOMultifd *vfio_multifd_new(void)
multifd->load_buf_idx_last = UINT32_MAX;
qemu_cond_init(&multifd->load_bufs_buffer_ready_cond);
multifd->load_bufs_thread_running = false;
multifd->load_bufs_thread_want_exit = false;
qemu_cond_init(&multifd->load_bufs_thread_finished_cond);
return multifd;
}
/*
* Terminates vfio_load_bufs_thread by setting
* multifd->load_bufs_thread_want_exit and signalling all the conditions
* the thread could be blocked on.
*
* Waits for the thread to signal that it had finished.
*/
static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd)
{
/* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
bql_unlock();
WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
while (multifd->load_bufs_thread_running) {
multifd->load_bufs_thread_want_exit = true;
qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
qemu_cond_wait(&multifd->load_bufs_thread_finished_cond,
&multifd->load_bufs_mutex);
}
}
bql_lock();
}
static void vfio_multifd_free(VFIOMultifd *multifd)
{
vfio_load_cleanup_load_bufs_thread(multifd);
qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond);
vfio_state_buffers_destroy(&multifd->load_bufs);
qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond);
qemu_mutex_destroy(&multifd->load_bufs_mutex);
@ -244,3 +450,23 @@ bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp)
return true;
}
int vfio_multifd_switchover_start(VFIODevice *vbasedev)
{
VFIOMigration *migration = vbasedev->migration;
VFIOMultifd *multifd = migration->multifd;
assert(multifd);
/* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
bql_unlock();
WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
assert(!multifd->load_bufs_thread_running);
multifd->load_bufs_thread_running = true;
}
bql_lock();
qemu_loadvm_start_load_thread(vfio_load_bufs_thread, vbasedev);
return 0;
}