mirror of
https://github.com/Motorhead1991/qemu.git
synced 2025-08-02 23:33:54 -06:00
vfio/migration: Add VFIO migration pre-copy support
Pre-copy support allows the VFIO device data to be transferred while the VM is running. This helps to accommodate VFIO devices that have a large amount of data that needs to be transferred, and it can reduce migration downtime. Pre-copy support is optional in VFIO migration protocol v2. Implement pre-copy of VFIO migration protocol v2 and use it for devices that support it. Full description of it can be found in the following Linux commit: 4db52602a607 ("vfio: Extend the device migration protocol with PRE_COPY"). Signed-off-by: Avihai Horon <avihaih@nvidia.com> Reviewed-by: Cédric Le Goater <clg@redhat.com> Tested-by: YangHang Liu <yanghliu@redhat.com> Acked-by: Alex Williamson <alex.williamson@redhat.com> Signed-off-by: Cédric Le Goater <clg@redhat.com>
This commit is contained in:
parent
6cd1fe1159
commit
eda7362af9
5 changed files with 190 additions and 22 deletions
|
@ -492,7 +492,8 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
|
|||
}
|
||||
|
||||
if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF &&
|
||||
migration->device_state == VFIO_DEVICE_STATE_RUNNING) {
|
||||
(migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
|
||||
migration->device_state == VFIO_DEVICE_STATE_PRE_COPY)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -537,7 +538,8 @@ static bool vfio_devices_all_running_and_mig_active(VFIOContainer *container)
|
|||
return false;
|
||||
}
|
||||
|
||||
if (migration->device_state == VFIO_DEVICE_STATE_RUNNING) {
|
||||
if (migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
|
||||
migration->device_state == VFIO_DEVICE_STATE_PRE_COPY) {
|
||||
continue;
|
||||
} else {
|
||||
return false;
|
||||
|
|
|
@ -68,6 +68,8 @@ static const char *mig_state_to_str(enum vfio_device_mig_state state)
|
|||
return "STOP_COPY";
|
||||
case VFIO_DEVICE_STATE_RESUMING:
|
||||
return "RESUMING";
|
||||
case VFIO_DEVICE_STATE_PRE_COPY:
|
||||
return "PRE_COPY";
|
||||
default:
|
||||
return "UNKNOWN STATE";
|
||||
}
|
||||
|
@ -241,6 +243,25 @@ static int vfio_query_stop_copy_size(VFIODevice *vbasedev,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int vfio_query_precopy_size(VFIOMigration *migration)
|
||||
{
|
||||
struct vfio_precopy_info precopy = {
|
||||
.argsz = sizeof(precopy),
|
||||
};
|
||||
|
||||
migration->precopy_init_size = 0;
|
||||
migration->precopy_dirty_size = 0;
|
||||
|
||||
if (ioctl(migration->data_fd, VFIO_MIG_GET_PRECOPY_INFO, &precopy)) {
|
||||
return -errno;
|
||||
}
|
||||
|
||||
migration->precopy_init_size = precopy.initial_bytes;
|
||||
migration->precopy_dirty_size = precopy.dirty_bytes;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Returns the size of saved data on success and -errno on error */
|
||||
static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration)
|
||||
{
|
||||
|
@ -249,6 +270,14 @@ static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration)
|
|||
data_size = read(migration->data_fd, migration->data_buffer,
|
||||
migration->data_buffer_size);
|
||||
if (data_size < 0) {
|
||||
/*
|
||||
* Pre-copy emptied all the device state for now. For more information,
|
||||
* please refer to the Linux kernel VFIO uAPI.
|
||||
*/
|
||||
if (errno == ENOMSG) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return -errno;
|
||||
}
|
||||
if (data_size == 0) {
|
||||
|
@ -265,6 +294,38 @@ static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration)
|
|||
return qemu_file_get_error(f) ?: data_size;
|
||||
}
|
||||
|
||||
static void vfio_update_estimated_pending_data(VFIOMigration *migration,
|
||||
uint64_t data_size)
|
||||
{
|
||||
if (!data_size) {
|
||||
/*
|
||||
* Pre-copy emptied all the device state for now, update estimated sizes
|
||||
* accordingly.
|
||||
*/
|
||||
migration->precopy_init_size = 0;
|
||||
migration->precopy_dirty_size = 0;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (migration->precopy_init_size) {
|
||||
uint64_t init_size = MIN(migration->precopy_init_size, data_size);
|
||||
|
||||
migration->precopy_init_size -= init_size;
|
||||
data_size -= init_size;
|
||||
}
|
||||
|
||||
migration->precopy_dirty_size -= MIN(migration->precopy_dirty_size,
|
||||
data_size);
|
||||
}
|
||||
|
||||
static bool vfio_precopy_supported(VFIODevice *vbasedev)
|
||||
{
|
||||
VFIOMigration *migration = vbasedev->migration;
|
||||
|
||||
return migration->mig_flags & VFIO_MIGRATION_PRE_COPY;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
static int vfio_save_setup(QEMUFile *f, void *opaque)
|
||||
|
@ -285,6 +346,28 @@ static int vfio_save_setup(QEMUFile *f, void *opaque)
|
|||
return -ENOMEM;
|
||||
}
|
||||
|
||||
if (vfio_precopy_supported(vbasedev)) {
|
||||
int ret;
|
||||
|
||||
switch (migration->device_state) {
|
||||
case VFIO_DEVICE_STATE_RUNNING:
|
||||
ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_PRE_COPY,
|
||||
VFIO_DEVICE_STATE_RUNNING);
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
vfio_query_precopy_size(migration);
|
||||
|
||||
break;
|
||||
case VFIO_DEVICE_STATE_STOP:
|
||||
/* vfio_save_complete_precopy() will go to STOP_COPY */
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
trace_vfio_save_setup(vbasedev->name, migration->data_buffer_size);
|
||||
|
||||
qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
|
||||
|
@ -299,26 +382,42 @@ static void vfio_save_cleanup(void *opaque)
|
|||
|
||||
g_free(migration->data_buffer);
|
||||
migration->data_buffer = NULL;
|
||||
migration->precopy_init_size = 0;
|
||||
migration->precopy_dirty_size = 0;
|
||||
vfio_migration_cleanup(vbasedev);
|
||||
trace_vfio_save_cleanup(vbasedev->name);
|
||||
}
|
||||
|
||||
static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy,
|
||||
uint64_t *can_postcopy)
|
||||
{
|
||||
VFIODevice *vbasedev = opaque;
|
||||
VFIOMigration *migration = vbasedev->migration;
|
||||
|
||||
if (migration->device_state != VFIO_DEVICE_STATE_PRE_COPY) {
|
||||
return;
|
||||
}
|
||||
|
||||
*must_precopy +=
|
||||
migration->precopy_init_size + migration->precopy_dirty_size;
|
||||
|
||||
trace_vfio_state_pending_estimate(vbasedev->name, *must_precopy,
|
||||
*can_postcopy,
|
||||
migration->precopy_init_size,
|
||||
migration->precopy_dirty_size);
|
||||
}
|
||||
|
||||
/*
|
||||
* Migration size of VFIO devices can be as little as a few KBs or as big as
|
||||
* many GBs. This value should be big enough to cover the worst case.
|
||||
*/
|
||||
#define VFIO_MIG_STOP_COPY_SIZE (100 * GiB)
|
||||
|
||||
/*
|
||||
* Only exact function is implemented and not estimate function. The reason is
|
||||
* that during pre-copy phase of migration the estimate function is called
|
||||
* repeatedly while pending RAM size is over the threshold, thus migration
|
||||
* can't converge and querying the VFIO device pending data size is useless.
|
||||
*/
|
||||
static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy,
|
||||
uint64_t *can_postcopy)
|
||||
{
|
||||
VFIODevice *vbasedev = opaque;
|
||||
VFIOMigration *migration = vbasedev->migration;
|
||||
uint64_t stop_copy_size = VFIO_MIG_STOP_COPY_SIZE;
|
||||
|
||||
/*
|
||||
|
@ -328,8 +427,48 @@ static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy,
|
|||
vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
|
||||
*must_precopy += stop_copy_size;
|
||||
|
||||
if (migration->device_state == VFIO_DEVICE_STATE_PRE_COPY) {
|
||||
vfio_query_precopy_size(migration);
|
||||
|
||||
*must_precopy +=
|
||||
migration->precopy_init_size + migration->precopy_dirty_size;
|
||||
}
|
||||
|
||||
trace_vfio_state_pending_exact(vbasedev->name, *must_precopy, *can_postcopy,
|
||||
stop_copy_size);
|
||||
stop_copy_size, migration->precopy_init_size,
|
||||
migration->precopy_dirty_size);
|
||||
}
|
||||
|
||||
static bool vfio_is_active_iterate(void *opaque)
|
||||
{
|
||||
VFIODevice *vbasedev = opaque;
|
||||
VFIOMigration *migration = vbasedev->migration;
|
||||
|
||||
return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY;
|
||||
}
|
||||
|
||||
static int vfio_save_iterate(QEMUFile *f, void *opaque)
|
||||
{
|
||||
VFIODevice *vbasedev = opaque;
|
||||
VFIOMigration *migration = vbasedev->migration;
|
||||
ssize_t data_size;
|
||||
|
||||
data_size = vfio_save_block(f, migration);
|
||||
if (data_size < 0) {
|
||||
return data_size;
|
||||
}
|
||||
qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
|
||||
|
||||
vfio_update_estimated_pending_data(migration, data_size);
|
||||
|
||||
trace_vfio_save_iterate(vbasedev->name, migration->precopy_init_size,
|
||||
migration->precopy_dirty_size);
|
||||
|
||||
/*
|
||||
* A VFIO device's pre-copy dirty_bytes is not guaranteed to reach zero.
|
||||
* Return 1 so following handlers will not be potentially blocked.
|
||||
*/
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
|
||||
|
@ -338,7 +477,7 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
|
|||
ssize_t data_size;
|
||||
int ret;
|
||||
|
||||
/* We reach here with device state STOP only */
|
||||
/* We reach here with device state STOP or STOP_COPY only */
|
||||
ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
|
||||
VFIO_DEVICE_STATE_STOP);
|
||||
if (ret) {
|
||||
|
@ -457,7 +596,10 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
|
|||
static const SaveVMHandlers savevm_vfio_handlers = {
|
||||
.save_setup = vfio_save_setup,
|
||||
.save_cleanup = vfio_save_cleanup,
|
||||
.state_pending_estimate = vfio_state_pending_estimate,
|
||||
.state_pending_exact = vfio_state_pending_exact,
|
||||
.is_active_iterate = vfio_is_active_iterate,
|
||||
.save_live_iterate = vfio_save_iterate,
|
||||
.save_live_complete_precopy = vfio_save_complete_precopy,
|
||||
.save_state = vfio_save_state,
|
||||
.load_setup = vfio_load_setup,
|
||||
|
@ -470,13 +612,18 @@ static const SaveVMHandlers savevm_vfio_handlers = {
|
|||
static void vfio_vmstate_change(void *opaque, bool running, RunState state)
|
||||
{
|
||||
VFIODevice *vbasedev = opaque;
|
||||
VFIOMigration *migration = vbasedev->migration;
|
||||
enum vfio_device_mig_state new_state;
|
||||
int ret;
|
||||
|
||||
if (running) {
|
||||
new_state = VFIO_DEVICE_STATE_RUNNING;
|
||||
} else {
|
||||
new_state = VFIO_DEVICE_STATE_STOP;
|
||||
new_state =
|
||||
(migration->device_state == VFIO_DEVICE_STATE_PRE_COPY &&
|
||||
(state == RUN_STATE_FINISH_MIGRATE || state == RUN_STATE_PAUSED)) ?
|
||||
VFIO_DEVICE_STATE_STOP_COPY :
|
||||
VFIO_DEVICE_STATE_STOP;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -162,6 +162,8 @@ vfio_save_block(const char *name, int data_size) " (%s) data_size %d"
|
|||
vfio_save_cleanup(const char *name) " (%s)"
|
||||
vfio_save_complete_precopy(const char *name, int ret) " (%s) ret %d"
|
||||
vfio_save_device_config_state(const char *name) " (%s)"
|
||||
vfio_save_iterate(const char *name, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64
|
||||
vfio_save_setup(const char *name, uint64_t data_buffer_size) " (%s) data buffer size 0x%"PRIx64
|
||||
vfio_state_pending_exact(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t stopcopy_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" stopcopy size 0x%"PRIx64
|
||||
vfio_state_pending_estimate(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64
|
||||
vfio_state_pending_exact(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t stopcopy_size, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" stopcopy size 0x%"PRIx64" precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64
|
||||
vfio_vmstate_change(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue