mirror of
https://github.com/Motorhead1991/qemu.git
synced 2025-08-03 15:53:54 -06:00
vfio queue:
* Added property documentation * Added Minor fixes * Implemented basic PCI PM capability backing * Promoted new IGD maintainer * Deprecated vfio-plaform * Extended VFIO migration with multifd support -----BEGIN PGP SIGNATURE----- iQIyBAABCAAdFiEEoPZlSPBIlev+awtgUaNDx8/77KEFAmfJrZoACgkQUaNDx8/7 7KFE2A/0Dmief9u/dDJIKGIDa+iawcf4hu8iX4v5pB0DlGniT3rgK8WMGnhDpPxq Q4wsKfo+JJ2q6msInrT7Ckqyydu9nQztI3vwmfMuWxLhTMyH28K96ptwPqIZBjOx rPTEXfnVX4W3tpn1+48S+vefWVa/gkBkIvv7RpK18rMBXv1kDeyOvc/d2dbAt7ft zJc4f8gH3jfQzGwmnYVZU1yPrZN7p6zhYR/AD3RQOY97swgZIEyYxXhOuTPiCuEC zC+2AMKi9nmnCG6x/mnk7l2yJXSlv7lJdqcjYZhJ9EOIYfiUGTREYIgQbARcafE/ 4KSg2QR35BoUd4YrmEWxXJCRf3XnyWXDY36dDKVhC0OHng1F/U44HuL4QxwoTIay s1SP/DHcvDiPAewVTvdgt7Iwfn9xGhcQO2pkrxBoNLB5JYwW+R6mG7WXeDv1o3GT QosTu1fXZezQqFd4v6+q5iRNS2KtBZLTspwAmVdywEFUs+ZLBRlC+bodYlinZw6B Yl/z0LfAEh4J55QmX2espbp8MH1+mALuW2H2tgSGSrTBX1nwxZFI5veFzPepgF2S eTx69BMjiNMwzIjq1T7e9NpDCceiW0fXDu7IK1MzYhqg1nM9lX9AidhFTeiF2DB2 EPb3ljy/8fyxcPKa1T9X47hQaSjbMwofaO8Snoh0q0jokY246Q== =hIBw -----END PGP SIGNATURE----- Merge tag 'pull-vfio-20250306' of https://github.com/legoater/qemu into staging vfio queue: * Added property documentation * Added Minor fixes * Implemented basic PCI PM capability backing * Promoted new IGD maintainer * Deprecated vfio-plaform * Extended VFIO migration with multifd support # -----BEGIN PGP SIGNATURE----- # # iQIyBAABCAAdFiEEoPZlSPBIlev+awtgUaNDx8/77KEFAmfJrZoACgkQUaNDx8/7 # 7KFE2A/0Dmief9u/dDJIKGIDa+iawcf4hu8iX4v5pB0DlGniT3rgK8WMGnhDpPxq # Q4wsKfo+JJ2q6msInrT7Ckqyydu9nQztI3vwmfMuWxLhTMyH28K96ptwPqIZBjOx # rPTEXfnVX4W3tpn1+48S+vefWVa/gkBkIvv7RpK18rMBXv1kDeyOvc/d2dbAt7ft # zJc4f8gH3jfQzGwmnYVZU1yPrZN7p6zhYR/AD3RQOY97swgZIEyYxXhOuTPiCuEC # zC+2AMKi9nmnCG6x/mnk7l2yJXSlv7lJdqcjYZhJ9EOIYfiUGTREYIgQbARcafE/ # 4KSg2QR35BoUd4YrmEWxXJCRf3XnyWXDY36dDKVhC0OHng1F/U44HuL4QxwoTIay # s1SP/DHcvDiPAewVTvdgt7Iwfn9xGhcQO2pkrxBoNLB5JYwW+R6mG7WXeDv1o3GT # QosTu1fXZezQqFd4v6+q5iRNS2KtBZLTspwAmVdywEFUs+ZLBRlC+bodYlinZw6B # Yl/z0LfAEh4J55QmX2espbp8MH1+mALuW2H2tgSGSrTBX1nwxZFI5veFzPepgF2S # eTx69BMjiNMwzIjq1T7e9NpDCceiW0fXDu7IK1MzYhqg1nM9lX9AidhFTeiF2DB2 # EPb3ljy/8fyxcPKa1T9X47hQaSjbMwofaO8Snoh0q0jokY246Q== # =hIBw # -----END PGP SIGNATURE----- # gpg: Signature made Thu 06 Mar 2025 22:13:46 HKT # gpg: using RSA key A0F66548F04895EBFE6B0B6051A343C7CFFBECA1 # gpg: Good signature from "Cédric Le Goater <clg@redhat.com>" [full] # gpg: aka "Cédric Le Goater <clg@kaod.org>" [full] # Primary key fingerprint: A0F6 6548 F048 95EB FE6B 0B60 51A3 43C7 CFFB ECA1 * tag 'pull-vfio-20250306' of https://github.com/legoater/qemu: (42 commits) hw/core/machine: Add compat for x-migration-multifd-transfer VFIO property vfio/migration: Make x-migration-multifd-transfer VFIO property mutable vfio/migration: Add x-migration-multifd-transfer VFIO property vfio/migration: Multifd device state transfer support - send side vfio/migration: Multifd device state transfer support - config loading support migration/qemu-file: Define g_autoptr() cleanup function for QEMUFile vfio/migration: Multifd device state transfer support - load thread vfio/migration: Multifd device state transfer support - received buffers queuing vfio/migration: Setup and cleanup multifd transfer in these general methods vfio/migration: Multifd setup/cleanup functions and associated VFIOMultifd vfio/migration: Multifd device state transfer - add support checking function vfio/migration: Multifd device state transfer support - basic types vfio/migration: Move migration channel flags to vfio-common.h header file vfio/migration: Add vfio_add_bytes_transferred() vfio/migration: Convert bytes_transferred counter to atomic vfio/migration: Add load_device_config_state_start trace event migration: Add save_live_complete_precopy_thread handler migration/multifd: Add multifd_device_state_supported() migration/multifd: Make MultiFDSendData a struct migration/multifd: Device state transfer support - send side ... Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
This commit is contained in:
commit
2400fad572
54 changed files with 2303 additions and 216 deletions
|
@ -2186,10 +2186,17 @@ M: Cédric Le Goater <clg@redhat.com>
|
|||
S: Supported
|
||||
F: hw/vfio/*
|
||||
F: include/hw/vfio/
|
||||
F: docs/igd-assign.txt
|
||||
F: docs/devel/migration/vfio.rst
|
||||
F: qapi/vfio.json
|
||||
|
||||
vfio-igd
|
||||
M: Alex Williamson <alex.williamson@redhat.com>
|
||||
M: Cédric Le Goater <clg@redhat.com>
|
||||
M: Tomita Moeko <tomitamoeko@gmail.com>
|
||||
S: Supported
|
||||
F: hw/vfio/igd.c
|
||||
F: docs/igd-assign.txt
|
||||
|
||||
vfio-ccw
|
||||
M: Eric Farman <farman@linux.ibm.com>
|
||||
M: Matthew Rosato <mjrosato@linux.ibm.com>
|
||||
|
|
|
@ -434,6 +434,31 @@ Stream ``reconnect`` (since 9.2)
|
|||
The ``reconnect`` option only allows specifiying second granularity timeouts,
|
||||
which is not enough for all types of use cases, use ``reconnect-ms`` instead.
|
||||
|
||||
VFIO device options
|
||||
'''''''''''''''''''
|
||||
|
||||
``-device vfio-calxeda-xgmac`` (since 10.0)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
The vfio-calxeda-xgmac device allows to assign a host Calxeda Highbank
|
||||
10Gb XGMAC Ethernet controller device ("calxeda,hb-xgmac" compatibility
|
||||
string) to a guest. Calxeda HW has been ewasted now and there is no point
|
||||
keeping that device.
|
||||
|
||||
``-device vfio-amd-xgbe`` (since 10.0)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
The vfio-amd-xgbe device allows to assign a host AMD 10GbE controller
|
||||
to a guest ("amd,xgbe-seattle-v1a" compatibility string). AMD "Seattle"
|
||||
is not supported anymore and there is no point keeping that device.
|
||||
|
||||
``-device vfio-platform`` (since 10.0)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
The vfio-platform device allows to assign a host platform device
|
||||
to a guest in a generic manner. Integrating a new device into
|
||||
the vfio-platform infrastructure requires some adaptation at
|
||||
both kernel and qemu level. No such attempt has been done for years
|
||||
and the conclusion is that vfio-platform has not got any traction.
|
||||
PCIe passthrough shall be the mainline solution.
|
||||
|
||||
CPU device properties
|
||||
'''''''''''''''''''''
|
||||
|
||||
|
|
|
@ -67,15 +67,35 @@ VFIO implements the device hooks for the iterative approach as follows:
|
|||
* A ``switchover_ack_needed`` function that checks if the VFIO device uses
|
||||
"switchover-ack" migration capability when this capability is enabled.
|
||||
|
||||
* A ``save_state`` function to save the device config space if it is present.
|
||||
* A ``switchover_start`` function that in the multifd mode starts a thread that
|
||||
reassembles the multifd received data and loads it in-order into the device.
|
||||
In the non-multifd mode this function is a NOP.
|
||||
|
||||
* A ``save_state`` function to save the device config space if it is present
|
||||
in the non-multifd mode.
|
||||
In the multifd mode it just emits either a dummy EOS marker.
|
||||
|
||||
* A ``save_live_complete_precopy`` function that sets the VFIO device in
|
||||
_STOP_COPY state and iteratively copies the data for the VFIO device until
|
||||
the vendor driver indicates that no data remains.
|
||||
In the multifd mode it just emits a dummy EOS marker.
|
||||
|
||||
* A ``save_live_complete_precopy_thread`` function that in the multifd mode
|
||||
provides thread handler performing multifd device state transfer.
|
||||
It sets the VFIO device to _STOP_COPY state, iteratively reads the data
|
||||
from the VFIO device and queues it for multifd transmission until the vendor
|
||||
driver indicates that no data remains.
|
||||
After that, it saves the device config space and queues it for multifd
|
||||
transfer too.
|
||||
In the non-multifd mode this thread is a NOP.
|
||||
|
||||
* A ``load_state`` function that loads the config section and the data
|
||||
sections that are generated by the save functions above.
|
||||
|
||||
* A ``load_state_buffer`` function that loads the device state and the device
|
||||
config that arrived via multifd channels.
|
||||
It's used only in the multifd mode.
|
||||
|
||||
* ``cleanup`` functions for both save and load that perform any migration
|
||||
related cleanup.
|
||||
|
||||
|
@ -176,8 +196,11 @@ Live migration save path
|
|||
Then the VFIO device is put in _STOP_COPY state
|
||||
(FINISH_MIGRATE, _ACTIVE, _STOP_COPY)
|
||||
.save_live_complete_precopy() is called for each active device
|
||||
For the VFIO device, iterate in .save_live_complete_precopy() until
|
||||
For the VFIO device: in the non-multifd mode iterate in
|
||||
.save_live_complete_precopy() until
|
||||
pending data is 0
|
||||
In the multifd mode this iteration is done in
|
||||
.save_live_complete_precopy_thread() instead.
|
||||
|
|
||||
(POSTMIGRATE, _COMPLETED, _STOP_COPY)
|
||||
Migraton thread schedules cleanup bottom half and exits
|
||||
|
@ -194,6 +217,9 @@ Live migration resume path
|
|||
(RESTORE_VM, _ACTIVE, _STOP)
|
||||
|
|
||||
For each device, .load_state() is called for that device section data
|
||||
transmitted via the main migration channel.
|
||||
For data transmitted via multifd channels .load_state_buffer() is called
|
||||
instead.
|
||||
(RESTORE_VM, _ACTIVE, _RESUMING)
|
||||
|
|
||||
At the end, .load_cleanup() is called for each device and vCPUs are started
|
||||
|
@ -206,3 +232,18 @@ Postcopy
|
|||
========
|
||||
|
||||
Postcopy migration is currently not supported for VFIO devices.
|
||||
|
||||
Multifd
|
||||
=======
|
||||
|
||||
Starting from QEMU version 10.0 there's a possibility to transfer VFIO device
|
||||
_STOP_COPY state via multifd channels. This helps reduce downtime - especially
|
||||
with multiple VFIO devices or with devices having a large migration state.
|
||||
As an additional benefit, setting the VFIO device to _STOP_COPY state and
|
||||
saving its config space is also parallelized (run in a separate thread) in
|
||||
such migration mode.
|
||||
|
||||
The multifd VFIO device state transfer is controlled by
|
||||
"x-migration-multifd-transfer" VFIO device property. This property defaults to
|
||||
AUTO, which means that VFIO device state transfer via multifd channels is
|
||||
attempted in configurations that otherwise support it.
|
||||
|
|
|
@ -44,6 +44,8 @@ GlobalProperty hw_compat_9_2[] = {
|
|||
{ "virtio-balloon-pci-non-transitional", "vectors", "0" },
|
||||
{ "virtio-mem-pci", "vectors", "0" },
|
||||
{ "migration", "multifd-clean-tls-termination", "false" },
|
||||
{ "migration", "send-switchover-start", "off"},
|
||||
{ "vfio-pci", "x-migration-multifd-transfer", "off" },
|
||||
};
|
||||
const size_t hw_compat_9_2_len = G_N_ELEMENTS(hw_compat_9_2);
|
||||
|
||||
|
|
|
@ -372,8 +372,7 @@ static int
|
|||
e1000e_add_pm_capability(PCIDevice *pdev, uint8_t offset, uint16_t pmc)
|
||||
{
|
||||
Error *local_err = NULL;
|
||||
int ret = pci_add_capability(pdev, PCI_CAP_ID_PM, offset,
|
||||
PCI_PM_SIZEOF, &local_err);
|
||||
int ret = pci_pm_init(pdev, offset, &local_err);
|
||||
|
||||
if (local_err) {
|
||||
error_report_err(local_err);
|
||||
|
|
|
@ -551,9 +551,7 @@ static void e100_pci_reset(EEPRO100State *s, Error **errp)
|
|||
if (info->power_management) {
|
||||
/* Power Management Capabilities */
|
||||
int cfg_offset = 0xdc;
|
||||
int r = pci_add_capability(&s->dev, PCI_CAP_ID_PM,
|
||||
cfg_offset, PCI_PM_SIZEOF,
|
||||
errp);
|
||||
int r = pci_pm_init(&s->dev, cfg_offset, errp);
|
||||
if (r < 0) {
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -356,8 +356,7 @@ static int
|
|||
igb_add_pm_capability(PCIDevice *pdev, uint8_t offset, uint16_t pmc)
|
||||
{
|
||||
Error *local_err = NULL;
|
||||
int ret = pci_add_capability(pdev, PCI_CAP_ID_PM, offset,
|
||||
PCI_PM_SIZEOF, &local_err);
|
||||
int ret = pci_pm_init(pdev, offset, &local_err);
|
||||
|
||||
if (local_err) {
|
||||
error_report_err(local_err);
|
||||
|
|
|
@ -8600,8 +8600,7 @@ static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset)
|
|||
Error *err = NULL;
|
||||
int ret;
|
||||
|
||||
ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, offset,
|
||||
PCI_PM_SIZEOF, &err);
|
||||
ret = pci_pm_init(pci_dev, offset, &err);
|
||||
if (err) {
|
||||
error_report_err(err);
|
||||
return ret;
|
||||
|
|
|
@ -52,11 +52,10 @@ static void pcie_pci_bridge_realize(PCIDevice *d, Error **errp)
|
|||
goto cap_error;
|
||||
}
|
||||
|
||||
pos = pci_add_capability(d, PCI_CAP_ID_PM, 0, PCI_PM_SIZEOF, errp);
|
||||
pos = pci_pm_init(d, 0, errp);
|
||||
if (pos < 0) {
|
||||
goto pm_error;
|
||||
}
|
||||
d->exp.pm_cap = pos;
|
||||
pci_set_word(d->config + pos + PCI_PM_PMC, 0x3);
|
||||
|
||||
pcie_cap_arifwd_init(d);
|
||||
|
|
93
hw/pci/pci.c
93
hw/pci/pci.c
|
@ -435,6 +435,84 @@ static void pci_msi_trigger(PCIDevice *dev, MSIMessage msg)
|
|||
attrs, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Register and track a PM capability. If wmask is also enabled for the power
|
||||
* state field of the pmcsr register, guest writes may change the device PM
|
||||
* state. BAR access is only enabled while the device is in the D0 state.
|
||||
* Return the capability offset or negative error code.
|
||||
*/
|
||||
int pci_pm_init(PCIDevice *d, uint8_t offset, Error **errp)
|
||||
{
|
||||
int cap = pci_add_capability(d, PCI_CAP_ID_PM, offset, PCI_PM_SIZEOF, errp);
|
||||
|
||||
if (cap < 0) {
|
||||
return cap;
|
||||
}
|
||||
|
||||
d->pm_cap = cap;
|
||||
d->cap_present |= QEMU_PCI_CAP_PM;
|
||||
|
||||
return cap;
|
||||
}
|
||||
|
||||
static uint8_t pci_pm_state(PCIDevice *d)
|
||||
{
|
||||
uint16_t pmcsr;
|
||||
|
||||
if (!(d->cap_present & QEMU_PCI_CAP_PM)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
pmcsr = pci_get_word(d->config + d->pm_cap + PCI_PM_CTRL);
|
||||
|
||||
return pmcsr & PCI_PM_CTRL_STATE_MASK;
|
||||
}
|
||||
|
||||
/*
|
||||
* Update the PM capability state based on the new value stored in config
|
||||
* space respective to the old, pre-write state provided. If the new value
|
||||
* is rejected (unsupported or invalid transition) restore the old value.
|
||||
* Return the resulting PM state.
|
||||
*/
|
||||
static uint8_t pci_pm_update(PCIDevice *d, uint32_t addr, int l, uint8_t old)
|
||||
{
|
||||
uint16_t pmc;
|
||||
uint8_t new;
|
||||
|
||||
if (!(d->cap_present & QEMU_PCI_CAP_PM) ||
|
||||
!range_covers_byte(addr, l, d->pm_cap + PCI_PM_CTRL)) {
|
||||
return old;
|
||||
}
|
||||
|
||||
new = pci_pm_state(d);
|
||||
if (new == old) {
|
||||
return old;
|
||||
}
|
||||
|
||||
pmc = pci_get_word(d->config + d->pm_cap + PCI_PM_PMC);
|
||||
|
||||
/*
|
||||
* Transitions to D1 & D2 are only allowed if supported. Devices may
|
||||
* only transition to higher D-states or to D0.
|
||||
*/
|
||||
if ((!(pmc & PCI_PM_CAP_D1) && new == 1) ||
|
||||
(!(pmc & PCI_PM_CAP_D2) && new == 2) ||
|
||||
(old && new && new < old)) {
|
||||
pci_word_test_and_clear_mask(d->config + d->pm_cap + PCI_PM_CTRL,
|
||||
PCI_PM_CTRL_STATE_MASK);
|
||||
pci_word_test_and_set_mask(d->config + d->pm_cap + PCI_PM_CTRL,
|
||||
old);
|
||||
trace_pci_pm_bad_transition(d->name, pci_dev_bus_num(d),
|
||||
PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
|
||||
old, new);
|
||||
return old;
|
||||
}
|
||||
|
||||
trace_pci_pm_transition(d->name, pci_dev_bus_num(d), PCI_SLOT(d->devfn),
|
||||
PCI_FUNC(d->devfn), old, new);
|
||||
return new;
|
||||
}
|
||||
|
||||
static void pci_reset_regions(PCIDevice *dev)
|
||||
{
|
||||
int r;
|
||||
|
@ -474,6 +552,11 @@ static void pci_do_device_reset(PCIDevice *dev)
|
|||
pci_get_word(dev->wmask + PCI_INTERRUPT_LINE) |
|
||||
pci_get_word(dev->w1cmask + PCI_INTERRUPT_LINE));
|
||||
dev->config[PCI_CACHE_LINE_SIZE] = 0x0;
|
||||
/* Default PM state is D0 */
|
||||
if (dev->cap_present & QEMU_PCI_CAP_PM) {
|
||||
pci_word_test_and_clear_mask(dev->config + dev->pm_cap + PCI_PM_CTRL,
|
||||
PCI_PM_CTRL_STATE_MASK);
|
||||
}
|
||||
pci_reset_regions(dev);
|
||||
pci_update_mappings(dev);
|
||||
|
||||
|
@ -1606,7 +1689,7 @@ static void pci_update_mappings(PCIDevice *d)
|
|||
continue;
|
||||
|
||||
new_addr = pci_bar_address(d, i, r->type, r->size);
|
||||
if (!d->enabled) {
|
||||
if (!d->enabled || pci_pm_state(d)) {
|
||||
new_addr = PCI_BAR_UNMAPPED;
|
||||
}
|
||||
|
||||
|
@ -1672,6 +1755,7 @@ uint32_t pci_default_read_config(PCIDevice *d,
|
|||
|
||||
void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int l)
|
||||
{
|
||||
uint8_t new_pm_state, old_pm_state = pci_pm_state(d);
|
||||
int i, was_irq_disabled = pci_irq_disabled(d);
|
||||
uint32_t val = val_in;
|
||||
|
||||
|
@ -1684,11 +1768,16 @@ void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int
|
|||
d->config[addr + i] = (d->config[addr + i] & ~wmask) | (val & wmask);
|
||||
d->config[addr + i] &= ~(val & w1cmask); /* W1C: Write 1 to Clear */
|
||||
}
|
||||
|
||||
new_pm_state = pci_pm_update(d, addr, l, old_pm_state);
|
||||
|
||||
if (ranges_overlap(addr, l, PCI_BASE_ADDRESS_0, 24) ||
|
||||
ranges_overlap(addr, l, PCI_ROM_ADDRESS, 4) ||
|
||||
ranges_overlap(addr, l, PCI_ROM_ADDRESS1, 4) ||
|
||||
range_covers_byte(addr, l, PCI_COMMAND))
|
||||
range_covers_byte(addr, l, PCI_COMMAND) ||
|
||||
!!new_pm_state != !!old_pm_state) {
|
||||
pci_update_mappings(d);
|
||||
}
|
||||
|
||||
if (ranges_overlap(addr, l, PCI_COMMAND, 2)) {
|
||||
pci_update_irq_disabled(d, was_irq_disabled);
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
# See docs/devel/tracing.rst for syntax documentation.
|
||||
|
||||
# pci.c
|
||||
pci_pm_bad_transition(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, uint8_t old, uint8_t new) "%s %02x:%02x.%x REJECTED PM transition D%d->D%d"
|
||||
pci_pm_transition(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, uint8_t old, uint8_t new) "%s %02x:%02x.%x PM transition D%d->D%d"
|
||||
pci_update_mappings_del(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, int bar, uint64_t addr, uint64_t size) "%s %02x:%02x.%x %d,0x%"PRIx64"+0x%"PRIx64
|
||||
pci_update_mappings_add(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, int bar, uint64_t addr, uint64_t size) "%s %02x:%02x.%x %d,0x%"PRIx64"+0x%"PRIx64
|
||||
pci_route_irq(int dev_irq, const char *dev_path, int parent_irq, const char *parent_path) "IRQ %d @%s -> IRQ %d @%s"
|
||||
|
|
|
@ -15,12 +15,14 @@
|
|||
#include "hw/vfio/vfio-amd-xgbe.h"
|
||||
#include "migration/vmstate.h"
|
||||
#include "qemu/module.h"
|
||||
#include "qemu/error-report.h"
|
||||
|
||||
static void amd_xgbe_realize(DeviceState *dev, Error **errp)
|
||||
{
|
||||
VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev);
|
||||
VFIOAmdXgbeDeviceClass *k = VFIO_AMD_XGBE_DEVICE_GET_CLASS(dev);
|
||||
|
||||
warn_report("-device vfio-amd-xgbe is deprecated");
|
||||
vdev->compat = g_strdup("amd,xgbe-seattle-v1a");
|
||||
vdev->num_compat = 1;
|
||||
|
||||
|
|
|
@ -257,6 +257,15 @@ static void vfio_ap_class_init(ObjectClass *klass, void *data)
|
|||
dc->hotpluggable = true;
|
||||
device_class_set_legacy_reset(dc, vfio_ap_reset);
|
||||
dc->bus_type = TYPE_AP_BUS;
|
||||
|
||||
object_class_property_set_description(klass, /* 3.1 */
|
||||
"sysfsdev",
|
||||
"Host sysfs path of assigned device");
|
||||
#ifdef CONFIG_IOMMUFD
|
||||
object_class_property_set_description(klass, /* 9.0 */
|
||||
"iommufd",
|
||||
"Set host IOMMUFD backend device");
|
||||
#endif
|
||||
}
|
||||
|
||||
static const TypeInfo vfio_ap_info = {
|
||||
|
|
|
@ -15,12 +15,14 @@
|
|||
#include "hw/vfio/vfio-calxeda-xgmac.h"
|
||||
#include "migration/vmstate.h"
|
||||
#include "qemu/module.h"
|
||||
#include "qemu/error-report.h"
|
||||
|
||||
static void calxeda_xgmac_realize(DeviceState *dev, Error **errp)
|
||||
{
|
||||
VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev);
|
||||
VFIOCalxedaXgmacDeviceClass *k = VFIO_CALXEDA_XGMAC_DEVICE_GET_CLASS(dev);
|
||||
|
||||
warn_report("-device vfio-calxeda-xgmac is deprecated");
|
||||
vdev->compat = g_strdup("calxeda,hb-xgmac");
|
||||
vdev->num_compat = 1;
|
||||
|
||||
|
|
|
@ -51,17 +51,8 @@ struct VFIOCCWDevice {
|
|||
EventNotifier crw_notifier;
|
||||
EventNotifier req_notifier;
|
||||
bool force_orb_pfch;
|
||||
bool warned_orb_pfch;
|
||||
};
|
||||
|
||||
static inline void warn_once_pfch(VFIOCCWDevice *vcdev, SubchDev *sch,
|
||||
const char *msg)
|
||||
{
|
||||
warn_report_once_cond(&vcdev->warned_orb_pfch,
|
||||
"vfio-ccw (devno %x.%x.%04x): %s",
|
||||
sch->cssid, sch->ssid, sch->devno, msg);
|
||||
}
|
||||
|
||||
static void vfio_ccw_compute_needs_reset(VFIODevice *vdev)
|
||||
{
|
||||
vdev->needs_reset = false;
|
||||
|
@ -83,7 +74,8 @@ static IOInstEnding vfio_ccw_handle_request(SubchDev *sch)
|
|||
|
||||
if (!(sch->orb.ctrl0 & ORB_CTRL0_MASK_PFCH) && vcdev->force_orb_pfch) {
|
||||
sch->orb.ctrl0 |= ORB_CTRL0_MASK_PFCH;
|
||||
warn_once_pfch(vcdev, sch, "PFCH flag forced");
|
||||
warn_report_once("vfio-ccw (devno %x.%x.%04x): PFCH flag forced",
|
||||
sch->cssid, sch->ssid, sch->devno);
|
||||
}
|
||||
|
||||
QEMU_BUILD_BUG_ON(sizeof(region->orb_area) != sizeof(ORB));
|
||||
|
@ -717,6 +709,21 @@ static void vfio_ccw_class_init(ObjectClass *klass, void *data)
|
|||
cdc->handle_halt = vfio_ccw_handle_halt;
|
||||
cdc->handle_clear = vfio_ccw_handle_clear;
|
||||
cdc->handle_store = vfio_ccw_handle_store;
|
||||
|
||||
object_class_property_set_description(klass, /* 2.10 */
|
||||
"sysfsdev",
|
||||
"Host sysfs path of assigned device");
|
||||
object_class_property_set_description(klass, /* 3.0 */
|
||||
"force-orb-pfch",
|
||||
"Force unlimited prefetch");
|
||||
#ifdef CONFIG_IOMMUFD
|
||||
object_class_property_set_description(klass, /* 9.0 */
|
||||
"iommufd",
|
||||
"Set host IOMMUFD backend device");
|
||||
#endif
|
||||
object_class_property_set_description(klass, /* 9.2 */
|
||||
"loadparm",
|
||||
"Define which devices that can be used for booting");
|
||||
}
|
||||
|
||||
static const TypeInfo vfio_ccw_info = {
|
||||
|
|
|
@ -5,6 +5,7 @@ vfio_ss.add(files(
|
|||
'container-base.c',
|
||||
'container.c',
|
||||
'migration.c',
|
||||
'migration-multifd.c',
|
||||
'cpr.c',
|
||||
))
|
||||
vfio_ss.add(when: 'CONFIG_PSERIES', if_true: files('spapr.c'))
|
||||
|
|
679
hw/vfio/migration-multifd.c
Normal file
679
hw/vfio/migration-multifd.c
Normal file
|
@ -0,0 +1,679 @@
|
|||
/*
|
||||
* Multifd VFIO migration
|
||||
*
|
||||
* Copyright (C) 2024,2025 Oracle and/or its affiliates.
|
||||
*
|
||||
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
||||
* See the COPYING file in the top-level directory.
|
||||
*
|
||||
* SPDX-License-Identifier: GPL-2.0-or-later
|
||||
*/
|
||||
|
||||
#include "qemu/osdep.h"
|
||||
#include "hw/vfio/vfio-common.h"
|
||||
#include "migration/misc.h"
|
||||
#include "qapi/error.h"
|
||||
#include "qemu/error-report.h"
|
||||
#include "qemu/lockable.h"
|
||||
#include "qemu/main-loop.h"
|
||||
#include "qemu/thread.h"
|
||||
#include "io/channel-buffer.h"
|
||||
#include "migration/qemu-file.h"
|
||||
#include "migration-multifd.h"
|
||||
#include "trace.h"
|
||||
|
||||
#define VFIO_DEVICE_STATE_CONFIG_STATE (1)
|
||||
|
||||
#define VFIO_DEVICE_STATE_PACKET_VER_CURRENT (0)
|
||||
|
||||
typedef struct VFIODeviceStatePacket {
|
||||
uint32_t version;
|
||||
uint32_t idx;
|
||||
uint32_t flags;
|
||||
uint8_t data[0];
|
||||
} QEMU_PACKED VFIODeviceStatePacket;
|
||||
|
||||
/* type safety */
|
||||
typedef struct VFIOStateBuffers {
|
||||
GArray *array;
|
||||
} VFIOStateBuffers;
|
||||
|
||||
typedef struct VFIOStateBuffer {
|
||||
bool is_present;
|
||||
char *data;
|
||||
size_t len;
|
||||
} VFIOStateBuffer;
|
||||
|
||||
typedef struct VFIOMultifd {
|
||||
bool load_bufs_thread_running;
|
||||
bool load_bufs_thread_want_exit;
|
||||
|
||||
VFIOStateBuffers load_bufs;
|
||||
QemuCond load_bufs_buffer_ready_cond;
|
||||
QemuCond load_bufs_thread_finished_cond;
|
||||
QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
|
||||
uint32_t load_buf_idx;
|
||||
uint32_t load_buf_idx_last;
|
||||
} VFIOMultifd;
|
||||
|
||||
static void vfio_state_buffer_clear(gpointer data)
|
||||
{
|
||||
VFIOStateBuffer *lb = data;
|
||||
|
||||
if (!lb->is_present) {
|
||||
return;
|
||||
}
|
||||
|
||||
g_clear_pointer(&lb->data, g_free);
|
||||
lb->is_present = false;
|
||||
}
|
||||
|
||||
static void vfio_state_buffers_init(VFIOStateBuffers *bufs)
|
||||
{
|
||||
bufs->array = g_array_new(FALSE, TRUE, sizeof(VFIOStateBuffer));
|
||||
g_array_set_clear_func(bufs->array, vfio_state_buffer_clear);
|
||||
}
|
||||
|
||||
static void vfio_state_buffers_destroy(VFIOStateBuffers *bufs)
|
||||
{
|
||||
g_clear_pointer(&bufs->array, g_array_unref);
|
||||
}
|
||||
|
||||
static void vfio_state_buffers_assert_init(VFIOStateBuffers *bufs)
|
||||
{
|
||||
assert(bufs->array);
|
||||
}
|
||||
|
||||
static unsigned int vfio_state_buffers_size_get(VFIOStateBuffers *bufs)
|
||||
{
|
||||
return bufs->array->len;
|
||||
}
|
||||
|
||||
static void vfio_state_buffers_size_set(VFIOStateBuffers *bufs,
|
||||
unsigned int size)
|
||||
{
|
||||
g_array_set_size(bufs->array, size);
|
||||
}
|
||||
|
||||
static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs,
|
||||
unsigned int idx)
|
||||
{
|
||||
return &g_array_index(bufs->array, VFIOStateBuffer, idx);
|
||||
}
|
||||
|
||||
/* called with load_bufs_mutex locked */
|
||||
static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev,
|
||||
VFIODeviceStatePacket *packet,
|
||||
size_t packet_total_size,
|
||||
Error **errp)
|
||||
{
|
||||
VFIOMigration *migration = vbasedev->migration;
|
||||
VFIOMultifd *multifd = migration->multifd;
|
||||
VFIOStateBuffer *lb;
|
||||
|
||||
vfio_state_buffers_assert_init(&multifd->load_bufs);
|
||||
if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) {
|
||||
vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1);
|
||||
}
|
||||
|
||||
lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx);
|
||||
if (lb->is_present) {
|
||||
error_setg(errp, "%s: state buffer %" PRIu32 " already filled",
|
||||
vbasedev->name, packet->idx);
|
||||
return false;
|
||||
}
|
||||
|
||||
assert(packet->idx >= multifd->load_buf_idx);
|
||||
|
||||
lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet));
|
||||
lb->len = packet_total_size - sizeof(*packet);
|
||||
lb->is_present = true;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size,
|
||||
Error **errp)
|
||||
{
|
||||
VFIODevice *vbasedev = opaque;
|
||||
VFIOMigration *migration = vbasedev->migration;
|
||||
VFIOMultifd *multifd = migration->multifd;
|
||||
VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data;
|
||||
|
||||
if (!vfio_multifd_transfer_enabled(vbasedev)) {
|
||||
error_setg(errp,
|
||||
"%s: got device state packet but not doing multifd transfer",
|
||||
vbasedev->name);
|
||||
return false;
|
||||
}
|
||||
|
||||
assert(multifd);
|
||||
|
||||
if (data_size < sizeof(*packet)) {
|
||||
error_setg(errp, "%s: packet too short at %zu (min is %zu)",
|
||||
vbasedev->name, data_size, sizeof(*packet));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) {
|
||||
error_setg(errp, "%s: packet has unknown version %" PRIu32,
|
||||
vbasedev->name, packet->version);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (packet->idx == UINT32_MAX) {
|
||||
error_setg(errp, "%s: packet index is invalid", vbasedev->name);
|
||||
return false;
|
||||
}
|
||||
|
||||
trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx);
|
||||
|
||||
/*
|
||||
* Holding BQL here would violate the lock order and can cause
|
||||
* a deadlock once we attempt to lock load_bufs_mutex below.
|
||||
*/
|
||||
assert(!bql_locked());
|
||||
|
||||
WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
|
||||
/* config state packet should be the last one in the stream */
|
||||
if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) {
|
||||
multifd->load_buf_idx_last = packet->idx;
|
||||
}
|
||||
|
||||
if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size,
|
||||
errp)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool vfio_load_bufs_thread_load_config(VFIODevice *vbasedev,
|
||||
Error **errp)
|
||||
{
|
||||
VFIOMigration *migration = vbasedev->migration;
|
||||
VFIOMultifd *multifd = migration->multifd;
|
||||
VFIOStateBuffer *lb;
|
||||
g_autoptr(QIOChannelBuffer) bioc = NULL;
|
||||
g_autoptr(QEMUFile) f_out = NULL, f_in = NULL;
|
||||
uint64_t mig_header;
|
||||
int ret;
|
||||
|
||||
assert(multifd->load_buf_idx == multifd->load_buf_idx_last);
|
||||
lb = vfio_state_buffers_at(&multifd->load_bufs, multifd->load_buf_idx);
|
||||
assert(lb->is_present);
|
||||
|
||||
bioc = qio_channel_buffer_new(lb->len);
|
||||
qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-load");
|
||||
|
||||
f_out = qemu_file_new_output(QIO_CHANNEL(bioc));
|
||||
qemu_put_buffer(f_out, (uint8_t *)lb->data, lb->len);
|
||||
|
||||
ret = qemu_fflush(f_out);
|
||||
if (ret) {
|
||||
error_setg(errp, "%s: load config state flush failed: %d",
|
||||
vbasedev->name, ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL);
|
||||
f_in = qemu_file_new_input(QIO_CHANNEL(bioc));
|
||||
|
||||
mig_header = qemu_get_be64(f_in);
|
||||
if (mig_header != VFIO_MIG_FLAG_DEV_CONFIG_STATE) {
|
||||
error_setg(errp, "%s: expected FLAG_DEV_CONFIG_STATE but got %" PRIx64,
|
||||
vbasedev->name, mig_header);
|
||||
return false;
|
||||
}
|
||||
|
||||
bql_lock();
|
||||
ret = vfio_load_device_config_state(f_in, vbasedev);
|
||||
bql_unlock();
|
||||
|
||||
if (ret < 0) {
|
||||
error_setg(errp, "%s: vfio_load_device_config_state() failed: %d",
|
||||
vbasedev->name, ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd)
|
||||
{
|
||||
VFIOStateBuffer *lb;
|
||||
unsigned int bufs_len;
|
||||
|
||||
bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs);
|
||||
if (multifd->load_buf_idx >= bufs_len) {
|
||||
assert(multifd->load_buf_idx == bufs_len);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
lb = vfio_state_buffers_at(&multifd->load_bufs,
|
||||
multifd->load_buf_idx);
|
||||
if (!lb->is_present) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return lb;
|
||||
}
|
||||
|
||||
static bool vfio_load_state_buffer_write(VFIODevice *vbasedev,
|
||||
VFIOStateBuffer *lb,
|
||||
Error **errp)
|
||||
{
|
||||
VFIOMigration *migration = vbasedev->migration;
|
||||
VFIOMultifd *multifd = migration->multifd;
|
||||
g_autofree char *buf = NULL;
|
||||
char *buf_cur;
|
||||
size_t buf_len;
|
||||
|
||||
if (!lb->len) {
|
||||
return true;
|
||||
}
|
||||
|
||||
trace_vfio_load_state_device_buffer_load_start(vbasedev->name,
|
||||
multifd->load_buf_idx);
|
||||
|
||||
/* lb might become re-allocated when we drop the lock */
|
||||
buf = g_steal_pointer(&lb->data);
|
||||
buf_cur = buf;
|
||||
buf_len = lb->len;
|
||||
while (buf_len > 0) {
|
||||
ssize_t wr_ret;
|
||||
int errno_save;
|
||||
|
||||
/*
|
||||
* Loading data to the device takes a while,
|
||||
* drop the lock during this process.
|
||||
*/
|
||||
qemu_mutex_unlock(&multifd->load_bufs_mutex);
|
||||
wr_ret = write(migration->data_fd, buf_cur, buf_len);
|
||||
errno_save = errno;
|
||||
qemu_mutex_lock(&multifd->load_bufs_mutex);
|
||||
|
||||
if (wr_ret < 0) {
|
||||
error_setg(errp,
|
||||
"%s: writing state buffer %" PRIu32 " failed: %d",
|
||||
vbasedev->name, multifd->load_buf_idx, errno_save);
|
||||
return false;
|
||||
}
|
||||
|
||||
assert(wr_ret <= buf_len);
|
||||
buf_len -= wr_ret;
|
||||
buf_cur += wr_ret;
|
||||
}
|
||||
|
||||
trace_vfio_load_state_device_buffer_load_end(vbasedev->name,
|
||||
multifd->load_buf_idx);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd,
|
||||
bool *should_quit)
|
||||
{
|
||||
return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit);
|
||||
}
|
||||
|
||||
/*
|
||||
* This thread is spawned by vfio_multifd_switchover_start() which gets
|
||||
* called upon encountering the switchover point marker in main migration
|
||||
* stream.
|
||||
*
|
||||
* It exits after either:
|
||||
* * completing loading the remaining device state and device config, OR:
|
||||
* * encountering some error while doing the above, OR:
|
||||
* * being forcefully aborted by the migration core by it setting should_quit
|
||||
* or by vfio_load_cleanup_load_bufs_thread() setting
|
||||
* multifd->load_bufs_thread_want_exit.
|
||||
*/
|
||||
static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp)
|
||||
{
|
||||
VFIODevice *vbasedev = opaque;
|
||||
VFIOMigration *migration = vbasedev->migration;
|
||||
VFIOMultifd *multifd = migration->multifd;
|
||||
bool ret = false;
|
||||
|
||||
trace_vfio_load_bufs_thread_start(vbasedev->name);
|
||||
|
||||
assert(multifd);
|
||||
QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);
|
||||
|
||||
assert(multifd->load_bufs_thread_running);
|
||||
|
||||
while (true) {
|
||||
VFIOStateBuffer *lb;
|
||||
|
||||
/*
|
||||
* Always check cancellation first after the buffer_ready wait below in
|
||||
* case that cond was signalled by vfio_load_cleanup_load_bufs_thread().
|
||||
*/
|
||||
if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) {
|
||||
error_setg(errp, "operation cancelled");
|
||||
goto thread_exit;
|
||||
}
|
||||
|
||||
assert(multifd->load_buf_idx <= multifd->load_buf_idx_last);
|
||||
|
||||
lb = vfio_load_state_buffer_get(multifd);
|
||||
if (!lb) {
|
||||
trace_vfio_load_state_device_buffer_starved(vbasedev->name,
|
||||
multifd->load_buf_idx);
|
||||
qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond,
|
||||
&multifd->load_bufs_mutex);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (multifd->load_buf_idx == multifd->load_buf_idx_last) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (multifd->load_buf_idx == 0) {
|
||||
trace_vfio_load_state_device_buffer_start(vbasedev->name);
|
||||
}
|
||||
|
||||
if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) {
|
||||
goto thread_exit;
|
||||
}
|
||||
|
||||
if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) {
|
||||
trace_vfio_load_state_device_buffer_end(vbasedev->name);
|
||||
}
|
||||
|
||||
multifd->load_buf_idx++;
|
||||
}
|
||||
|
||||
if (!vfio_load_bufs_thread_load_config(vbasedev, errp)) {
|
||||
goto thread_exit;
|
||||
}
|
||||
|
||||
ret = true;
|
||||
|
||||
thread_exit:
|
||||
/*
|
||||
* Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that
|
||||
* this thread is exiting.
|
||||
*/
|
||||
multifd->load_bufs_thread_running = false;
|
||||
qemu_cond_signal(&multifd->load_bufs_thread_finished_cond);
|
||||
|
||||
trace_vfio_load_bufs_thread_end(vbasedev->name);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static VFIOMultifd *vfio_multifd_new(void)
|
||||
{
|
||||
VFIOMultifd *multifd = g_new(VFIOMultifd, 1);
|
||||
|
||||
vfio_state_buffers_init(&multifd->load_bufs);
|
||||
|
||||
qemu_mutex_init(&multifd->load_bufs_mutex);
|
||||
|
||||
multifd->load_buf_idx = 0;
|
||||
multifd->load_buf_idx_last = UINT32_MAX;
|
||||
qemu_cond_init(&multifd->load_bufs_buffer_ready_cond);
|
||||
|
||||
multifd->load_bufs_thread_running = false;
|
||||
multifd->load_bufs_thread_want_exit = false;
|
||||
qemu_cond_init(&multifd->load_bufs_thread_finished_cond);
|
||||
|
||||
return multifd;
|
||||
}
|
||||
|
||||
/*
|
||||
* Terminates vfio_load_bufs_thread by setting
|
||||
* multifd->load_bufs_thread_want_exit and signalling all the conditions
|
||||
* the thread could be blocked on.
|
||||
*
|
||||
* Waits for the thread to signal that it had finished.
|
||||
*/
|
||||
static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd)
|
||||
{
|
||||
/* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
|
||||
bql_unlock();
|
||||
WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
|
||||
while (multifd->load_bufs_thread_running) {
|
||||
multifd->load_bufs_thread_want_exit = true;
|
||||
|
||||
qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
|
||||
qemu_cond_wait(&multifd->load_bufs_thread_finished_cond,
|
||||
&multifd->load_bufs_mutex);
|
||||
}
|
||||
}
|
||||
bql_lock();
|
||||
}
|
||||
|
||||
static void vfio_multifd_free(VFIOMultifd *multifd)
|
||||
{
|
||||
vfio_load_cleanup_load_bufs_thread(multifd);
|
||||
|
||||
qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond);
|
||||
vfio_state_buffers_destroy(&multifd->load_bufs);
|
||||
qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond);
|
||||
qemu_mutex_destroy(&multifd->load_bufs_mutex);
|
||||
|
||||
g_free(multifd);
|
||||
}
|
||||
|
||||
void vfio_multifd_cleanup(VFIODevice *vbasedev)
|
||||
{
|
||||
VFIOMigration *migration = vbasedev->migration;
|
||||
|
||||
g_clear_pointer(&migration->multifd, vfio_multifd_free);
|
||||
}
|
||||
|
||||
bool vfio_multifd_transfer_supported(void)
|
||||
{
|
||||
return multifd_device_state_supported() &&
|
||||
migrate_send_switchover_start();
|
||||
}
|
||||
|
||||
bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev)
|
||||
{
|
||||
VFIOMigration *migration = vbasedev->migration;
|
||||
|
||||
return migration->multifd_transfer;
|
||||
}
|
||||
|
||||
bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp)
|
||||
{
|
||||
VFIOMigration *migration = vbasedev->migration;
|
||||
|
||||
/*
|
||||
* Make a copy of this setting at the start in case it is changed
|
||||
* mid-migration.
|
||||
*/
|
||||
if (vbasedev->migration_multifd_transfer == ON_OFF_AUTO_AUTO) {
|
||||
migration->multifd_transfer = vfio_multifd_transfer_supported();
|
||||
} else {
|
||||
migration->multifd_transfer =
|
||||
vbasedev->migration_multifd_transfer == ON_OFF_AUTO_ON;
|
||||
}
|
||||
|
||||
if (!vfio_multifd_transfer_enabled(vbasedev)) {
|
||||
/* Nothing further to check or do */
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!vfio_multifd_transfer_supported()) {
|
||||
error_setg(errp,
|
||||
"%s: Multifd device transfer requested but unsupported in the current config",
|
||||
vbasedev->name);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (alloc_multifd) {
|
||||
assert(!migration->multifd);
|
||||
migration->multifd = vfio_multifd_new();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void vfio_multifd_emit_dummy_eos(VFIODevice *vbasedev, QEMUFile *f)
|
||||
{
|
||||
assert(vfio_multifd_transfer_enabled(vbasedev));
|
||||
|
||||
/*
|
||||
* Emit dummy NOP data on the main migration channel since the actual
|
||||
* device state transfer is done via multifd channels.
|
||||
*/
|
||||
qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
|
||||
}
|
||||
|
||||
static bool
|
||||
vfio_save_complete_precopy_thread_config_state(VFIODevice *vbasedev,
|
||||
char *idstr,
|
||||
uint32_t instance_id,
|
||||
uint32_t idx,
|
||||
Error **errp)
|
||||
{
|
||||
g_autoptr(QIOChannelBuffer) bioc = NULL;
|
||||
g_autoptr(QEMUFile) f = NULL;
|
||||
int ret;
|
||||
g_autofree VFIODeviceStatePacket *packet = NULL;
|
||||
size_t packet_len;
|
||||
|
||||
bioc = qio_channel_buffer_new(0);
|
||||
qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save");
|
||||
|
||||
f = qemu_file_new_output(QIO_CHANNEL(bioc));
|
||||
|
||||
if (vfio_save_device_config_state(f, vbasedev, errp)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ret = qemu_fflush(f);
|
||||
if (ret) {
|
||||
error_setg(errp, "%s: save config state flush failed: %d",
|
||||
vbasedev->name, ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
packet_len = sizeof(*packet) + bioc->usage;
|
||||
packet = g_malloc0(packet_len);
|
||||
packet->version = VFIO_DEVICE_STATE_PACKET_VER_CURRENT;
|
||||
packet->idx = idx;
|
||||
packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE;
|
||||
memcpy(&packet->data, bioc->data, bioc->usage);
|
||||
|
||||
if (!multifd_queue_device_state(idstr, instance_id,
|
||||
(char *)packet, packet_len)) {
|
||||
error_setg(errp, "%s: multifd config data queuing failed",
|
||||
vbasedev->name);
|
||||
return false;
|
||||
}
|
||||
|
||||
vfio_mig_add_bytes_transferred(packet_len);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* This thread is spawned by the migration core directly via
|
||||
* .save_live_complete_precopy_thread SaveVMHandler.
|
||||
*
|
||||
* It exits after either:
|
||||
* * completing saving the remaining device state and device config, OR:
|
||||
* * encountering some error while doing the above, OR:
|
||||
* * being forcefully aborted by the migration core by
|
||||
* multifd_device_state_save_thread_should_exit() returning true.
|
||||
*/
|
||||
bool
|
||||
vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d,
|
||||
Error **errp)
|
||||
{
|
||||
VFIODevice *vbasedev = d->handler_opaque;
|
||||
VFIOMigration *migration = vbasedev->migration;
|
||||
bool ret = false;
|
||||
g_autofree VFIODeviceStatePacket *packet = NULL;
|
||||
uint32_t idx;
|
||||
|
||||
if (!vfio_multifd_transfer_enabled(vbasedev)) {
|
||||
/* Nothing to do, vfio_save_complete_precopy() does the transfer. */
|
||||
return true;
|
||||
}
|
||||
|
||||
trace_vfio_save_complete_precopy_thread_start(vbasedev->name,
|
||||
d->idstr, d->instance_id);
|
||||
|
||||
/* We reach here with device state STOP or STOP_COPY only */
|
||||
if (vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
|
||||
VFIO_DEVICE_STATE_STOP, errp)) {
|
||||
goto thread_exit;
|
||||
}
|
||||
|
||||
packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size);
|
||||
packet->version = VFIO_DEVICE_STATE_PACKET_VER_CURRENT;
|
||||
|
||||
for (idx = 0; ; idx++) {
|
||||
ssize_t data_size;
|
||||
size_t packet_size;
|
||||
|
||||
if (multifd_device_state_save_thread_should_exit()) {
|
||||
error_setg(errp, "operation cancelled");
|
||||
goto thread_exit;
|
||||
}
|
||||
|
||||
data_size = read(migration->data_fd, &packet->data,
|
||||
migration->data_buffer_size);
|
||||
if (data_size < 0) {
|
||||
error_setg(errp, "%s: reading state buffer %" PRIu32 " failed: %d",
|
||||
vbasedev->name, idx, errno);
|
||||
goto thread_exit;
|
||||
} else if (data_size == 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
packet->idx = idx;
|
||||
packet_size = sizeof(*packet) + data_size;
|
||||
|
||||
if (!multifd_queue_device_state(d->idstr, d->instance_id,
|
||||
(char *)packet, packet_size)) {
|
||||
error_setg(errp, "%s: multifd data queuing failed", vbasedev->name);
|
||||
goto thread_exit;
|
||||
}
|
||||
|
||||
vfio_mig_add_bytes_transferred(packet_size);
|
||||
}
|
||||
|
||||
if (!vfio_save_complete_precopy_thread_config_state(vbasedev,
|
||||
d->idstr,
|
||||
d->instance_id,
|
||||
idx, errp)) {
|
||||
goto thread_exit;
|
||||
}
|
||||
|
||||
ret = true;
|
||||
|
||||
thread_exit:
|
||||
trace_vfio_save_complete_precopy_thread_end(vbasedev->name, ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int vfio_multifd_switchover_start(VFIODevice *vbasedev)
|
||||
{
|
||||
VFIOMigration *migration = vbasedev->migration;
|
||||
VFIOMultifd *multifd = migration->multifd;
|
||||
|
||||
assert(multifd);
|
||||
|
||||
/* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
|
||||
bql_unlock();
|
||||
WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
|
||||
assert(!multifd->load_bufs_thread_running);
|
||||
multifd->load_bufs_thread_running = true;
|
||||
}
|
||||
bql_lock();
|
||||
|
||||
qemu_loadvm_start_load_thread(vfio_load_bufs_thread, vbasedev);
|
||||
|
||||
return 0;
|
||||
}
|
34
hw/vfio/migration-multifd.h
Normal file
34
hw/vfio/migration-multifd.h
Normal file
|
@ -0,0 +1,34 @@
|
|||
/*
|
||||
* Multifd VFIO migration
|
||||
*
|
||||
* Copyright (C) 2024,2025 Oracle and/or its affiliates.
|
||||
*
|
||||
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
||||
* See the COPYING file in the top-level directory.
|
||||
*
|
||||
* SPDX-License-Identifier: GPL-2.0-or-later
|
||||
*/
|
||||
|
||||
#ifndef HW_VFIO_MIGRATION_MULTIFD_H
|
||||
#define HW_VFIO_MIGRATION_MULTIFD_H
|
||||
|
||||
#include "hw/vfio/vfio-common.h"
|
||||
|
||||
bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp);
|
||||
void vfio_multifd_cleanup(VFIODevice *vbasedev);
|
||||
|
||||
bool vfio_multifd_transfer_supported(void);
|
||||
bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev);
|
||||
|
||||
bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size,
|
||||
Error **errp);
|
||||
|
||||
void vfio_multifd_emit_dummy_eos(VFIODevice *vbasedev, QEMUFile *f);
|
||||
|
||||
bool
|
||||
vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d,
|
||||
Error **errp);
|
||||
|
||||
int vfio_multifd_switchover_start(VFIODevice *vbasedev);
|
||||
|
||||
#endif
|
|
@ -23,6 +23,7 @@
|
|||
#include "migration/qemu-file.h"
|
||||
#include "migration/register.h"
|
||||
#include "migration/blocker.h"
|
||||
#include "migration-multifd.h"
|
||||
#include "qapi/error.h"
|
||||
#include "qapi/qapi-events-vfio.h"
|
||||
#include "exec/ramlist.h"
|
||||
|
@ -31,23 +32,6 @@
|
|||
#include "trace.h"
|
||||
#include "hw/hw.h"
|
||||
|
||||
/*
|
||||
* Flags to be used as unique delimiters for VFIO devices in the migration
|
||||
* stream. These flags are composed as:
|
||||
* 0xffffffff => MSB 32-bit all 1s
|
||||
* 0xef10 => Magic ID, represents emulated (virtual) function IO
|
||||
* 0x0000 => 16-bits reserved for flags
|
||||
*
|
||||
* The beginning of state information is marked by _DEV_CONFIG_STATE,
|
||||
* _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a
|
||||
* certain state information is marked by _END_OF_STATE.
|
||||
*/
|
||||
#define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL)
|
||||
#define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL)
|
||||
#define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL)
|
||||
#define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL)
|
||||
#define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL)
|
||||
|
||||
/*
|
||||
* This is an arbitrary size based on migration of mlx5 devices, where typically
|
||||
* total device migration size is on the order of 100s of MB. Testing with
|
||||
|
@ -55,7 +39,7 @@
|
|||
*/
|
||||
#define VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE (1 * MiB)
|
||||
|
||||
static int64_t bytes_transferred;
|
||||
static unsigned long bytes_transferred;
|
||||
|
||||
static const char *mig_state_to_str(enum vfio_device_mig_state state)
|
||||
{
|
||||
|
@ -136,10 +120,10 @@ static void vfio_migration_set_device_state(VFIODevice *vbasedev,
|
|||
vfio_migration_send_event(vbasedev);
|
||||
}
|
||||
|
||||
static int vfio_migration_set_state(VFIODevice *vbasedev,
|
||||
enum vfio_device_mig_state new_state,
|
||||
enum vfio_device_mig_state recover_state,
|
||||
Error **errp)
|
||||
int vfio_migration_set_state(VFIODevice *vbasedev,
|
||||
enum vfio_device_mig_state new_state,
|
||||
enum vfio_device_mig_state recover_state,
|
||||
Error **errp)
|
||||
{
|
||||
VFIOMigration *migration = vbasedev->migration;
|
||||
uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
|
||||
|
@ -254,8 +238,7 @@ static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev,
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int vfio_save_device_config_state(QEMUFile *f, void *opaque,
|
||||
Error **errp)
|
||||
int vfio_save_device_config_state(QEMUFile *f, void *opaque, Error **errp)
|
||||
{
|
||||
VFIODevice *vbasedev = opaque;
|
||||
int ret;
|
||||
|
@ -280,11 +263,13 @@ static int vfio_save_device_config_state(QEMUFile *f, void *opaque,
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int vfio_load_device_config_state(QEMUFile *f, void *opaque)
|
||||
int vfio_load_device_config_state(QEMUFile *f, void *opaque)
|
||||
{
|
||||
VFIODevice *vbasedev = opaque;
|
||||
uint64_t data;
|
||||
|
||||
trace_vfio_load_device_config_state_start(vbasedev->name);
|
||||
|
||||
if (vbasedev->ops && vbasedev->ops->vfio_load_config) {
|
||||
int ret;
|
||||
|
||||
|
@ -303,7 +288,7 @@ static int vfio_load_device_config_state(QEMUFile *f, void *opaque)
|
|||
return -EINVAL;
|
||||
}
|
||||
|
||||
trace_vfio_load_device_config_state(vbasedev->name);
|
||||
trace_vfio_load_device_config_state_end(vbasedev->name);
|
||||
return qemu_file_get_error(f);
|
||||
}
|
||||
|
||||
|
@ -389,7 +374,7 @@ static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration)
|
|||
qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE);
|
||||
qemu_put_be64(f, data_size);
|
||||
qemu_put_buffer(f, migration->data_buffer, data_size);
|
||||
bytes_transferred += data_size;
|
||||
vfio_mig_add_bytes_transferred(data_size);
|
||||
|
||||
trace_vfio_save_block(migration->vbasedev->name, data_size);
|
||||
|
||||
|
@ -467,6 +452,10 @@ static int vfio_save_setup(QEMUFile *f, void *opaque, Error **errp)
|
|||
uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE;
|
||||
int ret;
|
||||
|
||||
if (!vfio_multifd_setup(vbasedev, false, errp)) {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE);
|
||||
|
||||
vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
|
||||
|
@ -523,6 +512,9 @@ static void vfio_save_cleanup(void *opaque)
|
|||
Error *local_err = NULL;
|
||||
int ret;
|
||||
|
||||
/* Currently a NOP, done for symmetry with load_cleanup() */
|
||||
vfio_multifd_cleanup(vbasedev);
|
||||
|
||||
/*
|
||||
* Changing device state from STOP_COPY to STOP can take time. Do it here,
|
||||
* after migration has completed, so it won't increase downtime.
|
||||
|
@ -645,6 +637,11 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
|
|||
int ret;
|
||||
Error *local_err = NULL;
|
||||
|
||||
if (vfio_multifd_transfer_enabled(vbasedev)) {
|
||||
vfio_multifd_emit_dummy_eos(vbasedev, f);
|
||||
return 0;
|
||||
}
|
||||
|
||||
trace_vfio_save_complete_precopy_start(vbasedev->name);
|
||||
|
||||
/* We reach here with device state STOP or STOP_COPY only */
|
||||
|
@ -676,6 +673,11 @@ static void vfio_save_state(QEMUFile *f, void *opaque)
|
|||
Error *local_err = NULL;
|
||||
int ret;
|
||||
|
||||
if (vfio_multifd_transfer_enabled(vbasedev)) {
|
||||
vfio_multifd_emit_dummy_eos(vbasedev, f);
|
||||
return;
|
||||
}
|
||||
|
||||
ret = vfio_save_device_config_state(f, opaque, &local_err);
|
||||
if (ret) {
|
||||
error_prepend(&local_err,
|
||||
|
@ -688,15 +690,28 @@ static void vfio_save_state(QEMUFile *f, void *opaque)
|
|||
static int vfio_load_setup(QEMUFile *f, void *opaque, Error **errp)
|
||||
{
|
||||
VFIODevice *vbasedev = opaque;
|
||||
VFIOMigration *migration = vbasedev->migration;
|
||||
int ret;
|
||||
|
||||
return vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING,
|
||||
vbasedev->migration->device_state, errp);
|
||||
if (!vfio_multifd_setup(vbasedev, true, errp)) {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING,
|
||||
migration->device_state, errp);
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vfio_load_cleanup(void *opaque)
|
||||
{
|
||||
VFIODevice *vbasedev = opaque;
|
||||
|
||||
vfio_multifd_cleanup(vbasedev);
|
||||
|
||||
vfio_migration_cleanup(vbasedev);
|
||||
trace_vfio_load_cleanup(vbasedev->name);
|
||||
|
||||
|
@ -717,6 +732,13 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
|
|||
switch (data) {
|
||||
case VFIO_MIG_FLAG_DEV_CONFIG_STATE:
|
||||
{
|
||||
if (vfio_multifd_transfer_enabled(vbasedev)) {
|
||||
error_report("%s: got DEV_CONFIG_STATE in main migration "
|
||||
"channel but doing multifd transfer",
|
||||
vbasedev->name);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return vfio_load_device_config_state(f, opaque);
|
||||
}
|
||||
case VFIO_MIG_FLAG_DEV_SETUP_STATE:
|
||||
|
@ -782,6 +804,17 @@ static bool vfio_switchover_ack_needed(void *opaque)
|
|||
return vfio_precopy_supported(vbasedev);
|
||||
}
|
||||
|
||||
static int vfio_switchover_start(void *opaque)
|
||||
{
|
||||
VFIODevice *vbasedev = opaque;
|
||||
|
||||
if (vfio_multifd_transfer_enabled(vbasedev)) {
|
||||
return vfio_multifd_switchover_start(vbasedev);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const SaveVMHandlers savevm_vfio_handlers = {
|
||||
.save_prepare = vfio_save_prepare,
|
||||
.save_setup = vfio_save_setup,
|
||||
|
@ -796,6 +829,12 @@ static const SaveVMHandlers savevm_vfio_handlers = {
|
|||
.load_cleanup = vfio_load_cleanup,
|
||||
.load_state = vfio_load_state,
|
||||
.switchover_ack_needed = vfio_switchover_ack_needed,
|
||||
/*
|
||||
* Multifd support
|
||||
*/
|
||||
.load_state_buffer = vfio_multifd_load_state_buffer,
|
||||
.switchover_start = vfio_switchover_start,
|
||||
.save_live_complete_precopy_thread = vfio_multifd_save_complete_precopy_thread,
|
||||
};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
@ -1011,12 +1050,17 @@ static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp)
|
|||
|
||||
int64_t vfio_mig_bytes_transferred(void)
|
||||
{
|
||||
return bytes_transferred;
|
||||
return MIN(qatomic_read(&bytes_transferred), INT64_MAX);
|
||||
}
|
||||
|
||||
void vfio_reset_bytes_transferred(void)
|
||||
{
|
||||
bytes_transferred = 0;
|
||||
qatomic_set(&bytes_transferred, 0);
|
||||
}
|
||||
|
||||
void vfio_mig_add_bytes_transferred(unsigned long val)
|
||||
{
|
||||
qatomic_add(&bytes_transferred, val);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
194
hw/vfio/pci.c
194
hw/vfio/pci.c
|
@ -2215,8 +2215,12 @@ static bool vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
|
|||
break;
|
||||
case PCI_CAP_ID_PM:
|
||||
vfio_check_pm_reset(vdev, pos);
|
||||
vdev->pm_cap = pos;
|
||||
ret = pci_add_capability(pdev, cap_id, pos, size, errp) >= 0;
|
||||
ret = pci_pm_init(pdev, pos, errp) >= 0;
|
||||
/*
|
||||
* PCI-core config space emulation needs write access to the power
|
||||
* state enabled for tracking BAR mapping relative to PM state.
|
||||
*/
|
||||
pci_set_word(pdev->wmask + pos + PCI_PM_CTRL, PCI_PM_CTRL_STATE_MASK);
|
||||
break;
|
||||
case PCI_CAP_ID_AF:
|
||||
vfio_check_af_flr(vdev, pos);
|
||||
|
@ -2406,26 +2410,6 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
|
|||
|
||||
vfio_disable_interrupts(vdev);
|
||||
|
||||
/* Make sure the device is in D0 */
|
||||
if (vdev->pm_cap) {
|
||||
uint16_t pmcsr;
|
||||
uint8_t state;
|
||||
|
||||
pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
|
||||
state = pmcsr & PCI_PM_CTRL_STATE_MASK;
|
||||
if (state) {
|
||||
pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
|
||||
vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
|
||||
/* vfio handles the necessary delay here */
|
||||
pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
|
||||
state = pmcsr & PCI_PM_CTRL_STATE_MASK;
|
||||
if (state) {
|
||||
error_report("vfio: Unable to power on device, stuck in D%d",
|
||||
state);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master.
|
||||
* Also put INTx Disable in known state.
|
||||
|
@ -2434,6 +2418,26 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
|
|||
cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
|
||||
PCI_COMMAND_INTX_DISABLE);
|
||||
vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
|
||||
|
||||
/* Make sure the device is in D0 */
|
||||
if (pdev->pm_cap) {
|
||||
uint16_t pmcsr;
|
||||
uint8_t state;
|
||||
|
||||
pmcsr = vfio_pci_read_config(pdev, pdev->pm_cap + PCI_PM_CTRL, 2);
|
||||
state = pmcsr & PCI_PM_CTRL_STATE_MASK;
|
||||
if (state) {
|
||||
pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
|
||||
vfio_pci_write_config(pdev, pdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
|
||||
/* vfio handles the necessary delay here */
|
||||
pmcsr = vfio_pci_read_config(pdev, pdev->pm_cap + PCI_PM_CTRL, 2);
|
||||
state = pmcsr & PCI_PM_CTRL_STATE_MASK;
|
||||
if (state) {
|
||||
error_report("vfio: Unable to power on device, stuck in D%d",
|
||||
state);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vfio_pci_post_reset(VFIOPCIDevice *vdev)
|
||||
|
@ -3353,6 +3357,8 @@ static void vfio_instance_init(Object *obj)
|
|||
pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS;
|
||||
}
|
||||
|
||||
static PropertyInfo vfio_pci_migration_multifd_transfer_prop;
|
||||
|
||||
static const Property vfio_pci_dev_properties[] = {
|
||||
DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
|
||||
DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token),
|
||||
|
@ -3377,6 +3383,10 @@ static const Property vfio_pci_dev_properties[] = {
|
|||
VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false),
|
||||
DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice,
|
||||
vbasedev.enable_migration, ON_OFF_AUTO_AUTO),
|
||||
DEFINE_PROP("x-migration-multifd-transfer", VFIOPCIDevice,
|
||||
vbasedev.migration_multifd_transfer,
|
||||
vfio_pci_migration_multifd_transfer_prop, OnOffAuto,
|
||||
.set_default = true, .defval.i = ON_OFF_AUTO_AUTO),
|
||||
DEFINE_PROP_BOOL("migration-events", VFIOPCIDevice,
|
||||
vbasedev.migration_events, false),
|
||||
DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
|
||||
|
@ -3433,6 +3443,126 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
|
|||
pdc->exit = vfio_exitfn;
|
||||
pdc->config_read = vfio_pci_read_config;
|
||||
pdc->config_write = vfio_pci_write_config;
|
||||
|
||||
object_class_property_set_description(klass, /* 1.3 */
|
||||
"host",
|
||||
"Host PCI address [domain:]<bus:slot.function> of assigned device");
|
||||
object_class_property_set_description(klass, /* 1.3 */
|
||||
"x-intx-mmap-timeout-ms",
|
||||
"When EOI is not provided by KVM/QEMU, wait time "
|
||||
"(milliseconds) to re-enable device direct access "
|
||||
"after INTx (DEBUG)");
|
||||
object_class_property_set_description(klass, /* 1.5 */
|
||||
"x-vga",
|
||||
"Expose VGA address spaces for device");
|
||||
object_class_property_set_description(klass, /* 2.3 */
|
||||
"x-req",
|
||||
"Disable device request notification support (DEBUG)");
|
||||
object_class_property_set_description(klass, /* 2.4 and 2.5 */
|
||||
"x-no-mmap",
|
||||
"Disable MMAP for device. Allows to trace MMIO "
|
||||
"accesses (DEBUG)");
|
||||
object_class_property_set_description(klass, /* 2.5 */
|
||||
"x-no-kvm-intx",
|
||||
"Disable direct VFIO->KVM INTx injection. Allows to "
|
||||
"trace INTx interrupts (DEBUG)");
|
||||
object_class_property_set_description(klass, /* 2.5 */
|
||||
"x-no-kvm-msi",
|
||||
"Disable direct VFIO->KVM MSI injection. Allows to "
|
||||
"trace MSI interrupts (DEBUG)");
|
||||
object_class_property_set_description(klass, /* 2.5 */
|
||||
"x-no-kvm-msix",
|
||||
"Disable direct VFIO->KVM MSIx injection. Allows to "
|
||||
"trace MSIx interrupts (DEBUG)");
|
||||
object_class_property_set_description(klass, /* 2.5 */
|
||||
"x-pci-vendor-id",
|
||||
"Override PCI Vendor ID with provided value (DEBUG)");
|
||||
object_class_property_set_description(klass, /* 2.5 */
|
||||
"x-pci-device-id",
|
||||
"Override PCI device ID with provided value (DEBUG)");
|
||||
object_class_property_set_description(klass, /* 2.5 */
|
||||
"x-pci-sub-vendor-id",
|
||||
"Override PCI Subsystem Vendor ID with provided value "
|
||||
"(DEBUG)");
|
||||
object_class_property_set_description(klass, /* 2.5 */
|
||||
"x-pci-sub-device-id",
|
||||
"Override PCI Subsystem Device ID with provided value "
|
||||
"(DEBUG)");
|
||||
object_class_property_set_description(klass, /* 2.6 */
|
||||
"sysfsdev",
|
||||
"Host sysfs path of assigned device");
|
||||
object_class_property_set_description(klass, /* 2.7 */
|
||||
"x-igd-opregion",
|
||||
"Expose host IGD OpRegion to guest");
|
||||
object_class_property_set_description(klass, /* 2.7 (See c4c45e943e51) */
|
||||
"x-igd-gms",
|
||||
"Override IGD data stolen memory size (32MiB units)");
|
||||
object_class_property_set_description(klass, /* 2.11 */
|
||||
"x-nv-gpudirect-clique",
|
||||
"Add NVIDIA GPUDirect capability indicating P2P DMA "
|
||||
"clique for device [0-15]");
|
||||
object_class_property_set_description(klass, /* 2.12 */
|
||||
"x-no-geforce-quirks",
|
||||
"Disable GeForce quirks (for NVIDIA Quadro/GRID/Tesla). "
|
||||
"Improves performance");
|
||||
object_class_property_set_description(klass, /* 2.12 */
|
||||
"display",
|
||||
"Enable display support for device, ex. vGPU");
|
||||
object_class_property_set_description(klass, /* 2.12 */
|
||||
"x-msix-relocation",
|
||||
"Specify MSI-X MMIO relocation to the end of specified "
|
||||
"existing BAR or new BAR to avoid virtualization overhead "
|
||||
"due to adjacent device registers");
|
||||
object_class_property_set_description(klass, /* 3.0 */
|
||||
"x-no-kvm-ioeventfd",
|
||||
"Disable registration of ioeventfds with KVM (DEBUG)");
|
||||
object_class_property_set_description(klass, /* 3.0 */
|
||||
"x-no-vfio-ioeventfd",
|
||||
"Disable linking of KVM ioeventfds to VFIO ioeventfds "
|
||||
"(DEBUG)");
|
||||
object_class_property_set_description(klass, /* 3.1 */
|
||||
"x-balloon-allowed",
|
||||
"Override allowing ballooning with device (DEBUG, DANGER)");
|
||||
object_class_property_set_description(klass, /* 3.2 */
|
||||
"xres",
|
||||
"Set X display resolution the vGPU should use");
|
||||
object_class_property_set_description(klass, /* 3.2 */
|
||||
"yres",
|
||||
"Set Y display resolution the vGPU should use");
|
||||
object_class_property_set_description(klass, /* 5.2 */
|
||||
"x-pre-copy-dirty-page-tracking",
|
||||
"Disable dirty pages tracking during iterative phase "
|
||||
"(DEBUG)");
|
||||
object_class_property_set_description(klass, /* 5.2, 8.0 non-experimetal */
|
||||
"enable-migration",
|
||||
"Enale device migration. Also requires a host VFIO PCI "
|
||||
"variant or mdev driver with migration support enabled");
|
||||
object_class_property_set_description(klass, /* 8.1 */
|
||||
"vf-token",
|
||||
"Specify UUID VF token. Required for VF when PF is owned "
|
||||
"by another VFIO driver");
|
||||
#ifdef CONFIG_IOMMUFD
|
||||
object_class_property_set_description(klass, /* 9.0 */
|
||||
"iommufd",
|
||||
"Set host IOMMUFD backend device");
|
||||
#endif
|
||||
object_class_property_set_description(klass, /* 9.1 */
|
||||
"x-device-dirty-page-tracking",
|
||||
"Disable device dirty page tracking and use "
|
||||
"container-based dirty page tracking (DEBUG)");
|
||||
object_class_property_set_description(klass, /* 9.1 */
|
||||
"migration-events",
|
||||
"Emit VFIO migration QAPI event when a VFIO device "
|
||||
"changes its migration state. For management applications");
|
||||
object_class_property_set_description(klass, /* 9.1 */
|
||||
"skip-vsc-check",
|
||||
"Skip config space check for Vendor Specific Capability. "
|
||||
"Setting to false will enforce strict checking of VSC content "
|
||||
"(DEBUG)");
|
||||
object_class_property_set_description(klass, /* 10.0 */
|
||||
"x-migration-multifd-transfer",
|
||||
"Transfer this device state via "
|
||||
"multifd channels when live migrating it");
|
||||
}
|
||||
|
||||
static const TypeInfo vfio_pci_dev_info = {
|
||||
|
@ -3461,6 +3591,15 @@ static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass, void *data)
|
|||
|
||||
device_class_set_props(dc, vfio_pci_dev_nohotplug_properties);
|
||||
dc->hotpluggable = false;
|
||||
|
||||
object_class_property_set_description(klass, /* 3.1 */
|
||||
"ramfb",
|
||||
"Enable ramfb to provide pre-boot graphics for devices "
|
||||
"enabling display option");
|
||||
object_class_property_set_description(klass, /* 8.2 */
|
||||
"x-ramfb-migrate",
|
||||
"Override default migration support for ramfb support "
|
||||
"(DEBUG)");
|
||||
}
|
||||
|
||||
static const TypeInfo vfio_pci_nohotplug_dev_info = {
|
||||
|
@ -3472,6 +3611,17 @@ static const TypeInfo vfio_pci_nohotplug_dev_info = {
|
|||
|
||||
static void register_vfio_pci_dev_type(void)
|
||||
{
|
||||
/*
|
||||
* Ordinary ON_OFF_AUTO property isn't runtime-mutable, but source VM can
|
||||
* run for a long time before being migrated so it is desirable to have a
|
||||
* fallback mechanism to the old way of transferring VFIO device state if
|
||||
* it turns to be necessary.
|
||||
* The following makes this type of property have the same mutability level
|
||||
* as ordinary migration parameters.
|
||||
*/
|
||||
vfio_pci_migration_multifd_transfer_prop = qdev_prop_on_off_auto;
|
||||
vfio_pci_migration_multifd_transfer_prop.realized_set_allowed = true;
|
||||
|
||||
type_register_static(&vfio_pci_dev_info);
|
||||
type_register_static(&vfio_pci_nohotplug_dev_info);
|
||||
}
|
||||
|
|
|
@ -160,7 +160,6 @@ struct VFIOPCIDevice {
|
|||
int32_t bootindex;
|
||||
uint32_t igd_gms;
|
||||
OffAutoPCIBAR msix_relo;
|
||||
uint8_t pm_cap;
|
||||
uint8_t nv_gpudirect_clique;
|
||||
bool pci_aer;
|
||||
bool req_enabled;
|
||||
|
|
|
@ -575,6 +575,7 @@ static void vfio_platform_realize(DeviceState *dev, Error **errp)
|
|||
VFIODevice *vbasedev = &vdev->vbasedev;
|
||||
int i;
|
||||
|
||||
warn_report("-device vfio-platform is deprecated");
|
||||
qemu_mutex_init(&vdev->intp_mutex);
|
||||
|
||||
trace_vfio_platform_realize(vbasedev->sysfsdev ?
|
||||
|
@ -672,6 +673,30 @@ static void vfio_platform_class_init(ObjectClass *klass, void *data)
|
|||
dc->desc = "VFIO-based platform device assignment";
|
||||
sbc->connect_irq_notifier = vfio_start_irqfd_injection;
|
||||
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
|
||||
|
||||
object_class_property_set_description(klass, /* 2.4 */
|
||||
"host",
|
||||
"Host device name of assigned device");
|
||||
object_class_property_set_description(klass, /* 2.4 and 2.5 */
|
||||
"x-no-mmap",
|
||||
"Disable MMAP for device. Allows to trace MMIO "
|
||||
"accesses (DEBUG)");
|
||||
object_class_property_set_description(klass, /* 2.4 */
|
||||
"mmap-timeout-ms",
|
||||
"When EOI is not provided by KVM/QEMU, wait time "
|
||||
"(milliseconds) to re-enable device direct access "
|
||||
"after level interrupt (DEBUG)");
|
||||
object_class_property_set_description(klass, /* 2.4 */
|
||||
"x-irqfd",
|
||||
"Allow disabling irqfd support (DEBUG)");
|
||||
object_class_property_set_description(klass, /* 2.6 */
|
||||
"sysfsdev",
|
||||
"Host sysfs path of assigned device");
|
||||
#ifdef CONFIG_IOMMUFD
|
||||
object_class_property_set_description(klass, /* 9.0 */
|
||||
"iommufd",
|
||||
"Set host IOMMUFD backend device");
|
||||
#endif
|
||||
}
|
||||
|
||||
static const TypeInfo vfio_platform_dev_info = {
|
||||
|
|
|
@ -149,10 +149,19 @@ vfio_display_edid_update(uint32_t prefx, uint32_t prefy) "%ux%u"
|
|||
vfio_display_edid_write_error(void) ""
|
||||
|
||||
# migration.c
|
||||
vfio_load_bufs_thread_start(const char *name) " (%s)"
|
||||
vfio_load_bufs_thread_end(const char *name) " (%s)"
|
||||
vfio_load_cleanup(const char *name) " (%s)"
|
||||
vfio_load_device_config_state(const char *name) " (%s)"
|
||||
vfio_load_device_config_state_start(const char *name) " (%s)"
|
||||
vfio_load_device_config_state_end(const char *name) " (%s)"
|
||||
vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64
|
||||
vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size %"PRIu64" ret %d"
|
||||
vfio_load_state_device_buffer_incoming(const char *name, uint32_t idx) " (%s) idx %"PRIu32
|
||||
vfio_load_state_device_buffer_start(const char *name) " (%s)"
|
||||
vfio_load_state_device_buffer_starved(const char *name, uint32_t idx) " (%s) idx %"PRIu32
|
||||
vfio_load_state_device_buffer_load_start(const char *name, uint32_t idx) " (%s) idx %"PRIu32
|
||||
vfio_load_state_device_buffer_load_end(const char *name, uint32_t idx) " (%s) idx %"PRIu32
|
||||
vfio_load_state_device_buffer_end(const char *name) " (%s)"
|
||||
vfio_migration_realize(const char *name) " (%s)"
|
||||
vfio_migration_set_device_state(const char *name, const char *state) " (%s) state %s"
|
||||
vfio_migration_set_state(const char *name, const char *new_state, const char *recover_state) " (%s) new state %s, recover state %s"
|
||||
|
@ -162,6 +171,8 @@ vfio_save_block_precopy_empty_hit(const char *name) " (%s)"
|
|||
vfio_save_cleanup(const char *name) " (%s)"
|
||||
vfio_save_complete_precopy(const char *name, int ret) " (%s) ret %d"
|
||||
vfio_save_complete_precopy_start(const char *name) " (%s)"
|
||||
vfio_save_complete_precopy_thread_start(const char *name, const char *idstr, uint32_t instance_id) " (%s) idstr %s instance %"PRIu32
|
||||
vfio_save_complete_precopy_thread_end(const char *name, int ret) " (%s) ret %d"
|
||||
vfio_save_device_config_state(const char *name) " (%s)"
|
||||
vfio_save_iterate(const char *name, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy initial size %"PRIu64" precopy dirty size %"PRIu64
|
||||
vfio_save_iterate_start(const char *name) " (%s)"
|
||||
|
|
|
@ -2204,14 +2204,11 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp)
|
|||
pos = pcie_endpoint_cap_init(pci_dev, 0);
|
||||
assert(pos > 0);
|
||||
|
||||
pos = pci_add_capability(pci_dev, PCI_CAP_ID_PM, 0,
|
||||
PCI_PM_SIZEOF, errp);
|
||||
pos = pci_pm_init(pci_dev, 0, errp);
|
||||
if (pos < 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
pci_dev->exp.pm_cap = pos;
|
||||
|
||||
/*
|
||||
* Indicates that this function complies with revision 1.2 of the
|
||||
* PCI Power Management Interface Specification.
|
||||
|
@ -2310,11 +2307,11 @@ static bool virtio_pci_no_soft_reset(PCIDevice *dev)
|
|||
{
|
||||
uint16_t pmcsr;
|
||||
|
||||
if (!pci_is_express(dev) || !dev->exp.pm_cap) {
|
||||
if (!pci_is_express(dev) || !(dev->cap_present & QEMU_PCI_CAP_PM)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
pmcsr = pci_get_word(dev->config + dev->exp.pm_cap + PCI_PM_CTRL);
|
||||
pmcsr = pci_get_word(dev->config + dev->pm_cap + PCI_PM_CTRL);
|
||||
|
||||
/*
|
||||
* When No_Soft_Reset bit is set and the device
|
||||
|
@ -2343,7 +2340,7 @@ static void virtio_pci_bus_reset_hold(Object *obj, ResetType type)
|
|||
|
||||
if (proxy->flags & VIRTIO_PCI_FLAG_INIT_PM) {
|
||||
pci_word_test_and_clear_mask(
|
||||
dev->config + dev->exp.pm_cap + PCI_PM_CTRL,
|
||||
dev->config + dev->pm_cap + PCI_PM_CTRL,
|
||||
PCI_PM_CTRL_STATE_MASK);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -54,7 +54,7 @@ typedef void QEMUBHFunc(void *opaque);
|
|||
typedef bool AioPollFn(void *opaque);
|
||||
typedef void IOHandler(void *opaque);
|
||||
|
||||
struct ThreadPool;
|
||||
struct ThreadPoolAio;
|
||||
struct LinuxAioState;
|
||||
typedef struct LuringState LuringState;
|
||||
|
||||
|
@ -207,7 +207,7 @@ struct AioContext {
|
|||
/* Thread pool for performing work and receiving completion callbacks.
|
||||
* Has its own locking.
|
||||
*/
|
||||
struct ThreadPool *thread_pool;
|
||||
struct ThreadPoolAio *thread_pool;
|
||||
|
||||
#ifdef CONFIG_LINUX_AIO
|
||||
struct LinuxAioState *linux_aio;
|
||||
|
@ -500,8 +500,8 @@ void aio_set_event_notifier_poll(AioContext *ctx,
|
|||
*/
|
||||
GSource *aio_get_g_source(AioContext *ctx);
|
||||
|
||||
/* Return the ThreadPool bound to this AioContext */
|
||||
struct ThreadPool *aio_get_thread_pool(AioContext *ctx);
|
||||
/* Return the ThreadPoolAio bound to this AioContext */
|
||||
struct ThreadPoolAio *aio_get_thread_pool(AioContext *ctx);
|
||||
|
||||
/* Setup the LinuxAioState bound to this AioContext */
|
||||
struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp);
|
||||
|
|
|
@ -24,20 +24,70 @@
|
|||
|
||||
typedef int ThreadPoolFunc(void *opaque);
|
||||
|
||||
typedef struct ThreadPool ThreadPool;
|
||||
typedef struct ThreadPoolAio ThreadPoolAio;
|
||||
|
||||
ThreadPool *thread_pool_new(struct AioContext *ctx);
|
||||
void thread_pool_free(ThreadPool *pool);
|
||||
ThreadPoolAio *thread_pool_new_aio(struct AioContext *ctx);
|
||||
void thread_pool_free_aio(ThreadPoolAio *pool);
|
||||
|
||||
/*
|
||||
* thread_pool_submit* API: submit I/O requests in the thread's
|
||||
* thread_pool_submit_{aio,co} API: submit I/O requests in the thread's
|
||||
* current AioContext.
|
||||
*/
|
||||
BlockAIOCB *thread_pool_submit_aio(ThreadPoolFunc *func, void *arg,
|
||||
BlockCompletionFunc *cb, void *opaque);
|
||||
int coroutine_fn thread_pool_submit_co(ThreadPoolFunc *func, void *arg);
|
||||
void thread_pool_submit(ThreadPoolFunc *func, void *arg);
|
||||
void thread_pool_update_params(ThreadPoolAio *pool, struct AioContext *ctx);
|
||||
|
||||
void thread_pool_update_params(ThreadPool *pool, struct AioContext *ctx);
|
||||
/* ------------------------------------------- */
|
||||
/* Generic thread pool types and methods below */
|
||||
typedef struct ThreadPool ThreadPool;
|
||||
|
||||
/* Create a new thread pool. Never returns NULL. */
|
||||
ThreadPool *thread_pool_new(void);
|
||||
|
||||
/*
|
||||
* Free the thread pool.
|
||||
* Waits for all the previously submitted work to complete before performing
|
||||
* the actual freeing operation.
|
||||
*/
|
||||
void thread_pool_free(ThreadPool *pool);
|
||||
|
||||
/*
|
||||
* Submit a new work (task) for the pool.
|
||||
*
|
||||
* @opaque_destroy is an optional GDestroyNotify for the @opaque argument
|
||||
* to the work function at @func.
|
||||
*/
|
||||
void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func,
|
||||
void *opaque, GDestroyNotify opaque_destroy);
|
||||
|
||||
/*
|
||||
* Submit a new work (task) for the pool, making sure it starts getting
|
||||
* processed immediately, launching a new thread for it if necessary.
|
||||
*
|
||||
* @opaque_destroy is an optional GDestroyNotify for the @opaque argument
|
||||
* to the work function at @func.
|
||||
*/
|
||||
void thread_pool_submit_immediate(ThreadPool *pool, ThreadPoolFunc *func,
|
||||
void *opaque, GDestroyNotify opaque_destroy);
|
||||
|
||||
/*
|
||||
* Wait for all previously submitted work to complete before returning.
|
||||
*
|
||||
* Can be used as a barrier between two sets of tasks executed on a thread
|
||||
* pool without destroying it or in a performance sensitive path where the
|
||||
* caller just wants to wait for all tasks to complete while deferring the
|
||||
* pool free operation for later, less performance sensitive time.
|
||||
*/
|
||||
void thread_pool_wait(ThreadPool *pool);
|
||||
|
||||
/* Set the maximum number of threads in the pool. */
|
||||
bool thread_pool_set_max_threads(ThreadPool *pool, int max_threads);
|
||||
|
||||
/*
|
||||
* Adjust the maximum number of threads in the pool to give each task its
|
||||
* own thread (exactly one thread per task).
|
||||
*/
|
||||
bool thread_pool_adjust_max_threads_to_work(ThreadPool *pool);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -216,6 +216,8 @@ enum {
|
|||
QEMU_PCIE_ARI_NEXTFN_1 = (1 << QEMU_PCIE_ARI_NEXTFN_1_BITNR),
|
||||
#define QEMU_PCIE_EXT_TAG_BITNR 13
|
||||
QEMU_PCIE_EXT_TAG = (1 << QEMU_PCIE_EXT_TAG_BITNR),
|
||||
#define QEMU_PCI_CAP_PM_BITNR 14
|
||||
QEMU_PCI_CAP_PM = (1 << QEMU_PCI_CAP_PM_BITNR),
|
||||
};
|
||||
|
||||
typedef struct PCIINTxRoute {
|
||||
|
@ -676,5 +678,6 @@ static inline void pci_irq_deassert(PCIDevice *pci_dev)
|
|||
MSIMessage pci_get_msi_message(PCIDevice *dev, int vector);
|
||||
void pci_set_enabled(PCIDevice *pci_dev, bool state);
|
||||
void pci_set_power(PCIDevice *pci_dev, bool state);
|
||||
int pci_pm_init(PCIDevice *pci_dev, uint8_t offset, Error **errp);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -105,6 +105,9 @@ struct PCIDevice {
|
|||
/* Capability bits */
|
||||
uint32_t cap_present;
|
||||
|
||||
/* Offset of PM capability in config space */
|
||||
uint8_t pm_cap;
|
||||
|
||||
/* Offset of MSI-X capability in config space */
|
||||
uint8_t msix_cap;
|
||||
|
||||
|
|
|
@ -58,8 +58,6 @@ typedef enum {
|
|||
struct PCIExpressDevice {
|
||||
/* Offset of express capability in config space */
|
||||
uint8_t exp_cap;
|
||||
/* Offset of Power Management capability in config space */
|
||||
uint8_t pm_cap;
|
||||
|
||||
/* SLOT */
|
||||
bool hpev_notified; /* Logical AND of conditions for hot plug event.
|
||||
|
|
|
@ -36,6 +36,23 @@
|
|||
|
||||
#define VFIO_MSG_PREFIX "vfio %s: "
|
||||
|
||||
/*
|
||||
* Flags to be used as unique delimiters for VFIO devices in the migration
|
||||
* stream. These flags are composed as:
|
||||
* 0xffffffff => MSB 32-bit all 1s
|
||||
* 0xef10 => Magic ID, represents emulated (virtual) function IO
|
||||
* 0x0000 => 16-bits reserved for flags
|
||||
*
|
||||
* The beginning of state information is marked by _DEV_CONFIG_STATE,
|
||||
* _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a
|
||||
* certain state information is marked by _END_OF_STATE.
|
||||
*/
|
||||
#define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL)
|
||||
#define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL)
|
||||
#define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL)
|
||||
#define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL)
|
||||
#define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL)
|
||||
|
||||
enum {
|
||||
VFIO_DEVICE_TYPE_PCI = 0,
|
||||
VFIO_DEVICE_TYPE_PLATFORM = 1,
|
||||
|
@ -61,6 +78,8 @@ typedef struct VFIORegion {
|
|||
uint8_t nr; /* cache the region number for debug */
|
||||
} VFIORegion;
|
||||
|
||||
typedef struct VFIOMultifd VFIOMultifd;
|
||||
|
||||
typedef struct VFIOMigration {
|
||||
struct VFIODevice *vbasedev;
|
||||
VMChangeStateEntry *vm_state;
|
||||
|
@ -72,6 +91,8 @@ typedef struct VFIOMigration {
|
|||
uint64_t mig_flags;
|
||||
uint64_t precopy_init_size;
|
||||
uint64_t precopy_dirty_size;
|
||||
bool multifd_transfer;
|
||||
VFIOMultifd *multifd;
|
||||
bool initial_data_sent;
|
||||
|
||||
bool event_save_iterate_started;
|
||||
|
@ -133,6 +154,7 @@ typedef struct VFIODevice {
|
|||
bool no_mmap;
|
||||
bool ram_block_discard_allowed;
|
||||
OnOffAuto enable_migration;
|
||||
OnOffAuto migration_multifd_transfer;
|
||||
bool migration_events;
|
||||
VFIODeviceOps *ops;
|
||||
unsigned int num_irqs;
|
||||
|
@ -274,9 +296,13 @@ void vfio_unblock_multiple_devices_migration(void);
|
|||
bool vfio_viommu_preset(VFIODevice *vbasedev);
|
||||
int64_t vfio_mig_bytes_transferred(void);
|
||||
void vfio_reset_bytes_transferred(void);
|
||||
void vfio_mig_add_bytes_transferred(unsigned long val);
|
||||
bool vfio_device_state_is_running(VFIODevice *vbasedev);
|
||||
bool vfio_device_state_is_precopy(VFIODevice *vbasedev);
|
||||
|
||||
int vfio_save_device_config_state(QEMUFile *f, void *opaque, Error **errp);
|
||||
int vfio_load_device_config_state(QEMUFile *f, void *opaque);
|
||||
|
||||
#ifdef CONFIG_LINUX
|
||||
int vfio_get_region_info(VFIODevice *vbasedev, int index,
|
||||
struct vfio_region_info **info);
|
||||
|
@ -291,6 +317,11 @@ struct vfio_info_cap_header *
|
|||
vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id);
|
||||
struct vfio_info_cap_header *
|
||||
vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id);
|
||||
|
||||
int vfio_migration_set_state(VFIODevice *vbasedev,
|
||||
enum vfio_device_mig_state new_state,
|
||||
enum vfio_device_mig_state recover_state,
|
||||
Error **errp);
|
||||
#endif
|
||||
|
||||
bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp);
|
||||
|
|
|
@ -10,6 +10,10 @@
|
|||
#ifndef QEMU_MIGRATION_CLIENT_OPTIONS_H
|
||||
#define QEMU_MIGRATION_CLIENT_OPTIONS_H
|
||||
|
||||
|
||||
/* properties */
|
||||
bool migrate_send_switchover_start(void);
|
||||
|
||||
/* capabilities */
|
||||
|
||||
bool migrate_background_snapshot(void);
|
||||
|
|
|
@ -45,9 +45,12 @@ bool migrate_ram_is_ignored(RAMBlock *block);
|
|||
/* migration/block.c */
|
||||
|
||||
AnnounceParameters *migrate_announce_params(void);
|
||||
|
||||
/* migration/savevm.c */
|
||||
|
||||
void dump_vmstate_json_to_file(FILE *out_fp);
|
||||
void qemu_loadvm_start_load_thread(MigrationLoadThread function,
|
||||
void *opaque);
|
||||
|
||||
/* migration/migration.c */
|
||||
void migration_object_init(void);
|
||||
|
@ -115,4 +118,26 @@ bool migrate_is_uri(const char *uri);
|
|||
bool migrate_uri_parse(const char *uri, MigrationChannel **channel,
|
||||
Error **errp);
|
||||
|
||||
/* migration/multifd-device-state.c */
|
||||
typedef struct SaveLiveCompletePrecopyThreadData {
|
||||
SaveLiveCompletePrecopyThreadHandler hdlr;
|
||||
char *idstr;
|
||||
uint32_t instance_id;
|
||||
void *handler_opaque;
|
||||
} SaveLiveCompletePrecopyThreadData;
|
||||
|
||||
bool multifd_queue_device_state(char *idstr, uint32_t instance_id,
|
||||
char *data, size_t len);
|
||||
bool multifd_device_state_supported(void);
|
||||
|
||||
void
|
||||
multifd_spawn_device_state_save_thread(SaveLiveCompletePrecopyThreadHandler hdlr,
|
||||
char *idstr, uint32_t instance_id,
|
||||
void *opaque);
|
||||
|
||||
bool multifd_device_state_save_thread_should_exit(void);
|
||||
|
||||
void multifd_abort_device_state_save_threads(void);
|
||||
bool multifd_join_device_state_save_threads(void);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -69,7 +69,9 @@ typedef struct SaveVMHandlers {
|
|||
/**
|
||||
* @save_cleanup
|
||||
*
|
||||
* Uninitializes the data structures on the source
|
||||
* Uninitializes the data structures on the source.
|
||||
* Note that this handler can be called even if save_setup
|
||||
* wasn't called earlier.
|
||||
*
|
||||
* @opaque: data pointer passed to register_savevm_live()
|
||||
*/
|
||||
|
@ -103,6 +105,25 @@ typedef struct SaveVMHandlers {
|
|||
*/
|
||||
int (*save_live_complete_precopy)(QEMUFile *f, void *opaque);
|
||||
|
||||
/**
|
||||
* @save_live_complete_precopy_thread (invoked in a separate thread)
|
||||
*
|
||||
* Called at the end of a precopy phase from a separate worker thread
|
||||
* in configurations where multifd device state transfer is supported
|
||||
* in order to perform asynchronous transmission of the remaining data in
|
||||
* parallel with @save_live_complete_precopy handlers.
|
||||
* When postcopy is enabled, devices that support postcopy will skip this
|
||||
* step.
|
||||
*
|
||||
* @d: a #SaveLiveCompletePrecopyThreadData containing parameters that the
|
||||
* handler may need, including this device section idstr and instance_id,
|
||||
* and opaque data pointer passed to register_savevm_live().
|
||||
* @errp: pointer to Error*, to store an error if it happens.
|
||||
*
|
||||
* Returns true to indicate success and false for errors.
|
||||
*/
|
||||
SaveLiveCompletePrecopyThreadHandler save_live_complete_precopy_thread;
|
||||
|
||||
/* This runs both outside and inside the BQL. */
|
||||
|
||||
/**
|
||||
|
@ -227,6 +248,21 @@ typedef struct SaveVMHandlers {
|
|||
*/
|
||||
int (*load_state)(QEMUFile *f, void *opaque, int version_id);
|
||||
|
||||
/**
|
||||
* @load_state_buffer (invoked outside the BQL)
|
||||
*
|
||||
* Load device state buffer provided to qemu_loadvm_load_state_buffer().
|
||||
*
|
||||
* @opaque: data pointer passed to register_savevm_live()
|
||||
* @buf: the data buffer to load
|
||||
* @len: the data length in buffer
|
||||
* @errp: pointer to Error*, to store an error if it happens.
|
||||
*
|
||||
* Returns true to indicate success and false for errors.
|
||||
*/
|
||||
bool (*load_state_buffer)(void *opaque, char *buf, size_t len,
|
||||
Error **errp);
|
||||
|
||||
/**
|
||||
* @load_setup
|
||||
*
|
||||
|
@ -244,6 +280,8 @@ typedef struct SaveVMHandlers {
|
|||
* @load_cleanup
|
||||
*
|
||||
* Uninitializes the data structures on the destination.
|
||||
* Note that this handler can be called even if load_setup
|
||||
* wasn't called earlier.
|
||||
*
|
||||
* @opaque: data pointer passed to register_savevm_live()
|
||||
*
|
||||
|
@ -275,6 +313,18 @@ typedef struct SaveVMHandlers {
|
|||
* otherwise
|
||||
*/
|
||||
bool (*switchover_ack_needed)(void *opaque);
|
||||
|
||||
/**
|
||||
* @switchover_start
|
||||
*
|
||||
* Notifies that the switchover has started. Called only on
|
||||
* the destination.
|
||||
*
|
||||
* @opaque: data pointer passed to register_savevm_live()
|
||||
*
|
||||
* Returns zero to indicate success and negative for error
|
||||
*/
|
||||
int (*switchover_start)(void *opaque);
|
||||
} SaveVMHandlers;
|
||||
|
||||
/**
|
||||
|
|
|
@ -437,6 +437,8 @@ Error *error_copy(const Error *err);
|
|||
*/
|
||||
void error_free(Error *err);
|
||||
|
||||
G_DEFINE_AUTOPTR_CLEANUP_FUNC(Error, error_free)
|
||||
|
||||
/*
|
||||
* Convenience function to assert that *@errp is set, then silently free it.
|
||||
*/
|
||||
|
|
|
@ -108,6 +108,7 @@ typedef struct QString QString;
|
|||
typedef struct RAMBlock RAMBlock;
|
||||
typedef struct Range Range;
|
||||
typedef struct ReservedRegion ReservedRegion;
|
||||
typedef struct SaveLiveCompletePrecopyThreadData SaveLiveCompletePrecopyThreadData;
|
||||
typedef struct SHPCDevice SHPCDevice;
|
||||
typedef struct SSIBus SSIBus;
|
||||
typedef struct TCGCPUOps TCGCPUOps;
|
||||
|
@ -131,5 +132,9 @@ typedef struct IRQState *qemu_irq;
|
|||
* Function types
|
||||
*/
|
||||
typedef void (*qemu_irq_handler)(void *opaque, int n, int level);
|
||||
typedef bool (*MigrationLoadThread)(void *opaque, bool *should_quit,
|
||||
Error **errp);
|
||||
typedef bool (*SaveLiveCompletePrecopyThreadHandler)(SaveLiveCompletePrecopyThreadData *d,
|
||||
Error **errp);
|
||||
|
||||
#endif /* QEMU_TYPEDEFS_H */
|
||||
|
|
|
@ -452,6 +452,9 @@ static int colo_do_checkpoint_transaction(MigrationState *s,
|
|||
bql_unlock();
|
||||
goto out;
|
||||
}
|
||||
|
||||
qemu_savevm_maybe_send_switchover_start(s->to_dst_file);
|
||||
|
||||
/* Note: device state is saved into buffer */
|
||||
ret = qemu_save_device_state(fb);
|
||||
|
||||
|
|
|
@ -25,6 +25,7 @@ system_ss.add(files(
|
|||
'migration-hmp-cmds.c',
|
||||
'migration.c',
|
||||
'multifd.c',
|
||||
'multifd-device-state.c',
|
||||
'multifd-nocomp.c',
|
||||
'multifd-zlib.c',
|
||||
'multifd-zero-page.c',
|
||||
|
|
|
@ -46,6 +46,8 @@ static void migration_global_dump(Monitor *mon)
|
|||
ms->send_configuration ? "on" : "off");
|
||||
monitor_printf(mon, "send-section-footer: %s\n",
|
||||
ms->send_section_footer ? "on" : "off");
|
||||
monitor_printf(mon, "send-switchover-start: %s\n",
|
||||
ms->send_switchover_start ? "on" : "off");
|
||||
monitor_printf(mon, "clear-bitmap-shift: %u\n",
|
||||
ms->clear_bitmap_shift);
|
||||
}
|
||||
|
|
|
@ -402,11 +402,24 @@ void migration_incoming_state_destroy(void)
|
|||
struct MigrationIncomingState *mis = migration_incoming_get_current();
|
||||
|
||||
multifd_recv_cleanup();
|
||||
|
||||
/*
|
||||
* RAM state cleanup needs to happen after multifd cleanup, because
|
||||
* multifd threads can use some of its states (receivedmap).
|
||||
* The VFIO load_cleanup() implementation is BQL-sensitive. It requires
|
||||
* BQL must NOT be taken when recycling load threads, so that it won't
|
||||
* block the load threads from making progress on address space
|
||||
* modification operations.
|
||||
*
|
||||
* To make it work, we could try to not take BQL for all load_cleanup(),
|
||||
* or conditionally unlock BQL only if bql_locked() in VFIO.
|
||||
*
|
||||
* Since most existing call sites take BQL for load_cleanup(), make
|
||||
* it simple by taking BQL always as the rule, so that VFIO can unlock
|
||||
* BQL and retake unconditionally.
|
||||
*/
|
||||
qemu_loadvm_state_cleanup();
|
||||
assert(bql_locked());
|
||||
qemu_loadvm_state_cleanup(mis);
|
||||
|
||||
if (mis->to_src_file) {
|
||||
/* Tell source that we are done */
|
||||
|
@ -2891,6 +2904,8 @@ static bool migration_switchover_start(MigrationState *s, Error **errp)
|
|||
|
||||
precopy_notify_complete();
|
||||
|
||||
qemu_savevm_maybe_send_switchover_start(s->to_dst_file);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -43,6 +43,7 @@
|
|||
#define MIGRATION_THREAD_DST_PREEMPT "mig/dst/preempt"
|
||||
|
||||
struct PostcopyBlocktimeContext;
|
||||
typedef struct ThreadPool ThreadPool;
|
||||
|
||||
#define MIGRATION_RESUME_ACK_VALUE (1)
|
||||
|
||||
|
@ -187,6 +188,10 @@ struct MigrationIncomingState {
|
|||
Coroutine *colo_incoming_co;
|
||||
QemuSemaphore colo_incoming_sem;
|
||||
|
||||
/* Optional load threads pool and its thread exit request flag */
|
||||
ThreadPool *load_threads;
|
||||
bool load_threads_abort;
|
||||
|
||||
/*
|
||||
* PostcopyBlocktimeContext to keep information for postcopy
|
||||
* live migration, to calculate vCPU block time
|
||||
|
@ -400,6 +405,8 @@ struct MigrationState {
|
|||
bool send_configuration;
|
||||
/* Whether we send section footer during migration */
|
||||
bool send_section_footer;
|
||||
/* Whether we send switchover start notification during migration */
|
||||
bool send_switchover_start;
|
||||
|
||||
/* Needed by postcopy-pause state */
|
||||
QemuSemaphore postcopy_pause_sem;
|
||||
|
|
212
migration/multifd-device-state.c
Normal file
212
migration/multifd-device-state.c
Normal file
|
@ -0,0 +1,212 @@
|
|||
/*
|
||||
* Multifd device state migration
|
||||
*
|
||||
* Copyright (C) 2024,2025 Oracle and/or its affiliates.
|
||||
*
|
||||
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
||||
* See the COPYING file in the top-level directory.
|
||||
*
|
||||
* SPDX-License-Identifier: GPL-2.0-or-later
|
||||
*/
|
||||
|
||||
#include "qemu/osdep.h"
|
||||
#include "qapi/error.h"
|
||||
#include "qemu/lockable.h"
|
||||
#include "block/thread-pool.h"
|
||||
#include "migration.h"
|
||||
#include "migration/misc.h"
|
||||
#include "multifd.h"
|
||||
#include "options.h"
|
||||
|
||||
static struct {
|
||||
QemuMutex queue_job_mutex;
|
||||
|
||||
MultiFDSendData *send_data;
|
||||
|
||||
ThreadPool *threads;
|
||||
bool threads_abort;
|
||||
} *multifd_send_device_state;
|
||||
|
||||
void multifd_device_state_send_setup(void)
|
||||
{
|
||||
assert(!multifd_send_device_state);
|
||||
multifd_send_device_state = g_malloc(sizeof(*multifd_send_device_state));
|
||||
|
||||
qemu_mutex_init(&multifd_send_device_state->queue_job_mutex);
|
||||
|
||||
multifd_send_device_state->send_data = multifd_send_data_alloc();
|
||||
|
||||
multifd_send_device_state->threads = thread_pool_new();
|
||||
multifd_send_device_state->threads_abort = false;
|
||||
}
|
||||
|
||||
void multifd_device_state_send_cleanup(void)
|
||||
{
|
||||
g_clear_pointer(&multifd_send_device_state->threads, thread_pool_free);
|
||||
g_clear_pointer(&multifd_send_device_state->send_data,
|
||||
multifd_send_data_free);
|
||||
|
||||
qemu_mutex_destroy(&multifd_send_device_state->queue_job_mutex);
|
||||
|
||||
g_clear_pointer(&multifd_send_device_state, g_free);
|
||||
}
|
||||
|
||||
void multifd_send_data_clear_device_state(MultiFDDeviceState_t *device_state)
|
||||
{
|
||||
g_clear_pointer(&device_state->idstr, g_free);
|
||||
g_clear_pointer(&device_state->buf, g_free);
|
||||
}
|
||||
|
||||
static void multifd_device_state_fill_packet(MultiFDSendParams *p)
|
||||
{
|
||||
MultiFDDeviceState_t *device_state = &p->data->u.device_state;
|
||||
MultiFDPacketDeviceState_t *packet = p->packet_device_state;
|
||||
|
||||
packet->hdr.flags = cpu_to_be32(p->flags);
|
||||
strncpy(packet->idstr, device_state->idstr, sizeof(packet->idstr) - 1);
|
||||
packet->idstr[sizeof(packet->idstr) - 1] = 0;
|
||||
packet->instance_id = cpu_to_be32(device_state->instance_id);
|
||||
packet->next_packet_size = cpu_to_be32(p->next_packet_size);
|
||||
}
|
||||
|
||||
static void multifd_prepare_header_device_state(MultiFDSendParams *p)
|
||||
{
|
||||
p->iov[0].iov_len = sizeof(*p->packet_device_state);
|
||||
p->iov[0].iov_base = p->packet_device_state;
|
||||
p->iovs_num++;
|
||||
}
|
||||
|
||||
void multifd_device_state_send_prepare(MultiFDSendParams *p)
|
||||
{
|
||||
MultiFDDeviceState_t *device_state = &p->data->u.device_state;
|
||||
|
||||
assert(multifd_payload_device_state(p->data));
|
||||
|
||||
multifd_prepare_header_device_state(p);
|
||||
|
||||
assert(!(p->flags & MULTIFD_FLAG_SYNC));
|
||||
|
||||
p->next_packet_size = device_state->buf_len;
|
||||
if (p->next_packet_size > 0) {
|
||||
p->iov[p->iovs_num].iov_base = device_state->buf;
|
||||
p->iov[p->iovs_num].iov_len = p->next_packet_size;
|
||||
p->iovs_num++;
|
||||
}
|
||||
|
||||
p->flags |= MULTIFD_FLAG_NOCOMP | MULTIFD_FLAG_DEVICE_STATE;
|
||||
|
||||
multifd_device_state_fill_packet(p);
|
||||
}
|
||||
|
||||
bool multifd_queue_device_state(char *idstr, uint32_t instance_id,
|
||||
char *data, size_t len)
|
||||
{
|
||||
/* Device state submissions can come from multiple threads */
|
||||
QEMU_LOCK_GUARD(&multifd_send_device_state->queue_job_mutex);
|
||||
MultiFDDeviceState_t *device_state;
|
||||
|
||||
assert(multifd_payload_empty(multifd_send_device_state->send_data));
|
||||
|
||||
multifd_set_payload_type(multifd_send_device_state->send_data,
|
||||
MULTIFD_PAYLOAD_DEVICE_STATE);
|
||||
device_state = &multifd_send_device_state->send_data->u.device_state;
|
||||
device_state->idstr = g_strdup(idstr);
|
||||
device_state->instance_id = instance_id;
|
||||
device_state->buf = g_memdup2(data, len);
|
||||
device_state->buf_len = len;
|
||||
|
||||
if (!multifd_send(&multifd_send_device_state->send_data)) {
|
||||
multifd_send_data_clear(multifd_send_device_state->send_data);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool multifd_device_state_supported(void)
|
||||
{
|
||||
return migrate_multifd() && !migrate_mapped_ram() &&
|
||||
migrate_multifd_compression() == MULTIFD_COMPRESSION_NONE;
|
||||
}
|
||||
|
||||
static void multifd_device_state_save_thread_data_free(void *opaque)
|
||||
{
|
||||
SaveLiveCompletePrecopyThreadData *data = opaque;
|
||||
|
||||
g_clear_pointer(&data->idstr, g_free);
|
||||
g_free(data);
|
||||
}
|
||||
|
||||
static int multifd_device_state_save_thread(void *opaque)
|
||||
{
|
||||
SaveLiveCompletePrecopyThreadData *data = opaque;
|
||||
g_autoptr(Error) local_err = NULL;
|
||||
|
||||
if (!data->hdlr(data, &local_err)) {
|
||||
MigrationState *s = migrate_get_current();
|
||||
|
||||
/*
|
||||
* Can't call abort_device_state_save_threads() here since new
|
||||
* save threads could still be in process of being launched
|
||||
* (if, for example, the very first save thread launched exited
|
||||
* with an error very quickly).
|
||||
*/
|
||||
|
||||
assert(local_err);
|
||||
|
||||
/*
|
||||
* In case of multiple save threads failing which thread error
|
||||
* return we end setting is purely arbitrary.
|
||||
*/
|
||||
migrate_set_error(s, local_err);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool multifd_device_state_save_thread_should_exit(void)
|
||||
{
|
||||
return qatomic_read(&multifd_send_device_state->threads_abort);
|
||||
}
|
||||
|
||||
void
|
||||
multifd_spawn_device_state_save_thread(SaveLiveCompletePrecopyThreadHandler hdlr,
|
||||
char *idstr, uint32_t instance_id,
|
||||
void *opaque)
|
||||
{
|
||||
SaveLiveCompletePrecopyThreadData *data;
|
||||
|
||||
assert(multifd_device_state_supported());
|
||||
assert(multifd_send_device_state);
|
||||
|
||||
assert(!qatomic_read(&multifd_send_device_state->threads_abort));
|
||||
|
||||
data = g_new(SaveLiveCompletePrecopyThreadData, 1);
|
||||
data->hdlr = hdlr;
|
||||
data->idstr = g_strdup(idstr);
|
||||
data->instance_id = instance_id;
|
||||
data->handler_opaque = opaque;
|
||||
|
||||
thread_pool_submit_immediate(multifd_send_device_state->threads,
|
||||
multifd_device_state_save_thread,
|
||||
data,
|
||||
multifd_device_state_save_thread_data_free);
|
||||
}
|
||||
|
||||
void multifd_abort_device_state_save_threads(void)
|
||||
{
|
||||
assert(multifd_device_state_supported());
|
||||
|
||||
qatomic_set(&multifd_send_device_state->threads_abort, true);
|
||||
}
|
||||
|
||||
bool multifd_join_device_state_save_threads(void)
|
||||
{
|
||||
MigrationState *s = migrate_get_current();
|
||||
|
||||
assert(multifd_device_state_supported());
|
||||
|
||||
thread_pool_wait(multifd_send_device_state->threads);
|
||||
|
||||
return !migrate_has_error(s);
|
||||
}
|
|
@ -14,6 +14,7 @@
|
|||
#include "exec/ramblock.h"
|
||||
#include "exec/target_page.h"
|
||||
#include "file.h"
|
||||
#include "migration-stats.h"
|
||||
#include "multifd.h"
|
||||
#include "options.h"
|
||||
#include "qapi/error.h"
|
||||
|
@ -24,15 +25,14 @@
|
|||
|
||||
static MultiFDSendData *multifd_ram_send;
|
||||
|
||||
size_t multifd_ram_payload_size(void)
|
||||
void multifd_ram_payload_alloc(MultiFDPages_t *pages)
|
||||
{
|
||||
uint32_t n = multifd_ram_page_count();
|
||||
pages->offset = g_new0(ram_addr_t, multifd_ram_page_count());
|
||||
}
|
||||
|
||||
/*
|
||||
* We keep an array of page offsets at the end of MultiFDPages_t,
|
||||
* add space for it in the allocation.
|
||||
*/
|
||||
return sizeof(MultiFDPages_t) + n * sizeof(ram_addr_t);
|
||||
void multifd_ram_payload_free(MultiFDPages_t *pages)
|
||||
{
|
||||
g_clear_pointer(&pages->offset, g_free);
|
||||
}
|
||||
|
||||
void multifd_ram_save_setup(void)
|
||||
|
@ -42,8 +42,7 @@ void multifd_ram_save_setup(void)
|
|||
|
||||
void multifd_ram_save_cleanup(void)
|
||||
{
|
||||
g_free(multifd_ram_send);
|
||||
multifd_ram_send = NULL;
|
||||
g_clear_pointer(&multifd_ram_send, multifd_send_data_free);
|
||||
}
|
||||
|
||||
static void multifd_set_file_bitmap(MultiFDSendParams *p)
|
||||
|
@ -86,6 +85,13 @@ static void multifd_nocomp_send_cleanup(MultiFDSendParams *p, Error **errp)
|
|||
return;
|
||||
}
|
||||
|
||||
static void multifd_ram_prepare_header(MultiFDSendParams *p)
|
||||
{
|
||||
p->iov[0].iov_len = p->packet_len;
|
||||
p->iov[0].iov_base = p->packet;
|
||||
p->iovs_num++;
|
||||
}
|
||||
|
||||
static void multifd_send_prepare_iovs(MultiFDSendParams *p)
|
||||
{
|
||||
MultiFDPages_t *pages = &p->data->u.ram;
|
||||
|
@ -119,7 +125,7 @@ static int multifd_nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
|
|||
* Only !zerocopy needs the header in IOV; zerocopy will
|
||||
* send it separately.
|
||||
*/
|
||||
multifd_send_prepare_header(p);
|
||||
multifd_ram_prepare_header(p);
|
||||
}
|
||||
|
||||
multifd_send_prepare_iovs(p);
|
||||
|
@ -134,6 +140,8 @@ static int multifd_nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
|
|||
if (ret != 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
stat64_add(&mig_stats.multifd_bytes, p->packet_len);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -432,7 +440,7 @@ int multifd_ram_flush_and_sync(QEMUFile *f)
|
|||
bool multifd_send_prepare_common(MultiFDSendParams *p)
|
||||
{
|
||||
MultiFDPages_t *pages = &p->data->u.ram;
|
||||
multifd_send_prepare_header(p);
|
||||
multifd_ram_prepare_header(p);
|
||||
multifd_send_zero_page_detect(p);
|
||||
|
||||
if (!pages->normal_num) {
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
|
||||
#include "qemu/osdep.h"
|
||||
#include "qemu/cutils.h"
|
||||
#include "qemu/iov.h"
|
||||
#include "qemu/rcu.h"
|
||||
#include "exec/target_page.h"
|
||||
#include "system/system.h"
|
||||
|
@ -19,8 +20,10 @@
|
|||
#include "qemu/error-report.h"
|
||||
#include "qapi/error.h"
|
||||
#include "file.h"
|
||||
#include "migration/misc.h"
|
||||
#include "migration.h"
|
||||
#include "migration-stats.h"
|
||||
#include "savevm.h"
|
||||
#include "socket.h"
|
||||
#include "tls.h"
|
||||
#include "qemu-file.h"
|
||||
|
@ -49,6 +52,10 @@ typedef struct {
|
|||
|
||||
struct {
|
||||
MultiFDSendParams *params;
|
||||
|
||||
/* multifd_send() body is not thread safe, needs serialization */
|
||||
QemuMutex multifd_send_mutex;
|
||||
|
||||
/*
|
||||
* Global number of generated multifd packets.
|
||||
*
|
||||
|
@ -98,24 +105,44 @@ struct {
|
|||
|
||||
MultiFDSendData *multifd_send_data_alloc(void)
|
||||
{
|
||||
size_t max_payload_size, size_minus_payload;
|
||||
MultiFDSendData *new = g_new0(MultiFDSendData, 1);
|
||||
|
||||
/*
|
||||
* MultiFDPages_t has a flexible array at the end, account for it
|
||||
* when allocating MultiFDSendData. Use max() in case other types
|
||||
* added to the union in the future are larger than
|
||||
* (MultiFDPages_t + flex array).
|
||||
*/
|
||||
max_payload_size = MAX(multifd_ram_payload_size(), sizeof(MultiFDPayload));
|
||||
multifd_ram_payload_alloc(&new->u.ram);
|
||||
/* Device state allocates its payload on-demand */
|
||||
|
||||
/*
|
||||
* Account for any holes the compiler might insert. We can't pack
|
||||
* the structure because that misaligns the members and triggers
|
||||
* Waddress-of-packed-member.
|
||||
*/
|
||||
size_minus_payload = sizeof(MultiFDSendData) - sizeof(MultiFDPayload);
|
||||
return new;
|
||||
}
|
||||
|
||||
return g_malloc0(size_minus_payload + max_payload_size);
|
||||
void multifd_send_data_clear(MultiFDSendData *data)
|
||||
{
|
||||
if (multifd_payload_empty(data)) {
|
||||
return;
|
||||
}
|
||||
|
||||
switch (data->type) {
|
||||
case MULTIFD_PAYLOAD_DEVICE_STATE:
|
||||
multifd_send_data_clear_device_state(&data->u.device_state);
|
||||
break;
|
||||
default:
|
||||
/* Nothing to do */
|
||||
break;
|
||||
}
|
||||
|
||||
data->type = MULTIFD_PAYLOAD_NONE;
|
||||
}
|
||||
|
||||
void multifd_send_data_free(MultiFDSendData *data)
|
||||
{
|
||||
if (!data) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* This also free's device state payload */
|
||||
multifd_send_data_clear(data);
|
||||
|
||||
multifd_ram_payload_free(&data->u.ram);
|
||||
|
||||
g_free(data);
|
||||
}
|
||||
|
||||
static bool multifd_use_packets(void)
|
||||
|
@ -201,6 +228,7 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
|
|||
return msg.id;
|
||||
}
|
||||
|
||||
/* Fills a RAM multifd packet */
|
||||
void multifd_send_fill_packet(MultiFDSendParams *p)
|
||||
{
|
||||
MultiFDPacket_t *packet = p->packet;
|
||||
|
@ -209,10 +237,10 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
|
|||
|
||||
memset(packet, 0, p->packet_len);
|
||||
|
||||
packet->magic = cpu_to_be32(MULTIFD_MAGIC);
|
||||
packet->version = cpu_to_be32(MULTIFD_VERSION);
|
||||
packet->hdr.magic = cpu_to_be32(MULTIFD_MAGIC);
|
||||
packet->hdr.version = cpu_to_be32(MULTIFD_VERSION);
|
||||
|
||||
packet->flags = cpu_to_be32(p->flags);
|
||||
packet->hdr.flags = cpu_to_be32(p->flags);
|
||||
packet->next_packet_size = cpu_to_be32(p->next_packet_size);
|
||||
|
||||
packet_num = qatomic_fetch_inc(&multifd_send_state->packet_num);
|
||||
|
@ -228,12 +256,12 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
|
|||
p->flags, p->next_packet_size);
|
||||
}
|
||||
|
||||
static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
|
||||
static int multifd_recv_unfill_packet_header(MultiFDRecvParams *p,
|
||||
const MultiFDPacketHdr_t *hdr,
|
||||
Error **errp)
|
||||
{
|
||||
const MultiFDPacket_t *packet = p->packet;
|
||||
uint32_t magic = be32_to_cpu(packet->magic);
|
||||
uint32_t version = be32_to_cpu(packet->version);
|
||||
int ret = 0;
|
||||
uint32_t magic = be32_to_cpu(hdr->magic);
|
||||
uint32_t version = be32_to_cpu(hdr->version);
|
||||
|
||||
if (magic != MULTIFD_MAGIC) {
|
||||
error_setg(errp, "multifd: received packet magic %x, expected %x",
|
||||
|
@ -247,10 +275,29 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
|
|||
return -1;
|
||||
}
|
||||
|
||||
p->flags = be32_to_cpu(packet->flags);
|
||||
p->flags = be32_to_cpu(hdr->flags);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int multifd_recv_unfill_packet_device_state(MultiFDRecvParams *p,
|
||||
Error **errp)
|
||||
{
|
||||
MultiFDPacketDeviceState_t *packet = p->packet_dev_state;
|
||||
|
||||
packet->instance_id = be32_to_cpu(packet->instance_id);
|
||||
p->next_packet_size = be32_to_cpu(packet->next_packet_size);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int multifd_recv_unfill_packet_ram(MultiFDRecvParams *p, Error **errp)
|
||||
{
|
||||
const MultiFDPacket_t *packet = p->packet;
|
||||
int ret = 0;
|
||||
|
||||
p->next_packet_size = be32_to_cpu(packet->next_packet_size);
|
||||
p->packet_num = be64_to_cpu(packet->packet_num);
|
||||
p->packets_recved++;
|
||||
|
||||
/* Always unfill, old QEMUs (<9.0) send data along with SYNC */
|
||||
ret = multifd_ram_unfill_packet(p, errp);
|
||||
|
@ -261,6 +308,17 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
|
||||
{
|
||||
p->packets_recved++;
|
||||
|
||||
if (p->flags & MULTIFD_FLAG_DEVICE_STATE) {
|
||||
return multifd_recv_unfill_packet_device_state(p, errp);
|
||||
}
|
||||
|
||||
return multifd_recv_unfill_packet_ram(p, errp);
|
||||
}
|
||||
|
||||
static bool multifd_send_should_exit(void)
|
||||
{
|
||||
return qatomic_read(&multifd_send_state->exiting);
|
||||
|
@ -308,6 +366,8 @@ bool multifd_send(MultiFDSendData **send_data)
|
|||
return false;
|
||||
}
|
||||
|
||||
QEMU_LOCK_GUARD(&multifd_send_state->multifd_send_mutex);
|
||||
|
||||
/* We wait here, until at least one channel is ready */
|
||||
qemu_sem_wait(&multifd_send_state->channels_ready);
|
||||
|
||||
|
@ -459,9 +519,9 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp)
|
|||
qemu_sem_destroy(&p->sem_sync);
|
||||
g_free(p->name);
|
||||
p->name = NULL;
|
||||
g_free(p->data);
|
||||
p->data = NULL;
|
||||
g_clear_pointer(&p->data, multifd_send_data_free);
|
||||
p->packet_len = 0;
|
||||
g_clear_pointer(&p->packet_device_state, g_free);
|
||||
g_free(p->packet);
|
||||
p->packet = NULL;
|
||||
multifd_send_state->ops->send_cleanup(p, errp);
|
||||
|
@ -474,8 +534,10 @@ static void multifd_send_cleanup_state(void)
|
|||
{
|
||||
file_cleanup_outgoing_migration();
|
||||
socket_cleanup_outgoing_migration();
|
||||
multifd_device_state_send_cleanup();
|
||||
qemu_sem_destroy(&multifd_send_state->channels_created);
|
||||
qemu_sem_destroy(&multifd_send_state->channels_ready);
|
||||
qemu_mutex_destroy(&multifd_send_state->multifd_send_mutex);
|
||||
g_free(multifd_send_state->params);
|
||||
multifd_send_state->params = NULL;
|
||||
g_free(multifd_send_state);
|
||||
|
@ -631,16 +693,32 @@ static void *multifd_send_thread(void *opaque)
|
|||
* qatomic_store_release() in multifd_send().
|
||||
*/
|
||||
if (qatomic_load_acquire(&p->pending_job)) {
|
||||
bool is_device_state = multifd_payload_device_state(p->data);
|
||||
size_t total_size;
|
||||
|
||||
p->flags = 0;
|
||||
p->iovs_num = 0;
|
||||
assert(!multifd_payload_empty(p->data));
|
||||
|
||||
ret = multifd_send_state->ops->send_prepare(p, &local_err);
|
||||
if (ret != 0) {
|
||||
break;
|
||||
if (is_device_state) {
|
||||
multifd_device_state_send_prepare(p);
|
||||
} else {
|
||||
ret = multifd_send_state->ops->send_prepare(p, &local_err);
|
||||
if (ret != 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* The packet header in the zerocopy RAM case is accounted for
|
||||
* in multifd_nocomp_send_prepare() - where it is actually
|
||||
* being sent.
|
||||
*/
|
||||
total_size = iov_size(p->iov, p->iovs_num);
|
||||
|
||||
if (migrate_mapped_ram()) {
|
||||
assert(!is_device_state);
|
||||
|
||||
ret = file_write_ramblock_iov(p->c, p->iov, p->iovs_num,
|
||||
&p->data->u.ram, &local_err);
|
||||
} else {
|
||||
|
@ -653,11 +731,10 @@ static void *multifd_send_thread(void *opaque)
|
|||
break;
|
||||
}
|
||||
|
||||
stat64_add(&mig_stats.multifd_bytes,
|
||||
(uint64_t)p->next_packet_size + p->packet_len);
|
||||
stat64_add(&mig_stats.multifd_bytes, total_size);
|
||||
|
||||
p->next_packet_size = 0;
|
||||
multifd_set_payload_type(p->data, MULTIFD_PAYLOAD_NONE);
|
||||
multifd_send_data_clear(p->data);
|
||||
|
||||
/*
|
||||
* Making sure p->data is published before saying "we're
|
||||
|
@ -856,6 +933,7 @@ bool multifd_send_setup(void)
|
|||
thread_count = migrate_multifd_channels();
|
||||
multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
|
||||
multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
|
||||
qemu_mutex_init(&multifd_send_state->multifd_send_mutex);
|
||||
qemu_sem_init(&multifd_send_state->channels_created, 0);
|
||||
qemu_sem_init(&multifd_send_state->channels_ready, 0);
|
||||
qatomic_set(&multifd_send_state->exiting, 0);
|
||||
|
@ -874,6 +952,9 @@ bool multifd_send_setup(void)
|
|||
p->packet_len = sizeof(MultiFDPacket_t)
|
||||
+ sizeof(uint64_t) * page_count;
|
||||
p->packet = g_malloc0(p->packet_len);
|
||||
p->packet_device_state = g_malloc0(sizeof(*p->packet_device_state));
|
||||
p->packet_device_state->hdr.magic = cpu_to_be32(MULTIFD_MAGIC);
|
||||
p->packet_device_state->hdr.version = cpu_to_be32(MULTIFD_VERSION);
|
||||
}
|
||||
p->name = g_strdup_printf(MIGRATION_THREAD_SRC_MULTIFD, i);
|
||||
p->write_flags = 0;
|
||||
|
@ -909,6 +990,8 @@ bool multifd_send_setup(void)
|
|||
assert(p->iov);
|
||||
}
|
||||
|
||||
multifd_device_state_send_setup();
|
||||
|
||||
return true;
|
||||
|
||||
err:
|
||||
|
@ -1048,6 +1131,7 @@ static void multifd_recv_cleanup_channel(MultiFDRecvParams *p)
|
|||
p->packet_len = 0;
|
||||
g_free(p->packet);
|
||||
p->packet = NULL;
|
||||
g_clear_pointer(&p->packet_dev_state, g_free);
|
||||
g_free(p->normal);
|
||||
p->normal = NULL;
|
||||
g_free(p->zero);
|
||||
|
@ -1149,6 +1233,34 @@ void multifd_recv_sync_main(void)
|
|||
trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
|
||||
}
|
||||
|
||||
static int multifd_device_state_recv(MultiFDRecvParams *p, Error **errp)
|
||||
{
|
||||
g_autofree char *dev_state_buf = NULL;
|
||||
int ret;
|
||||
|
||||
dev_state_buf = g_malloc(p->next_packet_size);
|
||||
|
||||
ret = qio_channel_read_all(p->c, dev_state_buf, p->next_packet_size, errp);
|
||||
if (ret != 0) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (p->packet_dev_state->idstr[sizeof(p->packet_dev_state->idstr) - 1]
|
||||
!= 0) {
|
||||
error_setg(errp, "unterminated multifd device state idstr");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!qemu_loadvm_load_state_buffer(p->packet_dev_state->idstr,
|
||||
p->packet_dev_state->instance_id,
|
||||
dev_state_buf, p->next_packet_size,
|
||||
errp)) {
|
||||
ret = -1;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void *multifd_recv_thread(void *opaque)
|
||||
{
|
||||
MigrationState *s = migrate_get_current();
|
||||
|
@ -1165,14 +1277,19 @@ static void *multifd_recv_thread(void *opaque)
|
|||
}
|
||||
|
||||
while (true) {
|
||||
MultiFDPacketHdr_t hdr;
|
||||
uint32_t flags = 0;
|
||||
bool is_device_state = false;
|
||||
bool has_data = false;
|
||||
uint8_t *pkt_buf;
|
||||
size_t pkt_len;
|
||||
|
||||
p->normal_num = 0;
|
||||
|
||||
if (use_packets) {
|
||||
struct iovec iov = {
|
||||
.iov_base = (void *)p->packet,
|
||||
.iov_len = p->packet_len
|
||||
.iov_base = (void *)&hdr,
|
||||
.iov_len = sizeof(hdr)
|
||||
};
|
||||
|
||||
if (multifd_recv_should_exit()) {
|
||||
|
@ -1191,6 +1308,32 @@ static void *multifd_recv_thread(void *opaque)
|
|||
break;
|
||||
}
|
||||
|
||||
ret = multifd_recv_unfill_packet_header(p, &hdr, &local_err);
|
||||
if (ret) {
|
||||
break;
|
||||
}
|
||||
|
||||
is_device_state = p->flags & MULTIFD_FLAG_DEVICE_STATE;
|
||||
if (is_device_state) {
|
||||
pkt_buf = (uint8_t *)p->packet_dev_state + sizeof(hdr);
|
||||
pkt_len = sizeof(*p->packet_dev_state) - sizeof(hdr);
|
||||
} else {
|
||||
pkt_buf = (uint8_t *)p->packet + sizeof(hdr);
|
||||
pkt_len = p->packet_len - sizeof(hdr);
|
||||
}
|
||||
|
||||
ret = qio_channel_read_all_eof(p->c, (char *)pkt_buf, pkt_len,
|
||||
&local_err);
|
||||
if (!ret) {
|
||||
/* EOF */
|
||||
error_setg(&local_err, "multifd: unexpected EOF after packet header");
|
||||
break;
|
||||
}
|
||||
|
||||
if (ret == -1) {
|
||||
break;
|
||||
}
|
||||
|
||||
qemu_mutex_lock(&p->mutex);
|
||||
ret = multifd_recv_unfill_packet(p, &local_err);
|
||||
if (ret) {
|
||||
|
@ -1202,12 +1345,17 @@ static void *multifd_recv_thread(void *opaque)
|
|||
/* recv methods don't know how to handle the SYNC flag */
|
||||
p->flags &= ~MULTIFD_FLAG_SYNC;
|
||||
|
||||
/*
|
||||
* Even if it's a SYNC packet, this needs to be set
|
||||
* because older QEMUs (<9.0) still send data along with
|
||||
* the SYNC packet.
|
||||
*/
|
||||
has_data = p->normal_num || p->zero_num;
|
||||
if (is_device_state) {
|
||||
has_data = p->next_packet_size > 0;
|
||||
} else {
|
||||
/*
|
||||
* Even if it's a SYNC packet, this needs to be set
|
||||
* because older QEMUs (<9.0) still send data along with
|
||||
* the SYNC packet.
|
||||
*/
|
||||
has_data = p->normal_num || p->zero_num;
|
||||
}
|
||||
|
||||
qemu_mutex_unlock(&p->mutex);
|
||||
} else {
|
||||
/*
|
||||
|
@ -1236,14 +1384,29 @@ static void *multifd_recv_thread(void *opaque)
|
|||
}
|
||||
|
||||
if (has_data) {
|
||||
ret = multifd_recv_state->ops->recv(p, &local_err);
|
||||
if (is_device_state) {
|
||||
assert(use_packets);
|
||||
ret = multifd_device_state_recv(p, &local_err);
|
||||
} else {
|
||||
ret = multifd_recv_state->ops->recv(p, &local_err);
|
||||
}
|
||||
if (ret != 0) {
|
||||
break;
|
||||
}
|
||||
} else if (is_device_state) {
|
||||
error_setg(&local_err,
|
||||
"multifd: received empty device state packet");
|
||||
break;
|
||||
}
|
||||
|
||||
if (use_packets) {
|
||||
if (flags & MULTIFD_FLAG_SYNC) {
|
||||
if (is_device_state) {
|
||||
error_setg(&local_err,
|
||||
"multifd: received SYNC device state packet");
|
||||
break;
|
||||
}
|
||||
|
||||
qemu_sem_post(&multifd_recv_state->sem_sync);
|
||||
qemu_sem_wait(&p->sem_sync);
|
||||
}
|
||||
|
@ -1312,6 +1475,7 @@ int multifd_recv_setup(Error **errp)
|
|||
p->packet_len = sizeof(MultiFDPacket_t)
|
||||
+ sizeof(uint64_t) * page_count;
|
||||
p->packet = g_malloc0(p->packet_len);
|
||||
p->packet_dev_state = g_malloc0(sizeof(*p->packet_dev_state));
|
||||
}
|
||||
p->name = g_strdup_printf(MIGRATION_THREAD_DST_MULTIFD, i);
|
||||
p->normal = g_new0(ram_addr_t, page_count);
|
||||
|
|
|
@ -62,6 +62,12 @@ MultiFDRecvData *multifd_get_recv_data(void);
|
|||
#define MULTIFD_FLAG_UADK (8 << 1)
|
||||
#define MULTIFD_FLAG_QATZIP (16 << 1)
|
||||
|
||||
/*
|
||||
* If set it means that this packet contains device state
|
||||
* (MultiFDPacketDeviceState_t), not RAM data (MultiFDPacket_t).
|
||||
*/
|
||||
#define MULTIFD_FLAG_DEVICE_STATE (32 << 1)
|
||||
|
||||
/* This value needs to be a multiple of qemu_target_page_size() */
|
||||
#define MULTIFD_PACKET_SIZE (512 * 1024)
|
||||
|
||||
|
@ -69,6 +75,11 @@ typedef struct {
|
|||
uint32_t magic;
|
||||
uint32_t version;
|
||||
uint32_t flags;
|
||||
} __attribute__((packed)) MultiFDPacketHdr_t;
|
||||
|
||||
typedef struct {
|
||||
MultiFDPacketHdr_t hdr;
|
||||
|
||||
/* maximum number of allocated pages */
|
||||
uint32_t pages_alloc;
|
||||
/* non zero pages */
|
||||
|
@ -89,14 +100,28 @@ typedef struct {
|
|||
uint64_t offset[];
|
||||
} __attribute__((packed)) MultiFDPacket_t;
|
||||
|
||||
typedef struct {
|
||||
MultiFDPacketHdr_t hdr;
|
||||
|
||||
char idstr[256];
|
||||
uint32_t instance_id;
|
||||
|
||||
/* size of the next packet that contains the actual data */
|
||||
uint32_t next_packet_size;
|
||||
} __attribute__((packed)) MultiFDPacketDeviceState_t;
|
||||
|
||||
typedef struct {
|
||||
/* number of used pages */
|
||||
uint32_t num;
|
||||
/* number of normal pages */
|
||||
uint32_t normal_num;
|
||||
/*
|
||||
* Pointer to the ramblock. NOTE: it's caller's responsibility to make
|
||||
* sure the pointer is always valid!
|
||||
*/
|
||||
RAMBlock *block;
|
||||
/* offset of each page */
|
||||
ram_addr_t offset[];
|
||||
/* offset array of each page, managed by multifd */
|
||||
ram_addr_t *offset;
|
||||
} MultiFDPages_t;
|
||||
|
||||
struct MultiFDRecvData {
|
||||
|
@ -106,13 +131,22 @@ struct MultiFDRecvData {
|
|||
off_t file_offset;
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
char *idstr;
|
||||
uint32_t instance_id;
|
||||
char *buf;
|
||||
size_t buf_len;
|
||||
} MultiFDDeviceState_t;
|
||||
|
||||
typedef enum {
|
||||
MULTIFD_PAYLOAD_NONE,
|
||||
MULTIFD_PAYLOAD_RAM,
|
||||
MULTIFD_PAYLOAD_DEVICE_STATE,
|
||||
} MultiFDPayloadType;
|
||||
|
||||
typedef union MultiFDPayload {
|
||||
typedef struct MultiFDPayload {
|
||||
MultiFDPages_t ram;
|
||||
MultiFDDeviceState_t device_state;
|
||||
} MultiFDPayload;
|
||||
|
||||
struct MultiFDSendData {
|
||||
|
@ -125,9 +159,17 @@ static inline bool multifd_payload_empty(MultiFDSendData *data)
|
|||
return data->type == MULTIFD_PAYLOAD_NONE;
|
||||
}
|
||||
|
||||
static inline bool multifd_payload_device_state(MultiFDSendData *data)
|
||||
{
|
||||
return data->type == MULTIFD_PAYLOAD_DEVICE_STATE;
|
||||
}
|
||||
|
||||
static inline void multifd_set_payload_type(MultiFDSendData *data,
|
||||
MultiFDPayloadType type)
|
||||
{
|
||||
assert(multifd_payload_empty(data));
|
||||
assert(type != MULTIFD_PAYLOAD_NONE);
|
||||
|
||||
data->type = type;
|
||||
}
|
||||
|
||||
|
@ -174,8 +216,9 @@ typedef struct {
|
|||
|
||||
/* thread local variables. No locking required */
|
||||
|
||||
/* pointer to the packet */
|
||||
/* pointers to the possible packet types */
|
||||
MultiFDPacket_t *packet;
|
||||
MultiFDPacketDeviceState_t *packet_device_state;
|
||||
/* size of the next packet that contains pages */
|
||||
uint32_t next_packet_size;
|
||||
/* packets sent through this channel */
|
||||
|
@ -222,8 +265,9 @@ typedef struct {
|
|||
|
||||
/* thread local variables. No locking required */
|
||||
|
||||
/* pointer to the packet */
|
||||
/* pointers to the possible packet types */
|
||||
MultiFDPacket_t *packet;
|
||||
MultiFDPacketDeviceState_t *packet_dev_state;
|
||||
/* size of the next packet that contains pages */
|
||||
uint32_t next_packet_size;
|
||||
/* packets received through this channel */
|
||||
|
@ -333,16 +377,11 @@ bool multifd_send_prepare_common(MultiFDSendParams *p);
|
|||
void multifd_send_zero_page_detect(MultiFDSendParams *p);
|
||||
void multifd_recv_zero_page_process(MultiFDRecvParams *p);
|
||||
|
||||
static inline void multifd_send_prepare_header(MultiFDSendParams *p)
|
||||
{
|
||||
p->iov[0].iov_len = p->packet_len;
|
||||
p->iov[0].iov_base = p->packet;
|
||||
p->iovs_num++;
|
||||
}
|
||||
|
||||
void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc);
|
||||
bool multifd_send(MultiFDSendData **send_data);
|
||||
MultiFDSendData *multifd_send_data_alloc(void);
|
||||
void multifd_send_data_clear(MultiFDSendData *data);
|
||||
void multifd_send_data_free(MultiFDSendData *data);
|
||||
|
||||
static inline uint32_t multifd_ram_page_size(void)
|
||||
{
|
||||
|
@ -359,7 +398,16 @@ void multifd_ram_save_cleanup(void);
|
|||
int multifd_ram_flush_and_sync(QEMUFile *f);
|
||||
bool multifd_ram_sync_per_round(void);
|
||||
bool multifd_ram_sync_per_section(void);
|
||||
size_t multifd_ram_payload_size(void);
|
||||
void multifd_ram_payload_alloc(MultiFDPages_t *pages);
|
||||
void multifd_ram_payload_free(MultiFDPages_t *pages);
|
||||
void multifd_ram_fill_packet(MultiFDSendParams *p);
|
||||
int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp);
|
||||
|
||||
void multifd_send_data_clear_device_state(MultiFDDeviceState_t *device_state);
|
||||
|
||||
void multifd_device_state_send_setup(void);
|
||||
void multifd_device_state_send_cleanup(void);
|
||||
|
||||
void multifd_device_state_send_prepare(MultiFDSendParams *p);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -93,6 +93,8 @@ const Property migration_properties[] = {
|
|||
send_configuration, true),
|
||||
DEFINE_PROP_BOOL("send-section-footer", MigrationState,
|
||||
send_section_footer, true),
|
||||
DEFINE_PROP_BOOL("send-switchover-start", MigrationState,
|
||||
send_switchover_start, true),
|
||||
DEFINE_PROP_BOOL("multifd-flush-after-each-section", MigrationState,
|
||||
multifd_flush_after_each_section, false),
|
||||
DEFINE_PROP_UINT8("x-clear-bitmap-shift", MigrationState,
|
||||
|
@ -209,6 +211,13 @@ bool migrate_auto_converge(void)
|
|||
return s->capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE];
|
||||
}
|
||||
|
||||
bool migrate_send_switchover_start(void)
|
||||
{
|
||||
MigrationState *s = migrate_get_current();
|
||||
|
||||
return s->send_switchover_start;
|
||||
}
|
||||
|
||||
bool migrate_background_snapshot(void)
|
||||
{
|
||||
MigrationState *s = migrate_get_current();
|
||||
|
|
|
@ -33,6 +33,8 @@ QEMUFile *qemu_file_new_input(QIOChannel *ioc);
|
|||
QEMUFile *qemu_file_new_output(QIOChannel *ioc);
|
||||
int qemu_fclose(QEMUFile *f);
|
||||
|
||||
G_DEFINE_AUTOPTR_CLEANUP_FUNC(QEMUFile, qemu_fclose)
|
||||
|
||||
/*
|
||||
* qemu_file_transferred:
|
||||
*
|
||||
|
|
|
@ -37,6 +37,7 @@
|
|||
#include "migration/register.h"
|
||||
#include "migration/global_state.h"
|
||||
#include "migration/channel-block.h"
|
||||
#include "multifd.h"
|
||||
#include "ram.h"
|
||||
#include "qemu-file.h"
|
||||
#include "savevm.h"
|
||||
|
@ -54,6 +55,7 @@
|
|||
#include "qemu/job.h"
|
||||
#include "qemu/main-loop.h"
|
||||
#include "block/snapshot.h"
|
||||
#include "block/thread-pool.h"
|
||||
#include "qemu/cutils.h"
|
||||
#include "io/channel-buffer.h"
|
||||
#include "io/channel-file.h"
|
||||
|
@ -90,6 +92,7 @@ enum qemu_vm_cmd {
|
|||
MIG_CMD_ENABLE_COLO, /* Enable COLO */
|
||||
MIG_CMD_POSTCOPY_RESUME, /* resume postcopy on dest */
|
||||
MIG_CMD_RECV_BITMAP, /* Request for recved bitmap on dst */
|
||||
MIG_CMD_SWITCHOVER_START, /* Switchover start notification */
|
||||
MIG_CMD_MAX
|
||||
};
|
||||
|
||||
|
@ -109,6 +112,7 @@ static struct mig_cmd_args {
|
|||
[MIG_CMD_POSTCOPY_RESUME] = { .len = 0, .name = "POSTCOPY_RESUME" },
|
||||
[MIG_CMD_PACKAGED] = { .len = 4, .name = "PACKAGED" },
|
||||
[MIG_CMD_RECV_BITMAP] = { .len = -1, .name = "RECV_BITMAP" },
|
||||
[MIG_CMD_SWITCHOVER_START] = { .len = 0, .name = "SWITCHOVER_START" },
|
||||
[MIG_CMD_MAX] = { .len = -1, .name = "MAX" },
|
||||
};
|
||||
|
||||
|
@ -129,6 +133,35 @@ static struct mig_cmd_args {
|
|||
* generic extendable format with an exception for two old entities.
|
||||
*/
|
||||
|
||||
/***********************************************************/
|
||||
/* Optional load threads pool support */
|
||||
|
||||
static void qemu_loadvm_thread_pool_create(MigrationIncomingState *mis)
|
||||
{
|
||||
assert(!mis->load_threads);
|
||||
mis->load_threads = thread_pool_new();
|
||||
mis->load_threads_abort = false;
|
||||
}
|
||||
|
||||
static void qemu_loadvm_thread_pool_destroy(MigrationIncomingState *mis)
|
||||
{
|
||||
qatomic_set(&mis->load_threads_abort, true);
|
||||
|
||||
bql_unlock(); /* Load threads might be waiting for BQL */
|
||||
g_clear_pointer(&mis->load_threads, thread_pool_free);
|
||||
bql_lock();
|
||||
}
|
||||
|
||||
static bool qemu_loadvm_thread_pool_wait(MigrationState *s,
|
||||
MigrationIncomingState *mis)
|
||||
{
|
||||
bql_unlock(); /* Let load threads do work requiring BQL */
|
||||
thread_pool_wait(mis->load_threads);
|
||||
bql_lock();
|
||||
|
||||
return !migrate_has_error(s);
|
||||
}
|
||||
|
||||
/***********************************************************/
|
||||
/* savevm/loadvm support */
|
||||
|
||||
|
@ -1201,6 +1234,19 @@ void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name)
|
|||
qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf);
|
||||
}
|
||||
|
||||
static void qemu_savevm_send_switchover_start(QEMUFile *f)
|
||||
{
|
||||
trace_savevm_send_switchover_start();
|
||||
qemu_savevm_command_send(f, MIG_CMD_SWITCHOVER_START, 0, NULL);
|
||||
}
|
||||
|
||||
void qemu_savevm_maybe_send_switchover_start(QEMUFile *f)
|
||||
{
|
||||
if (migrate_send_switchover_start()) {
|
||||
qemu_savevm_send_switchover_start(f);
|
||||
}
|
||||
}
|
||||
|
||||
bool qemu_savevm_state_blocked(Error **errp)
|
||||
{
|
||||
SaveStateEntry *se;
|
||||
|
@ -1482,6 +1528,24 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy)
|
|||
int64_t start_ts_each, end_ts_each;
|
||||
SaveStateEntry *se;
|
||||
int ret;
|
||||
bool multifd_device_state = multifd_device_state_supported();
|
||||
|
||||
if (multifd_device_state) {
|
||||
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
|
||||
SaveLiveCompletePrecopyThreadHandler hdlr;
|
||||
|
||||
if (!se->ops || (in_postcopy && se->ops->has_postcopy &&
|
||||
se->ops->has_postcopy(se->opaque)) ||
|
||||
!se->ops->save_live_complete_precopy_thread) {
|
||||
continue;
|
||||
}
|
||||
|
||||
hdlr = se->ops->save_live_complete_precopy_thread;
|
||||
multifd_spawn_device_state_save_thread(hdlr,
|
||||
se->idstr, se->instance_id,
|
||||
se->opaque);
|
||||
}
|
||||
}
|
||||
|
||||
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
|
||||
if (!se->ops ||
|
||||
|
@ -1507,16 +1571,35 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy)
|
|||
save_section_footer(f, se);
|
||||
if (ret < 0) {
|
||||
qemu_file_set_error(f, ret);
|
||||
return -1;
|
||||
goto ret_fail_abort_threads;
|
||||
}
|
||||
end_ts_each = qemu_clock_get_us(QEMU_CLOCK_REALTIME);
|
||||
trace_vmstate_downtime_save("iterable", se->idstr, se->instance_id,
|
||||
end_ts_each - start_ts_each);
|
||||
}
|
||||
|
||||
if (multifd_device_state) {
|
||||
if (migrate_has_error(migrate_get_current())) {
|
||||
multifd_abort_device_state_save_threads();
|
||||
}
|
||||
|
||||
if (!multifd_join_device_state_save_threads()) {
|
||||
qemu_file_set_error(f, -EINVAL);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
trace_vmstate_downtime_checkpoint("src-iterable-saved");
|
||||
|
||||
return 0;
|
||||
|
||||
ret_fail_abort_threads:
|
||||
if (multifd_device_state) {
|
||||
multifd_abort_device_state_save_threads();
|
||||
multifd_join_device_state_save_threads();
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
|
||||
|
@ -1687,6 +1770,7 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
|
|||
|
||||
ret = qemu_file_get_error(f);
|
||||
if (ret == 0) {
|
||||
qemu_savevm_maybe_send_switchover_start(f);
|
||||
qemu_savevm_state_complete_precopy(f, false);
|
||||
ret = qemu_file_get_error(f);
|
||||
}
|
||||
|
@ -1970,6 +2054,8 @@ static void *postcopy_ram_listen_thread(void *opaque)
|
|||
* in qemu_file, and thus we must be blocking now.
|
||||
*/
|
||||
qemu_file_set_blocking(f, true);
|
||||
|
||||
/* TODO: sanity check that only postcopiable data will be loaded here */
|
||||
load_res = qemu_loadvm_state_main(f, mis);
|
||||
|
||||
/*
|
||||
|
@ -2030,7 +2116,9 @@ static void *postcopy_ram_listen_thread(void *opaque)
|
|||
* (If something broke then qemu will have to exit anyway since it's
|
||||
* got a bad migration state).
|
||||
*/
|
||||
bql_lock();
|
||||
migration_incoming_state_destroy();
|
||||
bql_unlock();
|
||||
|
||||
rcu_unregister_thread();
|
||||
mis->have_listen_thread = false;
|
||||
|
@ -2383,6 +2471,26 @@ static int loadvm_process_enable_colo(MigrationIncomingState *mis)
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int loadvm_postcopy_handle_switchover_start(void)
|
||||
{
|
||||
SaveStateEntry *se;
|
||||
|
||||
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
|
||||
int ret;
|
||||
|
||||
if (!se->ops || !se->ops->switchover_start) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ret = se->ops->switchover_start(se->opaque);
|
||||
if (ret < 0) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Process an incoming 'QEMU_VM_COMMAND'
|
||||
* 0 just a normal return
|
||||
|
@ -2481,6 +2589,9 @@ static int loadvm_process_command(QEMUFile *f)
|
|||
|
||||
case MIG_CMD_ENABLE_COLO:
|
||||
return loadvm_process_enable_colo(mis);
|
||||
|
||||
case MIG_CMD_SWITCHOVER_START:
|
||||
return loadvm_postcopy_handle_switchover_start();
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -2740,16 +2851,68 @@ static int qemu_loadvm_state_setup(QEMUFile *f, Error **errp)
|
|||
return 0;
|
||||
}
|
||||
|
||||
void qemu_loadvm_state_cleanup(void)
|
||||
struct LoadThreadData {
|
||||
MigrationLoadThread function;
|
||||
void *opaque;
|
||||
};
|
||||
|
||||
static int qemu_loadvm_load_thread(void *thread_opaque)
|
||||
{
|
||||
struct LoadThreadData *data = thread_opaque;
|
||||
MigrationIncomingState *mis = migration_incoming_get_current();
|
||||
g_autoptr(Error) local_err = NULL;
|
||||
|
||||
if (!data->function(data->opaque, &mis->load_threads_abort, &local_err)) {
|
||||
MigrationState *s = migrate_get_current();
|
||||
|
||||
/*
|
||||
* Can't set load_threads_abort here since processing of main migration
|
||||
* channel data could still be happening, resulting in launching of new
|
||||
* load threads.
|
||||
*/
|
||||
|
||||
assert(local_err);
|
||||
|
||||
/*
|
||||
* In case of multiple load threads failing which thread error
|
||||
* return we end setting is purely arbitrary.
|
||||
*/
|
||||
migrate_set_error(s, local_err);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void qemu_loadvm_start_load_thread(MigrationLoadThread function,
|
||||
void *opaque)
|
||||
{
|
||||
MigrationIncomingState *mis = migration_incoming_get_current();
|
||||
struct LoadThreadData *data;
|
||||
|
||||
/* We only set it from this thread so it's okay to read it directly */
|
||||
assert(!mis->load_threads_abort);
|
||||
|
||||
data = g_new(struct LoadThreadData, 1);
|
||||
data->function = function;
|
||||
data->opaque = opaque;
|
||||
|
||||
thread_pool_submit_immediate(mis->load_threads, qemu_loadvm_load_thread,
|
||||
data, g_free);
|
||||
}
|
||||
|
||||
void qemu_loadvm_state_cleanup(MigrationIncomingState *mis)
|
||||
{
|
||||
SaveStateEntry *se;
|
||||
|
||||
trace_loadvm_state_cleanup();
|
||||
|
||||
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
|
||||
if (se->ops && se->ops->load_cleanup) {
|
||||
se->ops->load_cleanup(se->opaque);
|
||||
}
|
||||
}
|
||||
|
||||
qemu_loadvm_thread_pool_destroy(mis);
|
||||
}
|
||||
|
||||
/* Return true if we should continue the migration, or false. */
|
||||
|
@ -2900,6 +3063,7 @@ out:
|
|||
|
||||
int qemu_loadvm_state(QEMUFile *f)
|
||||
{
|
||||
MigrationState *s = migrate_get_current();
|
||||
MigrationIncomingState *mis = migration_incoming_get_current();
|
||||
Error *local_err = NULL;
|
||||
int ret;
|
||||
|
@ -2909,6 +3073,8 @@ int qemu_loadvm_state(QEMUFile *f)
|
|||
return -EINVAL;
|
||||
}
|
||||
|
||||
qemu_loadvm_thread_pool_create(mis);
|
||||
|
||||
ret = qemu_loadvm_state_header(f);
|
||||
if (ret) {
|
||||
return ret;
|
||||
|
@ -2940,12 +3106,18 @@ int qemu_loadvm_state(QEMUFile *f)
|
|||
|
||||
/* When reaching here, it must be precopy */
|
||||
if (ret == 0) {
|
||||
if (migrate_has_error(migrate_get_current())) {
|
||||
if (migrate_has_error(migrate_get_current()) ||
|
||||
!qemu_loadvm_thread_pool_wait(s, mis)) {
|
||||
ret = -EINVAL;
|
||||
} else {
|
||||
ret = qemu_file_get_error(f);
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Set this flag unconditionally so we'll catch further attempts to
|
||||
* start additional threads via an appropriate assert()
|
||||
*/
|
||||
qatomic_set(&mis->load_threads_abort, true);
|
||||
|
||||
/*
|
||||
* Try to read in the VMDESC section as well, so that dumping tools that
|
||||
|
@ -3021,6 +3193,29 @@ int qemu_loadvm_approve_switchover(void)
|
|||
return migrate_send_rp_switchover_ack(mis);
|
||||
}
|
||||
|
||||
bool qemu_loadvm_load_state_buffer(const char *idstr, uint32_t instance_id,
|
||||
char *buf, size_t len, Error **errp)
|
||||
{
|
||||
SaveStateEntry *se;
|
||||
|
||||
se = find_se(idstr, instance_id);
|
||||
if (!se) {
|
||||
error_setg(errp,
|
||||
"Unknown idstr %s or instance id %u for load state buffer",
|
||||
idstr, instance_id);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!se->ops || !se->ops->load_state_buffer) {
|
||||
error_setg(errp,
|
||||
"idstr %s / instance %u has no load state buffer operation",
|
||||
idstr, instance_id);
|
||||
return false;
|
||||
}
|
||||
|
||||
return se->ops->load_state_buffer(se->opaque, buf, len, errp);
|
||||
}
|
||||
|
||||
bool save_snapshot(const char *name, bool overwrite, const char *vmstate,
|
||||
bool has_devices, strList *devices, Error **errp)
|
||||
{
|
||||
|
|
|
@ -53,6 +53,7 @@ void qemu_savevm_send_postcopy_listen(QEMUFile *f);
|
|||
void qemu_savevm_send_postcopy_run(QEMUFile *f);
|
||||
void qemu_savevm_send_postcopy_resume(QEMUFile *f);
|
||||
void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name);
|
||||
void qemu_savevm_maybe_send_switchover_start(QEMUFile *f);
|
||||
|
||||
void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
|
||||
uint16_t len,
|
||||
|
@ -63,11 +64,14 @@ void qemu_savevm_live_state(QEMUFile *f);
|
|||
int qemu_save_device_state(QEMUFile *f);
|
||||
|
||||
int qemu_loadvm_state(QEMUFile *f);
|
||||
void qemu_loadvm_state_cleanup(void);
|
||||
void qemu_loadvm_state_cleanup(MigrationIncomingState *mis);
|
||||
int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
|
||||
int qemu_load_device_state(QEMUFile *f);
|
||||
int qemu_loadvm_approve_switchover(void);
|
||||
int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
|
||||
bool in_postcopy);
|
||||
|
||||
bool qemu_loadvm_load_state_buffer(const char *idstr, uint32_t instance_id,
|
||||
char *buf, size_t len, Error **errp);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -39,6 +39,7 @@ savevm_send_postcopy_run(void) ""
|
|||
savevm_send_postcopy_resume(void) ""
|
||||
savevm_send_colo_enable(void) ""
|
||||
savevm_send_recv_bitmap(char *name) "%s"
|
||||
savevm_send_switchover_start(void) ""
|
||||
savevm_state_setup(void) ""
|
||||
savevm_state_resume_prepare(void) ""
|
||||
savevm_state_header(void) ""
|
||||
|
|
|
@ -620,7 +620,9 @@ class MigrationDump(object):
|
|||
QEMU_VM_SUBSECTION = 0x05
|
||||
QEMU_VM_VMDESCRIPTION = 0x06
|
||||
QEMU_VM_CONFIGURATION = 0x07
|
||||
QEMU_VM_COMMAND = 0x08
|
||||
QEMU_VM_SECTION_FOOTER= 0x7e
|
||||
QEMU_MIG_CMD_SWITCHOVER_START = 0x0b
|
||||
|
||||
def __init__(self, filename):
|
||||
self.section_classes = {
|
||||
|
@ -685,6 +687,15 @@ class MigrationDump(object):
|
|||
elif section_type == self.QEMU_VM_SECTION_PART or section_type == self.QEMU_VM_SECTION_END:
|
||||
section_id = file.read32()
|
||||
self.sections[section_id].read()
|
||||
elif section_type == self.QEMU_VM_COMMAND:
|
||||
command_type = file.read16()
|
||||
command_data_len = file.read16()
|
||||
if command_type != self.QEMU_MIG_CMD_SWITCHOVER_START:
|
||||
raise Exception("Unknown QEMU_VM_COMMAND: %x" %
|
||||
(command_type))
|
||||
if command_data_len != 0:
|
||||
raise Exception("Invalid SWITCHOVER_START length: %x" %
|
||||
(command_data_len))
|
||||
elif section_type == self.QEMU_VM_SECTION_FOOTER:
|
||||
read_section_id = file.read32()
|
||||
if read_section_id != section_id:
|
||||
|
|
|
@ -43,10 +43,10 @@ static void done_cb(void *opaque, int ret)
|
|||
active--;
|
||||
}
|
||||
|
||||
static void test_submit(void)
|
||||
static void test_submit_no_complete(void)
|
||||
{
|
||||
WorkerTestData data = { .n = 0 };
|
||||
thread_pool_submit(worker_cb, &data);
|
||||
thread_pool_submit_aio(worker_cb, &data, NULL, NULL);
|
||||
while (data.n == 0) {
|
||||
aio_poll(ctx, true);
|
||||
}
|
||||
|
@ -236,7 +236,7 @@ int main(int argc, char **argv)
|
|||
ctx = qemu_get_current_aio_context();
|
||||
|
||||
g_test_init(&argc, &argv, NULL);
|
||||
g_test_add_func("/thread-pool/submit", test_submit);
|
||||
g_test_add_func("/thread-pool/submit-no-complete", test_submit_no_complete);
|
||||
g_test_add_func("/thread-pool/submit-aio", test_submit_aio);
|
||||
g_test_add_func("/thread-pool/submit-co", test_submit_co);
|
||||
g_test_add_func("/thread-pool/submit-many", test_submit_many);
|
||||
|
|
|
@ -369,7 +369,7 @@ aio_ctx_finalize(GSource *source)
|
|||
QEMUBH *bh;
|
||||
unsigned flags;
|
||||
|
||||
thread_pool_free(ctx->thread_pool);
|
||||
thread_pool_free_aio(ctx->thread_pool);
|
||||
|
||||
#ifdef CONFIG_LINUX_AIO
|
||||
if (ctx->linux_aio) {
|
||||
|
@ -435,10 +435,10 @@ GSource *aio_get_g_source(AioContext *ctx)
|
|||
return &ctx->source;
|
||||
}
|
||||
|
||||
ThreadPool *aio_get_thread_pool(AioContext *ctx)
|
||||
ThreadPoolAio *aio_get_thread_pool(AioContext *ctx)
|
||||
{
|
||||
if (!ctx->thread_pool) {
|
||||
ctx->thread_pool = thread_pool_new(ctx);
|
||||
ctx->thread_pool = thread_pool_new_aio(ctx);
|
||||
}
|
||||
return ctx->thread_pool;
|
||||
}
|
||||
|
|
|
@ -23,9 +23,9 @@
|
|||
#include "block/thread-pool.h"
|
||||
#include "qemu/main-loop.h"
|
||||
|
||||
static void do_spawn_thread(ThreadPool *pool);
|
||||
static void do_spawn_thread(ThreadPoolAio *pool);
|
||||
|
||||
typedef struct ThreadPoolElement ThreadPoolElement;
|
||||
typedef struct ThreadPoolElementAio ThreadPoolElementAio;
|
||||
|
||||
enum ThreadState {
|
||||
THREAD_QUEUED,
|
||||
|
@ -33,9 +33,9 @@ enum ThreadState {
|
|||
THREAD_DONE,
|
||||
};
|
||||
|
||||
struct ThreadPoolElement {
|
||||
struct ThreadPoolElementAio {
|
||||
BlockAIOCB common;
|
||||
ThreadPool *pool;
|
||||
ThreadPoolAio *pool;
|
||||
ThreadPoolFunc *func;
|
||||
void *arg;
|
||||
|
||||
|
@ -47,13 +47,13 @@ struct ThreadPoolElement {
|
|||
int ret;
|
||||
|
||||
/* Access to this list is protected by lock. */
|
||||
QTAILQ_ENTRY(ThreadPoolElement) reqs;
|
||||
QTAILQ_ENTRY(ThreadPoolElementAio) reqs;
|
||||
|
||||
/* This list is only written by the thread pool's mother thread. */
|
||||
QLIST_ENTRY(ThreadPoolElement) all;
|
||||
QLIST_ENTRY(ThreadPoolElementAio) all;
|
||||
};
|
||||
|
||||
struct ThreadPool {
|
||||
struct ThreadPoolAio {
|
||||
AioContext *ctx;
|
||||
QEMUBH *completion_bh;
|
||||
QemuMutex lock;
|
||||
|
@ -62,10 +62,10 @@ struct ThreadPool {
|
|||
QEMUBH *new_thread_bh;
|
||||
|
||||
/* The following variables are only accessed from one AioContext. */
|
||||
QLIST_HEAD(, ThreadPoolElement) head;
|
||||
QLIST_HEAD(, ThreadPoolElementAio) head;
|
||||
|
||||
/* The following variables are protected by lock. */
|
||||
QTAILQ_HEAD(, ThreadPoolElement) request_list;
|
||||
QTAILQ_HEAD(, ThreadPoolElementAio) request_list;
|
||||
int cur_threads;
|
||||
int idle_threads;
|
||||
int new_threads; /* backlog of threads we need to create */
|
||||
|
@ -76,14 +76,14 @@ struct ThreadPool {
|
|||
|
||||
static void *worker_thread(void *opaque)
|
||||
{
|
||||
ThreadPool *pool = opaque;
|
||||
ThreadPoolAio *pool = opaque;
|
||||
|
||||
qemu_mutex_lock(&pool->lock);
|
||||
pool->pending_threads--;
|
||||
do_spawn_thread(pool);
|
||||
|
||||
while (pool->cur_threads <= pool->max_threads) {
|
||||
ThreadPoolElement *req;
|
||||
ThreadPoolElementAio *req;
|
||||
int ret;
|
||||
|
||||
if (QTAILQ_EMPTY(&pool->request_list)) {
|
||||
|
@ -131,7 +131,7 @@ static void *worker_thread(void *opaque)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
static void do_spawn_thread(ThreadPool *pool)
|
||||
static void do_spawn_thread(ThreadPoolAio *pool)
|
||||
{
|
||||
QemuThread t;
|
||||
|
||||
|
@ -148,14 +148,14 @@ static void do_spawn_thread(ThreadPool *pool)
|
|||
|
||||
static void spawn_thread_bh_fn(void *opaque)
|
||||
{
|
||||
ThreadPool *pool = opaque;
|
||||
ThreadPoolAio *pool = opaque;
|
||||
|
||||
qemu_mutex_lock(&pool->lock);
|
||||
do_spawn_thread(pool);
|
||||
qemu_mutex_unlock(&pool->lock);
|
||||
}
|
||||
|
||||
static void spawn_thread(ThreadPool *pool)
|
||||
static void spawn_thread(ThreadPoolAio *pool)
|
||||
{
|
||||
pool->cur_threads++;
|
||||
pool->new_threads++;
|
||||
|
@ -173,8 +173,8 @@ static void spawn_thread(ThreadPool *pool)
|
|||
|
||||
static void thread_pool_completion_bh(void *opaque)
|
||||
{
|
||||
ThreadPool *pool = opaque;
|
||||
ThreadPoolElement *elem, *next;
|
||||
ThreadPoolAio *pool = opaque;
|
||||
ThreadPoolElementAio *elem, *next;
|
||||
|
||||
defer_call_begin(); /* cb() may use defer_call() to coalesce work */
|
||||
|
||||
|
@ -184,8 +184,8 @@ restart:
|
|||
continue;
|
||||
}
|
||||
|
||||
trace_thread_pool_complete(pool, elem, elem->common.opaque,
|
||||
elem->ret);
|
||||
trace_thread_pool_complete_aio(pool, elem, elem->common.opaque,
|
||||
elem->ret);
|
||||
QLIST_REMOVE(elem, all);
|
||||
|
||||
if (elem->common.cb) {
|
||||
|
@ -217,10 +217,10 @@ restart:
|
|||
|
||||
static void thread_pool_cancel(BlockAIOCB *acb)
|
||||
{
|
||||
ThreadPoolElement *elem = (ThreadPoolElement *)acb;
|
||||
ThreadPool *pool = elem->pool;
|
||||
ThreadPoolElementAio *elem = (ThreadPoolElementAio *)acb;
|
||||
ThreadPoolAio *pool = elem->pool;
|
||||
|
||||
trace_thread_pool_cancel(elem, elem->common.opaque);
|
||||
trace_thread_pool_cancel_aio(elem, elem->common.opaque);
|
||||
|
||||
QEMU_LOCK_GUARD(&pool->lock);
|
||||
if (elem->state == THREAD_QUEUED) {
|
||||
|
@ -234,16 +234,16 @@ static void thread_pool_cancel(BlockAIOCB *acb)
|
|||
}
|
||||
|
||||
static const AIOCBInfo thread_pool_aiocb_info = {
|
||||
.aiocb_size = sizeof(ThreadPoolElement),
|
||||
.aiocb_size = sizeof(ThreadPoolElementAio),
|
||||
.cancel_async = thread_pool_cancel,
|
||||
};
|
||||
|
||||
BlockAIOCB *thread_pool_submit_aio(ThreadPoolFunc *func, void *arg,
|
||||
BlockCompletionFunc *cb, void *opaque)
|
||||
{
|
||||
ThreadPoolElement *req;
|
||||
ThreadPoolElementAio *req;
|
||||
AioContext *ctx = qemu_get_current_aio_context();
|
||||
ThreadPool *pool = aio_get_thread_pool(ctx);
|
||||
ThreadPoolAio *pool = aio_get_thread_pool(ctx);
|
||||
|
||||
/* Assert that the thread submitting work is the same running the pool */
|
||||
assert(pool->ctx == qemu_get_current_aio_context());
|
||||
|
@ -256,7 +256,7 @@ BlockAIOCB *thread_pool_submit_aio(ThreadPoolFunc *func, void *arg,
|
|||
|
||||
QLIST_INSERT_HEAD(&pool->head, req, all);
|
||||
|
||||
trace_thread_pool_submit(pool, req, arg);
|
||||
trace_thread_pool_submit_aio(pool, req, arg);
|
||||
|
||||
qemu_mutex_lock(&pool->lock);
|
||||
if (pool->idle_threads == 0 && pool->cur_threads < pool->max_threads) {
|
||||
|
@ -290,12 +290,7 @@ int coroutine_fn thread_pool_submit_co(ThreadPoolFunc *func, void *arg)
|
|||
return tpc.ret;
|
||||
}
|
||||
|
||||
void thread_pool_submit(ThreadPoolFunc *func, void *arg)
|
||||
{
|
||||
thread_pool_submit_aio(func, arg, NULL, NULL);
|
||||
}
|
||||
|
||||
void thread_pool_update_params(ThreadPool *pool, AioContext *ctx)
|
||||
void thread_pool_update_params(ThreadPoolAio *pool, AioContext *ctx)
|
||||
{
|
||||
qemu_mutex_lock(&pool->lock);
|
||||
|
||||
|
@ -322,7 +317,7 @@ void thread_pool_update_params(ThreadPool *pool, AioContext *ctx)
|
|||
qemu_mutex_unlock(&pool->lock);
|
||||
}
|
||||
|
||||
static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx)
|
||||
static void thread_pool_init_one(ThreadPoolAio *pool, AioContext *ctx)
|
||||
{
|
||||
if (!ctx) {
|
||||
ctx = qemu_get_aio_context();
|
||||
|
@ -342,14 +337,14 @@ static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx)
|
|||
thread_pool_update_params(pool, ctx);
|
||||
}
|
||||
|
||||
ThreadPool *thread_pool_new(AioContext *ctx)
|
||||
ThreadPoolAio *thread_pool_new_aio(AioContext *ctx)
|
||||
{
|
||||
ThreadPool *pool = g_new(ThreadPool, 1);
|
||||
ThreadPoolAio *pool = g_new(ThreadPoolAio, 1);
|
||||
thread_pool_init_one(pool, ctx);
|
||||
return pool;
|
||||
}
|
||||
|
||||
void thread_pool_free(ThreadPool *pool)
|
||||
void thread_pool_free_aio(ThreadPoolAio *pool)
|
||||
{
|
||||
if (!pool) {
|
||||
return;
|
||||
|
@ -379,3 +374,122 @@ void thread_pool_free(ThreadPool *pool)
|
|||
qemu_mutex_destroy(&pool->lock);
|
||||
g_free(pool);
|
||||
}
|
||||
|
||||
struct ThreadPool {
|
||||
GThreadPool *t;
|
||||
size_t cur_work;
|
||||
QemuMutex cur_work_lock;
|
||||
QemuCond all_finished_cond;
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
ThreadPoolFunc *func;
|
||||
void *opaque;
|
||||
GDestroyNotify opaque_destroy;
|
||||
} ThreadPoolElement;
|
||||
|
||||
static void thread_pool_func(gpointer data, gpointer user_data)
|
||||
{
|
||||
ThreadPool *pool = user_data;
|
||||
g_autofree ThreadPoolElement *el = data;
|
||||
|
||||
el->func(el->opaque);
|
||||
|
||||
if (el->opaque_destroy) {
|
||||
el->opaque_destroy(el->opaque);
|
||||
}
|
||||
|
||||
QEMU_LOCK_GUARD(&pool->cur_work_lock);
|
||||
|
||||
assert(pool->cur_work > 0);
|
||||
pool->cur_work--;
|
||||
|
||||
if (pool->cur_work == 0) {
|
||||
qemu_cond_signal(&pool->all_finished_cond);
|
||||
}
|
||||
}
|
||||
|
||||
ThreadPool *thread_pool_new(void)
|
||||
{
|
||||
ThreadPool *pool = g_new(ThreadPool, 1);
|
||||
|
||||
pool->cur_work = 0;
|
||||
qemu_mutex_init(&pool->cur_work_lock);
|
||||
qemu_cond_init(&pool->all_finished_cond);
|
||||
|
||||
pool->t = g_thread_pool_new(thread_pool_func, pool, 0, TRUE, NULL);
|
||||
/*
|
||||
* g_thread_pool_new() can only return errors if initial thread(s)
|
||||
* creation fails but we ask for 0 initial threads above.
|
||||
*/
|
||||
assert(pool->t);
|
||||
|
||||
return pool;
|
||||
}
|
||||
|
||||
void thread_pool_free(ThreadPool *pool)
|
||||
{
|
||||
/*
|
||||
* With _wait = TRUE this effectively waits for all
|
||||
* previously submitted work to complete first.
|
||||
*/
|
||||
g_thread_pool_free(pool->t, FALSE, TRUE);
|
||||
|
||||
qemu_cond_destroy(&pool->all_finished_cond);
|
||||
qemu_mutex_destroy(&pool->cur_work_lock);
|
||||
|
||||
g_free(pool);
|
||||
}
|
||||
|
||||
void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func,
|
||||
void *opaque, GDestroyNotify opaque_destroy)
|
||||
{
|
||||
ThreadPoolElement *el = g_new(ThreadPoolElement, 1);
|
||||
|
||||
el->func = func;
|
||||
el->opaque = opaque;
|
||||
el->opaque_destroy = opaque_destroy;
|
||||
|
||||
WITH_QEMU_LOCK_GUARD(&pool->cur_work_lock) {
|
||||
pool->cur_work++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Ignore the return value since this function can only return errors
|
||||
* if creation of an additional thread fails but even in this case the
|
||||
* provided work is still getting queued (just for the existing threads).
|
||||
*/
|
||||
g_thread_pool_push(pool->t, el, NULL);
|
||||
}
|
||||
|
||||
void thread_pool_submit_immediate(ThreadPool *pool, ThreadPoolFunc *func,
|
||||
void *opaque, GDestroyNotify opaque_destroy)
|
||||
{
|
||||
thread_pool_submit(pool, func, opaque, opaque_destroy);
|
||||
thread_pool_adjust_max_threads_to_work(pool);
|
||||
}
|
||||
|
||||
void thread_pool_wait(ThreadPool *pool)
|
||||
{
|
||||
QEMU_LOCK_GUARD(&pool->cur_work_lock);
|
||||
|
||||
while (pool->cur_work > 0) {
|
||||
qemu_cond_wait(&pool->all_finished_cond,
|
||||
&pool->cur_work_lock);
|
||||
}
|
||||
}
|
||||
|
||||
bool thread_pool_set_max_threads(ThreadPool *pool,
|
||||
int max_threads)
|
||||
{
|
||||
assert(max_threads > 0);
|
||||
|
||||
return g_thread_pool_set_max_threads(pool->t, max_threads, NULL);
|
||||
}
|
||||
|
||||
bool thread_pool_adjust_max_threads_to_work(ThreadPool *pool)
|
||||
{
|
||||
QEMU_LOCK_GUARD(&pool->cur_work_lock);
|
||||
|
||||
return thread_pool_set_max_threads(pool, pool->cur_work);
|
||||
}
|
||||
|
|
|
@ -14,9 +14,9 @@ aio_co_schedule_bh_cb(void *ctx, void *co) "ctx %p co %p"
|
|||
reentrant_aio(void *ctx, const char *name) "ctx %p name %s"
|
||||
|
||||
# thread-pool.c
|
||||
thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
|
||||
thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
|
||||
thread_pool_cancel(void *req, void *opaque) "req %p opaque %p"
|
||||
thread_pool_submit_aio(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
|
||||
thread_pool_complete_aio(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
|
||||
thread_pool_cancel_aio(void *req, void *opaque) "req %p opaque %p"
|
||||
|
||||
# buffer.c
|
||||
buffer_resize(const char *buf, size_t olen, size_t len) "%s: old %zd, new %zd"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue