pcie_sriov: Allow user to create SR-IOV device

A user can create a SR-IOV device by specifying the PF with the
sriov-pf property of the VFs. The VFs must be added before the PF.

A user-creatable VF must have PCIDeviceClass::sriov_vf_user_creatable
set. Such a VF cannot refer to the PF because it is created before the
PF.

A PF that user-creatable VFs can be attached calls
pcie_sriov_pf_init_from_user_created_vfs() during realization and
pcie_sriov_pf_exit() when exiting.

Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Message-Id: <20250314-sriov-v9-5-57dae8ae3ab5@daynix.com>
Tested-by: Yui Washizu <yui.washidu@gmail.com>
Tested-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
This commit is contained in:
Akihiko Odaki 2025-03-14 15:14:54 +09:00 committed by Michael S. Tsirkin
parent d2f5bb7849
commit 19e55471d4
4 changed files with 291 additions and 83 deletions

View file

@ -101,6 +101,7 @@ static const Property pci_props[] = {
QEMU_PCIE_ARI_NEXTFN_1_BITNR, false), QEMU_PCIE_ARI_NEXTFN_1_BITNR, false),
DEFINE_PROP_SIZE32("x-max-bounce-buffer-size", PCIDevice, DEFINE_PROP_SIZE32("x-max-bounce-buffer-size", PCIDevice,
max_bounce_buffer_size, DEFAULT_MAX_BOUNCE_BUFFER_SIZE), max_bounce_buffer_size, DEFAULT_MAX_BOUNCE_BUFFER_SIZE),
DEFINE_PROP_STRING("sriov-pf", PCIDevice, sriov_pf),
DEFINE_PROP_BIT("x-pcie-ext-tag", PCIDevice, cap_present, DEFINE_PROP_BIT("x-pcie-ext-tag", PCIDevice, cap_present,
QEMU_PCIE_EXT_TAG_BITNR, true), QEMU_PCIE_EXT_TAG_BITNR, true),
{ .name = "busnr", .info = &prop_pci_busnr }, { .name = "busnr", .info = &prop_pci_busnr },
@ -1112,13 +1113,8 @@ static void pci_init_multifunction(PCIBus *bus, PCIDevice *dev, Error **errp)
dev->config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION; dev->config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
} }
/* /* SR/IOV is not handled here. */
* With SR/IOV and ARI, a device at function 0 need not be a multifunction if (pci_is_vf(dev)) {
* device, as it may just be a VF that ended up with function 0 in
* the legacy PCI interpretation. Avoid failing in such cases:
*/
if (pci_is_vf(dev) &&
dev->exp.sriov_vf.pf->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
return; return;
} }
@ -1151,7 +1147,8 @@ static void pci_init_multifunction(PCIBus *bus, PCIDevice *dev, Error **errp)
} }
/* function 0 indicates single function, so function > 0 must be NULL */ /* function 0 indicates single function, so function > 0 must be NULL */
for (func = 1; func < PCI_FUNC_MAX; ++func) { for (func = 1; func < PCI_FUNC_MAX; ++func) {
if (bus->devices[PCI_DEVFN(slot, func)]) { PCIDevice *device = bus->devices[PCI_DEVFN(slot, func)];
if (device && !pci_is_vf(device)) {
error_setg(errp, "PCI: %x.0 indicates single function, " error_setg(errp, "PCI: %x.0 indicates single function, "
"but %x.%x is already populated.", "but %x.%x is already populated.",
slot, slot, func); slot, slot, func);
@ -1439,6 +1436,7 @@ static void pci_qdev_unrealize(DeviceState *dev)
pci_unregister_io_regions(pci_dev); pci_unregister_io_regions(pci_dev);
pci_del_option_rom(pci_dev); pci_del_option_rom(pci_dev);
pcie_sriov_unregister_device(pci_dev);
if (pc->exit) { if (pc->exit) {
pc->exit(pci_dev); pc->exit(pci_dev);
@ -1470,7 +1468,6 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num,
pcibus_t size = memory_region_size(memory); pcibus_t size = memory_region_size(memory);
uint8_t hdr_type; uint8_t hdr_type;
assert(!pci_is_vf(pci_dev)); /* VFs must use pcie_sriov_vf_register_bar */
assert(region_num >= 0); assert(region_num >= 0);
assert(region_num < PCI_NUM_REGIONS); assert(region_num < PCI_NUM_REGIONS);
assert(is_power_of_2(size)); assert(is_power_of_2(size));
@ -1482,7 +1479,6 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num,
r = &pci_dev->io_regions[region_num]; r = &pci_dev->io_regions[region_num];
assert(!r->size); assert(!r->size);
r->addr = PCI_BAR_UNMAPPED;
r->size = size; r->size = size;
r->type = type; r->type = type;
r->memory = memory; r->memory = memory;
@ -1490,6 +1486,18 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num,
? pci_get_bus(pci_dev)->address_space_io ? pci_get_bus(pci_dev)->address_space_io
: pci_get_bus(pci_dev)->address_space_mem; : pci_get_bus(pci_dev)->address_space_mem;
if (pci_is_vf(pci_dev)) {
PCIDevice *pf = pci_dev->exp.sriov_vf.pf;
assert(!pf || type == pf->exp.sriov_pf.vf_bar_type[region_num]);
r->addr = pci_bar_address(pci_dev, region_num, r->type, r->size);
if (r->addr != PCI_BAR_UNMAPPED) {
memory_region_add_subregion_overlap(r->address_space,
r->addr, r->memory, 1);
}
} else {
r->addr = PCI_BAR_UNMAPPED;
wmask = ~(size - 1); wmask = ~(size - 1);
if (region_num == PCI_ROM_SLOT) { if (region_num == PCI_ROM_SLOT) {
/* ROM enable bit is writable */ /* ROM enable bit is writable */
@ -1508,6 +1516,7 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num,
pci_set_long(pci_dev->cmask + addr, 0xffffffff); pci_set_long(pci_dev->cmask + addr, 0xffffffff);
} }
} }
}
static void pci_update_vga(PCIDevice *pci_dev) static void pci_update_vga(PCIDevice *pci_dev)
{ {
@ -2272,6 +2281,11 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
} }
} }
if (!pcie_sriov_register_device(pci_dev, errp)) {
pci_qdev_unrealize(DEVICE(pci_dev));
return;
}
/* /*
* A PCIe Downstream Port that do not have ARI Forwarding enabled must * A PCIe Downstream Port that do not have ARI Forwarding enabled must
* associate only Device 0 with the device attached to the bus * associate only Device 0 with the device attached to the bus

View file

@ -15,11 +15,12 @@
#include "hw/pci/pcie.h" #include "hw/pci/pcie.h"
#include "hw/pci/pci_bus.h" #include "hw/pci/pci_bus.h"
#include "hw/qdev-properties.h" #include "hw/qdev-properties.h"
#include "qemu/error-report.h"
#include "qemu/range.h" #include "qemu/range.h"
#include "qapi/error.h" #include "qapi/error.h"
#include "trace.h" #include "trace.h"
static GHashTable *pfs;
static void unparent_vfs(PCIDevice *dev, uint16_t total_vfs) static void unparent_vfs(PCIDevice *dev, uint16_t total_vfs)
{ {
for (uint16_t i = 0; i < total_vfs; i++) { for (uint16_t i = 0; i < total_vfs; i++) {
@ -31,13 +32,43 @@ static void unparent_vfs(PCIDevice *dev, uint16_t total_vfs)
dev->exp.sriov_pf.vf = NULL; dev->exp.sriov_pf.vf = NULL;
} }
bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, static void register_vfs(PCIDevice *dev)
const char *vfname, uint16_t vf_dev_id, {
uint16_t init_vfs, uint16_t total_vfs, uint16_t num_vfs;
uint16_t vf_offset, uint16_t vf_stride, uint16_t i;
Error **errp) uint16_t sriov_cap = dev->exp.sriov_cap;
assert(sriov_cap > 0);
num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
trace_sriov_register_vfs(dev->name, PCI_SLOT(dev->devfn),
PCI_FUNC(dev->devfn), num_vfs);
for (i = 0; i < num_vfs; i++) {
pci_set_enabled(dev->exp.sriov_pf.vf[i], true);
}
pci_set_word(dev->wmask + sriov_cap + PCI_SRIOV_NUM_VF, 0);
}
static void unregister_vfs(PCIDevice *dev)
{
uint8_t *cfg = dev->config + dev->exp.sriov_cap;
uint16_t i;
trace_sriov_unregister_vfs(dev->name, PCI_SLOT(dev->devfn),
PCI_FUNC(dev->devfn));
for (i = 0; i < pci_get_word(cfg + PCI_SRIOV_TOTAL_VF); i++) {
pci_set_enabled(dev->exp.sriov_pf.vf[i], false);
}
pci_set_word(dev->wmask + dev->exp.sriov_cap + PCI_SRIOV_NUM_VF, 0xffff);
}
static bool pcie_sriov_pf_init_common(PCIDevice *dev, uint16_t offset,
uint16_t vf_dev_id, uint16_t init_vfs,
uint16_t total_vfs, uint16_t vf_offset,
uint16_t vf_stride, Error **errp)
{ {
BusState *bus = qdev_get_parent_bus(&dev->qdev);
int32_t devfn = dev->devfn + vf_offset; int32_t devfn = dev->devfn + vf_offset;
uint8_t *cfg = dev->config + offset; uint8_t *cfg = dev->config + offset;
uint8_t *wmask; uint8_t *wmask;
@ -94,6 +125,28 @@ bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset,
qdev_prop_set_bit(&dev->qdev, "multifunction", true); qdev_prop_set_bit(&dev->qdev, "multifunction", true);
return true;
}
bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset,
const char *vfname, uint16_t vf_dev_id,
uint16_t init_vfs, uint16_t total_vfs,
uint16_t vf_offset, uint16_t vf_stride,
Error **errp)
{
BusState *bus = qdev_get_parent_bus(&dev->qdev);
int32_t devfn = dev->devfn + vf_offset;
if (pfs && g_hash_table_contains(pfs, dev->qdev.id)) {
error_setg(errp, "attaching user-created SR-IOV VF unsupported");
return false;
}
if (!pcie_sriov_pf_init_common(dev, offset, vf_dev_id, init_vfs,
total_vfs, vf_offset, vf_stride, errp)) {
return false;
}
dev->exp.sriov_pf.vf = g_new(PCIDevice *, total_vfs); dev->exp.sriov_pf.vf = g_new(PCIDevice *, total_vfs);
for (uint16_t i = 0; i < total_vfs; i++) { for (uint16_t i = 0; i < total_vfs; i++) {
@ -123,8 +176,23 @@ void pcie_sriov_pf_exit(PCIDevice *dev)
{ {
uint8_t *cfg = dev->config + dev->exp.sriov_cap; uint8_t *cfg = dev->config + dev->exp.sriov_cap;
if (dev->exp.sriov_pf.vf_user_created) {
uint16_t ven_id = pci_get_word(dev->config + PCI_VENDOR_ID);
uint16_t total_vfs = pci_get_word(dev->config + PCI_SRIOV_TOTAL_VF);
uint16_t vf_dev_id = pci_get_word(dev->config + PCI_SRIOV_VF_DID);
unregister_vfs(dev);
for (uint16_t i = 0; i < total_vfs; i++) {
dev->exp.sriov_pf.vf[i]->exp.sriov_vf.pf = NULL;
pci_config_set_vendor_id(dev->exp.sriov_pf.vf[i]->config, ven_id);
pci_config_set_device_id(dev->exp.sriov_pf.vf[i]->config, vf_dev_id);
}
} else {
unparent_vfs(dev, pci_get_word(cfg + PCI_SRIOV_TOTAL_VF)); unparent_vfs(dev, pci_get_word(cfg + PCI_SRIOV_TOTAL_VF));
} }
}
void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num, void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num,
uint8_t type, dma_addr_t size) uint8_t type, dma_addr_t size)
@ -156,69 +224,173 @@ void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num,
void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num, void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num,
MemoryRegion *memory) MemoryRegion *memory)
{ {
PCIIORegion *r;
PCIBus *bus = pci_get_bus(dev);
uint8_t type; uint8_t type;
pcibus_t size = memory_region_size(memory);
assert(pci_is_vf(dev)); /* PFs must use pci_register_bar */ assert(dev->exp.sriov_vf.pf);
assert(region_num >= 0);
assert(region_num < PCI_NUM_REGIONS);
type = dev->exp.sriov_vf.pf->exp.sriov_pf.vf_bar_type[region_num]; type = dev->exp.sriov_vf.pf->exp.sriov_pf.vf_bar_type[region_num];
if (!is_power_of_2(size)) { return pci_register_bar(dev, region_num, type, memory);
error_report("%s: PCI region size must be a power"
" of two - type=0x%x, size=0x%"FMT_PCIBUS,
__func__, type, size);
exit(1);
} }
r = &dev->io_regions[region_num]; static gint compare_vf_devfns(gconstpointer a, gconstpointer b)
r->memory = memory;
r->address_space =
type & PCI_BASE_ADDRESS_SPACE_IO
? bus->address_space_io
: bus->address_space_mem;
r->size = size;
r->type = type;
r->addr = pci_bar_address(dev, region_num, r->type, r->size);
if (r->addr != PCI_BAR_UNMAPPED) {
memory_region_add_subregion_overlap(r->address_space,
r->addr, r->memory, 1);
}
}
static void register_vfs(PCIDevice *dev)
{ {
uint16_t num_vfs; return (*(PCIDevice **)a)->devfn - (*(PCIDevice **)b)->devfn;
uint16_t i;
uint16_t sriov_cap = dev->exp.sriov_cap;
assert(sriov_cap > 0);
num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
trace_sriov_register_vfs(dev->name, PCI_SLOT(dev->devfn),
PCI_FUNC(dev->devfn), num_vfs);
for (i = 0; i < num_vfs; i++) {
pci_set_enabled(dev->exp.sriov_pf.vf[i], true);
} }
pci_set_word(dev->wmask + sriov_cap + PCI_SRIOV_NUM_VF, 0); int16_t pcie_sriov_pf_init_from_user_created_vfs(PCIDevice *dev,
} uint16_t offset,
Error **errp)
static void unregister_vfs(PCIDevice *dev)
{ {
uint8_t *cfg = dev->config + dev->exp.sriov_cap; GPtrArray *pf;
PCIDevice **vfs;
BusState *bus = qdev_get_parent_bus(DEVICE(dev));
uint16_t ven_id = pci_get_word(dev->config + PCI_VENDOR_ID);
uint16_t vf_dev_id;
uint16_t vf_offset;
uint16_t vf_stride;
uint16_t i; uint16_t i;
trace_sriov_unregister_vfs(dev->name, PCI_SLOT(dev->devfn), if (!pfs || !dev->qdev.id) {
PCI_FUNC(dev->devfn)); return 0;
for (i = 0; i < pci_get_word(cfg + PCI_SRIOV_TOTAL_VF); i++) {
pci_set_enabled(dev->exp.sriov_pf.vf[i], false);
} }
pci_set_word(dev->wmask + dev->exp.sriov_cap + PCI_SRIOV_NUM_VF, 0xffff); pf = g_hash_table_lookup(pfs, dev->qdev.id);
if (!pf) {
return 0;
}
if (pf->len > UINT16_MAX) {
error_setg(errp, "too many VFs");
return -1;
}
g_ptr_array_sort(pf, compare_vf_devfns);
vfs = (void *)pf->pdata;
if (vfs[0]->devfn <= dev->devfn) {
error_setg(errp, "a VF function number is less than the PF function number");
return -1;
}
vf_dev_id = pci_get_word(vfs[0]->config + PCI_DEVICE_ID);
vf_offset = vfs[0]->devfn - dev->devfn;
vf_stride = pf->len < 2 ? 0 : vfs[1]->devfn - vfs[0]->devfn;
for (i = 0; i < pf->len; i++) {
if (bus != qdev_get_parent_bus(&vfs[i]->qdev)) {
error_setg(errp, "SR-IOV VF parent bus mismatches with PF");
return -1;
}
if (ven_id != pci_get_word(vfs[i]->config + PCI_VENDOR_ID)) {
error_setg(errp, "SR-IOV VF vendor ID mismatches with PF");
return -1;
}
if (vf_dev_id != pci_get_word(vfs[i]->config + PCI_DEVICE_ID)) {
error_setg(errp, "inconsistent SR-IOV VF device IDs");
return -1;
}
for (size_t j = 0; j < PCI_NUM_REGIONS; j++) {
if (vfs[i]->io_regions[j].size != vfs[0]->io_regions[j].size ||
vfs[i]->io_regions[j].type != vfs[0]->io_regions[j].type) {
error_setg(errp, "inconsistent SR-IOV BARs");
return -1;
}
}
if (vfs[i]->devfn - vfs[0]->devfn != vf_stride * i) {
error_setg(errp, "inconsistent SR-IOV stride");
return -1;
}
}
if (!pcie_sriov_pf_init_common(dev, offset, vf_dev_id, pf->len,
pf->len, vf_offset, vf_stride, errp)) {
return -1;
}
for (i = 0; i < pf->len; i++) {
vfs[i]->exp.sriov_vf.pf = dev;
vfs[i]->exp.sriov_vf.vf_number = i;
/* set vid/did according to sr/iov spec - they are not used */
pci_config_set_vendor_id(vfs[i]->config, 0xffff);
pci_config_set_device_id(vfs[i]->config, 0xffff);
}
dev->exp.sriov_pf.vf = vfs;
dev->exp.sriov_pf.vf_user_created = true;
for (i = 0; i < PCI_NUM_REGIONS; i++) {
PCIIORegion *region = &vfs[0]->io_regions[i];
if (region->size) {
pcie_sriov_pf_init_vf_bar(dev, i, region->type, region->size);
}
}
return PCI_EXT_CAP_SRIOV_SIZEOF;
}
bool pcie_sriov_register_device(PCIDevice *dev, Error **errp)
{
if (!dev->exp.sriov_pf.vf && dev->qdev.id &&
pfs && g_hash_table_contains(pfs, dev->qdev.id)) {
error_setg(errp, "attaching user-created SR-IOV VF unsupported");
return false;
}
if (dev->sriov_pf) {
PCIDevice *pci_pf;
GPtrArray *pf;
if (!PCI_DEVICE_GET_CLASS(dev)->sriov_vf_user_creatable) {
error_setg(errp, "user cannot create SR-IOV VF with this device type");
return false;
}
if (!pci_is_express(dev)) {
error_setg(errp, "PCI Express is required for SR-IOV VF");
return false;
}
if (!pci_qdev_find_device(dev->sriov_pf, &pci_pf)) {
error_setg(errp, "PCI device specified as SR-IOV PF already exists");
return false;
}
if (!pfs) {
pfs = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
}
pf = g_hash_table_lookup(pfs, dev->sriov_pf);
if (!pf) {
pf = g_ptr_array_new();
g_hash_table_insert(pfs, g_strdup(dev->sriov_pf), pf);
}
g_ptr_array_add(pf, dev);
}
return true;
}
void pcie_sriov_unregister_device(PCIDevice *dev)
{
if (dev->sriov_pf && pfs) {
GPtrArray *pf = g_hash_table_lookup(pfs, dev->sriov_pf);
if (pf) {
g_ptr_array_remove_fast(pf, dev);
if (!pf->len) {
g_hash_table_remove(pfs, dev->sriov_pf);
g_ptr_array_free(pf, FALSE);
}
}
}
} }
void pcie_sriov_config_write(PCIDevice *dev, uint32_t address, void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
@ -314,7 +486,7 @@ void pcie_sriov_pf_add_sup_pgsize(PCIDevice *dev, uint16_t opt_sup_pgsize)
uint16_t pcie_sriov_vf_number(PCIDevice *dev) uint16_t pcie_sriov_vf_number(PCIDevice *dev)
{ {
assert(pci_is_vf(dev)); assert(dev->exp.sriov_vf.pf);
return dev->exp.sriov_vf.vf_number; return dev->exp.sriov_vf.vf_number;
} }

View file

@ -38,6 +38,8 @@ struct PCIDeviceClass {
uint16_t subsystem_id; /* only for header type = 0 */ uint16_t subsystem_id; /* only for header type = 0 */
const char *romfile; /* rom bar */ const char *romfile; /* rom bar */
bool sriov_vf_user_creatable;
}; };
enum PCIReqIDType { enum PCIReqIDType {
@ -177,6 +179,8 @@ struct PCIDevice {
* realizing the device. * realizing the device.
*/ */
uint32_t max_bounce_buffer_size; uint32_t max_bounce_buffer_size;
char *sriov_pf;
}; };
static inline int pci_intx(PCIDevice *pci_dev) static inline int pci_intx(PCIDevice *pci_dev)
@ -209,7 +213,7 @@ static inline int pci_is_express_downstream_port(const PCIDevice *d)
static inline int pci_is_vf(const PCIDevice *d) static inline int pci_is_vf(const PCIDevice *d)
{ {
return d->exp.sriov_vf.pf != NULL; return d->sriov_pf || d->exp.sriov_vf.pf != NULL;
} }
static inline uint32_t pci_config_size(const PCIDevice *d) static inline uint32_t pci_config_size(const PCIDevice *d)

View file

@ -18,6 +18,7 @@
typedef struct PCIESriovPF { typedef struct PCIESriovPF {
uint8_t vf_bar_type[PCI_NUM_REGIONS]; /* Store type for each VF bar */ uint8_t vf_bar_type[PCI_NUM_REGIONS]; /* Store type for each VF bar */
PCIDevice **vf; /* Pointer to an array of num_vfs VF devices */ PCIDevice **vf; /* Pointer to an array of num_vfs VF devices */
bool vf_user_created; /* If VFs are created by user */
} PCIESriovPF; } PCIESriovPF;
typedef struct PCIESriovVF { typedef struct PCIESriovVF {
@ -40,6 +41,23 @@ void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num,
void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num, void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num,
MemoryRegion *memory); MemoryRegion *memory);
/**
* pcie_sriov_pf_init_from_user_created_vfs() - Initialize PF with user-created
* VFs.
* @dev: A PCIe device being realized.
* @offset: The offset of the SR-IOV capability.
* @errp: pointer to Error*, to store an error if it happens.
*
* Return: The size of added capability. 0 if the user did not create VFs.
* -1 if failed.
*/
int16_t pcie_sriov_pf_init_from_user_created_vfs(PCIDevice *dev,
uint16_t offset,
Error **errp);
bool pcie_sriov_register_device(PCIDevice *dev, Error **errp);
void pcie_sriov_unregister_device(PCIDevice *dev);
/* /*
* Default (minimal) page size support values * Default (minimal) page size support values
* as required by the SR/IOV standard: * as required by the SR/IOV standard: