vfio queue:

* Support for non 64b IOVA space
 * Introduction of a PCIIOMMUOps callback structure to ease future
   extensions
 * Fix for a buffer overrun when writing the VF token
 * PPC cleanups preparing ground for IOMMUFD support
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEEoPZlSPBIlev+awtgUaNDx8/77KEFAmVI+bIACgkQUaNDx8/7
 7KHW4g/9FmgX0k2Elm1BAul3slJtuBT8/iHKfK19rhXICxhxS5xBWJA8FmosTWAT
 91YqQJhOHARxLd9VROfv8Fq8sAo+Ys8bP3PTXh5satjY5gR9YtmMSVqvsAVLn7lv
 a/0xp7wPJt2UeKzvRNUqFXNr7yHPwxFxbJbmmAJbNte8p+TfE2qvojbJnu7BjJbg
 sTtS/vFWNJwtuNYTkMRoiZaUKEoEZ8LnslOqKUjgeO59g4i3Dq8e2JCmHANPFWUK
 cWmr7AqcXgXEnLSDWTtfN53bjcSCYkFVb4WV4Wv1/7hUF5jQ4UR0l3B64xWe0M3/
 Prak3bWOM/o7JwLBsgaWPngXA9V0WFBTXVF4x5qTwhuR1sSV8MxUvTKxI+qqiEzA
 FjU89oSZ+zXId/hEUuTL6vn1Th8/6mwD0L9ORchNOQUKzCjBzI4MVPB09nM3AdPC
 LGThlufsZktdoU2KjMHpc+gMIXQYsxkgvm07K5iZTZ5eJ4tV5KB0aPvTZppGUxe1
 YY9og9F3hxjDHQtEuSY2rzBQI7nrUpd1ZI5ut/3ZgDWkqD6aGRtMme4n4GsGsYb2
 Ht9+d2RL9S8uPUh+7rV8K/N3+vXgXRaEYTuAScKtflEbA7YnZA5nUdMng8x0kMTQ
 Y73XCd4UGWDfSSZsgaIHGkM/MRIHgmlrfcwPkWqWW9vF+92O6Hw=
 =/Du0
 -----END PGP SIGNATURE-----

Merge tag 'pull-vfio-20231106' of https://github.com/legoater/qemu into staging

vfio queue:

* Support for non 64b IOVA space
* Introduction of a PCIIOMMUOps callback structure to ease future
  extensions
* Fix for a buffer overrun when writing the VF token
* PPC cleanups preparing ground for IOMMUFD support

# -----BEGIN PGP SIGNATURE-----
#
# iQIzBAABCAAdFiEEoPZlSPBIlev+awtgUaNDx8/77KEFAmVI+bIACgkQUaNDx8/7
# 7KHW4g/9FmgX0k2Elm1BAul3slJtuBT8/iHKfK19rhXICxhxS5xBWJA8FmosTWAT
# 91YqQJhOHARxLd9VROfv8Fq8sAo+Ys8bP3PTXh5satjY5gR9YtmMSVqvsAVLn7lv
# a/0xp7wPJt2UeKzvRNUqFXNr7yHPwxFxbJbmmAJbNte8p+TfE2qvojbJnu7BjJbg
# sTtS/vFWNJwtuNYTkMRoiZaUKEoEZ8LnslOqKUjgeO59g4i3Dq8e2JCmHANPFWUK
# cWmr7AqcXgXEnLSDWTtfN53bjcSCYkFVb4WV4Wv1/7hUF5jQ4UR0l3B64xWe0M3/
# Prak3bWOM/o7JwLBsgaWPngXA9V0WFBTXVF4x5qTwhuR1sSV8MxUvTKxI+qqiEzA
# FjU89oSZ+zXId/hEUuTL6vn1Th8/6mwD0L9ORchNOQUKzCjBzI4MVPB09nM3AdPC
# LGThlufsZktdoU2KjMHpc+gMIXQYsxkgvm07K5iZTZ5eJ4tV5KB0aPvTZppGUxe1
# YY9og9F3hxjDHQtEuSY2rzBQI7nrUpd1ZI5ut/3ZgDWkqD6aGRtMme4n4GsGsYb2
# Ht9+d2RL9S8uPUh+7rV8K/N3+vXgXRaEYTuAScKtflEbA7YnZA5nUdMng8x0kMTQ
# Y73XCd4UGWDfSSZsgaIHGkM/MRIHgmlrfcwPkWqWW9vF+92O6Hw=
# =/Du0
# -----END PGP SIGNATURE-----
# gpg: Signature made Mon 06 Nov 2023 22:35:30 HKT
# gpg:                using RSA key A0F66548F04895EBFE6B0B6051A343C7CFFBECA1
# gpg: Good signature from "Cédric Le Goater <clg@redhat.com>" [unknown]
# gpg:                 aka "Cédric Le Goater <clg@kaod.org>" [unknown]
# gpg: WARNING: This key is not certified with a trusted signature!
# gpg:          There is no indication that the signature belongs to the owner.
# Primary key fingerprint: A0F6 6548 F048 95EB FE6B  0B60 51A3 43C7 CFFB ECA1

* tag 'pull-vfio-20231106' of https://github.com/legoater/qemu: (22 commits)
  vfio/common: Move vfio_host_win_add/del into spapr.c
  vfio/spapr: Make vfio_spapr_create/remove_window static
  vfio/container: Move spapr specific init/deinit into spapr.c
  vfio/container: Move vfio_container_add/del_section_window into spapr.c
  vfio/container: Move IBM EEH related functions into spapr_pci_vfio.c
  util/uuid: Define UUID_STR_LEN from UUID_NONE string
  util/uuid: Remove UUID_FMT_LEN
  vfio/pci: Fix buffer overrun when writing the VF token
  util/uuid: Add UUID_STR_LEN definition
  hw/pci: modify pci_setup_iommu() to set PCIIOMMUOps
  test: Add some tests for range and resv-mem helpers
  virtio-iommu: Consolidate host reserved regions and property set ones
  virtio-iommu: Implement set_iova_ranges() callback
  virtio-iommu: Record whether a probe request has been issued
  range: Introduce range_inverse_array()
  virtio-iommu: Introduce per IOMMUDevice reserved regions
  util/reserved-region: Add new ReservedRegion helpers
  range: Make range_compare() public
  virtio-iommu: Rename reserved_regions into prop_resv_regions
  vfio: Collect container iova range info
  ...

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
This commit is contained in:
Stefan Hajnoczi 2023-11-07 09:41:52 +08:00
commit bb59f3548f
52 changed files with 1304 additions and 453 deletions

View file

@ -738,6 +738,10 @@ static AddressSpace *typhoon_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn)
return &s->pchip.iommu_as;
}
static const PCIIOMMUOps typhoon_iommu_ops = {
.get_address_space = typhoon_pci_dma_iommu,
};
static void typhoon_set_irq(void *opaque, int irq, int level)
{
TyphoonState *s = opaque;
@ -897,7 +901,7 @@ PCIBus *typhoon_init(MemoryRegion *ram, qemu_irq *p_isa_irq,
"iommu-typhoon", UINT64_MAX);
address_space_init(&s->pchip.iommu_as, MEMORY_REGION(&s->pchip.iommu),
"pchip0-pci");
pci_setup_iommu(b, typhoon_pci_dma_iommu, s);
pci_setup_iommu(b, &typhoon_iommu_ops, s);
/* Pchip0 PCI special/interrupt acknowledge, 0x801.F800.0000, 64MB. */
memory_region_init_io(&s->pchip.reg_iack, OBJECT(s), &alpha_pci_iack_ops,

View file

@ -605,6 +605,10 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn)
return &sdev->as;
}
static const PCIIOMMUOps smmu_ops = {
.get_address_space = smmu_find_add_as,
};
IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid)
{
uint8_t bus_n, devfn;
@ -661,7 +665,7 @@ static void smmu_base_realize(DeviceState *dev, Error **errp)
s->smmu_pcibus_by_busptr = g_hash_table_new(NULL, NULL);
if (s->primary_bus) {
pci_setup_iommu(s->primary_bus, smmu_find_add_as, s);
pci_setup_iommu(s->primary_bus, &smmu_ops, s);
} else {
error_setg(errp, "SMMU is not attached to any PCI bus!");
}

View file

@ -705,7 +705,7 @@ static void get_reserved_region(Object *obj, Visitor *v, const char *name,
int rc;
rc = snprintf(buffer, sizeof(buffer), "0x%"PRIx64":0x%"PRIx64":%u",
rr->low, rr->high, rr->type);
range_lob(&rr->range), range_upb(&rr->range), rr->type);
assert(rc < sizeof(buffer));
visit_type_str(v, name, &p, errp);
@ -717,6 +717,7 @@ static void set_reserved_region(Object *obj, Visitor *v, const char *name,
Property *prop = opaque;
ReservedRegion *rr = object_field_prop_ptr(obj, prop);
const char *endptr;
uint64_t lob, upb;
char *str;
int ret;
@ -724,7 +725,7 @@ static void set_reserved_region(Object *obj, Visitor *v, const char *name,
return;
}
ret = qemu_strtou64(str, &endptr, 16, &rr->low);
ret = qemu_strtou64(str, &endptr, 16, &lob);
if (ret) {
error_setg(errp, "start address of '%s'"
" must be a hexadecimal integer", name);
@ -734,7 +735,7 @@ static void set_reserved_region(Object *obj, Visitor *v, const char *name,
goto separator_error;
}
ret = qemu_strtou64(endptr + 1, &endptr, 16, &rr->high);
ret = qemu_strtou64(endptr + 1, &endptr, 16, &upb);
if (ret) {
error_setg(errp, "end address of '%s'"
" must be a hexadecimal integer", name);
@ -744,6 +745,8 @@ static void set_reserved_region(Object *obj, Visitor *v, const char *name,
goto separator_error;
}
range_set_bounds(&rr->range, lob, upb);
ret = qemu_strtoui(endptr + 1, &endptr, 10, &rr->type);
if (ret) {
error_setg(errp, "type of '%s'"
@ -1111,7 +1114,7 @@ static void get_uuid(Object *obj, Visitor *v, const char *name, void *opaque,
{
Property *prop = opaque;
QemuUUID *uuid = object_field_prop_ptr(obj, prop);
char buffer[UUID_FMT_LEN + 1];
char buffer[UUID_STR_LEN];
char *p = buffer;
qemu_uuid_unparse(uuid, buffer);

View file

@ -2271,7 +2271,7 @@ static void vmbus_dev_realize(DeviceState *dev, Error **errp)
VMBus *vmbus = VMBUS(qdev_get_parent_bus(dev));
BusChild *child;
Error *err = NULL;
char idstr[UUID_FMT_LEN + 1];
char idstr[UUID_STR_LEN];
assert(!qemu_uuid_is_null(&vdev->instanceid));
@ -2467,7 +2467,7 @@ static char *vmbus_get_dev_path(DeviceState *dev)
static char *vmbus_get_fw_dev_path(DeviceState *dev)
{
VMBusDevice *vdev = VMBUS_DEVICE(dev);
char uuid[UUID_FMT_LEN + 1];
char uuid[UUID_STR_LEN];
qemu_uuid_unparse(&vdev->instanceid, uuid);
return g_strdup_printf("%s@%s", qdev_fw_name(dev), uuid);

View file

@ -1450,6 +1450,10 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
return &iommu_as[devfn]->as;
}
static const PCIIOMMUOps amdvi_iommu_ops = {
.get_address_space = amdvi_host_dma_iommu,
};
static const MemoryRegionOps mmio_mem_ops = {
.read = amdvi_mmio_read,
.write = amdvi_mmio_write,
@ -1581,7 +1585,7 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
AMDVI_MMIO_SIZE);
memory_region_add_subregion(get_system_memory(), AMDVI_BASE_ADDR,
&s->mmio);
pci_setup_iommu(bus, amdvi_host_dma_iommu, s);
pci_setup_iommu(bus, &amdvi_iommu_ops, s);
amdvi_init(s);
}

View file

@ -4088,6 +4088,10 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
return &vtd_as->as;
}
static PCIIOMMUOps vtd_iommu_ops = {
.get_address_space = vtd_host_dma_iommu,
};
static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
{
X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
@ -4210,7 +4214,7 @@ static void vtd_realize(DeviceState *dev, Error **errp)
s->vtd_address_spaces = g_hash_table_new_full(vtd_as_hash, vtd_as_equal,
g_free, g_free);
vtd_init(s);
pci_setup_iommu(bus, vtd_host_dma_iommu, dev);
pci_setup_iommu(bus, &vtd_iommu_ops, dev);
/* Pseudo address space under root PCI bus. */
x86ms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC);
qemu_add_machine_init_done_notifier(&vtd_machine_done_notify);

View file

@ -345,6 +345,10 @@ static AddressSpace *elroy_pcihost_set_iommu(PCIBus *bus, void *opaque,
return &s->astro->iommu_as;
}
static const PCIIOMMUOps elroy_pcihost_iommu_ops = {
.get_address_space = elroy_pcihost_set_iommu,
};
/*
* Encoding in IOSAPIC:
* base_addr == 0xfffa0000, we want to get 0xa0ff0000.
@ -834,7 +838,7 @@ static void astro_realize(DeviceState *obj, Error **errp)
&elroy->pci_io);
/* Host memory as seen from the PCI side, via the IOMMU. */
pci_setup_iommu(PCI_HOST_BRIDGE(elroy)->bus, elroy_pcihost_set_iommu,
pci_setup_iommu(PCI_HOST_BRIDGE(elroy)->bus, &elroy_pcihost_iommu_ops,
elroy);
}
}

View file

@ -663,6 +663,10 @@ static AddressSpace *designware_pcie_host_set_iommu(PCIBus *bus, void *opaque,
return &s->pci.address_space;
}
static const PCIIOMMUOps designware_iommu_ops = {
.get_address_space = designware_pcie_host_set_iommu,
};
static void designware_pcie_host_realize(DeviceState *dev, Error **errp)
{
PCIHostState *pci = PCI_HOST_BRIDGE(dev);
@ -705,7 +709,7 @@ static void designware_pcie_host_realize(DeviceState *dev, Error **errp)
address_space_init(&s->pci.address_space,
&s->pci.address_space_root,
"pcie-bus-address-space");
pci_setup_iommu(pci->bus, designware_pcie_host_set_iommu, s);
pci_setup_iommu(pci->bus, &designware_iommu_ops, s);
qdev_realize(DEVICE(&s->root), BUS(pci->bus), &error_fatal);
}

View file

@ -354,6 +354,10 @@ static AddressSpace *dino_pcihost_set_iommu(PCIBus *bus, void *opaque,
return &s->bm_as;
}
static const PCIIOMMUOps dino_iommu_ops = {
.get_address_space = dino_pcihost_set_iommu,
};
/*
* Dino interrupts are connected as shown on Page 78, Table 23
* (Little-endian bit numbers)
@ -481,7 +485,7 @@ static void dino_pcihost_init(Object *obj)
g_free(name);
}
pci_setup_iommu(phb->bus, dino_pcihost_set_iommu, s);
pci_setup_iommu(phb->bus, &dino_iommu_ops, s);
sysbus_init_mmio(sbd, &s->this_mem);

View file

@ -968,6 +968,10 @@ static AddressSpace *pnv_phb3_dma_iommu(PCIBus *bus, void *opaque, int devfn)
return &ds->dma_as;
}
static PCIIOMMUOps pnv_phb3_iommu_ops = {
.get_address_space = pnv_phb3_dma_iommu,
};
static void pnv_phb3_instance_init(Object *obj)
{
PnvPHB3 *phb = PNV_PHB3(obj);
@ -1012,7 +1016,7 @@ void pnv_phb3_bus_init(DeviceState *dev, PnvPHB3 *phb)
object_property_set_int(OBJECT(pci->bus), "chip-id", phb->chip_id,
&error_abort);
pci_setup_iommu(pci->bus, pnv_phb3_dma_iommu, phb);
pci_setup_iommu(pci->bus, &pnv_phb3_iommu_ops, phb);
}
static void pnv_phb3_realize(DeviceState *dev, Error **errp)

View file

@ -1518,6 +1518,10 @@ static void pnv_phb4_xscom_realize(PnvPHB4 *phb)
&phb->phb_regs_mr);
}
static PCIIOMMUOps pnv_phb4_iommu_ops = {
.get_address_space = pnv_phb4_dma_iommu,
};
static void pnv_phb4_instance_init(Object *obj)
{
PnvPHB4 *phb = PNV_PHB4(obj);
@ -1557,7 +1561,7 @@ void pnv_phb4_bus_init(DeviceState *dev, PnvPHB4 *phb)
object_property_set_int(OBJECT(pci->bus), "chip-id", phb->chip_id,
&error_abort);
pci_setup_iommu(pci->bus, pnv_phb4_dma_iommu, phb);
pci_setup_iommu(pci->bus, &pnv_phb4_iommu_ops, phb);
pci->bus->flags |= PCI_BUS_EXTENDED_CONFIG_SPACE;
}

View file

@ -435,6 +435,10 @@ static AddressSpace *e500_pcihost_set_iommu(PCIBus *bus, void *opaque,
return &s->bm_as;
}
static const PCIIOMMUOps ppce500_iommu_ops = {
.get_address_space = e500_pcihost_set_iommu,
};
static void e500_pcihost_realize(DeviceState *dev, Error **errp)
{
SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
@ -469,7 +473,7 @@ static void e500_pcihost_realize(DeviceState *dev, Error **errp)
memory_region_init(&s->bm, OBJECT(s), "bm-e500", UINT64_MAX);
memory_region_add_subregion(&s->bm, 0x0, &s->busmem);
address_space_init(&s->bm_as, &s->bm, "pci-bm");
pci_setup_iommu(b, e500_pcihost_set_iommu, s);
pci_setup_iommu(b, &ppce500_iommu_ops, s);
pci_create_simple(b, 0, "e500-host-bridge");

View file

@ -223,6 +223,10 @@ static AddressSpace *raven_pcihost_set_iommu(PCIBus *bus, void *opaque,
return &s->bm_as;
}
static const PCIIOMMUOps raven_iommu_ops = {
.get_address_space = raven_pcihost_set_iommu,
};
static void raven_change_gpio(void *opaque, int n, int level)
{
PREPPCIState *s = opaque;
@ -320,7 +324,7 @@ static void raven_pcihost_initfn(Object *obj)
memory_region_add_subregion(&s->bm, 0 , &s->bm_pci_memory_alias);
memory_region_add_subregion(&s->bm, 0x80000000, &s->bm_ram_alias);
address_space_init(&s->bm_as, &s->bm, "raven-bm");
pci_setup_iommu(&s->pci_bus, raven_pcihost_set_iommu, s);
pci_setup_iommu(&s->pci_bus, &raven_iommu_ops, s);
h->bus = &s->pci_bus;

View file

@ -112,6 +112,10 @@ static AddressSpace *sabre_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn)
return &is->iommu_as;
}
static const PCIIOMMUOps sabre_iommu_ops = {
.get_address_space = sabre_pci_dma_iommu,
};
static void sabre_config_write(void *opaque, hwaddr addr,
uint64_t val, unsigned size)
{
@ -384,7 +388,7 @@ static void sabre_realize(DeviceState *dev, Error **errp)
/* IOMMU */
memory_region_add_subregion_overlap(&s->sabre_config, 0x200,
sysbus_mmio_get_region(SYS_BUS_DEVICE(s->iommu), 0), 1);
pci_setup_iommu(phb->bus, sabre_pci_dma_iommu, s->iommu);
pci_setup_iommu(phb->bus, &sabre_iommu_ops, s->iommu);
/* APB secondary busses */
pci_dev = pci_new_multifunction(PCI_DEVFN(1, 0), TYPE_SIMBA_PCI_BRIDGE);

View file

@ -2678,7 +2678,7 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
PCIBus *iommu_bus = bus;
uint8_t devfn = dev->devfn;
while (iommu_bus && !iommu_bus->iommu_fn && iommu_bus->parent_dev) {
while (iommu_bus && !iommu_bus->iommu_ops && iommu_bus->parent_dev) {
PCIBus *parent_bus = pci_get_bus(iommu_bus->parent_dev);
/*
@ -2717,15 +2717,23 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
iommu_bus = parent_bus;
}
if (!pci_bus_bypass_iommu(bus) && iommu_bus && iommu_bus->iommu_fn) {
return iommu_bus->iommu_fn(bus, iommu_bus->iommu_opaque, devfn);
if (!pci_bus_bypass_iommu(bus) && iommu_bus->iommu_ops) {
return iommu_bus->iommu_ops->get_address_space(bus,
iommu_bus->iommu_opaque, devfn);
}
return &address_space_memory;
}
void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque)
void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque)
{
bus->iommu_fn = fn;
/*
* If called, pci_setup_iommu() should provide a minimum set of
* useful callbacks for the bus.
*/
assert(ops);
assert(ops->get_address_space);
bus->iommu_ops = ops;
bus->iommu_opaque = opaque;
}

View file

@ -449,6 +449,10 @@ static AddressSpace *ppc440_pcix_set_iommu(PCIBus *b, void *opaque, int devfn)
return &s->bm_as;
}
static const PCIIOMMUOps ppc440_iommu_ops = {
.get_address_space = ppc440_pcix_set_iommu,
};
/*
* Some guests on sam460ex write all kinds of garbage here such as
* missing enable bit and low bits set and still expect this to work
@ -503,7 +507,7 @@ static void ppc440_pcix_realize(DeviceState *dev, Error **errp)
memory_region_init(&s->bm, OBJECT(s), "bm-ppc440-pcix", UINT64_MAX);
memory_region_add_subregion(&s->bm, 0x0, &s->busmem);
address_space_init(&s->bm_as, &s->bm, "pci-bm");
pci_setup_iommu(h->bus, ppc440_pcix_set_iommu, s);
pci_setup_iommu(h->bus, &ppc440_iommu_ops, s);
memory_region_init(&s->container, OBJECT(s), "pci-container", PCI_ALL_SIZE);
memory_region_init_io(&h->conf_mem, OBJECT(s), &ppc440_pcix_host_conf_ops,

View file

@ -780,6 +780,10 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn)
return &phb->iommu_as;
}
static const PCIIOMMUOps spapr_iommu_ops = {
.get_address_space = spapr_pci_dma_iommu,
};
static char *spapr_phb_vfio_get_loc_code(SpaprPhbState *sphb, PCIDevice *pdev)
{
g_autofree char *path = NULL;
@ -1978,7 +1982,7 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
memory_region_add_subregion(&sphb->iommu_root, SPAPR_PCI_MSI_WINDOW,
&sphb->msiwindow);
pci_setup_iommu(bus, spapr_pci_dma_iommu, sphb);
pci_setup_iommu(bus, &spapr_iommu_ops, sphb);
pci_bus_set_route_irq_fn(bus, spapr_route_intx_pin_to_irq);

View file

@ -18,14 +18,112 @@
*/
#include "qemu/osdep.h"
#include <sys/ioctl.h>
#include <linux/vfio.h>
#include "hw/ppc/spapr.h"
#include "hw/pci-host/spapr.h"
#include "hw/pci/msix.h"
#include "hw/pci/pci_device.h"
#include "hw/vfio/vfio.h"
#include "hw/vfio/vfio-common.h"
#include "qemu/error-report.h"
/*
* Interfaces for IBM EEH (Enhanced Error Handling)
*/
static bool vfio_eeh_container_ok(VFIOContainer *container)
{
/*
* As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO
* implementation is broken if there are multiple groups in a
* container. The hardware works in units of Partitionable
* Endpoints (== IOMMU groups) and the EEH operations naively
* iterate across all groups in the container, without any logic
* to make sure the groups have their state synchronized. For
* certain operations (ENABLE) that might be ok, until an error
* occurs, but for others (GET_STATE) it's clearly broken.
*/
/*
* XXX Once fixed kernels exist, test for them here
*/
if (QLIST_EMPTY(&container->group_list)) {
return false;
}
if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) {
return false;
}
return true;
}
static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op)
{
struct vfio_eeh_pe_op pe_op = {
.argsz = sizeof(pe_op),
.op = op,
};
int ret;
if (!vfio_eeh_container_ok(container)) {
error_report("vfio/eeh: EEH_PE_OP 0x%x: "
"kernel requires a container with exactly one group", op);
return -EPERM;
}
ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op);
if (ret < 0) {
error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op);
return -errno;
}
return ret;
}
static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
{
VFIOAddressSpace *space = vfio_get_address_space(as);
VFIOContainer *container = NULL;
if (QLIST_EMPTY(&space->containers)) {
/* No containers to act on */
goto out;
}
container = QLIST_FIRST(&space->containers);
if (QLIST_NEXT(container, next)) {
/*
* We don't yet have logic to synchronize EEH state across
* multiple containers
*/
container = NULL;
goto out;
}
out:
vfio_put_address_space(space);
return container;
}
static bool vfio_eeh_as_ok(AddressSpace *as)
{
VFIOContainer *container = vfio_eeh_as_container(as);
return (container != NULL) && vfio_eeh_container_ok(container);
}
static int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
{
VFIOContainer *container = vfio_eeh_as_container(as);
if (!container) {
return -ENODEV;
}
return vfio_eeh_container_op(container, op);
}
bool spapr_phb_eeh_available(SpaprPhbState *sphb)
{
return vfio_eeh_as_ok(&sphb->iommu_as);

View file

@ -100,6 +100,10 @@ static void remote_iommu_finalize(Object *obj)
iommu->elem_by_devfn = NULL;
}
static const PCIIOMMUOps remote_iommu_ops = {
.get_address_space = remote_iommu_find_add_as,
};
void remote_iommu_setup(PCIBus *pci_bus)
{
RemoteIommu *iommu = NULL;
@ -108,7 +112,7 @@ void remote_iommu_setup(PCIBus *pci_bus)
iommu = REMOTE_IOMMU(object_new(TYPE_REMOTE_IOMMU));
pci_setup_iommu(pci_bus, remote_iommu_find_add_as, iommu);
pci_setup_iommu(pci_bus, &remote_iommu_ops, iommu);
object_property_add_child(OBJECT(pci_bus), "remote-iommu", OBJECT(iommu));

View file

@ -652,6 +652,10 @@ static AddressSpace *s390_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn)
return &iommu->as;
}
static const PCIIOMMUOps s390_iommu_ops = {
.get_address_space = s390_pci_dma_iommu,
};
static uint8_t set_ind_atomic(uint64_t ind_loc, uint8_t to_be_set)
{
uint8_t expected, actual;
@ -839,7 +843,7 @@ static void s390_pcihost_realize(DeviceState *dev, Error **errp)
b = pci_register_root_bus(dev, NULL, s390_pci_set_irq, s390_pci_map_irq,
NULL, get_system_memory(), get_system_io(), 0,
64, TYPE_PCI_BUS);
pci_setup_iommu(b, s390_pci_dma_iommu, s);
pci_setup_iommu(b, &s390_iommu_ops, s);
bus = BUS(b);
qbus_set_hotplug_handler(bus, OBJECT(dev));
@ -1058,7 +1062,7 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
pdev = PCI_DEVICE(dev);
pci_bridge_map_irq(pb, dev->id, s390_pci_map_irq);
pci_setup_iommu(&pb->sec_bus, s390_pci_dma_iommu, s);
pci_setup_iommu(&pb->sec_bus, &s390_iommu_ops, s);
qbus_set_hotplug_handler(BUS(&pb->sec_bus), OBJECT(s));

View file

@ -14,7 +14,6 @@
#include <linux/vfio.h>
#include <sys/ioctl.h>
#include "qapi/error.h"
#include "hw/vfio/vfio.h"
#include "hw/vfio/vfio-common.h"
#include "hw/s390x/ap-device.h"
#include "qemu/error-report.h"

View file

@ -20,7 +20,6 @@
#include <sys/ioctl.h>
#include "qapi/error.h"
#include "hw/vfio/vfio.h"
#include "hw/vfio/vfio-common.h"
#include "hw/s390x/s390-ccw.h"
#include "hw/s390x/vfio-ccw.h"

View file

@ -26,7 +26,6 @@
#include <linux/vfio.h>
#include "hw/vfio/vfio-common.h"
#include "hw/vfio/vfio.h"
#include "hw/vfio/pci.h"
#include "exec/address-spaces.h"
#include "exec/memory.h"
@ -246,44 +245,6 @@ bool vfio_devices_all_running_and_mig_active(VFIOContainer *container)
return true;
}
void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova,
hwaddr max_iova, uint64_t iova_pgsizes)
{
VFIOHostDMAWindow *hostwin;
QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
if (ranges_overlap(hostwin->min_iova,
hostwin->max_iova - hostwin->min_iova + 1,
min_iova,
max_iova - min_iova + 1)) {
hw_error("%s: Overlapped IOMMU are not enabled", __func__);
}
}
hostwin = g_malloc0(sizeof(*hostwin));
hostwin->min_iova = min_iova;
hostwin->max_iova = max_iova;
hostwin->iova_pgsizes = iova_pgsizes;
QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
}
int vfio_host_win_del(VFIOContainer *container,
hwaddr min_iova, hwaddr max_iova)
{
VFIOHostDMAWindow *hostwin;
QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
QLIST_REMOVE(hostwin, hostwin_next);
g_free(hostwin);
return 0;
}
}
return -1;
}
static bool vfio_listener_skipped_section(MemoryRegionSection *section)
{
return (!memory_region_is_ram(section->mr) &&
@ -532,22 +493,6 @@ static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
g_free(vrdl);
}
static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container,
hwaddr iova, hwaddr end)
{
VFIOHostDMAWindow *hostwin;
bool hostwin_found = false;
QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
hostwin_found = true;
break;
}
}
return hostwin_found ? hostwin : NULL;
}
static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
{
MemoryRegion *mr = section->mr;
@ -626,7 +571,6 @@ static void vfio_listener_region_add(MemoryListener *listener,
Int128 llend, llsize;
void *vaddr;
int ret;
VFIOHostDMAWindow *hostwin;
Error *err = NULL;
if (!vfio_listener_valid_section(section, "region_add")) {
@ -648,13 +592,6 @@ static void vfio_listener_region_add(MemoryListener *listener,
goto fail;
}
hostwin = vfio_find_hostwin(container, iova, end);
if (!hostwin) {
error_setg(&err, "Container %p can't map guest IOVA region"
" 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end);
goto fail;
}
memory_region_ref(section->mr);
if (memory_region_is_iommu(section->mr)) {
@ -693,6 +630,15 @@ static void vfio_listener_region_add(MemoryListener *listener,
goto fail;
}
if (container->iova_ranges) {
ret = memory_region_iommu_set_iova_ranges(giommu->iommu_mr,
container->iova_ranges, &err);
if (ret) {
g_free(giommu);
goto fail;
}
}
ret = memory_region_register_iommu_notifier(section->mr, &giommu->n,
&err);
if (ret) {
@ -726,7 +672,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
llsize = int128_sub(llend, int128_make64(iova));
if (memory_region_is_ram_device(section->mr)) {
hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
hwaddr pgmask = (1ULL << ctz64(container->pgsizes)) - 1;
if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
trace_vfio_listener_region_add_no_dma_map(
@ -825,12 +771,8 @@ static void vfio_listener_region_del(MemoryListener *listener,
if (memory_region_is_ram_device(section->mr)) {
hwaddr pgmask;
VFIOHostDMAWindow *hostwin;
hostwin = vfio_find_hostwin(container, iova, end);
assert(hostwin); /* or region_add() would have failed */
pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
pgmask = (1ULL << ctz64(container->pgsizes)) - 1;
try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
} else if (memory_region_has_ram_discard_manager(section->mr)) {
vfio_unregister_ram_discard_listener(container, section);

View file

@ -20,20 +20,15 @@
#include "qemu/osdep.h"
#include <sys/ioctl.h>
#ifdef CONFIG_KVM
#include <linux/kvm.h>
#endif
#include <linux/vfio.h>
#include "hw/vfio/vfio-common.h"
#include "hw/vfio/vfio.h"
#include "exec/address-spaces.h"
#include "exec/memory.h"
#include "exec/ram_addr.h"
#include "hw/hw.h"
#include "qemu/error-report.h"
#include "qemu/range.h"
#include "sysemu/kvm.h"
#include "sysemu/reset.h"
#include "trace.h"
#include "qapi/error.h"
@ -205,92 +200,6 @@ int vfio_dma_map(VFIOContainer *container, hwaddr iova,
return -errno;
}
int vfio_container_add_section_window(VFIOContainer *container,
MemoryRegionSection *section,
Error **errp)
{
VFIOHostDMAWindow *hostwin;
hwaddr pgsize = 0;
int ret;
if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
return 0;
}
/* For now intersections are not allowed, we may relax this later */
QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
if (ranges_overlap(hostwin->min_iova,
hostwin->max_iova - hostwin->min_iova + 1,
section->offset_within_address_space,
int128_get64(section->size))) {
error_setg(errp,
"region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
"host DMA window [0x%"PRIx64",0x%"PRIx64"]",
section->offset_within_address_space,
section->offset_within_address_space +
int128_get64(section->size) - 1,
hostwin->min_iova, hostwin->max_iova);
return -EINVAL;
}
}
ret = vfio_spapr_create_window(container, section, &pgsize);
if (ret) {
error_setg_errno(errp, -ret, "Failed to create SPAPR window");
return ret;
}
vfio_host_win_add(container, section->offset_within_address_space,
section->offset_within_address_space +
int128_get64(section->size) - 1, pgsize);
#ifdef CONFIG_KVM
if (kvm_enabled()) {
VFIOGroup *group;
IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
struct kvm_vfio_spapr_tce param;
struct kvm_device_attr attr = {
.group = KVM_DEV_VFIO_GROUP,
.attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
.addr = (uint64_t)(unsigned long)&param,
};
if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
&param.tablefd)) {
QLIST_FOREACH(group, &container->group_list, container_next) {
param.groupfd = group->fd;
if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
error_setg_errno(errp, errno,
"vfio: failed GROUP_SET_SPAPR_TCE for "
"KVM VFIO device %d and group fd %d",
param.tablefd, param.groupfd);
return -errno;
}
trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
}
}
}
#endif
return 0;
}
void vfio_container_del_section_window(VFIOContainer *container,
MemoryRegionSection *section)
{
if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
return;
}
vfio_spapr_remove_window(container,
section->offset_within_address_space);
if (vfio_host_win_del(container,
section->offset_within_address_space,
section->offset_within_address_space +
int128_get64(section->size) - 1) < 0) {
hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
__func__, section->offset_within_address_space);
}
}
int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
{
int ret;
@ -355,14 +264,6 @@ int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap,
return ret;
}
static void vfio_listener_release(VFIOContainer *container)
{
memory_listener_unregister(&container->listener);
if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
memory_listener_unregister(&container->prereg_listener);
}
}
static struct vfio_info_cap_header *
vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
{
@ -382,7 +283,7 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
/* If the capability cannot be found, assume no DMA limiting */
hdr = vfio_get_iommu_type1_info_cap(info,
VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL);
if (hdr == NULL) {
if (!hdr) {
return false;
}
@ -394,6 +295,32 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
return true;
}
static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info,
VFIOContainer *container)
{
struct vfio_info_cap_header *hdr;
struct vfio_iommu_type1_info_cap_iova_range *cap;
hdr = vfio_get_iommu_type1_info_cap(info,
VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE);
if (!hdr) {
return false;
}
cap = (void *)hdr;
for (int i = 0; i < cap->nr_iovas; i++) {
Range *range = g_new(Range, 1);
range_set_bounds(range, cap->iova_ranges[i].start,
cap->iova_ranges[i].end);
container->iova_ranges =
range_list_insert(container->iova_ranges, range);
}
return true;
}
static void vfio_kvm_device_add_group(VFIOGroup *group)
{
Error *err = NULL;
@ -535,6 +462,12 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container,
}
}
static void vfio_free_container(VFIOContainer *container)
{
g_list_free_full(container->iova_ranges, g_free);
g_free(container);
}
static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
Error **errp)
{
@ -616,8 +549,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
container->error = NULL;
container->dirty_pages_supported = false;
container->dma_max_mappings = 0;
container->iova_ranges = NULL;
QLIST_INIT(&container->giommu_list);
QLIST_INIT(&container->hostwin_list);
QLIST_INIT(&container->vrdl_list);
ret = vfio_init_container(container, group->fd, errp);
@ -652,84 +585,21 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) {
container->dma_max_mappings = 65535;
}
vfio_get_info_iova_range(info, container);
vfio_get_iommu_info_migration(container, info);
g_free(info);
/*
* FIXME: We should parse VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE
* information to get the actual window extent rather than assume
* a 64-bit IOVA address space.
*/
vfio_host_win_add(container, 0, (hwaddr)-1, container->pgsizes);
break;
}
case VFIO_SPAPR_TCE_v2_IOMMU:
case VFIO_SPAPR_TCE_IOMMU:
{
struct vfio_iommu_spapr_tce_info info;
bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
/*
* The host kernel code implementing VFIO_IOMMU_DISABLE is called
* when container fd is closed so we do not call it explicitly
* in this file.
*/
if (!v2) {
ret = ioctl(fd, VFIO_IOMMU_ENABLE);
if (ret) {
error_setg_errno(errp, errno, "failed to enable container");
ret = -errno;
goto enable_discards_exit;
}
} else {
container->prereg_listener = vfio_prereg_listener;
memory_listener_register(&container->prereg_listener,
&address_space_memory);
if (container->error) {
memory_listener_unregister(&container->prereg_listener);
ret = -1;
error_propagate_prepend(errp, container->error,
"RAM memory listener initialization failed: ");
goto enable_discards_exit;
}
}
info.argsz = sizeof(info);
ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
ret = vfio_spapr_container_init(container, errp);
if (ret) {
error_setg_errno(errp, errno,
"VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
ret = -errno;
if (v2) {
memory_listener_unregister(&container->prereg_listener);
}
goto enable_discards_exit;
}
if (v2) {
container->pgsizes = info.ddw.pgsizes;
/*
* There is a default window in just created container.
* To make region_add/del simpler, we better remove this
* window now and let those iommu_listener callbacks
* create/remove them when needed.
*/
ret = vfio_spapr_remove_window(container, info.dma32_window_start);
if (ret) {
error_setg_errno(errp, -ret,
"failed to remove existing window");
goto enable_discards_exit;
}
} else {
/* The default table uses 4K pages */
container->pgsizes = 0x1000;
vfio_host_win_add(container, info.dma32_window_start,
info.dma32_window_start +
info.dma32_window_size - 1,
0x1000);
}
break;
}
}
@ -759,13 +629,17 @@ listener_release_exit:
QLIST_REMOVE(group, container_next);
QLIST_REMOVE(container, next);
vfio_kvm_device_del_group(group);
vfio_listener_release(container);
memory_listener_unregister(&container->listener);
if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
vfio_spapr_container_deinit(container);
}
enable_discards_exit:
vfio_ram_block_discard_disable(container, false);
free_container_exit:
g_free(container);
vfio_free_container(container);
close_fd_exit:
close(fd);
@ -789,7 +663,11 @@ static void vfio_disconnect_container(VFIOGroup *group)
* group.
*/
if (QLIST_EMPTY(&container->group_list)) {
vfio_listener_release(container);
memory_listener_unregister(&container->listener);
if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
vfio_spapr_container_deinit(container);
}
}
if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
@ -800,7 +678,6 @@ static void vfio_disconnect_container(VFIOGroup *group)
if (QLIST_EMPTY(&container->group_list)) {
VFIOAddressSpace *space = container->space;
VFIOGuestIOMMU *giommu, *tmp;
VFIOHostDMAWindow *hostwin, *next;
QLIST_REMOVE(container, next);
@ -811,15 +688,9 @@ static void vfio_disconnect_container(VFIOGroup *group)
g_free(giommu);
}
QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next,
next) {
QLIST_REMOVE(hostwin, hostwin_next);
g_free(hostwin);
}
trace_vfio_disconnect_container(container->fd);
close(container->fd);
g_free(container);
vfio_free_container(container);
vfio_put_address_space(space);
}
@ -975,103 +846,6 @@ static void vfio_put_base_device(VFIODevice *vbasedev)
close(vbasedev->fd);
}
/*
* Interfaces for IBM EEH (Enhanced Error Handling)
*/
static bool vfio_eeh_container_ok(VFIOContainer *container)
{
/*
* As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO
* implementation is broken if there are multiple groups in a
* container. The hardware works in units of Partitionable
* Endpoints (== IOMMU groups) and the EEH operations naively
* iterate across all groups in the container, without any logic
* to make sure the groups have their state synchronized. For
* certain operations (ENABLE) that might be ok, until an error
* occurs, but for others (GET_STATE) it's clearly broken.
*/
/*
* XXX Once fixed kernels exist, test for them here
*/
if (QLIST_EMPTY(&container->group_list)) {
return false;
}
if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) {
return false;
}
return true;
}
static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op)
{
struct vfio_eeh_pe_op pe_op = {
.argsz = sizeof(pe_op),
.op = op,
};
int ret;
if (!vfio_eeh_container_ok(container)) {
error_report("vfio/eeh: EEH_PE_OP 0x%x: "
"kernel requires a container with exactly one group", op);
return -EPERM;
}
ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op);
if (ret < 0) {
error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op);
return -errno;
}
return ret;
}
static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
{
VFIOAddressSpace *space = vfio_get_address_space(as);
VFIOContainer *container = NULL;
if (QLIST_EMPTY(&space->containers)) {
/* No containers to act on */
goto out;
}
container = QLIST_FIRST(&space->containers);
if (QLIST_NEXT(container, next)) {
/*
* We don't yet have logic to synchronize EEH state across
* multiple containers
*/
container = NULL;
goto out;
}
out:
vfio_put_address_space(space);
return container;
}
bool vfio_eeh_as_ok(AddressSpace *as)
{
VFIOContainer *container = vfio_eeh_as_container(as);
return (container != NULL) && vfio_eeh_container_ok(container);
}
int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
{
VFIOContainer *container = vfio_eeh_as_container(as);
if (!container) {
return -ENODEV;
}
return vfio_eeh_container_op(container, op);
}
static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp)
{
char *tmp, group_path[PATH_MAX], *group_name;

View file

@ -23,7 +23,6 @@
#include <sys/ioctl.h>
#include "hw/vfio/vfio-common.h"
#include "hw/vfio/vfio.h"
#include "hw/hw.h"
#include "trace.h"
#include "qapi/error.h"

View file

@ -3081,7 +3081,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
struct stat st;
int i, ret;
bool is_mdev;
char uuid[UUID_FMT_LEN];
char uuid[UUID_STR_LEN];
char *name;
if (!vbasedev->sysfsdev) {

View file

@ -11,6 +11,11 @@
#include "qemu/osdep.h"
#include <sys/ioctl.h>
#include <linux/vfio.h>
#ifdef CONFIG_KVM
#include <linux/kvm.h>
#endif
#include "sysemu/kvm.h"
#include "exec/address-spaces.h"
#include "hw/vfio/vfio-common.h"
#include "hw/hw.h"
@ -135,15 +140,90 @@ static void vfio_prereg_listener_region_del(MemoryListener *listener,
trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0);
}
const MemoryListener vfio_prereg_listener = {
static const MemoryListener vfio_prereg_listener = {
.name = "vfio-pre-reg",
.region_add = vfio_prereg_listener_region_add,
.region_del = vfio_prereg_listener_region_del,
};
int vfio_spapr_create_window(VFIOContainer *container,
MemoryRegionSection *section,
hwaddr *pgsize)
static void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova,
hwaddr max_iova, uint64_t iova_pgsizes)
{
VFIOHostDMAWindow *hostwin;
QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
if (ranges_overlap(hostwin->min_iova,
hostwin->max_iova - hostwin->min_iova + 1,
min_iova,
max_iova - min_iova + 1)) {
hw_error("%s: Overlapped IOMMU are not enabled", __func__);
}
}
hostwin = g_malloc0(sizeof(*hostwin));
hostwin->min_iova = min_iova;
hostwin->max_iova = max_iova;
hostwin->iova_pgsizes = iova_pgsizes;
QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
}
static int vfio_host_win_del(VFIOContainer *container,
hwaddr min_iova, hwaddr max_iova)
{
VFIOHostDMAWindow *hostwin;
QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
QLIST_REMOVE(hostwin, hostwin_next);
g_free(hostwin);
return 0;
}
}
return -1;
}
static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container,
hwaddr iova, hwaddr end)
{
VFIOHostDMAWindow *hostwin;
bool hostwin_found = false;
QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
hostwin_found = true;
break;
}
}
return hostwin_found ? hostwin : NULL;
}
static int vfio_spapr_remove_window(VFIOContainer *container,
hwaddr offset_within_address_space)
{
struct vfio_iommu_spapr_tce_remove remove = {
.argsz = sizeof(remove),
.start_addr = offset_within_address_space,
};
int ret;
ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
if (ret) {
error_report("Failed to remove window at %"PRIx64,
(uint64_t)remove.start_addr);
return -errno;
}
trace_vfio_spapr_remove_window(offset_within_address_space);
return 0;
}
static int vfio_spapr_create_window(VFIOContainer *container,
MemoryRegionSection *section,
hwaddr *pgsize)
{
int ret = 0;
IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
@ -233,23 +313,195 @@ int vfio_spapr_create_window(VFIOContainer *container,
return 0;
}
int vfio_spapr_remove_window(VFIOContainer *container,
hwaddr offset_within_address_space)
int vfio_container_add_section_window(VFIOContainer *container,
MemoryRegionSection *section,
Error **errp)
{
struct vfio_iommu_spapr_tce_remove remove = {
.argsz = sizeof(remove),
.start_addr = offset_within_address_space,
};
VFIOHostDMAWindow *hostwin;
hwaddr pgsize = 0;
int ret;
ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
if (ret) {
error_report("Failed to remove window at %"PRIx64,
(uint64_t)remove.start_addr);
return -errno;
/*
* VFIO_SPAPR_TCE_IOMMU supports a single host window between
* [dma32_window_start, dma32_window_size), we need to ensure
* the section fall in this range.
*/
if (container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
hwaddr iova, end;
iova = section->offset_within_address_space;
end = iova + int128_get64(section->size) - 1;
if (!vfio_find_hostwin(container, iova, end)) {
error_setg(errp, "Container %p can't map guest IOVA region"
" 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container,
iova, end);
return -EINVAL;
}
return 0;
}
trace_vfio_spapr_remove_window(offset_within_address_space);
if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
return 0;
}
/* For now intersections are not allowed, we may relax this later */
QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
if (ranges_overlap(hostwin->min_iova,
hostwin->max_iova - hostwin->min_iova + 1,
section->offset_within_address_space,
int128_get64(section->size))) {
error_setg(errp,
"region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
"host DMA window [0x%"PRIx64",0x%"PRIx64"]",
section->offset_within_address_space,
section->offset_within_address_space +
int128_get64(section->size) - 1,
hostwin->min_iova, hostwin->max_iova);
return -EINVAL;
}
}
ret = vfio_spapr_create_window(container, section, &pgsize);
if (ret) {
error_setg_errno(errp, -ret, "Failed to create SPAPR window");
return ret;
}
vfio_host_win_add(container, section->offset_within_address_space,
section->offset_within_address_space +
int128_get64(section->size) - 1, pgsize);
#ifdef CONFIG_KVM
if (kvm_enabled()) {
VFIOGroup *group;
IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
struct kvm_vfio_spapr_tce param;
struct kvm_device_attr attr = {
.group = KVM_DEV_VFIO_GROUP,
.attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
.addr = (uint64_t)(unsigned long)&param,
};
if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
&param.tablefd)) {
QLIST_FOREACH(group, &container->group_list, container_next) {
param.groupfd = group->fd;
if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
error_setg_errno(errp, errno,
"vfio: failed GROUP_SET_SPAPR_TCE for "
"KVM VFIO device %d and group fd %d",
param.tablefd, param.groupfd);
return -errno;
}
trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
}
}
}
#endif
return 0;
}
void vfio_container_del_section_window(VFIOContainer *container,
MemoryRegionSection *section)
{
if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
return;
}
vfio_spapr_remove_window(container,
section->offset_within_address_space);
if (vfio_host_win_del(container,
section->offset_within_address_space,
section->offset_within_address_space +
int128_get64(section->size) - 1) < 0) {
hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
__func__, section->offset_within_address_space);
}
}
int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
{
struct vfio_iommu_spapr_tce_info info;
bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
int ret, fd = container->fd;
QLIST_INIT(&container->hostwin_list);
/*
* The host kernel code implementing VFIO_IOMMU_DISABLE is called
* when container fd is closed so we do not call it explicitly
* in this file.
*/
if (!v2) {
ret = ioctl(fd, VFIO_IOMMU_ENABLE);
if (ret) {
error_setg_errno(errp, errno, "failed to enable container");
return -errno;
}
} else {
container->prereg_listener = vfio_prereg_listener;
memory_listener_register(&container->prereg_listener,
&address_space_memory);
if (container->error) {
ret = -1;
error_propagate_prepend(errp, container->error,
"RAM memory listener initialization failed: ");
goto listener_unregister_exit;
}
}
info.argsz = sizeof(info);
ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
if (ret) {
error_setg_errno(errp, errno,
"VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
ret = -errno;
goto listener_unregister_exit;
}
if (v2) {
container->pgsizes = info.ddw.pgsizes;
/*
* There is a default window in just created container.
* To make region_add/del simpler, we better remove this
* window now and let those iommu_listener callbacks
* create/remove them when needed.
*/
ret = vfio_spapr_remove_window(container, info.dma32_window_start);
if (ret) {
error_setg_errno(errp, -ret,
"failed to remove existing window");
goto listener_unregister_exit;
}
} else {
/* The default table uses 4K pages */
container->pgsizes = 0x1000;
vfio_host_win_add(container, info.dma32_window_start,
info.dma32_window_start +
info.dma32_window_size - 1,
0x1000);
}
return 0;
listener_unregister_exit:
if (v2) {
memory_listener_unregister(&container->prereg_listener);
}
return ret;
}
void vfio_spapr_container_deinit(VFIOContainer *container)
{
VFIOHostDMAWindow *hostwin, *next;
if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
memory_listener_unregister(&container->prereg_listener);
}
QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next,
next) {
QLIST_REMOVE(hostwin, hostwin_next);
g_free(hostwin);
}
}

View file

@ -135,6 +135,7 @@ virtio_iommu_notify_flag_add(const char *name) "add notifier to mr %s"
virtio_iommu_notify_flag_del(const char *name) "del notifier from mr %s"
virtio_iommu_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
virtio_iommu_freeze_granule(uint64_t page_size_mask) "granule set to 0x%"PRIx64
virtio_iommu_host_resv_regions(const char *name, uint32_t index, uint64_t lob, uint64_t upb) "mr=%s host-resv-reg[%d] = [0x%"PRIx64",0x%"PRIx64"]"
# virtio-mem.c
virtio_mem_send_response(uint16_t type) "type=%" PRIu16

View file

@ -37,7 +37,7 @@ struct VirtIOIOMMUPCI {
static Property virtio_iommu_pci_properties[] = {
DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0),
DEFINE_PROP_ARRAY("reserved-regions", VirtIOIOMMUPCI,
vdev.nb_reserved_regions, vdev.reserved_regions,
vdev.nr_prop_resv_regions, vdev.prop_resv_regions,
qdev_prop_reserved_region, ReservedRegion),
DEFINE_PROP_END_OF_LIST(),
};
@ -54,9 +54,9 @@ static void virtio_iommu_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
"for the virtio-iommu-pci device");
return;
}
for (int i = 0; i < s->nb_reserved_regions; i++) {
if (s->reserved_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_RESERVED &&
s->reserved_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_MSI) {
for (int i = 0; i < s->nr_prop_resv_regions; i++) {
if (s->prop_resv_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_RESERVED &&
s->prop_resv_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_MSI) {
error_setg(errp, "reserved region %d has an invalid type", i);
error_append_hint(errp, "Valid values are 0 and 1\n");
return;

View file

@ -20,12 +20,15 @@
#include "qemu/osdep.h"
#include "qemu/log.h"
#include "qemu/iov.h"
#include "qemu/range.h"
#include "qemu/reserved-region.h"
#include "exec/target_page.h"
#include "hw/qdev-properties.h"
#include "hw/virtio/virtio.h"
#include "sysemu/kvm.h"
#include "sysemu/reset.h"
#include "sysemu/sysemu.h"
#include "qemu/reserved-region.h"
#include "qapi/error.h"
#include "qemu/error-report.h"
#include "trace.h"
@ -378,6 +381,19 @@ static void virtio_iommu_put_domain(gpointer data)
g_free(domain);
}
static void add_prop_resv_regions(IOMMUDevice *sdev)
{
VirtIOIOMMU *s = sdev->viommu;
int i;
for (i = 0; i < s->nr_prop_resv_regions; i++) {
ReservedRegion *reg = g_new0(ReservedRegion, 1);
*reg = s->prop_resv_regions[i];
sdev->resv_regions = resv_region_list_insert(sdev->resv_regions, reg);
}
}
static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque,
int devfn)
{
@ -408,6 +424,7 @@ static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque,
memory_region_init(&sdev->root, OBJECT(s), name, UINT64_MAX);
address_space_init(&sdev->as, &sdev->root, TYPE_VIRTIO_IOMMU);
add_prop_resv_regions(sdev);
/*
* Build the IOMMU disabled container with aliases to the
@ -444,6 +461,10 @@ static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque,
return &sdev->as;
}
static const PCIIOMMUOps virtio_iommu_ops = {
.get_address_space = virtio_iommu_find_add_as,
};
static int virtio_iommu_attach(VirtIOIOMMU *s,
struct virtio_iommu_req_attach *req)
{
@ -624,29 +645,30 @@ static int virtio_iommu_unmap(VirtIOIOMMU *s,
return ret;
}
static ssize_t virtio_iommu_fill_resv_mem_prop(VirtIOIOMMU *s, uint32_t ep,
static ssize_t virtio_iommu_fill_resv_mem_prop(IOMMUDevice *sdev, uint32_t ep,
uint8_t *buf, size_t free)
{
struct virtio_iommu_probe_resv_mem prop = {};
size_t size = sizeof(prop), length = size - sizeof(prop.head), total;
int i;
total = size * s->nb_reserved_regions;
GList *l;
total = size * g_list_length(sdev->resv_regions);
if (total > free) {
return -ENOSPC;
}
for (i = 0; i < s->nb_reserved_regions; i++) {
unsigned subtype = s->reserved_regions[i].type;
for (l = sdev->resv_regions; l; l = l->next) {
ReservedRegion *reg = l->data;
unsigned subtype = reg->type;
Range *range = &reg->range;
assert(subtype == VIRTIO_IOMMU_RESV_MEM_T_RESERVED ||
subtype == VIRTIO_IOMMU_RESV_MEM_T_MSI);
prop.head.type = cpu_to_le16(VIRTIO_IOMMU_PROBE_T_RESV_MEM);
prop.head.length = cpu_to_le16(length);
prop.subtype = subtype;
prop.start = cpu_to_le64(s->reserved_regions[i].low);
prop.end = cpu_to_le64(s->reserved_regions[i].high);
prop.start = cpu_to_le64(range_lob(range));
prop.end = cpu_to_le64(range_upb(range));
memcpy(buf, &prop, size);
@ -666,19 +688,27 @@ static int virtio_iommu_probe(VirtIOIOMMU *s,
uint8_t *buf)
{
uint32_t ep_id = le32_to_cpu(req->endpoint);
IOMMUMemoryRegion *iommu_mr = virtio_iommu_mr(s, ep_id);
size_t free = VIOMMU_PROBE_SIZE;
IOMMUDevice *sdev;
ssize_t count;
if (!virtio_iommu_mr(s, ep_id)) {
if (!iommu_mr) {
return VIRTIO_IOMMU_S_NOENT;
}
count = virtio_iommu_fill_resv_mem_prop(s, ep_id, buf, free);
sdev = container_of(iommu_mr, IOMMUDevice, iommu_mr);
if (!sdev) {
return -EINVAL;
}
count = virtio_iommu_fill_resv_mem_prop(sdev, ep_id, buf, free);
if (count < 0) {
return VIRTIO_IOMMU_S_INVAL;
}
buf += count;
free -= count;
sdev->probe_done = true;
return VIRTIO_IOMMU_S_OK;
}
@ -856,7 +886,7 @@ static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
bool bypass_allowed;
int granule;
bool found;
int i;
GList *l;
interval.low = addr;
interval.high = addr + 1;
@ -894,10 +924,10 @@ static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
goto unlock;
}
for (i = 0; i < s->nb_reserved_regions; i++) {
ReservedRegion *reg = &s->reserved_regions[i];
for (l = sdev->resv_regions; l; l = l->next) {
ReservedRegion *reg = l->data;
if (addr >= reg->low && addr <= reg->high) {
if (range_contains(&reg->range, addr)) {
switch (reg->type) {
case VIRTIO_IOMMU_RESV_MEM_T_MSI:
entry.perm = flag;
@ -1131,6 +1161,106 @@ static int virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr,
return 0;
}
/**
* rebuild_resv_regions: rebuild resv regions with both the
* info of host resv ranges and property set resv ranges
*/
static int rebuild_resv_regions(IOMMUDevice *sdev)
{
GList *l;
int i = 0;
/* free the existing list and rebuild it from scratch */
g_list_free_full(sdev->resv_regions, g_free);
sdev->resv_regions = NULL;
/* First add host reserved regions if any, all tagged as RESERVED */
for (l = sdev->host_resv_ranges; l; l = l->next) {
ReservedRegion *reg = g_new0(ReservedRegion, 1);
Range *r = (Range *)l->data;
reg->type = VIRTIO_IOMMU_RESV_MEM_T_RESERVED;
range_set_bounds(&reg->range, range_lob(r), range_upb(r));
sdev->resv_regions = resv_region_list_insert(sdev->resv_regions, reg);
trace_virtio_iommu_host_resv_regions(sdev->iommu_mr.parent_obj.name, i,
range_lob(&reg->range),
range_upb(&reg->range));
i++;
}
/*
* then add higher priority reserved regions set by the machine
* through properties
*/
add_prop_resv_regions(sdev);
return 0;
}
/**
* virtio_iommu_set_iova_ranges: Conveys the usable IOVA ranges
*
* The function turns those into reserved ranges. Once some
* reserved ranges have been set, new reserved regions cannot be
* added outside of the original ones.
*
* @mr: IOMMU MR
* @iova_ranges: list of usable IOVA ranges
* @errp: error handle
*/
static int virtio_iommu_set_iova_ranges(IOMMUMemoryRegion *mr,
GList *iova_ranges,
Error **errp)
{
IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
GList *current_ranges = sdev->host_resv_ranges;
GList *l, *tmp, *new_ranges = NULL;
int ret = -EINVAL;
/* check that each new resv region is included in an existing one */
if (sdev->host_resv_ranges) {
range_inverse_array(iova_ranges,
&new_ranges,
0, UINT64_MAX);
for (tmp = new_ranges; tmp; tmp = tmp->next) {
Range *newr = (Range *)tmp->data;
bool included = false;
for (l = current_ranges; l; l = l->next) {
Range * r = (Range *)l->data;
if (range_contains_range(r, newr)) {
included = true;
break;
}
}
if (!included) {
goto error;
}
}
/* all new reserved ranges are included in existing ones */
ret = 0;
goto out;
}
if (sdev->probe_done) {
warn_report("%s: Notified about new host reserved regions after probe",
mr->parent_obj.name);
}
range_inverse_array(iova_ranges,
&sdev->host_resv_ranges,
0, UINT64_MAX);
rebuild_resv_regions(sdev);
return 0;
error:
error_setg(errp, "IOMMU mr=%s Conflicting host reserved ranges set!",
mr->parent_obj.name);
out:
g_list_free_full(new_ranges, g_free);
return ret;
}
static void virtio_iommu_system_reset(void *opaque)
{
VirtIOIOMMU *s = opaque;
@ -1206,7 +1336,7 @@ static void virtio_iommu_device_realize(DeviceState *dev, Error **errp)
s->as_by_busptr = g_hash_table_new_full(NULL, NULL, NULL, g_free);
if (s->primary_bus) {
pci_setup_iommu(s->primary_bus, virtio_iommu_find_add_as, s);
pci_setup_iommu(s->primary_bus, &virtio_iommu_ops, s);
} else {
error_setg(errp, "VIRTIO-IOMMU is not attached to any PCI bus!");
}
@ -1426,6 +1556,7 @@ static void virtio_iommu_memory_region_class_init(ObjectClass *klass,
imrc->replay = virtio_iommu_replay;
imrc->notify_flag_changed = virtio_iommu_notify_flag_changed;
imrc->iommu_set_page_size_mask = virtio_iommu_set_page_size_mask;
imrc->iommu_set_iova_ranges = virtio_iommu_set_iova_ranges;
}
static const TypeInfo virtio_iommu_info = {