virtio,pc,pci: features, cleanups, fixes

more memslots support in libvhost-user
 support PCIe Gen5/Gen6 link speeds in pcie
 more traces in vdpa
 network simulation devices support in vdpa
 SMBIOS type 9 descriptor implementation
 Bump max_cpus to 4096 vcpus in q35
 aw-bits and granule options in VIRTIO-IOMMU
 Support report NUMA nodes for device memory using GI in acpi
 Beginning of shutdown event support in pvpanic
 
 fixes, cleanups all over the place.
 
 Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
 -----BEGIN PGP SIGNATURE-----
 
 iQFDBAABCAAtFiEEXQn9CHHI+FuUyooNKB8NuNKNVGkFAmXw0TMPHG1zdEByZWRo
 YXQuY29tAAoJECgfDbjSjVRp8x4H+gLMoGwaGAX7gDGPgn2Ix4j/3kO77ZJ9X9k/
 1KqZu/9eMS1j2Ei+vZqf05w7qRjxxhwDq3ilEXF/+UFqgAehLqpRRB8j5inqvzYt
 +jv0DbL11PBp/oFjWcytm5CbiVsvq8KlqCF29VNzc162XdtcduUOWagL96y8lJfZ
 uPrOoyeR7SMH9lp3LLLHWgu+9W4nOS03RroZ6Umj40y5B7yR0Rrppz8lMw5AoQtr
 0gMRnFhYXeiW6CXdz+Tzcr7XfvkkYDi/j7ibiNSURLBfOpZa6Y8+kJGKxz5H1K1G
 6ZY4PBcOpQzl+NMrktPHogczgJgOK10t+1i/R3bGZYw2Qn/93Eg=
 =C0UU
 -----END PGP SIGNATURE-----

Merge tag 'for_upstream' of https://git.kernel.org/pub/scm/virt/kvm/mst/qemu into staging

virtio,pc,pci: features, cleanups, fixes

more memslots support in libvhost-user
support PCIe Gen5/Gen6 link speeds in pcie
more traces in vdpa
network simulation devices support in vdpa
SMBIOS type 9 descriptor implementation
Bump max_cpus to 4096 vcpus in q35
aw-bits and granule options in VIRTIO-IOMMU
Support report NUMA nodes for device memory using GI in acpi
Beginning of shutdown event support in pvpanic

fixes, cleanups all over the place.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

# -----BEGIN PGP SIGNATURE-----
#
# iQFDBAABCAAtFiEEXQn9CHHI+FuUyooNKB8NuNKNVGkFAmXw0TMPHG1zdEByZWRo
# YXQuY29tAAoJECgfDbjSjVRp8x4H+gLMoGwaGAX7gDGPgn2Ix4j/3kO77ZJ9X9k/
# 1KqZu/9eMS1j2Ei+vZqf05w7qRjxxhwDq3ilEXF/+UFqgAehLqpRRB8j5inqvzYt
# +jv0DbL11PBp/oFjWcytm5CbiVsvq8KlqCF29VNzc162XdtcduUOWagL96y8lJfZ
# uPrOoyeR7SMH9lp3LLLHWgu+9W4nOS03RroZ6Umj40y5B7yR0Rrppz8lMw5AoQtr
# 0gMRnFhYXeiW6CXdz+Tzcr7XfvkkYDi/j7ibiNSURLBfOpZa6Y8+kJGKxz5H1K1G
# 6ZY4PBcOpQzl+NMrktPHogczgJgOK10t+1i/R3bGZYw2Qn/93Eg=
# =C0UU
# -----END PGP SIGNATURE-----
# gpg: Signature made Tue 12 Mar 2024 22:03:31 GMT
# gpg:                using RSA key 5D09FD0871C8F85B94CA8A0D281F0DB8D28D5469
# gpg:                issuer "mst@redhat.com"
# gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>" [full]
# gpg:                 aka "Michael S. Tsirkin <mst@redhat.com>" [full]
# Primary key fingerprint: 0270 606B 6F3C DF3D 0B17  0970 C350 3912 AFBE 8E67
#      Subkey fingerprint: 5D09 FD08 71C8 F85B 94CA  8A0D 281F 0DB8 D28D 5469

* tag 'for_upstream' of https://git.kernel.org/pub/scm/virt/kvm/mst/qemu: (68 commits)
  docs/specs/pvpanic: document shutdown event
  hw/cxl: Fix missing reserved data in CXL Device DVSEC
  hmat acpi: Fix out of bounds access due to missing use of indirection
  hmat acpi: Do not add Memory Proximity Domain Attributes Structure targetting non existent memory.
  qemu-options.hx: Document the virtio-iommu-pci aw-bits option
  hw/arm/virt: Set virtio-iommu aw-bits default value to 48
  hw/i386/q35: Set virtio-iommu aw-bits default value to 39
  virtio-iommu: Add an option to define the input range width
  virtio-iommu: Trace domain range limits as unsigned int
  qemu-options.hx: Document the virtio-iommu-pci granule option
  virtio-iommu: Change the default granule to the host page size
  virtio-iommu: Add a granule property
  hw/i386/acpi-build: Add support for SRAT Generic Initiator structures
  hw/acpi: Implement the SRAT GI affinity structure
  qom: new object to associate device to NUMA node
  hw/i386/pc: Inline pc_cmos_init() into pc_cmos_init_late() and remove it
  hw/i386/pc: Set "normal" boot device order in pc_basic_device_init()
  hw/i386/pc: Avoid one use of the current_machine global
  hw/i386/pc: Remove "rtc_state" link again
  Revert "hw/i386/pc: Confine system flash handling to pc_sysfw"
  ...

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>

# Conflicts:
#	hw/core/machine.c
This commit is contained in:
Peter Maydell 2024-03-13 15:11:53 +00:00
commit 6fc6931231
56 changed files with 1428 additions and 384 deletions

View file

@ -2370,6 +2370,11 @@ F: hw/virtio/vhost-user-scmi*
F: include/hw/virtio/vhost-user-scmi.h F: include/hw/virtio/vhost-user-scmi.h
F: tests/qtest/libqos/virtio-scmi.* F: tests/qtest/libqos/virtio-scmi.*
vdpa-net
M: Hao Chen <chenh@yusur.tech>
S: Maintained
F: docs/system/devices/vdpa-net.rst
virtio-crypto virtio-crypto
M: Gonglei <arei.gonglei@huawei.com> M: Gonglei <arei.gonglei@huawei.com>
S: Supported S: Supported

View file

@ -1839,7 +1839,9 @@ is sent by the front-end.
When the ``VHOST_USER_PROTOCOL_F_SHARED_OBJECT`` protocol When the ``VHOST_USER_PROTOCOL_F_SHARED_OBJECT`` protocol
feature has been successfully negotiated, this message can be submitted feature has been successfully negotiated, this message can be submitted
by the backend to remove themselves from to the virtio-dmabuf shared by the backend to remove themselves from to the virtio-dmabuf shared
table API. The shared table will remove the back-end device associated with table API. Only the back-end owning the entry (i.e., the one that first added
it) will have permission to remove it. Otherwise, the message is ignored.
The shared table will remove the back-end device associated with
the UUID. If ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, and the the UUID. If ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, and the
back-end sets the ``VHOST_USER_NEED_REPLY`` flag, the front-end must respond back-end sets the ``VHOST_USER_NEED_REPLY`` flag, the front-end must respond
with zero when operation is successfully completed, or non-zero otherwise. with zero when operation is successfully completed, or non-zero otherwise.

View file

@ -29,6 +29,8 @@ bit 1
a guest panic has happened and will be handled by the guest; a guest panic has happened and will be handled by the guest;
the host should record it or report it, but should not affect the host should record it or report it, but should not affect
the execution of the guest. the execution of the guest.
bit 2
a regular guest shutdown has happened and should be processed by the host
PCI Interface PCI Interface
------------- -------------

View file

@ -99,3 +99,4 @@ Emulated Devices
devices/canokey.rst devices/canokey.rst
devices/usb-u2f.rst devices/usb-u2f.rst
devices/igb.rst devices/igb.rst
devices/vdpa-net.rst

View file

@ -0,0 +1,121 @@
vdpa net
============
This document explains the setup and usage of the vdpa network device.
The vdpa network device is a paravirtualized vdpa emulate device.
Description
-----------
VDPA net devices support dirty page bitmap mark and vring state saving and recovery.
Users can use this VDPA device for live migration simulation testing in a nested virtualization environment.
Registers layout
----------------
The vdpa device add live migrate registers layout as follow::
Offset Register Name Bitwidth Associated vq
0x0 LM_LOGGING_CTRL 4bits
0x10 LM_BASE_ADDR_LOW 32bits
0x14 LM_BASE_ADDR_HIGH 32bits
0x18 LM_END_ADDR_LOW 32bits
0x1c LM_END_ADDR_HIGH 32bits
0x20 LM_RING_STATE_OFFSET 32bits vq0
0x24 LM_RING_STATE_OFFSET 32bits vq1
0x28 LM_RING_STATE_OFFSET 32bits vq2
......
0x20+1023*4 LM_RING_STATE_OFFSET 32bits vq1023
These registers are extended at the end of the notify bar space.
Architecture diagram
--------------------
::
|------------------------------------------------------------------------|
| guest-L1-user-space |
| |
| |----------------------------------------|
| | [virtio-net driver] |
| | ^ guest-L2-src(iommu=on) |
| |--------------|-------------------------|
| | | qemu-L2-src(viommu) |
| [dpdk-vdpa]<->[vhost socket]<-+->[vhost-user backend(iommu=on)] |
--------------------------------------------------------------------------
--------------------------------------------------------------------------
| ^ guest-L1-kernel-space |
| | |
| [VFIO] |
| ^ |
| | guest-L1-src(iommu=on) |
--------|-----------------------------------------------------------------
--------|-----------------------------------------------------------------
| [vdpa net device(iommu=on)] [manager nic device] |
| | | |
| | | |
| [tap device] qemu-L1-src(viommu) | |
------------------------------------------------+-------------------------
|
|
--------------------- |
| kernel net bridge |<-----
| virbr0 |<----------------------------------
--------------------- |
|
|
-------------------------------------------------------------------------- |
| guest-L1-user-space | |
| | |
| |----------------------------------------| |
| | [virtio-net driver] | |
| | ^ guest-L2-dst(iommu=on) | |
| |--------------|-------------------------| |
| | | qemu-L2-dst(viommu) | |
| [dpdk-vdpa]<->[vhost socket]<-+->[vhost-user backend(iommu=on)] | |
-------------------------------------------------------------------------- |
-------------------------------------------------------------------------- |
| ^ guest-L1-kernel-space | |
| | | |
| [VFIO] | |
| ^ | |
| | guest-L1-dst(iommu=on) | |
--------|----------------------------------------------------------------- |
--------|----------------------------------------------------------------- |
| [vdpa net device(iommu=on)] [manager nic device]----------------+----
| | |
| | |
| [tap device] qemu-L1-dst(viommu) |
--------------------------------------------------------------------------
Device properties
-----------------
The Virtio vdpa device can be configured with the following properties:
* ``vdpa=on`` open vdpa device emulated.
Usages
--------
This patch add virtio sriov support and vdpa live migrate support.
You can open vdpa by set xml file as follow::
<qemu:commandline xmlns:qemu='http://libvirt.org/schemas/domain/qemu/1.0'>
<qemu:arg value='-device'/>
<qemu:arg value='intel-iommu,intremap=on,device-iotlb=on,aw-bits=48'/>
<qemu:arg value='-netdev'/>
<qemu:arg value='tap,id=hostnet1,script=no,downscript=no,vhost=off'/>
<qemu:arg value='-device'/>
<qemu:arg value='virtio-net-pci,netdev=hostnet1,id=net1,mac=56:4a:b7:4f:4d:a9,bus=pci.6,addr=0x0,iommu_platform=on,ats=on,vdpa=on'/>
</qemu:commandline>
Limitations
-----------
1. Dependent on tap device with param ``vhost=off``.
2. Nested virtualization environment only supports ``q35`` machines.
3. Current only support split vring live migrate.

View file

@ -0,0 +1,148 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
*/
#include "qemu/osdep.h"
#include "hw/acpi/acpi_generic_initiator.h"
#include "hw/acpi/aml-build.h"
#include "hw/boards.h"
#include "hw/pci/pci_device.h"
#include "qemu/error-report.h"
typedef struct AcpiGenericInitiatorClass {
ObjectClass parent_class;
} AcpiGenericInitiatorClass;
OBJECT_DEFINE_TYPE_WITH_INTERFACES(AcpiGenericInitiator, acpi_generic_initiator,
ACPI_GENERIC_INITIATOR, OBJECT,
{ TYPE_USER_CREATABLE },
{ NULL })
OBJECT_DECLARE_SIMPLE_TYPE(AcpiGenericInitiator, ACPI_GENERIC_INITIATOR)
static void acpi_generic_initiator_init(Object *obj)
{
AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj);
gi->node = MAX_NODES;
gi->pci_dev = NULL;
}
static void acpi_generic_initiator_finalize(Object *obj)
{
AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj);
g_free(gi->pci_dev);
}
static void acpi_generic_initiator_set_pci_device(Object *obj, const char *val,
Error **errp)
{
AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj);
gi->pci_dev = g_strdup(val);
}
static void acpi_generic_initiator_set_node(Object *obj, Visitor *v,
const char *name, void *opaque,
Error **errp)
{
AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj);
MachineState *ms = MACHINE(qdev_get_machine());
uint32_t value;
if (!visit_type_uint32(v, name, &value, errp)) {
return;
}
if (value >= MAX_NODES) {
error_printf("%s: Invalid NUMA node specified\n",
TYPE_ACPI_GENERIC_INITIATOR);
exit(1);
}
gi->node = value;
ms->numa_state->nodes[gi->node].has_gi = true;
}
static void acpi_generic_initiator_class_init(ObjectClass *oc, void *data)
{
object_class_property_add_str(oc, "pci-dev", NULL,
acpi_generic_initiator_set_pci_device);
object_class_property_add(oc, "node", "int", NULL,
acpi_generic_initiator_set_node, NULL, NULL);
}
/*
* ACPI 6.3:
* Table 5-78 Generic Initiator Affinity Structure
*/
static void
build_srat_generic_pci_initiator_affinity(GArray *table_data, int node,
PCIDeviceHandle *handle)
{
uint8_t index;
build_append_int_noprefix(table_data, 5, 1); /* Type */
build_append_int_noprefix(table_data, 32, 1); /* Length */
build_append_int_noprefix(table_data, 0, 1); /* Reserved */
build_append_int_noprefix(table_data, 1, 1); /* Device Handle Type: PCI */
build_append_int_noprefix(table_data, node, 4); /* Proximity Domain */
/* Device Handle - PCI */
build_append_int_noprefix(table_data, handle->segment, 2);
build_append_int_noprefix(table_data, handle->bdf, 2);
for (index = 0; index < 12; index++) {
build_append_int_noprefix(table_data, 0, 1);
}
build_append_int_noprefix(table_data, GEN_AFFINITY_ENABLED, 4); /* Flags */
build_append_int_noprefix(table_data, 0, 4); /* Reserved */
}
static int build_all_acpi_generic_initiators(Object *obj, void *opaque)
{
MachineState *ms = MACHINE(qdev_get_machine());
AcpiGenericInitiator *gi;
GArray *table_data = opaque;
PCIDeviceHandle dev_handle;
PCIDevice *pci_dev;
Object *o;
if (!object_dynamic_cast(obj, TYPE_ACPI_GENERIC_INITIATOR)) {
return 0;
}
gi = ACPI_GENERIC_INITIATOR(obj);
if (gi->node >= ms->numa_state->num_nodes) {
error_printf("%s: Specified node %d is invalid.\n",
TYPE_ACPI_GENERIC_INITIATOR, gi->node);
exit(1);
}
o = object_resolve_path_type(gi->pci_dev, TYPE_PCI_DEVICE, NULL);
if (!o) {
error_printf("%s: Specified device must be a PCI device.\n",
TYPE_ACPI_GENERIC_INITIATOR);
exit(1);
}
pci_dev = PCI_DEVICE(o);
dev_handle.segment = 0;
dev_handle.bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)),
pci_dev->devfn);
build_srat_generic_pci_initiator_affinity(table_data,
gi->node, &dev_handle);
return 0;
}
void build_srat_generic_pci_initiator(GArray *table_data)
{
object_child_foreach_recursive(object_get_root(),
build_all_acpi_generic_initiators,
table_data);
}

View file

@ -78,6 +78,7 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb,
uint32_t *initiator_list) uint32_t *initiator_list)
{ {
int i, index; int i, index;
uint32_t initiator_to_index[MAX_NODES] = {};
HMAT_LB_Data *lb_data; HMAT_LB_Data *lb_data;
uint16_t *entry_list; uint16_t *entry_list;
uint32_t base; uint32_t base;
@ -121,6 +122,8 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb,
/* Initiator Proximity Domain List */ /* Initiator Proximity Domain List */
for (i = 0; i < num_initiator; i++) { for (i = 0; i < num_initiator; i++) {
build_append_int_noprefix(table_data, initiator_list[i], 4); build_append_int_noprefix(table_data, initiator_list[i], 4);
/* Reverse mapping for array possitions */
initiator_to_index[initiator_list[i]] = i;
} }
/* Target Proximity Domain List */ /* Target Proximity Domain List */
@ -132,7 +135,8 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb,
entry_list = g_new0(uint16_t, num_initiator * num_target); entry_list = g_new0(uint16_t, num_initiator * num_target);
for (i = 0; i < hmat_lb->list->len; i++) { for (i = 0; i < hmat_lb->list->len; i++) {
lb_data = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); lb_data = &g_array_index(hmat_lb->list, HMAT_LB_Data, i);
index = lb_data->initiator * num_target + lb_data->target; index = initiator_to_index[lb_data->initiator] * num_target +
lb_data->target;
entry_list[index] = (uint16_t)(lb_data->data / hmat_lb->base); entry_list[index] = (uint16_t)(lb_data->data / hmat_lb->base);
} }
@ -204,6 +208,13 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state)
build_append_int_noprefix(table_data, 0, 4); /* Reserved */ build_append_int_noprefix(table_data, 0, 4); /* Reserved */
for (i = 0; i < numa_state->num_nodes; i++) { for (i = 0; i < numa_state->num_nodes; i++) {
/*
* Linux rejects whole HMAT table if a node with no memory
* has one of these structures listing it as a target.
*/
if (!numa_state->nodes[i].node_mem) {
continue;
}
flags = 0; flags = 0;
if (numa_state->nodes[i].initiator < MAX_NODES) { if (numa_state->nodes[i].initiator < MAX_NODES) {
@ -214,7 +225,7 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state)
} }
for (i = 0; i < numa_state->num_nodes; i++) { for (i = 0; i < numa_state->num_nodes; i++) {
if (numa_state->nodes[i].has_cpu) { if (numa_state->nodes[i].has_cpu || numa_state->nodes[i].has_gi) {
initiator_list[num_initiator++] = i; initiator_list[num_initiator++] = i;
} }
} }

View file

@ -1,5 +1,6 @@
acpi_ss = ss.source_set() acpi_ss = ss.source_set()
acpi_ss.add(files( acpi_ss.add(files(
'acpi_generic_initiator.c',
'acpi_interface.c', 'acpi_interface.c',
'aml-build.c', 'aml-build.c',
'bios-linker-loader.c', 'bios-linker-loader.c',

View file

@ -57,6 +57,7 @@
#include "migration/vmstate.h" #include "migration/vmstate.h"
#include "hw/acpi/ghes.h" #include "hw/acpi/ghes.h"
#include "hw/acpi/viot.h" #include "hw/acpi/viot.h"
#include "hw/acpi/acpi_generic_initiator.h"
#include "hw/virtio/virtio-acpi.h" #include "hw/virtio/virtio-acpi.h"
#include "target/arm/multiprocessing.h" #include "target/arm/multiprocessing.h"
@ -504,6 +505,8 @@ build_srat(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
} }
} }
build_srat_generic_pci_initiator(table_data);
if (ms->nvdimms_state->is_enabled) { if (ms->nvdimms_state->is_enabled) {
nvdimm_build_srat(table_data); nvdimm_build_srat(table_data);
} }

View file

@ -85,11 +85,28 @@
#include "hw/char/pl011.h" #include "hw/char/pl011.h"
#include "qemu/guest-random.h" #include "qemu/guest-random.h"
static GlobalProperty arm_virt_compat[] = {
{ TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "48" },
};
static const size_t arm_virt_compat_len = G_N_ELEMENTS(arm_virt_compat);
/*
* This cannot be called from the virt_machine_class_init() because
* TYPE_VIRT_MACHINE is abstract and mc->compat_props g_ptr_array_new()
* only is called on virt non abstract class init.
*/
static void arm_virt_compat_set(MachineClass *mc)
{
compat_props_add(mc->compat_props, arm_virt_compat,
arm_virt_compat_len);
}
#define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \ #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
static void virt_##major##_##minor##_class_init(ObjectClass *oc, \ static void virt_##major##_##minor##_class_init(ObjectClass *oc, \
void *data) \ void *data) \
{ \ { \
MachineClass *mc = MACHINE_CLASS(oc); \ MachineClass *mc = MACHINE_CLASS(oc); \
arm_virt_compat_set(mc); \
virt_machine_##major##_##minor##_options(mc); \ virt_machine_##major##_##minor##_options(mc); \
mc->desc = "QEMU " # major "." # minor " ARM Virtual Machine"; \ mc->desc = "QEMU " # major "." # minor " ARM Virtual Machine"; \
if (latest) { \ if (latest) { \

View file

@ -243,12 +243,13 @@ static void virtio_snd_handle_pcm_info(VirtIOSound *s,
memset(&pcm_info[i].padding, 0, 5); memset(&pcm_info[i].padding, 0, 5);
} }
cmd->payload_size = sizeof(virtio_snd_pcm_info) * count;
cmd->resp.code = cpu_to_le32(VIRTIO_SND_S_OK); cmd->resp.code = cpu_to_le32(VIRTIO_SND_S_OK);
iov_from_buf(cmd->elem->in_sg, iov_from_buf(cmd->elem->in_sg,
cmd->elem->in_num, cmd->elem->in_num,
sizeof(virtio_snd_hdr), sizeof(virtio_snd_hdr),
pcm_info, pcm_info,
sizeof(virtio_snd_pcm_info) * count); cmd->payload_size);
} }
/* /*
@ -749,7 +750,8 @@ process_cmd(VirtIOSound *s, virtio_snd_ctrl_command *cmd)
0, 0,
&cmd->resp, &cmd->resp,
sizeof(virtio_snd_hdr)); sizeof(virtio_snd_hdr));
virtqueue_push(cmd->vq, cmd->elem, sizeof(virtio_snd_hdr)); virtqueue_push(cmd->vq, cmd->elem,
sizeof(virtio_snd_hdr) + cmd->payload_size);
virtio_notify(VIRTIO_DEVICE(s), cmd->vq); virtio_notify(VIRTIO_DEVICE(s), cmd->vq);
} }
@ -808,6 +810,7 @@ static void virtio_snd_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
cmd->elem = elem; cmd->elem = elem;
cmd->vq = vq; cmd->vq = vq;
cmd->resp.code = cpu_to_le32(VIRTIO_SND_S_OK); cmd->resp.code = cpu_to_le32(VIRTIO_SND_S_OK);
/* implicit cmd->payload_size = 0; */
QTAILQ_INSERT_TAIL(&s->cmdq, cmd, next); QTAILQ_INSERT_TAIL(&s->cmdq, cmd, next);
elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
} }

View file

@ -30,10 +30,13 @@
#include "exec/confidential-guest-support.h" #include "exec/confidential-guest-support.h"
#include "hw/virtio/virtio-pci.h" #include "hw/virtio/virtio-pci.h"
#include "hw/virtio/virtio-net.h" #include "hw/virtio/virtio-net.h"
#include "hw/virtio/virtio-iommu.h"
#include "audio/audio.h" #include "audio/audio.h"
GlobalProperty hw_compat_8_2[] = { GlobalProperty hw_compat_8_2[] = {
{ "migration", "zero-page-detection", "legacy"}, { "migration", "zero-page-detection", "legacy"},
{ TYPE_VIRTIO_IOMMU_PCI, "granule", "4k" },
{ TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "64" },
}; };
const size_t hw_compat_8_2_len = G_N_ELEMENTS(hw_compat_8_2); const size_t hw_compat_8_2_len = G_N_ELEMENTS(hw_compat_8_2);

View file

@ -227,7 +227,8 @@ void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node,
node->target, numa_state->num_nodes); node->target, numa_state->num_nodes);
return; return;
} }
if (!numa_info[node->initiator].has_cpu) { if (!numa_info[node->initiator].has_cpu &&
!numa_info[node->initiator].has_gi) {
error_setg(errp, "Invalid initiator=%d, it isn't an " error_setg(errp, "Invalid initiator=%d, it isn't an "
"initiator proximity domain", node->initiator); "initiator proximity domain", node->initiator);
return; return;

View file

@ -966,7 +966,7 @@ const PropertyInfo qdev_prop_off_auto_pcibar = {
.set_default_value = qdev_propinfo_set_default_value_enum, .set_default_value = qdev_propinfo_set_default_value_enum,
}; };
/* --- PCIELinkSpeed 2_5/5/8/16 -- */ /* --- PCIELinkSpeed 2_5/5/8/16/32/64 -- */
static void get_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name, static void get_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name,
void *opaque, Error **errp) void *opaque, Error **errp)
@ -988,6 +988,12 @@ static void get_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name,
case QEMU_PCI_EXP_LNK_16GT: case QEMU_PCI_EXP_LNK_16GT:
speed = PCIE_LINK_SPEED_16; speed = PCIE_LINK_SPEED_16;
break; break;
case QEMU_PCI_EXP_LNK_32GT:
speed = PCIE_LINK_SPEED_32;
break;
case QEMU_PCI_EXP_LNK_64GT:
speed = PCIE_LINK_SPEED_64;
break;
default: default:
/* Unreachable */ /* Unreachable */
abort(); abort();
@ -1021,6 +1027,12 @@ static void set_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name,
case PCIE_LINK_SPEED_16: case PCIE_LINK_SPEED_16:
*p = QEMU_PCI_EXP_LNK_16GT; *p = QEMU_PCI_EXP_LNK_16GT;
break; break;
case PCIE_LINK_SPEED_32:
*p = QEMU_PCI_EXP_LNK_32GT;
break;
case PCIE_LINK_SPEED_64:
*p = QEMU_PCI_EXP_LNK_64GT;
break;
default: default:
/* Unreachable */ /* Unreachable */
abort(); abort();
@ -1029,7 +1041,7 @@ static void set_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name,
const PropertyInfo qdev_prop_pcie_link_speed = { const PropertyInfo qdev_prop_pcie_link_speed = {
.name = "PCIELinkSpeed", .name = "PCIELinkSpeed",
.description = "2_5/5/8/16", .description = "2_5/5/8/16/32/64",
.enum_table = &PCIELinkSpeed_lookup, .enum_table = &PCIELinkSpeed_lookup,
.get = get_prop_pcielinkspeed, .get = get_prop_pcielinkspeed,
.set = set_prop_pcielinkspeed, .set = set_prop_pcielinkspeed,

View file

@ -297,6 +297,7 @@ void cxl_component_register_init_common(uint32_t *reg_state,
caps = 3; caps = 3;
break; break;
case CXL2_ROOT_PORT: case CXL2_ROOT_PORT:
case CXL2_RC:
/* + Extended Security, + Snoop */ /* + Extended Security, + Snoop */
caps = 5; caps = 5;
break; break;
@ -326,8 +327,19 @@ void cxl_component_register_init_common(uint32_t *reg_state,
CXL_##reg##_REGISTERS_OFFSET); \ CXL_##reg##_REGISTERS_OFFSET); \
} while (0) } while (0)
switch (type) {
case CXL2_DEVICE:
case CXL2_TYPE3_DEVICE:
case CXL2_LOGICAL_DEVICE:
case CXL2_ROOT_PORT:
case CXL2_UPSTREAM_PORT:
case CXL2_DOWNSTREAM_PORT:
init_cap_reg(RAS, 2, CXL_RAS_CAPABILITY_VERSION); init_cap_reg(RAS, 2, CXL_RAS_CAPABILITY_VERSION);
ras_init_common(reg_state, write_msk); ras_init_common(reg_state, write_msk);
break;
default:
break;
}
init_cap_reg(LINK, 4, CXL_LINK_CAPABILITY_VERSION); init_cap_reg(LINK, 4, CXL_LINK_CAPABILITY_VERSION);
@ -335,9 +347,10 @@ void cxl_component_register_init_common(uint32_t *reg_state,
return; return;
} }
init_cap_reg(HDM, 5, CXL_HDM_CAPABILITY_VERSION); if (type != CXL2_ROOT_PORT) {
hdm_init_common(reg_state, write_msk, type); init_cap_reg(HDM, 5, CXL_HDM_CAPABILITY_VERSION);
hdm_init_common(reg_state, write_msk, type);
}
if (caps < 5) { if (caps < 5) {
return; return;
} }

View file

@ -68,6 +68,7 @@
#include "hw/acpi/utils.h" #include "hw/acpi/utils.h"
#include "hw/acpi/pci.h" #include "hw/acpi/pci.h"
#include "hw/acpi/cxl.h" #include "hw/acpi/cxl.h"
#include "hw/acpi/acpi_generic_initiator.h"
#include "qom/qom-qobject.h" #include "qom/qom-qobject.h"
#include "hw/i386/amd_iommu.h" #include "hw/i386/amd_iommu.h"
@ -2046,6 +2047,8 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
build_srat_memory(table_data, 0, 0, 0, MEM_AFFINITY_NOFLAGS); build_srat_memory(table_data, 0, 0, 0, MEM_AFFINITY_NOFLAGS);
} }
build_srat_generic_pci_initiator(table_data);
/* /*
* Entry is required for Windows to enable memory hotplug in OS * Entry is required for Windows to enable memory hotplug in OS
* and for Linux to enable SWIOTLB when booted with less than * and for Linux to enable SWIOTLB when booted with less than

View file

@ -425,9 +425,10 @@ static void set_boot_dev(PCMachineState *pcms, MC146818RtcState *s,
static void pc_boot_set(void *opaque, const char *boot_device, Error **errp) static void pc_boot_set(void *opaque, const char *boot_device, Error **errp)
{ {
PCMachineState *pcms = PC_MACHINE(current_machine); PCMachineState *pcms = opaque;
X86MachineState *x86ms = X86_MACHINE(pcms);
set_boot_dev(pcms, opaque, boot_device, errp); set_boot_dev(pcms, MC146818_RTC(x86ms->rtc), boot_device, errp);
} }
static void pc_cmos_init_floppy(MC146818RtcState *rtc_state, ISADevice *floppy) static void pc_cmos_init_floppy(MC146818RtcState *rtc_state, ISADevice *floppy)
@ -569,14 +570,6 @@ static void pc_cmos_init_late(PCMachineState *pcms)
mc146818rtc_set_cmos_data(s, 0x39, val); mc146818rtc_set_cmos_data(s, 0x39, val);
pc_cmos_init_floppy(s, pc_find_fdc0()); pc_cmos_init_floppy(s, pc_find_fdc0());
}
void pc_cmos_init(PCMachineState *pcms,
ISADevice *rtc)
{
int val;
X86MachineState *x86ms = X86_MACHINE(pcms);
MC146818RtcState *s = MC146818_RTC(rtc);
/* various important CMOS locations needed by PC/Bochs bios */ /* various important CMOS locations needed by PC/Bochs bios */
@ -613,22 +606,10 @@ void pc_cmos_init(PCMachineState *pcms,
mc146818rtc_set_cmos_data(s, 0x5c, val >> 8); mc146818rtc_set_cmos_data(s, 0x5c, val >> 8);
mc146818rtc_set_cmos_data(s, 0x5d, val >> 16); mc146818rtc_set_cmos_data(s, 0x5d, val >> 16);
object_property_add_link(OBJECT(pcms), "rtc_state",
TYPE_ISA_DEVICE,
(Object **)&x86ms->rtc,
object_property_allow_set_link,
OBJ_PROP_LINK_STRONG);
object_property_set_link(OBJECT(pcms), "rtc_state", OBJECT(s),
&error_abort);
set_boot_dev(pcms, s, MACHINE(pcms)->boot_config.order, &error_fatal);
val = 0; val = 0;
val |= 0x02; /* FPU is there */ val |= 0x02; /* FPU is there */
val |= 0x04; /* PS/2 mouse installed */ val |= 0x04; /* PS/2 mouse installed */
mc146818rtc_set_cmos_data(s, REG_EQUIPMENT_BYTE, val); mc146818rtc_set_cmos_data(s, REG_EQUIPMENT_BYTE, val);
/* hard drives and FDC are handled by pc_cmos_init_late() */
} }
static void handle_a20_line_change(void *opaque, int irq, int level) static void handle_a20_line_change(void *opaque, int irq, int level)
@ -1261,7 +1242,9 @@ void pc_basic_device_init(struct PCMachineState *pcms,
} }
#endif #endif
qemu_register_boot_set(pc_boot_set, rtc_state); qemu_register_boot_set(pc_boot_set, pcms);
set_boot_dev(pcms, MC146818_RTC(rtc_state),
MACHINE(pcms)->boot_config.order, &error_fatal);
if (!xen_enabled() && if (!xen_enabled() &&
(x86ms->pit == ON_OFF_AUTO_AUTO || x86ms->pit == ON_OFF_AUTO_ON)) { (x86ms->pit == ON_OFF_AUTO_AUTO || x86ms->pit == ON_OFF_AUTO_ON)) {
@ -1751,6 +1734,7 @@ static void pc_machine_initfn(Object *obj)
pcms->fd_bootchk = true; pcms->fd_bootchk = true;
pcms->default_bus_bypass_iommu = false; pcms->default_bus_bypass_iommu = false;
pc_system_flash_create(pcms);
pcms->pcspk = isa_new(TYPE_PC_SPEAKER); pcms->pcspk = isa_new(TYPE_PC_SPEAKER);
object_property_add_alias(OBJECT(pcms), "pcspk-audiodev", object_property_add_alias(OBJECT(pcms), "pcspk-audiodev",
OBJECT(pcms->pcspk), "audiodev"); OBJECT(pcms->pcspk), "audiodev");

View file

@ -228,6 +228,7 @@ static void pc_init1(MachineState *machine, const char *pci_type)
assert(machine->ram_size == x86ms->below_4g_mem_size + assert(machine->ram_size == x86ms->below_4g_mem_size +
x86ms->above_4g_mem_size); x86ms->above_4g_mem_size);
pc_system_flash_cleanup_unused(pcms);
if (machine->kernel_filename != NULL) { if (machine->kernel_filename != NULL) {
/* For xen HVM direct kernel boot, load linux here */ /* For xen HVM direct kernel boot, load linux here */
xen_load_linux(pcms); xen_load_linux(pcms);
@ -343,8 +344,6 @@ static void pc_init1(MachineState *machine, const char *pci_type)
} }
#endif #endif
pc_cmos_init(pcms, x86ms->rtc);
if (piix4_pm) { if (piix4_pm) {
smi_irq = qemu_allocate_irq(pc_acpi_smi_interrupt, first_cpu, 0); smi_irq = qemu_allocate_irq(pc_acpi_smi_interrupt, first_cpu, 0);

View file

@ -45,6 +45,7 @@
#include "hw/i386/pc.h" #include "hw/i386/pc.h"
#include "hw/i386/amd_iommu.h" #include "hw/i386/amd_iommu.h"
#include "hw/i386/intel_iommu.h" #include "hw/i386/intel_iommu.h"
#include "hw/virtio/virtio-iommu.h"
#include "hw/display/ramfb.h" #include "hw/display/ramfb.h"
#include "hw/ide/pci.h" #include "hw/ide/pci.h"
#include "hw/ide/ahci-pci.h" #include "hw/ide/ahci-pci.h"
@ -63,6 +64,12 @@
/* ICH9 AHCI has 6 ports */ /* ICH9 AHCI has 6 ports */
#define MAX_SATA_PORTS 6 #define MAX_SATA_PORTS 6
static GlobalProperty pc_q35_compat_defaults[] = {
{ TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "39" },
};
static const size_t pc_q35_compat_defaults_len =
G_N_ELEMENTS(pc_q35_compat_defaults);
struct ehci_companions { struct ehci_companions {
const char *name; const char *name;
int func; int func;
@ -311,8 +318,6 @@ static void pc_q35_init(MachineState *machine)
smbus_eeprom_init(pcms->smbus, 8, NULL, 0); smbus_eeprom_init(pcms->smbus, 8, NULL, 0);
} }
pc_cmos_init(pcms, x86ms->rtc);
/* the rest devices to which pci devfn is automatically assigned */ /* the rest devices to which pci devfn is automatically assigned */
pc_vga_init(isa_bus, pcms->pcibus); pc_vga_init(isa_bus, pcms->pcibus);
pc_nic_init(pcmc, isa_bus, pcms->pcibus); pc_nic_init(pcmc, isa_bus, pcms->pcibus);
@ -350,12 +355,14 @@ static void pc_q35_machine_options(MachineClass *m)
m->default_nic = "e1000e"; m->default_nic = "e1000e";
m->default_kernel_irqchip_split = false; m->default_kernel_irqchip_split = false;
m->no_floppy = 1; m->no_floppy = 1;
m->max_cpus = 1024; m->max_cpus = 4096;
m->no_parallel = !module_object_class_by_name(TYPE_ISA_PARALLEL); m->no_parallel = !module_object_class_by_name(TYPE_ISA_PARALLEL);
machine_class_allow_dynamic_sysbus_dev(m, TYPE_AMD_IOMMU_DEVICE); machine_class_allow_dynamic_sysbus_dev(m, TYPE_AMD_IOMMU_DEVICE);
machine_class_allow_dynamic_sysbus_dev(m, TYPE_INTEL_IOMMU_DEVICE); machine_class_allow_dynamic_sysbus_dev(m, TYPE_INTEL_IOMMU_DEVICE);
machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE); machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE);
machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE); machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE);
compat_props_add(m->compat_props,
pc_q35_compat_defaults, pc_q35_compat_defaults_len);
} }
static void pc_q35_9_0_machine_options(MachineClass *m) static void pc_q35_9_0_machine_options(MachineClass *m)
@ -371,6 +378,7 @@ static void pc_q35_8_2_machine_options(MachineClass *m)
{ {
pc_q35_9_0_machine_options(m); pc_q35_9_0_machine_options(m);
m->alias = NULL; m->alias = NULL;
m->max_cpus = 1024;
compat_props_add(m->compat_props, hw_compat_8_2, hw_compat_8_2_len); compat_props_add(m->compat_props, hw_compat_8_2, hw_compat_8_2_len);
compat_props_add(m->compat_props, pc_compat_8_2, pc_compat_8_2_len); compat_props_add(m->compat_props, pc_compat_8_2, pc_compat_8_2_len);
} }

View file

@ -91,7 +91,19 @@ static PFlashCFI01 *pc_pflash_create(PCMachineState *pcms,
return PFLASH_CFI01(dev); return PFLASH_CFI01(dev);
} }
static void pc_system_flash_cleanup_unused(PCMachineState *pcms) void pc_system_flash_create(PCMachineState *pcms)
{
PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
if (pcmc->pci_enabled) {
pcms->flash[0] = pc_pflash_create(pcms, "system.flash0",
"pflash0");
pcms->flash[1] = pc_pflash_create(pcms, "system.flash1",
"pflash1");
}
}
void pc_system_flash_cleanup_unused(PCMachineState *pcms)
{ {
char *prop_name; char *prop_name;
int i; int i;
@ -198,9 +210,6 @@ void pc_system_firmware_init(PCMachineState *pcms,
return; return;
} }
pcms->flash[0] = pc_pflash_create(pcms, "system.flash0", "pflash0");
pcms->flash[1] = pc_pflash_create(pcms, "system.flash1", "pflash1");
/* Map legacy -drive if=pflash to machine properties */ /* Map legacy -drive if=pflash to machine properties */
for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) { for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) {
pflash_cfi01_legacy_drive(pcms->flash[i], pflash_cfi01_legacy_drive(pcms->flash[i],

View file

@ -488,12 +488,10 @@ static void igb_pci_uninit(PCIDevice *pci_dev)
static void igb_qdev_reset_hold(Object *obj) static void igb_qdev_reset_hold(Object *obj)
{ {
PCIDevice *d = PCI_DEVICE(obj);
IGBState *s = IGB(obj); IGBState *s = IGB(obj);
trace_e1000e_cb_qdev_reset_hold(); trace_e1000e_cb_qdev_reset_hold();
pcie_sriov_pf_disable_vfs(d);
igb_core_reset(&s->core); igb_core_reset(&s->core);
} }

View file

@ -2039,6 +2039,22 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
goto err; goto err;
} }
/* Mark dirty page's bitmap of guest memory */
if (vdev->lm_logging_ctrl == LM_ENABLE) {
uint64_t chunk = elem->in_addr[i] / VHOST_LOG_CHUNK;
/* Get chunk index */
BitmapMemoryRegionCaches *caches = qatomic_rcu_read(&vdev->caches);
uint64_t index = chunk / 8;
uint64_t shift = chunk % 8;
uint8_t val = 0;
address_space_read_cached(&caches->bitmap, index, &val,
sizeof(val));
val |= 1 << shift;
address_space_write_cached(&caches->bitmap, index, &val,
sizeof(val));
address_space_cache_invalidate(&caches->bitmap, index, sizeof(val));
}
elems[i] = elem; elems[i] = elem;
lens[i] = total; lens[i] = total;
i++; i++;

View file

@ -7126,10 +7126,6 @@ static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst)
sctrl = &n->sec_ctrl_list.sec[i]; sctrl = &n->sec_ctrl_list.sec[i];
nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false); nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
} }
if (rst != NVME_RESET_CONTROLLER) {
pcie_sriov_pf_disable_vfs(pci_dev);
}
} }
if (rst != NVME_RESET_CONTROLLER) { if (rst != NVME_RESET_CONTROLLER) {
@ -8509,36 +8505,26 @@ static void nvme_pci_reset(DeviceState *qdev)
nvme_ctrl_reset(n, NVME_RESET_FUNCTION); nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
} }
static void nvme_sriov_pre_write_ctrl(PCIDevice *dev, uint32_t address, static void nvme_sriov_post_write_config(PCIDevice *dev, uint16_t old_num_vfs)
uint32_t val, int len)
{ {
NvmeCtrl *n = NVME(dev); NvmeCtrl *n = NVME(dev);
NvmeSecCtrlEntry *sctrl; NvmeSecCtrlEntry *sctrl;
uint16_t sriov_cap = dev->exp.sriov_cap; int i;
uint32_t off = address - sriov_cap;
int i, num_vfs;
if (!sriov_cap) { for (i = pcie_sriov_num_vfs(dev); i < old_num_vfs; i++) {
return; sctrl = &n->sec_ctrl_list.sec[i];
} nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) {
if (!(val & PCI_SRIOV_CTRL_VFE)) {
num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
for (i = 0; i < num_vfs; i++) {
sctrl = &n->sec_ctrl_list.sec[i];
nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
}
}
} }
} }
static void nvme_pci_write_config(PCIDevice *dev, uint32_t address, static void nvme_pci_write_config(PCIDevice *dev, uint32_t address,
uint32_t val, int len) uint32_t val, int len)
{ {
nvme_sriov_pre_write_ctrl(dev, address, val, len); uint16_t old_num_vfs = pcie_sriov_num_vfs(dev);
pci_default_write_config(dev, address, val, len); pci_default_write_config(dev, address, val, len);
pcie_cap_flr_write_config(dev, address, val, len); pcie_cap_flr_write_config(dev, address, val, len);
nvme_sriov_post_write_config(dev, old_num_vfs);
} }
static const VMStateDescription nvme_vmstate = { static const VMStateDescription nvme_vmstate = {

View file

@ -290,7 +290,7 @@ static void pxb_cxl_dev_reset(DeviceState *dev)
uint32_t *write_msk = cxl_cstate->crb.cache_mem_regs_write_mask; uint32_t *write_msk = cxl_cstate->crb.cache_mem_regs_write_mask;
int dsp_count = 0; int dsp_count = 0;
cxl_component_register_init_common(reg_state, write_msk, CXL2_ROOT_PORT); cxl_component_register_init_common(reg_state, write_msk, CXL2_RC);
/* /*
* The CXL specification allows for host bridges with no HDM decoders * The CXL specification allows for host bridges with no HDM decoders
* if they only have a single root port. * if they only have a single root port.

View file

@ -409,6 +409,7 @@ static void pci_do_device_reset(PCIDevice *dev)
msi_reset(dev); msi_reset(dev);
msix_reset(dev); msix_reset(dev);
pcie_sriov_pf_reset(dev);
} }
/* /*

View file

@ -171,6 +171,14 @@ static void pcie_cap_fill_slot_lnk(PCIDevice *dev)
pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2, pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2,
PCI_EXP_LNKCAP2_SLS_16_0GB); PCI_EXP_LNKCAP2_SLS_16_0GB);
} }
if (s->speed > QEMU_PCI_EXP_LNK_16GT) {
pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2,
PCI_EXP_LNKCAP2_SLS_32_0GB);
}
if (s->speed > QEMU_PCI_EXP_LNK_32GT) {
pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2,
PCI_EXP_LNKCAP2_SLS_64_0GB);
}
} }
} }

View file

@ -176,6 +176,9 @@ static void register_vfs(PCIDevice *dev)
assert(sriov_cap > 0); assert(sriov_cap > 0);
num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF); num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) {
return;
}
dev->exp.sriov_pf.vf = g_new(PCIDevice *, num_vfs); dev->exp.sriov_pf.vf = g_new(PCIDevice *, num_vfs);
@ -212,7 +215,6 @@ static void unregister_vfs(PCIDevice *dev)
g_free(dev->exp.sriov_pf.vf); g_free(dev->exp.sriov_pf.vf);
dev->exp.sriov_pf.vf = NULL; dev->exp.sriov_pf.vf = NULL;
dev->exp.sriov_pf.num_vfs = 0; dev->exp.sriov_pf.num_vfs = 0;
pci_set_word(dev->config + dev->exp.sriov_cap + PCI_SRIOV_NUM_VF, 0);
} }
void pcie_sriov_config_write(PCIDevice *dev, uint32_t address, void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
@ -246,16 +248,28 @@ void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
} }
/* Reset SR/IOV VF Enable bit to trigger an unregister of all VFs */ /* Reset SR/IOV */
void pcie_sriov_pf_disable_vfs(PCIDevice *dev) void pcie_sriov_pf_reset(PCIDevice *dev)
{ {
uint16_t sriov_cap = dev->exp.sriov_cap; uint16_t sriov_cap = dev->exp.sriov_cap;
if (sriov_cap) { if (!sriov_cap) {
uint32_t val = pci_get_byte(dev->config + sriov_cap + PCI_SRIOV_CTRL); return;
if (val & PCI_SRIOV_CTRL_VFE) { }
val &= ~PCI_SRIOV_CTRL_VFE;
pcie_sriov_config_write(dev, sriov_cap + PCI_SRIOV_CTRL, val, 1); pci_set_word(dev->config + sriov_cap + PCI_SRIOV_CTRL, 0);
} unregister_vfs(dev);
pci_set_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF, 0);
/*
* Default is to use 4K pages, software can modify it
* to any of the supported bits
*/
pci_set_word(dev->config + sriov_cap + PCI_SRIOV_SYS_PGSIZE, 0x1);
for (uint16_t i = 0; i < PCI_NUM_REGIONS; i++) {
pci_set_quad(dev->config + sriov_cap + PCI_SRIOV_BAR + i * 4,
dev->exp.sriov_pf.vf_bar_type[i]);
} }
} }

View file

@ -121,6 +121,16 @@ struct type8_instance {
}; };
static QTAILQ_HEAD(, type8_instance) type8 = QTAILQ_HEAD_INITIALIZER(type8); static QTAILQ_HEAD(, type8_instance) type8 = QTAILQ_HEAD_INITIALIZER(type8);
/* type 9 instance for parsing */
struct type9_instance {
const char *slot_designation, *pcidev;
uint8_t slot_type, slot_data_bus_width, current_usage, slot_length,
slot_characteristics1, slot_characteristics2;
uint16_t slot_id;
QTAILQ_ENTRY(type9_instance) next;
};
static QTAILQ_HEAD(, type9_instance) type9 = QTAILQ_HEAD_INITIALIZER(type9);
static struct { static struct {
size_t nvalues; size_t nvalues;
char **values; char **values;
@ -380,6 +390,59 @@ static const QemuOptDesc qemu_smbios_type8_opts[] = {
{ /* end of list */ } { /* end of list */ }
}; };
static const QemuOptDesc qemu_smbios_type9_opts[] = {
{
.name = "type",
.type = QEMU_OPT_NUMBER,
.help = "SMBIOS element type",
},
{
.name = "slot_designation",
.type = QEMU_OPT_STRING,
.help = "string number for reference designation",
},
{
.name = "slot_type",
.type = QEMU_OPT_NUMBER,
.help = "connector type",
},
{
.name = "slot_data_bus_width",
.type = QEMU_OPT_NUMBER,
.help = "port type",
},
{
.name = "current_usage",
.type = QEMU_OPT_NUMBER,
.help = "current usage",
},
{
.name = "slot_length",
.type = QEMU_OPT_NUMBER,
.help = "system slot length",
},
{
.name = "slot_id",
.type = QEMU_OPT_NUMBER,
.help = "system slot id",
},
{
.name = "slot_characteristics1",
.type = QEMU_OPT_NUMBER,
.help = "slot characteristics1, see the spec",
},
{
.name = "slot_characteristics2",
.type = QEMU_OPT_NUMBER,
.help = "slot characteristics2, see the spec",
},
{
.name = "pci_device",
.type = QEMU_OPT_STRING,
.help = "PCI device, if provided."
}
};
static const QemuOptDesc qemu_smbios_type11_opts[] = { static const QemuOptDesc qemu_smbios_type11_opts[] = {
{ {
.name = "type", .name = "type",
@ -609,6 +672,7 @@ bool smbios_skip_table(uint8_t type, bool required_table)
#define T2_BASE 0x200 #define T2_BASE 0x200
#define T3_BASE 0x300 #define T3_BASE 0x300
#define T4_BASE 0x400 #define T4_BASE 0x400
#define T9_BASE 0x900
#define T11_BASE 0xe00 #define T11_BASE 0xe00
#define T16_BASE 0x1000 #define T16_BASE 0x1000
@ -807,6 +871,65 @@ static void smbios_build_type_8_table(void)
} }
} }
static void smbios_build_type_9_table(Error **errp)
{
unsigned instance = 0;
struct type9_instance *t9;
QTAILQ_FOREACH(t9, &type9, next) {
SMBIOS_BUILD_TABLE_PRE(9, T9_BASE + instance, true);
SMBIOS_TABLE_SET_STR(9, slot_designation, t9->slot_designation);
t->slot_type = t9->slot_type;
t->slot_data_bus_width = t9->slot_data_bus_width;
t->current_usage = t9->current_usage;
t->slot_length = t9->slot_length;
t->slot_id = t9->slot_id;
t->slot_characteristics1 = t9->slot_characteristics1;
t->slot_characteristics2 = t9->slot_characteristics2;
if (t9->pcidev) {
PCIDevice *pdev = NULL;
int rc = pci_qdev_find_device(t9->pcidev, &pdev);
if (rc != 0) {
error_setg(errp,
"No PCI device %s for SMBIOS type 9 entry %s",
t9->pcidev, t9->slot_designation);
return;
}
/*
* We only handle the case were the device is attached to
* the PCI root bus. The general case is more complex as
* bridges are enumerated later and the table would need
* to be updated at this moment.
*/
if (!pci_bus_is_root(pci_get_bus(pdev))) {
error_setg(errp,
"Cannot create type 9 entry for PCI device %s: "
"not attached to the root bus",
t9->pcidev);
return;
}
t->segment_group_number = cpu_to_le16(0);
t->bus_number = pci_dev_bus_num(pdev);
t->device_number = pdev->devfn;
} else {
/*
* Per SMBIOS spec, For slots that are not of the PCI, AGP, PCI-X,
* or PCI-Express type that do not have bus/device/function
* information, 0FFh should be populated in the fields of Segment
* Group Number, Bus Number, Device/Function Number.
*/
t->segment_group_number = 0xff;
t->bus_number = 0xff;
t->device_number = 0xff;
}
SMBIOS_BUILD_TABLE_POST;
instance++;
}
}
static void smbios_build_type_11_table(void) static void smbios_build_type_11_table(void)
{ {
char count_str[128]; char count_str[128];
@ -1126,6 +1249,7 @@ void smbios_get_tables(MachineState *ms,
} }
smbios_build_type_8_table(); smbios_build_type_8_table();
smbios_build_type_9_table(errp);
smbios_build_type_11_table(); smbios_build_type_11_table();
#define MAX_DIMM_SZ (16 * GiB) #define MAX_DIMM_SZ (16 * GiB)
@ -1460,6 +1584,24 @@ void smbios_entry_add(QemuOpts *opts, Error **errp)
t8_i->port_type = qemu_opt_get_number(opts, "port_type", 0); t8_i->port_type = qemu_opt_get_number(opts, "port_type", 0);
QTAILQ_INSERT_TAIL(&type8, t8_i, next); QTAILQ_INSERT_TAIL(&type8, t8_i, next);
return; return;
case 9: {
if (!qemu_opts_validate(opts, qemu_smbios_type9_opts, errp)) {
return;
}
struct type9_instance *t;
t = g_new0(struct type9_instance, 1);
save_opt(&t->slot_designation, opts, "slot_designation");
t->slot_type = qemu_opt_get_number(opts, "slot_type", 0);
t->slot_data_bus_width = qemu_opt_get_number(opts, "slot_data_bus_width", 0);
t->current_usage = qemu_opt_get_number(opts, "current_usage", 0);
t->slot_length = qemu_opt_get_number(opts, "slot_length", 0);
t->slot_id = qemu_opt_get_number(opts, "slot_id", 0);
t->slot_characteristics1 = qemu_opt_get_number(opts, "slot_characteristics1", 0);
t->slot_characteristics2 = qemu_opt_get_number(opts, "slot_characteristics2", 0);
save_opt(&t->pcidev, opts, "pcidev");
QTAILQ_INSERT_TAIL(&type9, t, next);
return;
}
case 11: case 11:
if (!qemu_opts_validate(opts, qemu_smbios_type11_opts, errp)) { if (!qemu_opts_validate(opts, qemu_smbios_type11_opts, errp)) {
return; return;

View file

@ -30,6 +30,7 @@ vhost_user_write(uint32_t req, uint32_t flags) "req:%d flags:0x%"PRIx32""
vhost_user_create_notifier(int idx, void *n) "idx:%d n:%p" vhost_user_create_notifier(int idx, void *n) "idx:%d n:%p"
# vhost-vdpa.c # vhost-vdpa.c
vhost_vdpa_skipped_memory_section(int is_ram, int is_iommu, int is_protected, int is_ram_device, uint64_t first, uint64_t last, int page_mask) "is_ram=%d, is_iommu=%d, is_protected=%d, is_ram_device=%d iova_min=0x%"PRIx64" iova_last=0x%"PRIx64" page_mask=0x%x"
vhost_vdpa_dma_map(void *vdpa, int fd, uint32_t msg_type, uint32_t asid, uint64_t iova, uint64_t size, uint64_t uaddr, uint8_t perm, uint8_t type) "vdpa_shared:%p fd: %d msg_type: %"PRIu32" asid: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" uaddr: 0x%"PRIx64" perm: 0x%"PRIx8" type: %"PRIu8 vhost_vdpa_dma_map(void *vdpa, int fd, uint32_t msg_type, uint32_t asid, uint64_t iova, uint64_t size, uint64_t uaddr, uint8_t perm, uint8_t type) "vdpa_shared:%p fd: %d msg_type: %"PRIu32" asid: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" uaddr: 0x%"PRIx64" perm: 0x%"PRIx8" type: %"PRIu8
vhost_vdpa_dma_unmap(void *vdpa, int fd, uint32_t msg_type, uint32_t asid, uint64_t iova, uint64_t size, uint8_t type) "vdpa_shared:%p fd: %d msg_type: %"PRIu32" asid: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" type: %"PRIu8 vhost_vdpa_dma_unmap(void *vdpa, int fd, uint32_t msg_type, uint32_t asid, uint64_t iova, uint64_t size, uint8_t type) "vdpa_shared:%p fd: %d msg_type: %"PRIu32" asid: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" type: %"PRIu8
vhost_vdpa_listener_begin_batch(void *v, int fd, uint32_t msg_type, uint8_t type) "vdpa_shared:%p fd: %d msg_type: %"PRIu32" type: %"PRIu8 vhost_vdpa_listener_begin_batch(void *v, int fd, uint32_t msg_type, uint8_t type) "vdpa_shared:%p fd: %d msg_type: %"PRIu32" type: %"PRIu8
@ -57,8 +58,8 @@ vhost_vdpa_dev_start(void *dev, bool started) "dev: %p started: %d"
vhost_vdpa_set_log_base(void *dev, uint64_t base, unsigned long long size, int refcnt, int fd, void *log) "dev: %p base: 0x%"PRIx64" size: %llu refcnt: %d fd: %d log: %p" vhost_vdpa_set_log_base(void *dev, uint64_t base, unsigned long long size, int refcnt, int fd, void *log) "dev: %p base: 0x%"PRIx64" size: %llu refcnt: %d fd: %d log: %p"
vhost_vdpa_set_vring_addr(void *dev, unsigned int index, unsigned int flags, uint64_t desc_user_addr, uint64_t used_user_addr, uint64_t avail_user_addr, uint64_t log_guest_addr) "dev: %p index: %u flags: 0x%x desc_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" log_guest_addr: 0x%"PRIx64 vhost_vdpa_set_vring_addr(void *dev, unsigned int index, unsigned int flags, uint64_t desc_user_addr, uint64_t used_user_addr, uint64_t avail_user_addr, uint64_t log_guest_addr) "dev: %p index: %u flags: 0x%x desc_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" log_guest_addr: 0x%"PRIx64
vhost_vdpa_set_vring_num(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u" vhost_vdpa_set_vring_num(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u"
vhost_vdpa_set_vring_base(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u" vhost_vdpa_set_dev_vring_base(void *dev, unsigned int index, unsigned int num, bool svq) "dev: %p index: %u num: %u svq: %d"
vhost_vdpa_get_vring_base(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u" vhost_vdpa_get_vring_base(void *dev, unsigned int index, unsigned int num, bool svq) "dev: %p index: %u num: %u svq: %d"
vhost_vdpa_set_vring_kick(void *dev, unsigned int index, int fd) "dev: %p index: %u fd: %d" vhost_vdpa_set_vring_kick(void *dev, unsigned int index, int fd) "dev: %p index: %u fd: %d"
vhost_vdpa_set_vring_call(void *dev, unsigned int index, int fd) "dev: %p index: %u fd: %d" vhost_vdpa_set_vring_call(void *dev, unsigned int index, int fd) "dev: %p index: %u fd: %d"
vhost_vdpa_get_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRIx64 vhost_vdpa_get_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRIx64
@ -111,7 +112,7 @@ virtio_iommu_device_reset(void) "reset!"
virtio_iommu_system_reset(void) "system reset!" virtio_iommu_system_reset(void) "system reset!"
virtio_iommu_get_features(uint64_t features) "device supports features=0x%"PRIx64 virtio_iommu_get_features(uint64_t features) "device supports features=0x%"PRIx64
virtio_iommu_device_status(uint8_t status) "driver status = %d" virtio_iommu_device_status(uint8_t status) "driver status = %d"
virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, uint32_t domain_start, uint32_t domain_end, uint32_t probe_size, uint8_t bypass) "page_size_mask=0x%"PRIx64" input range start=0x%"PRIx64" input range end=0x%"PRIx64" domain range start=%d domain range end=%d probe_size=0x%x bypass=0x%x" virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, uint32_t domain_start, uint32_t domain_end, uint32_t probe_size, uint8_t bypass) "page_size_mask=0x%"PRIx64" input range start=0x%"PRIx64" input range end=0x%"PRIx64" domain range start=%u domain range end=%u probe_size=0x%x bypass=0x%x"
virtio_iommu_set_config(uint8_t bypass) "bypass=0x%x" virtio_iommu_set_config(uint8_t bypass) "bypass=0x%x"
virtio_iommu_attach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" virtio_iommu_attach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d"
virtio_iommu_detach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" virtio_iommu_detach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d"

View file

@ -1610,11 +1610,27 @@ vhost_user_backend_handle_shared_object_add(struct vhost_dev *dev,
} }
static int static int
vhost_user_backend_handle_shared_object_remove(VhostUserShared *object) vhost_user_backend_handle_shared_object_remove(struct vhost_dev *dev,
VhostUserShared *object)
{ {
QemuUUID uuid; QemuUUID uuid;
memcpy(uuid.data, object->uuid, sizeof(object->uuid)); memcpy(uuid.data, object->uuid, sizeof(object->uuid));
switch (virtio_object_type(&uuid)) {
case TYPE_VHOST_DEV:
{
struct vhost_dev *owner = virtio_lookup_vhost_device(&uuid);
if (dev != owner) {
/* Not allowed to remove non-owned entries */
return 0;
}
break;
}
default:
/* Not allowed to remove non-owned entries */
return 0;
}
return virtio_remove_resource(&uuid); return virtio_remove_resource(&uuid);
} }
@ -1793,7 +1809,8 @@ static gboolean backend_read(QIOChannel *ioc, GIOCondition condition,
ret = vhost_user_backend_handle_shared_object_add(dev, &payload.object); ret = vhost_user_backend_handle_shared_object_add(dev, &payload.object);
break; break;
case VHOST_USER_BACKEND_SHARED_OBJECT_REMOVE: case VHOST_USER_BACKEND_SHARED_OBJECT_REMOVE:
ret = vhost_user_backend_handle_shared_object_remove(&payload.object); ret = vhost_user_backend_handle_shared_object_remove(dev,
&payload.object);
break; break;
case VHOST_USER_BACKEND_SHARED_OBJECT_LOOKUP: case VHOST_USER_BACKEND_SHARED_OBJECT_LOOKUP:
ret = vhost_user_backend_handle_shared_object_lookup(dev->opaque, ioc, ret = vhost_user_backend_handle_shared_object_lookup(dev->opaque, ioc,

View file

@ -47,12 +47,17 @@ static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
int page_mask) int page_mask)
{ {
Int128 llend; Int128 llend;
bool is_ram = memory_region_is_ram(section->mr);
bool is_iommu = memory_region_is_iommu(section->mr);
bool is_protected = memory_region_is_protected(section->mr);
if ((!memory_region_is_ram(section->mr) && /* vhost-vDPA doesn't allow MMIO to be mapped */
!memory_region_is_iommu(section->mr)) || bool is_ram_device = memory_region_is_ram_device(section->mr);
memory_region_is_protected(section->mr) ||
/* vhost-vDPA doesn't allow MMIO to be mapped */ if ((!is_ram && !is_iommu) || is_protected || is_ram_device) {
memory_region_is_ram_device(section->mr)) { trace_vhost_vdpa_skipped_memory_section(is_ram, is_iommu, is_protected,
is_ram_device, iova_min,
iova_max, page_mask);
return true; return true;
} }
@ -69,7 +74,7 @@ static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
* size that maps to the kernel * size that maps to the kernel
*/ */
if (!memory_region_is_iommu(section->mr)) { if (!is_iommu) {
llend = vhost_vdpa_section_end(section, page_mask); llend = vhost_vdpa_section_end(section, page_mask);
if (int128_gt(llend, int128_make64(iova_max))) { if (int128_gt(llend, int128_make64(iova_max))) {
error_report("RAM section out of device range (max=0x%" PRIx64 error_report("RAM section out of device range (max=0x%" PRIx64
@ -555,6 +560,11 @@ static bool vhost_vdpa_first_dev(struct vhost_dev *dev)
return v->index == 0; return v->index == 0;
} }
static bool vhost_vdpa_last_dev(struct vhost_dev *dev)
{
return dev->vq_index + dev->nvqs == dev->vq_index_end;
}
static int vhost_vdpa_get_dev_features(struct vhost_dev *dev, static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
uint64_t *features) uint64_t *features)
{ {
@ -965,7 +975,10 @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev, static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
struct vhost_vring_state *ring) struct vhost_vring_state *ring)
{ {
trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num); struct vhost_vdpa *v = dev->opaque;
trace_vhost_vdpa_set_dev_vring_base(dev, ring->index, ring->num,
v->shadow_vqs_enabled);
return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring); return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
} }
@ -1315,7 +1328,7 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
} }
if (dev->vq_index + dev->nvqs != dev->vq_index_end) { if (!vhost_vdpa_last_dev(dev)) {
return 0; return 0;
} }
@ -1337,7 +1350,7 @@ static void vhost_vdpa_reset_status(struct vhost_dev *dev)
{ {
struct vhost_vdpa *v = dev->opaque; struct vhost_vdpa *v = dev->opaque;
if (dev->vq_index + dev->nvqs != dev->vq_index_end) { if (!vhost_vdpa_last_dev(dev)) {
return; return;
} }
@ -1407,6 +1420,7 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
if (v->shadow_vqs_enabled) { if (v->shadow_vqs_enabled) {
ring->num = virtio_queue_get_last_avail_idx(dev->vdev, ring->index); ring->num = virtio_queue_get_last_avail_idx(dev->vdev, ring->index);
trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num, true);
return 0; return 0;
} }
@ -1419,7 +1433,7 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
} }
ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring); ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num); trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num, false);
return ret; return ret;
} }
@ -1447,7 +1461,15 @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
/* Remember last call fd because we can switch to SVQ anytime. */ /* Remember last call fd because we can switch to SVQ anytime. */
vhost_svq_set_svq_call_fd(svq, file->fd); vhost_svq_set_svq_call_fd(svq, file->fd);
if (v->shadow_vqs_enabled) { /*
* When SVQ is transitioning to off, shadow_vqs_enabled has
* not been set back to false yet, but the underlying call fd
* will have to switch back to the guest notifier to signal the
* passthrough virtqueues. In other situations, SVQ's own call
* fd shall be used to signal the device model.
*/
if (v->shadow_vqs_enabled &&
v->shared->svq_switching != SVQ_TSTATE_DISABLING) {
return 0; return 0;
} }

View file

@ -29,6 +29,7 @@
#include "sysemu/reset.h" #include "sysemu/reset.h"
#include "sysemu/sysemu.h" #include "sysemu/sysemu.h"
#include "qemu/reserved-region.h" #include "qemu/reserved-region.h"
#include "qemu/units.h"
#include "qapi/error.h" #include "qapi/error.h"
#include "qemu/error-report.h" #include "qemu/error-report.h"
#include "trace.h" #include "trace.h"
@ -1115,8 +1116,8 @@ static int virtio_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu_mr,
} }
/* /*
* The default mask (TARGET_PAGE_MASK) is the smallest supported guest granule, * The default mask depends on the "granule" property. For example, with
* for example 0xfffffffffffff000. When an assigned device has page size * 4k granule, it is -(4 * KiB). When an assigned device has page size
* restrictions due to the hardware IOMMU configuration, apply this restriction * restrictions due to the hardware IOMMU configuration, apply this restriction
* to the mask. * to the mask.
*/ */
@ -1313,8 +1314,32 @@ static void virtio_iommu_device_realize(DeviceState *dev, Error **errp)
* in vfio realize * in vfio realize
*/ */
s->config.bypass = s->boot_bypass; s->config.bypass = s->boot_bypass;
s->config.page_size_mask = qemu_target_page_mask(); if (s->aw_bits < 32 || s->aw_bits > 64) {
s->config.input_range.end = UINT64_MAX; error_setg(errp, "aw-bits must be within [32,64]");
return;
}
s->config.input_range.end =
s->aw_bits == 64 ? UINT64_MAX : BIT_ULL(s->aw_bits) - 1;
switch (s->granule_mode) {
case GRANULE_MODE_4K:
s->config.page_size_mask = -(4 * KiB);
break;
case GRANULE_MODE_8K:
s->config.page_size_mask = -(8 * KiB);
break;
case GRANULE_MODE_16K:
s->config.page_size_mask = -(16 * KiB);
break;
case GRANULE_MODE_64K:
s->config.page_size_mask = -(64 * KiB);
break;
case GRANULE_MODE_HOST:
s->config.page_size_mask = qemu_real_host_page_mask();
break;
default:
error_setg(errp, "Unsupported granule mode");
}
s->config.domain_range.end = UINT32_MAX; s->config.domain_range.end = UINT32_MAX;
s->config.probe_size = VIOMMU_PROBE_SIZE; s->config.probe_size = VIOMMU_PROBE_SIZE;
@ -1522,6 +1547,9 @@ static Property virtio_iommu_properties[] = {
DEFINE_PROP_LINK("primary-bus", VirtIOIOMMU, primary_bus, DEFINE_PROP_LINK("primary-bus", VirtIOIOMMU, primary_bus,
TYPE_PCI_BUS, PCIBus *), TYPE_PCI_BUS, PCIBus *),
DEFINE_PROP_BOOL("boot-bypass", VirtIOIOMMU, boot_bypass, true), DEFINE_PROP_BOOL("boot-bypass", VirtIOIOMMU, boot_bypass, true),
DEFINE_PROP_GRANULE_MODE("granule", VirtIOIOMMU, granule_mode,
GRANULE_MODE_HOST),
DEFINE_PROP_UINT8("aw-bits", VirtIOIOMMU, aw_bits, 64),
DEFINE_PROP_END_OF_LIST(), DEFINE_PROP_END_OF_LIST(),
}; };

View file

@ -1442,6 +1442,155 @@ int virtio_pci_add_shm_cap(VirtIOPCIProxy *proxy,
return virtio_pci_add_mem_cap(proxy, &cap.cap); return virtio_pci_add_mem_cap(proxy, &cap.cap);
} }
/* Called within call_rcu(). */
static void bitmap_free_region_cache(BitmapMemoryRegionCaches *caches)
{
assert(caches != NULL);
address_space_cache_destroy(&caches->bitmap);
g_free(caches);
}
static void lm_disable(VirtIODevice *vdev)
{
BitmapMemoryRegionCaches *caches;
caches = qatomic_read(&vdev->caches);
qatomic_rcu_set(&vdev->caches, NULL);
if (caches) {
call_rcu(caches, bitmap_free_region_cache, rcu);
}
}
static void lm_enable(VirtIODevice *vdev)
{
BitmapMemoryRegionCaches *old = vdev->caches;
BitmapMemoryRegionCaches *new = NULL;
hwaddr addr, end, size;
int64_t len;
addr = vdev->lm_base_addr_low | ((hwaddr)(vdev->lm_base_addr_high) << 32);
end = vdev->lm_end_addr_low | ((hwaddr)(vdev->lm_end_addr_high) << 32);
size = end - addr;
if (size <= 0) {
error_report("Invalid lm size.");
return;
}
new = g_new0(BitmapMemoryRegionCaches, 1);
len = address_space_cache_init(&new->bitmap, vdev->dma_as, addr, size,
true);
if (len < size) {
virtio_error(vdev, "Cannot map bitmap");
goto err_bitmap;
}
qatomic_rcu_set(&vdev->caches, new);
if (old) {
call_rcu(old, bitmap_free_region_cache, rcu);
}
return;
err_bitmap:
address_space_cache_destroy(&new->bitmap);
g_free(new);
}
static uint64_t virtio_pci_lm_read(void *opaque, hwaddr addr,
unsigned size)
{
VirtIOPCIProxy *proxy = opaque;
VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
hwaddr offset_end = LM_VRING_STATE_OFFSET +
virtio_pci_queue_mem_mult(proxy) * VIRTIO_QUEUE_MAX;
uint32_t val;
int qid;
if (vdev == NULL) {
return UINT64_MAX;
}
switch (addr) {
case LM_LOGGING_CTRL:
val = vdev->lm_logging_ctrl;
break;
case LM_BASE_ADDR_LOW:
val = vdev->lm_base_addr_low;
break;
case LM_BASE_ADDR_HIGH:
val = vdev->lm_base_addr_high;
break;
case LM_END_ADDR_LOW:
val = vdev->lm_end_addr_low;
break;
case LM_END_ADDR_HIGH:
val = vdev->lm_end_addr_high;
break;
default:
if (addr >= LM_VRING_STATE_OFFSET && addr <= offset_end) {
qid = (addr - LM_VRING_STATE_OFFSET) /
virtio_pci_queue_mem_mult(proxy);
val = virtio_queue_get_vring_states(vdev, qid);
} else
val = 0;
break;
}
return val;
}
static void virtio_pci_lm_write(void *opaque, hwaddr addr,
uint64_t val, unsigned size)
{
VirtIOPCIProxy *proxy = opaque;
VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
hwaddr offset_end = LM_VRING_STATE_OFFSET +
virtio_pci_queue_mem_mult(proxy) * VIRTIO_QUEUE_MAX;
int qid;
if (vdev == NULL) {
return;
}
switch (addr) {
case LM_LOGGING_CTRL:
vdev->lm_logging_ctrl = val;
switch (val) {
case LM_DISABLE:
lm_disable(vdev);
break;
case LM_ENABLE:
lm_enable(vdev);
break;
default:
virtio_error(vdev, "Unsupport LM_LOGGING_CTRL value: %"PRIx64,
val);
break;
};
break;
case LM_BASE_ADDR_LOW:
vdev->lm_base_addr_low = val;
break;
case LM_BASE_ADDR_HIGH:
vdev->lm_base_addr_high = val;
break;
case LM_END_ADDR_LOW:
vdev->lm_end_addr_low = val;
break;
case LM_END_ADDR_HIGH:
vdev->lm_end_addr_high = val;
break;
default:
if (addr >= LM_VRING_STATE_OFFSET && addr <= offset_end) {
qid = (addr - LM_VRING_STATE_OFFSET) /
virtio_pci_queue_mem_mult(proxy);
virtio_queue_set_vring_states(vdev, qid, val);
} else
virtio_error(vdev, "Unsupport addr: %"PRIx64, addr);
break;
}
}
static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr, static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr,
unsigned size) unsigned size)
{ {
@ -1823,6 +1972,15 @@ static void virtio_pci_modern_regions_init(VirtIOPCIProxy *proxy,
}, },
.endianness = DEVICE_LITTLE_ENDIAN, .endianness = DEVICE_LITTLE_ENDIAN,
}; };
static const MemoryRegionOps lm_ops = {
.read = virtio_pci_lm_read,
.write = virtio_pci_lm_write,
.impl = {
.min_access_size = 1,
.max_access_size = 4,
},
.endianness = DEVICE_LITTLE_ENDIAN,
};
g_autoptr(GString) name = g_string_new(NULL); g_autoptr(GString) name = g_string_new(NULL);
g_string_printf(name, "virtio-pci-common-%s", vdev_name); g_string_printf(name, "virtio-pci-common-%s", vdev_name);
@ -1859,6 +2017,14 @@ static void virtio_pci_modern_regions_init(VirtIOPCIProxy *proxy,
proxy, proxy,
name->str, name->str,
proxy->notify_pio.size); proxy->notify_pio.size);
if (proxy->flags & VIRTIO_PCI_FLAG_VDPA) {
g_string_printf(name, "virtio-pci-lm-%s", vdev_name);
memory_region_init_io(&proxy->lm.mr, OBJECT(proxy),
&lm_ops,
proxy,
name->str,
proxy->lm.size);
}
} }
static void virtio_pci_modern_region_map(VirtIOPCIProxy *proxy, static void virtio_pci_modern_region_map(VirtIOPCIProxy *proxy,
@ -2021,6 +2187,10 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp)
virtio_pci_modern_mem_region_map(proxy, &proxy->isr, &cap); virtio_pci_modern_mem_region_map(proxy, &proxy->isr, &cap);
virtio_pci_modern_mem_region_map(proxy, &proxy->device, &cap); virtio_pci_modern_mem_region_map(proxy, &proxy->device, &cap);
virtio_pci_modern_mem_region_map(proxy, &proxy->notify, &notify.cap); virtio_pci_modern_mem_region_map(proxy, &proxy->notify, &notify.cap);
if (proxy->flags & VIRTIO_PCI_FLAG_VDPA) {
memory_region_add_subregion(&proxy->modern_bar,
proxy->lm.offset, &proxy->lm.mr);
}
if (modern_pio) { if (modern_pio) {
memory_region_init(&proxy->io_bar, OBJECT(proxy), memory_region_init(&proxy->io_bar, OBJECT(proxy),
@ -2090,6 +2260,9 @@ static void virtio_pci_device_unplugged(DeviceState *d)
virtio_pci_modern_mem_region_unmap(proxy, &proxy->isr); virtio_pci_modern_mem_region_unmap(proxy, &proxy->isr);
virtio_pci_modern_mem_region_unmap(proxy, &proxy->device); virtio_pci_modern_mem_region_unmap(proxy, &proxy->device);
virtio_pci_modern_mem_region_unmap(proxy, &proxy->notify); virtio_pci_modern_mem_region_unmap(proxy, &proxy->notify);
if (proxy->flags & VIRTIO_PCI_FLAG_VDPA) {
memory_region_del_subregion(&proxy->modern_bar, &proxy->lm.mr);
}
if (modern_pio) { if (modern_pio) {
virtio_pci_modern_io_region_unmap(proxy, &proxy->notify_pio); virtio_pci_modern_io_region_unmap(proxy, &proxy->notify_pio);
} }
@ -2144,9 +2317,17 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp)
proxy->notify_pio.type = VIRTIO_PCI_CAP_NOTIFY_CFG; proxy->notify_pio.type = VIRTIO_PCI_CAP_NOTIFY_CFG;
/* subclasses can enforce modern, so do this unconditionally */ /* subclasses can enforce modern, so do this unconditionally */
memory_region_init(&proxy->modern_bar, OBJECT(proxy), "virtio-pci", if (!(proxy->flags & VIRTIO_PCI_FLAG_VDPA)) {
/* PCI BAR regions must be powers of 2 */ memory_region_init(&proxy->modern_bar, OBJECT(proxy), "virtio-pci",
pow2ceil(proxy->notify.offset + proxy->notify.size)); /* PCI BAR regions must be powers of 2 */
pow2ceil(proxy->notify.offset + proxy->notify.size));
} else {
proxy->lm.offset = proxy->notify.offset + proxy->notify.size;
proxy->lm.size = 0x20 + VIRTIO_QUEUE_MAX * 4;
memory_region_init(&proxy->modern_bar, OBJECT(proxy), "virtio-pci",
/* PCI BAR regions must be powers of 2 */
pow2ceil(proxy->lm.offset + proxy->lm.size));
}
if (proxy->disable_legacy == ON_OFF_AUTO_AUTO) { if (proxy->disable_legacy == ON_OFF_AUTO_AUTO) {
proxy->disable_legacy = pcie_port ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; proxy->disable_legacy = pcie_port ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
@ -2301,6 +2482,8 @@ static Property virtio_pci_properties[] = {
VIRTIO_PCI_FLAG_INIT_FLR_BIT, true), VIRTIO_PCI_FLAG_INIT_FLR_BIT, true),
DEFINE_PROP_BIT("aer", VirtIOPCIProxy, flags, DEFINE_PROP_BIT("aer", VirtIOPCIProxy, flags,
VIRTIO_PCI_FLAG_AER_BIT, false), VIRTIO_PCI_FLAG_AER_BIT, false),
DEFINE_PROP_BIT("vdpa", VirtIOPCIProxy, flags,
VIRTIO_PCI_FLAG_VDPA_BIT, false),
DEFINE_PROP_END_OF_LIST(), DEFINE_PROP_END_OF_LIST(),
}; };

View file

@ -3368,6 +3368,18 @@ static uint16_t virtio_queue_split_get_last_avail_idx(VirtIODevice *vdev,
return vdev->vq[n].last_avail_idx; return vdev->vq[n].last_avail_idx;
} }
static uint32_t virtio_queue_split_get_vring_states(VirtIODevice *vdev,
int n)
{
struct VirtQueue *vq = &vdev->vq[n];
uint16_t avail, used;
avail = vq->last_avail_idx;
used = vq->used_idx;
return avail | (uint32_t)used << 16;
}
unsigned int virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n) unsigned int virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n)
{ {
if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) { if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) {
@ -3377,6 +3389,33 @@ unsigned int virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n)
} }
} }
unsigned int virtio_queue_get_vring_states(VirtIODevice *vdev, int n)
{
if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) {
return -1;
} else {
return virtio_queue_split_get_vring_states(vdev, n);
}
}
static void virtio_queue_split_set_vring_states(VirtIODevice *vdev,
int n, uint32_t idx)
{
struct VirtQueue *vq = &vdev->vq[n];
vq->last_avail_idx = (uint16_t)(idx & 0xffff);
vq->shadow_avail_idx = (uint16_t)(idx & 0xffff);
vq->used_idx = (uint16_t)(idx >> 16);
}
void virtio_queue_set_vring_states(VirtIODevice *vdev, int n, uint32_t idx)
{
if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) {
return;
} else {
virtio_queue_split_set_vring_states(vdev, n, idx);
}
}
static void virtio_queue_packed_set_last_avail_idx(VirtIODevice *vdev, static void virtio_queue_packed_set_last_avail_idx(VirtIODevice *vdev,
int n, unsigned int idx) int n, unsigned int idx)
{ {

View file

@ -0,0 +1,47 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
*/
#ifndef ACPI_GENERIC_INITIATOR_H
#define ACPI_GENERIC_INITIATOR_H
#include "qom/object_interfaces.h"
#define TYPE_ACPI_GENERIC_INITIATOR "acpi-generic-initiator"
typedef struct AcpiGenericInitiator {
/* private */
Object parent;
/* public */
char *pci_dev;
uint16_t node;
} AcpiGenericInitiator;
/*
* ACPI 6.3:
* Table 5-81 Flags Generic Initiator Affinity Structure
*/
typedef enum {
/*
* If clear, the OSPM ignores the contents of the Generic
* Initiator/Port Affinity Structure. This allows system firmware
* to populate the SRAT with a static number of structures, but only
* enable them as necessary.
*/
GEN_AFFINITY_ENABLED = (1 << 0),
} GenericAffinityFlags;
/*
* ACPI 6.3:
* Table 5-80 Device Handle - PCI
*/
typedef struct PCIDeviceHandle {
uint16_t segment;
uint16_t bdf;
} PCIDeviceHandle;
void build_srat_generic_pci_initiator(GArray *table_data);
#endif

View file

@ -230,6 +230,7 @@ struct virtio_snd_ctrl_command {
VirtQueue *vq; VirtQueue *vq;
virtio_snd_hdr ctrl; virtio_snd_hdr ctrl;
virtio_snd_hdr resp; virtio_snd_hdr resp;
size_t payload_size;
QTAILQ_ENTRY(virtio_snd_ctrl_command) next; QTAILQ_ENTRY(virtio_snd_ctrl_command) next;
}; };
#endif #endif

View file

@ -25,6 +25,7 @@ enum reg_type {
CXL2_TYPE3_DEVICE, CXL2_TYPE3_DEVICE,
CXL2_LOGICAL_DEVICE, CXL2_LOGICAL_DEVICE,
CXL2_ROOT_PORT, CXL2_ROOT_PORT,
CXL2_RC,
CXL2_UPSTREAM_PORT, CXL2_UPSTREAM_PORT,
CXL2_DOWNSTREAM_PORT, CXL2_DOWNSTREAM_PORT,
CXL3_SWITCH_MAILBOX_CCI, CXL3_SWITCH_MAILBOX_CCI,

View file

@ -92,8 +92,9 @@ typedef struct CXLDVSECDevice {
uint32_t range2_base_hi; uint32_t range2_base_hi;
uint32_t range2_base_lo; uint32_t range2_base_lo;
uint16_t cap3; uint16_t cap3;
uint16_t resv;
} QEMU_PACKED CXLDVSECDevice; } QEMU_PACKED CXLDVSECDevice;
QEMU_BUILD_BUG_ON(sizeof(CXLDVSECDevice) != 0x3A); QEMU_BUILD_BUG_ON(sizeof(CXLDVSECDevice) != PCIE_CXL_DEVICE_DVSEC_LENGTH);
/* /*
* CXL r3.1 Section 8.1.5: CXL Extensions DVSEC for Ports * CXL r3.1 Section 8.1.5: CXL Extensions DVSEC for Ports

View file

@ -211,6 +211,23 @@ struct smbios_type_8 {
uint8_t port_type; uint8_t port_type;
} QEMU_PACKED; } QEMU_PACKED;
/* SMBIOS type 9 - System Slots (v2.1+) */
struct smbios_type_9 {
struct smbios_structure_header header;
uint8_t slot_designation;
uint8_t slot_type;
uint8_t slot_data_bus_width;
uint8_t current_usage;
uint8_t slot_length;
uint16_t slot_id;
uint8_t slot_characteristics1;
uint8_t slot_characteristics2;
/* SMBIOS spec v2.6+ */
uint16_t segment_group_number;
uint8_t bus_number;
uint8_t device_number;
} QEMU_PACKED;
/* SMBIOS type 11 - OEM strings */ /* SMBIOS type 11 - OEM strings */
struct smbios_type_11 { struct smbios_type_11 {
struct smbios_structure_header header; struct smbios_structure_header header;

View file

@ -178,8 +178,6 @@ void pc_basic_device_init(struct PCMachineState *pcms,
ISADevice *rtc_state, ISADevice *rtc_state,
bool create_fdctrl, bool create_fdctrl,
uint32_t hpet_irqs); uint32_t hpet_irqs);
void pc_cmos_init(PCMachineState *pcms,
ISADevice *s);
void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, PCIBus *pci_bus); void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, PCIBus *pci_bus);
void pc_i8259_create(ISABus *isa_bus, qemu_irq *i8259_irqs); void pc_i8259_create(ISABus *isa_bus, qemu_irq *i8259_irqs);
@ -190,6 +188,8 @@ void pc_i8259_create(ISABus *isa_bus, qemu_irq *i8259_irqs);
#define TYPE_PORT92 "port92" #define TYPE_PORT92 "port92"
/* pc_sysfw.c */ /* pc_sysfw.c */
void pc_system_flash_create(PCMachineState *pcms);
void pc_system_flash_cleanup_unused(PCMachineState *pcms);
void pc_system_firmware_init(PCMachineState *pcms, MemoryRegion *rom_memory); void pc_system_firmware_init(PCMachineState *pcms, MemoryRegion *rom_memory);
bool pc_system_ovmf_table_find(const char *entry, uint8_t **data, bool pc_system_ovmf_table_find(const char *entry, uint8_t **data,
int *data_len); int *data_len);

View file

@ -39,6 +39,8 @@ typedef enum PCIExpLinkSpeed {
QEMU_PCI_EXP_LNK_5GT, QEMU_PCI_EXP_LNK_5GT,
QEMU_PCI_EXP_LNK_8GT, QEMU_PCI_EXP_LNK_8GT,
QEMU_PCI_EXP_LNK_16GT, QEMU_PCI_EXP_LNK_16GT,
QEMU_PCI_EXP_LNK_32GT,
QEMU_PCI_EXP_LNK_64GT,
} PCIExpLinkSpeed; } PCIExpLinkSpeed;
#define QEMU_PCI_EXP_LNKCAP_MLS(speed) (speed) #define QEMU_PCI_EXP_LNKCAP_MLS(speed) (speed)

View file

@ -58,8 +58,8 @@ void pcie_sriov_pf_add_sup_pgsize(PCIDevice *dev, uint16_t opt_sup_pgsize);
void pcie_sriov_config_write(PCIDevice *dev, uint32_t address, void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
uint32_t val, int len); uint32_t val, int len);
/* Reset SR/IOV VF Enable bit to unregister all VFs */ /* Reset SR/IOV */
void pcie_sriov_pf_disable_vfs(PCIDevice *dev); void pcie_sriov_pf_reset(PCIDevice *dev);
/* Get logical VF number of a VF - only valid for VFs */ /* Get logical VF number of a VF - only valid for VFs */
uint16_t pcie_sriov_vf_number(PCIDevice *dev); uint16_t pcie_sriov_vf_number(PCIDevice *dev);

View file

@ -30,6 +30,12 @@ typedef struct VhostVDPAHostNotifier {
void *addr; void *addr;
} VhostVDPAHostNotifier; } VhostVDPAHostNotifier;
typedef enum SVQTransitionState {
SVQ_TSTATE_DISABLING = -1,
SVQ_TSTATE_DONE,
SVQ_TSTATE_ENABLING
} SVQTransitionState;
/* Info shared by all vhost_vdpa device models */ /* Info shared by all vhost_vdpa device models */
typedef struct vhost_vdpa_shared { typedef struct vhost_vdpa_shared {
int device_fd; int device_fd;
@ -47,6 +53,9 @@ typedef struct vhost_vdpa_shared {
/* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA */ /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA */
bool shadow_data; bool shadow_data;
/* SVQ switching is in progress, or already completed? */
SVQTransitionState svq_switching;
} VhostVDPAShared; } VhostVDPAShared;
typedef struct vhost_vdpa { typedef struct vhost_vdpa {

View file

@ -24,6 +24,7 @@
#include "hw/virtio/virtio.h" #include "hw/virtio/virtio.h"
#include "hw/pci/pci.h" #include "hw/pci/pci.h"
#include "qom/object.h" #include "qom/object.h"
#include "qapi/qapi-types-virtio.h"
#define TYPE_VIRTIO_IOMMU "virtio-iommu-device" #define TYPE_VIRTIO_IOMMU "virtio-iommu-device"
#define TYPE_VIRTIO_IOMMU_PCI "virtio-iommu-pci" #define TYPE_VIRTIO_IOMMU_PCI "virtio-iommu-pci"
@ -66,6 +67,8 @@ struct VirtIOIOMMU {
bool boot_bypass; bool boot_bypass;
Notifier machine_done; Notifier machine_done;
bool granule_frozen; bool granule_frozen;
GranuleMode granule_mode;
uint8_t aw_bits;
}; };
#endif #endif

View file

@ -43,6 +43,7 @@ enum {
VIRTIO_PCI_FLAG_INIT_FLR_BIT, VIRTIO_PCI_FLAG_INIT_FLR_BIT,
VIRTIO_PCI_FLAG_AER_BIT, VIRTIO_PCI_FLAG_AER_BIT,
VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT, VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT,
VIRTIO_PCI_FLAG_VDPA_BIT,
}; };
/* Need to activate work-arounds for buggy guests at vmstate load. */ /* Need to activate work-arounds for buggy guests at vmstate load. */
@ -89,6 +90,9 @@ enum {
#define VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED \ #define VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED \
(1 << VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT) (1 << VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT)
/* VDPA supported flags */
#define VIRTIO_PCI_FLAG_VDPA (1 << VIRTIO_PCI_FLAG_VDPA_BIT)
typedef struct { typedef struct {
MSIMessage msg; MSIMessage msg;
int virq; int virq;
@ -140,6 +144,7 @@ struct VirtIOPCIProxy {
}; };
VirtIOPCIRegion regs[5]; VirtIOPCIRegion regs[5];
}; };
VirtIOPCIRegion lm;
MemoryRegion modern_bar; MemoryRegion modern_bar;
MemoryRegion io_bar; MemoryRegion io_bar;
uint32_t legacy_io_bar_idx; uint32_t legacy_io_bar_idx;

View file

@ -35,6 +35,9 @@
(0x1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \ (0x1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \
(0x1ULL << VIRTIO_F_ANY_LAYOUT)) (0x1ULL << VIRTIO_F_ANY_LAYOUT))
#define LM_DISABLE 0x00
#define LM_ENABLE 0x01
struct VirtQueue; struct VirtQueue;
static inline hwaddr vring_align(hwaddr addr, static inline hwaddr vring_align(hwaddr addr,
@ -95,6 +98,11 @@ enum virtio_device_endian {
VIRTIO_DEVICE_ENDIAN_BIG, VIRTIO_DEVICE_ENDIAN_BIG,
}; };
typedef struct BitmapMemoryRegionCaches {
struct rcu_head rcu;
MemoryRegionCache bitmap;
} BitmapMemoryRegionCaches;
/** /**
* struct VirtIODevice - common VirtIO structure * struct VirtIODevice - common VirtIO structure
* @name: name of the device * @name: name of the device
@ -128,6 +136,14 @@ struct VirtIODevice
uint32_t generation; uint32_t generation;
int nvectors; int nvectors;
VirtQueue *vq; VirtQueue *vq;
uint8_t lm_logging_ctrl;
uint32_t lm_base_addr_low;
uint32_t lm_base_addr_high;
uint32_t lm_end_addr_low;
uint32_t lm_end_addr_high;
BitmapMemoryRegionCaches *caches;
MemoryListener listener; MemoryListener listener;
uint16_t device_id; uint16_t device_id;
/* @vm_running: current VM running state via virtio_vmstate_change() */ /* @vm_running: current VM running state via virtio_vmstate_change() */
@ -379,8 +395,11 @@ hwaddr virtio_queue_get_desc_size(VirtIODevice *vdev, int n);
hwaddr virtio_queue_get_avail_size(VirtIODevice *vdev, int n); hwaddr virtio_queue_get_avail_size(VirtIODevice *vdev, int n);
hwaddr virtio_queue_get_used_size(VirtIODevice *vdev, int n); hwaddr virtio_queue_get_used_size(VirtIODevice *vdev, int n);
unsigned int virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n); unsigned int virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n);
unsigned int virtio_queue_get_vring_states(VirtIODevice *vdev, int n);
void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n,
unsigned int idx); unsigned int idx);
void virtio_queue_set_vring_states(VirtIODevice *vdev, int n,
unsigned int idx);
void virtio_queue_restore_last_avail_idx(VirtIODevice *vdev, int n); void virtio_queue_restore_last_avail_idx(VirtIODevice *vdev, int n);
void virtio_queue_invalidate_signalled_used(VirtIODevice *vdev, int n); void virtio_queue_invalidate_signalled_used(VirtIODevice *vdev, int n);
void virtio_queue_update_used_idx(VirtIODevice *vdev, int n); void virtio_queue_update_used_idx(VirtIODevice *vdev, int n);

View file

@ -221,6 +221,13 @@ struct virtio_pci_cfg_cap {
#define VIRTIO_PCI_COMMON_ADM_Q_IDX 60 #define VIRTIO_PCI_COMMON_ADM_Q_IDX 60
#define VIRTIO_PCI_COMMON_ADM_Q_NUM 62 #define VIRTIO_PCI_COMMON_ADM_Q_NUM 62
#define LM_LOGGING_CTRL 0
#define LM_BASE_ADDR_LOW 4
#define LM_BASE_ADDR_HIGH 8
#define LM_END_ADDR_LOW 12
#define LM_END_ADDR_HIGH 16
#define LM_VRING_STATE_OFFSET 0x20
#endif /* VIRTIO_PCI_NO_MODERN */ #endif /* VIRTIO_PCI_NO_MODERN */
/* Admin command status. */ /* Admin command status. */

View file

@ -41,6 +41,7 @@ struct NodeInfo {
struct HostMemoryBackend *node_memdev; struct HostMemoryBackend *node_memdev;
bool present; bool present;
bool has_cpu; bool has_cpu;
bool has_gi;
uint8_t lb_info_provided; uint8_t lb_info_provided;
uint16_t initiator; uint16_t initiator;
uint8_t distance[MAX_NODES]; uint8_t distance[MAX_NODES];

View file

@ -23,3 +23,9 @@ colo_compare_tcp_info(const char *pkt, uint32_t seq, uint32_t ack, int hdlen, in
# filter-rewriter.c # filter-rewriter.c
colo_filter_rewriter_pkt_info(const char *func, const char *src, const char *dst, uint32_t seq, uint32_t ack, uint32_t flag) "%s: src/dst: %s/%s p: seq/ack=%u/%u flags=0x%x" colo_filter_rewriter_pkt_info(const char *func, const char *src, const char *dst, uint32_t seq, uint32_t ack, uint32_t flag) "%s: src/dst: %s/%s p: seq/ack=%u/%u flags=0x%x"
colo_filter_rewriter_conn_offset(uint32_t offset) ": offset=%u" colo_filter_rewriter_conn_offset(uint32_t offset) ": offset=%u"
# vhost-vdpa.c
vhost_vdpa_set_address_space_id(void *v, unsigned vq_group, unsigned asid_num) "vhost_vdpa: %p vq_group: %u asid: %u"
vhost_vdpa_net_load_cmd(void *s, uint8_t class, uint8_t cmd, int data_num, int data_size) "vdpa state: %p class: %u cmd: %u sg_num: %d size: %d"
vhost_vdpa_net_load_cmd_retval(void *s, uint8_t class, uint8_t cmd, int r) "vdpa state: %p class: %u cmd: %u retval: %d"
vhost_vdpa_net_load_mq(void *s, int ncurqps) "vdpa state: %p current_qpairs: %d"

View file

@ -28,6 +28,7 @@
#include "monitor/monitor.h" #include "monitor/monitor.h"
#include "migration/misc.h" #include "migration/misc.h"
#include "hw/virtio/vhost.h" #include "hw/virtio/vhost.h"
#include "trace.h"
/* Todo:need to add the multiqueue support here */ /* Todo:need to add the multiqueue support here */
typedef struct VhostVDPAState { typedef struct VhostVDPAState {
@ -286,6 +287,21 @@ static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf,
return size; return size;
} }
/** From any vdpa net client, get the netclient of the i-th queue pair */
static VhostVDPAState *vhost_vdpa_net_get_nc_vdpa(VhostVDPAState *s, int i)
{
NICState *nic = qemu_get_nic(s->nc.peer);
NetClientState *nc_i = qemu_get_peer(nic->ncs, i);
return DO_UPCAST(VhostVDPAState, nc, nc_i);
}
static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s)
{
return vhost_vdpa_net_get_nc_vdpa(s, 0);
}
static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable) static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable)
{ {
struct vhost_vdpa *v = &s->vhost_vdpa; struct vhost_vdpa *v = &s->vhost_vdpa;
@ -307,6 +323,8 @@ static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable)
data_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1; data_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ? cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ?
n->max_ncs - n->max_queue_pairs : 0; n->max_ncs - n->max_queue_pairs : 0;
v->shared->svq_switching = enable ?
SVQ_TSTATE_ENABLING : SVQ_TSTATE_DISABLING;
/* /*
* TODO: vhost_net_stop does suspend, get_base and reset. We can be smarter * TODO: vhost_net_stop does suspend, get_base and reset. We can be smarter
* in the future and resume the device if read-only operations between * in the future and resume the device if read-only operations between
@ -319,6 +337,7 @@ static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable)
if (unlikely(r < 0)) { if (unlikely(r < 0)) {
error_report("unable to start vhost net: %s(%d)", g_strerror(-r), -r); error_report("unable to start vhost net: %s(%d)", g_strerror(-r), -r);
} }
v->shared->svq_switching = SVQ_TSTATE_DONE;
} }
static int vdpa_net_migration_state_notifier(NotifierWithReturn *notifier, static int vdpa_net_migration_state_notifier(NotifierWithReturn *notifier,
@ -444,6 +463,8 @@ static int vhost_vdpa_set_address_space_id(struct vhost_vdpa *v,
}; };
int r; int r;
trace_vhost_vdpa_set_address_space_id(v, vq_group, asid_num);
r = ioctl(v->shared->device_fd, VHOST_VDPA_SET_GROUP_ASID, &asid); r = ioctl(v->shared->device_fd, VHOST_VDPA_SET_GROUP_ASID, &asid);
if (unlikely(r < 0)) { if (unlikely(r < 0)) {
error_report("Can't set vq group %u asid %u, errno=%d (%s)", error_report("Can't set vq group %u asid %u, errno=%d (%s)",
@ -510,7 +531,7 @@ dma_map_err:
static int vhost_vdpa_net_cvq_start(NetClientState *nc) static int vhost_vdpa_net_cvq_start(NetClientState *nc)
{ {
VhostVDPAState *s; VhostVDPAState *s, *s0;
struct vhost_vdpa *v; struct vhost_vdpa *v;
int64_t cvq_group; int64_t cvq_group;
int r; int r;
@ -521,7 +542,8 @@ static int vhost_vdpa_net_cvq_start(NetClientState *nc)
s = DO_UPCAST(VhostVDPAState, nc, nc); s = DO_UPCAST(VhostVDPAState, nc, nc);
v = &s->vhost_vdpa; v = &s->vhost_vdpa;
v->shadow_vqs_enabled = v->shared->shadow_data; s0 = vhost_vdpa_net_first_nc_vdpa(s);
v->shadow_vqs_enabled = s0->vhost_vdpa.shadow_vqs_enabled;
s->vhost_vdpa.address_space_id = VHOST_VDPA_GUEST_PA_ASID; s->vhost_vdpa.address_space_id = VHOST_VDPA_GUEST_PA_ASID;
if (v->shared->shadow_data) { if (v->shared->shadow_data) {
@ -695,6 +717,7 @@ static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s,
assert(data_size < vhost_vdpa_net_cvq_cmd_page_len() - sizeof(ctrl)); assert(data_size < vhost_vdpa_net_cvq_cmd_page_len() - sizeof(ctrl));
cmd_size = sizeof(ctrl) + data_size; cmd_size = sizeof(ctrl) + data_size;
trace_vhost_vdpa_net_load_cmd(s, class, cmd, data_num, data_size);
if (vhost_svq_available_slots(svq) < 2 || if (vhost_svq_available_slots(svq) < 2 ||
iov_size(out_cursor, 1) < cmd_size) { iov_size(out_cursor, 1) < cmd_size) {
/* /*
@ -726,6 +749,7 @@ static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s,
r = vhost_vdpa_net_cvq_add(s, &out, 1, &in, 1); r = vhost_vdpa_net_cvq_add(s, &out, 1, &in, 1);
if (unlikely(r < 0)) { if (unlikely(r < 0)) {
trace_vhost_vdpa_net_load_cmd_retval(s, class, cmd, r);
return r; return r;
} }
@ -917,6 +941,8 @@ static int vhost_vdpa_net_load_mq(VhostVDPAState *s,
return 0; return 0;
} }
trace_vhost_vdpa_net_load_mq(s, n->curr_queue_pairs);
mq.virtqueue_pairs = cpu_to_le16(n->curr_queue_pairs); mq.virtqueue_pairs = cpu_to_le16(n->curr_queue_pairs);
const struct iovec data = { const struct iovec data = {
.iov_base = &mq, .iov_base = &mq,

View file

@ -107,10 +107,14 @@
# #
# @16: 16.0GT/s # @16: 16.0GT/s
# #
# @32: 32.0GT/s
#
# @64: 64.0GT/s
#
# Since: 4.0 # Since: 4.0
## ##
{ 'enum': 'PCIELinkSpeed', { 'enum': 'PCIELinkSpeed',
'data': [ '2_5', '5', '8', '16' ] } 'data': [ '2_5', '5', '8', '16', '32', '64' ] }
## ##
# @PCIELinkWidth: # @PCIELinkWidth:

View file

@ -811,6 +811,21 @@
{ 'struct': 'IOMMUFDProperties', { 'struct': 'IOMMUFDProperties',
'data': { '*fd': 'str' } } 'data': { '*fd': 'str' } }
##
# @AcpiGenericInitiatorProperties:
#
# Properties for acpi-generic-initiator objects.
#
# @pci-dev: PCI device ID to be associated with the node
#
# @node: NUMA node associated with the PCI device
#
# Since: 9.0
##
{ 'struct': 'AcpiGenericInitiatorProperties',
'data': { 'pci-dev': 'str',
'node': 'uint32' } }
## ##
# @RngProperties: # @RngProperties:
# #
@ -928,6 +943,7 @@
## ##
{ 'enum': 'ObjectType', { 'enum': 'ObjectType',
'data': [ 'data': [
'acpi-generic-initiator',
'authz-list', 'authz-list',
'authz-listfile', 'authz-listfile',
'authz-pam', 'authz-pam',
@ -999,6 +1015,7 @@
'id': 'str' }, 'id': 'str' },
'discriminator': 'qom-type', 'discriminator': 'qom-type',
'data': { 'data': {
'acpi-generic-initiator': 'AcpiGenericInitiatorProperties',
'authz-list': 'AuthZListProperties', 'authz-list': 'AuthZListProperties',
'authz-listfile': 'AuthZListFileProperties', 'authz-listfile': 'AuthZListFileProperties',
'authz-pam': 'AuthZPAMProperties', 'authz-pam': 'AuthZPAMProperties',

View file

@ -1172,6 +1172,17 @@ SRST
Please also refer to the wiki page for general scenarios of VT-d Please also refer to the wiki page for general scenarios of VT-d
emulation in QEMU: https://wiki.qemu.org/Features/VT-d. emulation in QEMU: https://wiki.qemu.org/Features/VT-d.
``-device virtio-iommu-pci[,option=...]``
This is only supported by ``-machine q35`` (x86_64) and ``-machine virt`` (ARM).
It supports below options:
``granule=val`` (possible values are 4k, 8k, 16k, 64k and host; default: host)
This decides the default granule to be be exposed by the
virtio-iommu. If host, the granule matches the host page size.
``aw-bits=val`` (val between 32 and 64, default depends on machine)
This decides the address width of the IOVA address space.
ERST ERST
DEF("name", HAS_ARG, QEMU_OPTION_name, DEF("name", HAS_ARG, QEMU_OPTION_name,
@ -2718,6 +2729,9 @@ SRST
``-smbios type=4[,sock_pfx=str][,manufacturer=str][,version=str][,serial=str][,asset=str][,part=str][,processor-family=%d][,processor-id=%d]`` ``-smbios type=4[,sock_pfx=str][,manufacturer=str][,version=str][,serial=str][,asset=str][,part=str][,processor-family=%d][,processor-id=%d]``
Specify SMBIOS type 4 fields Specify SMBIOS type 4 fields
``-smbios type=9[,slot_designation=str][,slot_type=%d][,slot_data_bus_width=%d][,current_usage=%d][,slot_length=%d][,slot_id=%d][,slot_characteristics1=%d][,slot_characteristics12=%d][,pci_device=str]``
Specify SMBIOS type 9 fields
``-smbios type=11[,value=str][,path=filename]`` ``-smbios type=11[,value=str][,path=filename]``
Specify SMBIOS type 11 fields Specify SMBIOS type 11 fields

View file

@ -43,6 +43,8 @@
#include <fcntl.h> #include <fcntl.h>
#include <sys/ioctl.h> #include <sys/ioctl.h>
#include <linux/vhost.h> #include <linux/vhost.h>
#include <sys/vfs.h>
#include <linux/magic.h>
#ifdef __NR_userfaultfd #ifdef __NR_userfaultfd
#include <linux/userfaultfd.h> #include <linux/userfaultfd.h>
@ -195,30 +197,58 @@ vu_panic(VuDev *dev, const char *msg, ...)
*/ */
} }
/* Search for a memory region that covers this guest physical address. */
static VuDevRegion *
vu_gpa_to_mem_region(VuDev *dev, uint64_t guest_addr)
{
int low = 0;
int high = dev->nregions - 1;
/*
* Memory regions cannot overlap in guest physical address space. Each
* GPA belongs to exactly one memory region, so there can only be one
* match.
*
* We store our memory regions ordered by GPA and can simply perform a
* binary search.
*/
while (low <= high) {
unsigned int mid = low + (high - low) / 2;
VuDevRegion *cur = &dev->regions[mid];
if (guest_addr >= cur->gpa && guest_addr < cur->gpa + cur->size) {
return cur;
}
if (guest_addr >= cur->gpa + cur->size) {
low = mid + 1;
}
if (guest_addr < cur->gpa) {
high = mid - 1;
}
}
return NULL;
}
/* Translate guest physical address to our virtual address. */ /* Translate guest physical address to our virtual address. */
void * void *
vu_gpa_to_va(VuDev *dev, uint64_t *plen, uint64_t guest_addr) vu_gpa_to_va(VuDev *dev, uint64_t *plen, uint64_t guest_addr)
{ {
unsigned int i; VuDevRegion *r;
if (*plen == 0) { if (*plen == 0) {
return NULL; return NULL;
} }
/* Find matching memory region. */ r = vu_gpa_to_mem_region(dev, guest_addr);
for (i = 0; i < dev->nregions; i++) { if (!r) {
VuDevRegion *r = &dev->regions[i]; return NULL;
if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) {
if ((guest_addr + *plen) > (r->gpa + r->size)) {
*plen = r->gpa + r->size - guest_addr;
}
return (void *)(uintptr_t)
guest_addr - r->gpa + r->mmap_addr + r->mmap_offset;
}
} }
return NULL; if ((guest_addr + *plen) > (r->gpa + r->size)) {
*plen = r->gpa + r->size - guest_addr;
}
return (void *)(uintptr_t)guest_addr - r->gpa + r->mmap_addr +
r->mmap_offset;
} }
/* Translate qemu virtual address to our virtual address. */ /* Translate qemu virtual address to our virtual address. */
@ -240,6 +270,221 @@ qva_to_va(VuDev *dev, uint64_t qemu_addr)
return NULL; return NULL;
} }
static void
vu_remove_all_mem_regs(VuDev *dev)
{
unsigned int i;
for (i = 0; i < dev->nregions; i++) {
VuDevRegion *r = &dev->regions[i];
munmap((void *)(uintptr_t)r->mmap_addr, r->size + r->mmap_offset);
}
dev->nregions = 0;
}
static bool
map_ring(VuDev *dev, VuVirtq *vq)
{
vq->vring.desc = qva_to_va(dev, vq->vra.desc_user_addr);
vq->vring.used = qva_to_va(dev, vq->vra.used_user_addr);
vq->vring.avail = qva_to_va(dev, vq->vra.avail_user_addr);
DPRINT("Setting virtq addresses:\n");
DPRINT(" vring_desc at %p\n", vq->vring.desc);
DPRINT(" vring_used at %p\n", vq->vring.used);
DPRINT(" vring_avail at %p\n", vq->vring.avail);
return !(vq->vring.desc && vq->vring.used && vq->vring.avail);
}
static bool
vu_is_vq_usable(VuDev *dev, VuVirtq *vq)
{
if (unlikely(dev->broken)) {
return false;
}
if (likely(vq->vring.avail)) {
return true;
}
/*
* In corner cases, we might temporarily remove a memory region that
* mapped a ring. When removing a memory region we make sure to
* unmap any rings that would be impacted. Let's try to remap if we
* already succeeded mapping this ring once.
*/
if (!vq->vra.desc_user_addr || !vq->vra.used_user_addr ||
!vq->vra.avail_user_addr) {
return false;
}
if (map_ring(dev, vq)) {
vu_panic(dev, "remapping queue on access");
return false;
}
return true;
}
static void
unmap_rings(VuDev *dev, VuDevRegion *r)
{
int i;
for (i = 0; i < dev->max_queues; i++) {
VuVirtq *vq = &dev->vq[i];
const uintptr_t desc = (uintptr_t)vq->vring.desc;
const uintptr_t used = (uintptr_t)vq->vring.used;
const uintptr_t avail = (uintptr_t)vq->vring.avail;
if (desc < r->mmap_addr || desc >= r->mmap_addr + r->size) {
continue;
}
if (used < r->mmap_addr || used >= r->mmap_addr + r->size) {
continue;
}
if (avail < r->mmap_addr || avail >= r->mmap_addr + r->size) {
continue;
}
DPRINT("Unmapping rings of queue %d\n", i);
vq->vring.desc = NULL;
vq->vring.used = NULL;
vq->vring.avail = NULL;
}
}
static size_t
get_fd_hugepagesize(int fd)
{
#if defined(__linux__)
struct statfs fs;
int ret;
do {
ret = fstatfs(fd, &fs);
} while (ret != 0 && errno == EINTR);
if (!ret && (unsigned int)fs.f_type == HUGETLBFS_MAGIC) {
return fs.f_bsize;
}
#endif
return 0;
}
static void
_vu_add_mem_reg(VuDev *dev, VhostUserMemoryRegion *msg_region, int fd)
{
const uint64_t start_gpa = msg_region->guest_phys_addr;
const uint64_t end_gpa = start_gpa + msg_region->memory_size;
int prot = PROT_READ | PROT_WRITE;
uint64_t mmap_offset, fd_offset;
size_t hugepagesize;
VuDevRegion *r;
void *mmap_addr;
int low = 0;
int high = dev->nregions - 1;
unsigned int idx;
DPRINT("Adding region %d\n", dev->nregions);
DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n",
msg_region->guest_phys_addr);
DPRINT(" memory_size: 0x%016"PRIx64"\n",
msg_region->memory_size);
DPRINT(" userspace_addr: 0x%016"PRIx64"\n",
msg_region->userspace_addr);
DPRINT(" old mmap_offset: 0x%016"PRIx64"\n",
msg_region->mmap_offset);
if (dev->postcopy_listening) {
/*
* In postcopy we're using PROT_NONE here to catch anyone
* accessing it before we userfault
*/
prot = PROT_NONE;
}
/*
* We will add memory regions into the array sorted by GPA. Perform a
* binary search to locate the insertion point: it will be at the low
* index.
*/
while (low <= high) {
unsigned int mid = low + (high - low) / 2;
VuDevRegion *cur = &dev->regions[mid];
/* Overlap of GPA addresses. */
if (start_gpa < cur->gpa + cur->size && cur->gpa < end_gpa) {
vu_panic(dev, "regions with overlapping guest physical addresses");
return;
}
if (start_gpa >= cur->gpa + cur->size) {
low = mid + 1;
}
if (start_gpa < cur->gpa) {
high = mid - 1;
}
}
idx = low;
/*
* Convert most of msg_region->mmap_offset to fd_offset. In almost all
* cases, this will leave us with mmap_offset == 0, mmap()'ing only
* what we really need. Only if a memory region would partially cover
* hugetlb pages, we'd get mmap_offset != 0, which usually doesn't happen
* anymore (i.e., modern QEMU).
*
* Note that mmap() with hugetlb would fail if the offset into the file
* is not aligned to the huge page size.
*/
hugepagesize = get_fd_hugepagesize(fd);
if (hugepagesize) {
fd_offset = ALIGN_DOWN(msg_region->mmap_offset, hugepagesize);
mmap_offset = msg_region->mmap_offset - fd_offset;
} else {
fd_offset = msg_region->mmap_offset;
mmap_offset = 0;
}
DPRINT(" fd_offset: 0x%016"PRIx64"\n",
fd_offset);
DPRINT(" new mmap_offset: 0x%016"PRIx64"\n",
mmap_offset);
mmap_addr = mmap(0, msg_region->memory_size + mmap_offset,
prot, MAP_SHARED | MAP_NORESERVE, fd, fd_offset);
if (mmap_addr == MAP_FAILED) {
vu_panic(dev, "region mmap error: %s", strerror(errno));
return;
}
DPRINT(" mmap_addr: 0x%016"PRIx64"\n",
(uint64_t)(uintptr_t)mmap_addr);
#if defined(__linux__)
/* Don't include all guest memory in a coredump. */
madvise(mmap_addr, msg_region->memory_size + mmap_offset,
MADV_DONTDUMP);
#endif
/* Shift all affected entries by 1 to open a hole at idx. */
r = &dev->regions[idx];
memmove(r + 1, r, sizeof(VuDevRegion) * (dev->nregions - idx));
r->gpa = msg_region->guest_phys_addr;
r->size = msg_region->memory_size;
r->qva = msg_region->userspace_addr;
r->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
r->mmap_offset = mmap_offset;
dev->nregions++;
if (dev->postcopy_listening) {
/*
* Return the address to QEMU so that it can translate the ufd
* fault addresses back.
*/
msg_region->userspace_addr = r->mmap_addr + r->mmap_offset;
}
}
static void static void
vmsg_close_fds(VhostUserMsg *vmsg) vmsg_close_fds(VhostUserMsg *vmsg)
{ {
@ -612,21 +857,6 @@ vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg)
return false; return false;
} }
static bool
map_ring(VuDev *dev, VuVirtq *vq)
{
vq->vring.desc = qva_to_va(dev, vq->vra.desc_user_addr);
vq->vring.used = qva_to_va(dev, vq->vra.used_user_addr);
vq->vring.avail = qva_to_va(dev, vq->vra.avail_user_addr);
DPRINT("Setting virtq addresses:\n");
DPRINT(" vring_desc at %p\n", vq->vring.desc);
DPRINT(" vring_used at %p\n", vq->vring.used);
DPRINT(" vring_avail at %p\n", vq->vring.avail);
return !(vq->vring.desc && vq->vring.used && vq->vring.avail);
}
static bool static bool
generate_faults(VuDev *dev) { generate_faults(VuDev *dev) {
unsigned int i; unsigned int i;
@ -710,11 +940,7 @@ generate_faults(VuDev *dev) {
static bool static bool
vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
int i;
bool track_ramblocks = dev->postcopy_listening;
VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m;
VuDevRegion *dev_region = &dev->regions[dev->nregions];
void *mmap_addr;
if (vmsg->fd_num != 1) { if (vmsg->fd_num != 1) {
vmsg_close_fds(vmsg); vmsg_close_fds(vmsg);
@ -744,84 +970,24 @@ vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
* we know all the postcopy client bases have been received, and we * we know all the postcopy client bases have been received, and we
* should start generating faults. * should start generating faults.
*/ */
if (track_ramblocks && if (dev->postcopy_listening &&
vmsg->size == sizeof(vmsg->payload.u64) && vmsg->size == sizeof(vmsg->payload.u64) &&
vmsg->payload.u64 == 0) { vmsg->payload.u64 == 0) {
(void)generate_faults(dev); (void)generate_faults(dev);
return false; return false;
} }
DPRINT("Adding region: %u\n", dev->nregions); _vu_add_mem_reg(dev, msg_region, vmsg->fds[0]);
DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n",
msg_region->guest_phys_addr);
DPRINT(" memory_size: 0x%016"PRIx64"\n",
msg_region->memory_size);
DPRINT(" userspace_addr 0x%016"PRIx64"\n",
msg_region->userspace_addr);
DPRINT(" mmap_offset 0x%016"PRIx64"\n",
msg_region->mmap_offset);
dev_region->gpa = msg_region->guest_phys_addr;
dev_region->size = msg_region->memory_size;
dev_region->qva = msg_region->userspace_addr;
dev_region->mmap_offset = msg_region->mmap_offset;
/*
* We don't use offset argument of mmap() since the
* mapped address has to be page aligned, and we use huge
* pages.
*/
if (track_ramblocks) {
/*
* In postcopy we're using PROT_NONE here to catch anyone
* accessing it before we userfault.
*/
mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
PROT_NONE, MAP_SHARED | MAP_NORESERVE,
vmsg->fds[0], 0);
} else {
mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE,
vmsg->fds[0], 0);
}
if (mmap_addr == MAP_FAILED) {
vu_panic(dev, "region mmap error: %s", strerror(errno));
} else {
dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
DPRINT(" mmap_addr: 0x%016"PRIx64"\n",
dev_region->mmap_addr);
}
close(vmsg->fds[0]); close(vmsg->fds[0]);
if (track_ramblocks) { if (dev->postcopy_listening) {
/*
* Return the address to QEMU so that it can translate the ufd
* fault addresses back.
*/
msg_region->userspace_addr = (uintptr_t)(mmap_addr +
dev_region->mmap_offset);
/* Send the message back to qemu with the addresses filled in. */ /* Send the message back to qemu with the addresses filled in. */
vmsg->fd_num = 0; vmsg->fd_num = 0;
DPRINT("Successfully added new region in postcopy\n"); DPRINT("Successfully added new region in postcopy\n");
dev->nregions++;
return true; return true;
} else {
for (i = 0; i < dev->max_queues; i++) {
if (dev->vq[i].vring.desc) {
if (map_ring(dev, &dev->vq[i])) {
vu_panic(dev, "remapping queue %d for new memory region",
i);
}
}
}
DPRINT("Successfully added new region\n");
dev->nregions++;
return false;
} }
DPRINT("Successfully added new region\n");
return false;
} }
static inline bool reg_equal(VuDevRegion *vudev_reg, static inline bool reg_equal(VuDevRegion *vudev_reg,
@ -839,8 +1005,8 @@ static inline bool reg_equal(VuDevRegion *vudev_reg,
static bool static bool
vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m;
unsigned int i; unsigned int idx;
bool found = false; VuDevRegion *r;
if (vmsg->fd_num > 1) { if (vmsg->fd_num > 1) {
vmsg_close_fds(vmsg); vmsg_close_fds(vmsg);
@ -867,36 +1033,32 @@ vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
DPRINT(" mmap_offset 0x%016"PRIx64"\n", DPRINT(" mmap_offset 0x%016"PRIx64"\n",
msg_region->mmap_offset); msg_region->mmap_offset);
for (i = 0; i < dev->nregions; i++) { r = vu_gpa_to_mem_region(dev, msg_region->guest_phys_addr);
if (reg_equal(&dev->regions[i], msg_region)) { if (!r || !reg_equal(r, msg_region)) {
VuDevRegion *r = &dev->regions[i]; vmsg_close_fds(vmsg);
void *ma = (void *) (uintptr_t) r->mmap_addr;
if (ma) {
munmap(ma, r->size + r->mmap_offset);
}
/*
* Shift all affected entries by 1 to close the hole at index i and
* zero out the last entry.
*/
memmove(dev->regions + i, dev->regions + i + 1,
sizeof(VuDevRegion) * (dev->nregions - i - 1));
memset(dev->regions + dev->nregions - 1, 0, sizeof(VuDevRegion));
DPRINT("Successfully removed a region\n");
dev->nregions--;
i--;
found = true;
/* Continue the search for eventual duplicates. */
}
}
if (!found) {
vu_panic(dev, "Specified region not found\n"); vu_panic(dev, "Specified region not found\n");
return false;
} }
/*
* There might be valid cases where we temporarily remove memory regions
* to readd them again, or remove memory regions and don't use the rings
* anymore before we set the ring addresses and restart the device.
*
* Unmap all affected rings, remapping them on demand later. This should
* be a corner case.
*/
unmap_rings(dev, r);
munmap((void *)(uintptr_t)r->mmap_addr, r->size + r->mmap_offset);
idx = r - dev->regions;
assert(idx < dev->nregions);
/* Shift all affected entries by 1 to close the hole. */
memmove(r, r + 1, sizeof(VuDevRegion) * (dev->nregions - idx - 1));
DPRINT("Successfully removed a region\n");
dev->nregions--;
vmsg_close_fds(vmsg); vmsg_close_fds(vmsg);
return false; return false;
@ -920,140 +1082,42 @@ vu_get_shared_object(VuDev *dev, VhostUserMsg *vmsg)
return true; return true;
} }
static bool
vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
{
unsigned int i;
VhostUserMemory m = vmsg->payload.memory, *memory = &m;
dev->nregions = memory->nregions;
DPRINT("Nregions: %u\n", memory->nregions);
for (i = 0; i < dev->nregions; i++) {
void *mmap_addr;
VhostUserMemoryRegion *msg_region = &memory->regions[i];
VuDevRegion *dev_region = &dev->regions[i];
DPRINT("Region %d\n", i);
DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n",
msg_region->guest_phys_addr);
DPRINT(" memory_size: 0x%016"PRIx64"\n",
msg_region->memory_size);
DPRINT(" userspace_addr 0x%016"PRIx64"\n",
msg_region->userspace_addr);
DPRINT(" mmap_offset 0x%016"PRIx64"\n",
msg_region->mmap_offset);
dev_region->gpa = msg_region->guest_phys_addr;
dev_region->size = msg_region->memory_size;
dev_region->qva = msg_region->userspace_addr;
dev_region->mmap_offset = msg_region->mmap_offset;
/* We don't use offset argument of mmap() since the
* mapped address has to be page aligned, and we use huge
* pages.
* In postcopy we're using PROT_NONE here to catch anyone
* accessing it before we userfault
*/
mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
PROT_NONE, MAP_SHARED | MAP_NORESERVE,
vmsg->fds[i], 0);
if (mmap_addr == MAP_FAILED) {
vu_panic(dev, "region mmap error: %s", strerror(errno));
} else {
dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
DPRINT(" mmap_addr: 0x%016"PRIx64"\n",
dev_region->mmap_addr);
}
/* Return the address to QEMU so that it can translate the ufd
* fault addresses back.
*/
msg_region->userspace_addr = (uintptr_t)(mmap_addr +
dev_region->mmap_offset);
close(vmsg->fds[i]);
}
/* Send the message back to qemu with the addresses filled in */
vmsg->fd_num = 0;
if (!vu_send_reply(dev, dev->sock, vmsg)) {
vu_panic(dev, "failed to respond to set-mem-table for postcopy");
return false;
}
/* Wait for QEMU to confirm that it's registered the handler for the
* faults.
*/
if (!dev->read_msg(dev, dev->sock, vmsg) ||
vmsg->size != sizeof(vmsg->payload.u64) ||
vmsg->payload.u64 != 0) {
vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table");
return false;
}
/* OK, now we can go and register the memory and generate faults */
(void)generate_faults(dev);
return false;
}
static bool static bool
vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg)
{ {
unsigned int i;
VhostUserMemory m = vmsg->payload.memory, *memory = &m; VhostUserMemory m = vmsg->payload.memory, *memory = &m;
unsigned int i;
for (i = 0; i < dev->nregions; i++) { vu_remove_all_mem_regs(dev);
VuDevRegion *r = &dev->regions[i];
void *ma = (void *) (uintptr_t) r->mmap_addr;
if (ma) {
munmap(ma, r->size + r->mmap_offset);
}
}
dev->nregions = memory->nregions;
if (dev->postcopy_listening) {
return vu_set_mem_table_exec_postcopy(dev, vmsg);
}
DPRINT("Nregions: %u\n", memory->nregions); DPRINT("Nregions: %u\n", memory->nregions);
for (i = 0; i < dev->nregions; i++) { for (i = 0; i < memory->nregions; i++) {
void *mmap_addr; _vu_add_mem_reg(dev, &memory->regions[i], vmsg->fds[i]);
VhostUserMemoryRegion *msg_region = &memory->regions[i]; close(vmsg->fds[i]);
VuDevRegion *dev_region = &dev->regions[i]; }
DPRINT("Region %d\n", i); if (dev->postcopy_listening) {
DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", /* Send the message back to qemu with the addresses filled in */
msg_region->guest_phys_addr); vmsg->fd_num = 0;
DPRINT(" memory_size: 0x%016"PRIx64"\n", if (!vu_send_reply(dev, dev->sock, vmsg)) {
msg_region->memory_size); vu_panic(dev, "failed to respond to set-mem-table for postcopy");
DPRINT(" userspace_addr 0x%016"PRIx64"\n", return false;
msg_region->userspace_addr);
DPRINT(" mmap_offset 0x%016"PRIx64"\n",
msg_region->mmap_offset);
dev_region->gpa = msg_region->guest_phys_addr;
dev_region->size = msg_region->memory_size;
dev_region->qva = msg_region->userspace_addr;
dev_region->mmap_offset = msg_region->mmap_offset;
/* We don't use offset argument of mmap() since the
* mapped address has to be page aligned, and we use huge
* pages. */
mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE,
vmsg->fds[i], 0);
if (mmap_addr == MAP_FAILED) {
vu_panic(dev, "region mmap error: %s", strerror(errno));
} else {
dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
DPRINT(" mmap_addr: 0x%016"PRIx64"\n",
dev_region->mmap_addr);
} }
close(vmsg->fds[i]); /*
* Wait for QEMU to confirm that it's registered the handler for the
* faults.
*/
if (!dev->read_msg(dev, dev->sock, vmsg) ||
vmsg->size != sizeof(vmsg->payload.u64) ||
vmsg->payload.u64 != 0) {
vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table");
return false;
}
/* OK, now we can go and register the memory and generate faults */
(void)generate_faults(dev);
return false;
} }
for (i = 0; i < dev->max_queues; i++) { for (i = 0; i < dev->max_queues; i++) {
@ -2112,14 +2176,7 @@ vu_deinit(VuDev *dev)
{ {
unsigned int i; unsigned int i;
for (i = 0; i < dev->nregions; i++) { vu_remove_all_mem_regs(dev);
VuDevRegion *r = &dev->regions[i];
void *m = (void *) (uintptr_t) r->mmap_addr;
if (m != MAP_FAILED) {
munmap(m, r->size + r->mmap_offset);
}
}
dev->nregions = 0;
for (i = 0; i < dev->max_queues; i++) { for (i = 0; i < dev->max_queues; i++) {
VuVirtq *vq = &dev->vq[i]; VuVirtq *vq = &dev->vq[i];
@ -2171,6 +2228,8 @@ vu_deinit(VuDev *dev)
free(dev->vq); free(dev->vq);
dev->vq = NULL; dev->vq = NULL;
free(dev->regions);
dev->regions = NULL;
} }
bool bool
@ -2205,9 +2264,17 @@ vu_init(VuDev *dev,
dev->backend_fd = -1; dev->backend_fd = -1;
dev->max_queues = max_queues; dev->max_queues = max_queues;
dev->regions = malloc(VHOST_USER_MAX_RAM_SLOTS * sizeof(dev->regions[0]));
if (!dev->regions) {
DPRINT("%s: failed to malloc mem regions\n", __func__);
return false;
}
dev->vq = malloc(max_queues * sizeof(dev->vq[0])); dev->vq = malloc(max_queues * sizeof(dev->vq[0]));
if (!dev->vq) { if (!dev->vq) {
DPRINT("%s: failed to malloc virtqueues\n", __func__); DPRINT("%s: failed to malloc virtqueues\n", __func__);
free(dev->regions);
dev->regions = NULL;
return false; return false;
} }
@ -2374,8 +2441,7 @@ vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes,
idx = vq->last_avail_idx; idx = vq->last_avail_idx;
total_bufs = in_total = out_total = 0; total_bufs = in_total = out_total = 0;
if (unlikely(dev->broken) || if (!vu_is_vq_usable(dev, vq)) {
unlikely(!vq->vring.avail)) {
goto done; goto done;
} }
@ -2490,8 +2556,7 @@ vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes,
bool bool
vu_queue_empty(VuDev *dev, VuVirtq *vq) vu_queue_empty(VuDev *dev, VuVirtq *vq)
{ {
if (unlikely(dev->broken) || if (!vu_is_vq_usable(dev, vq)) {
unlikely(!vq->vring.avail)) {
return true; return true;
} }
@ -2530,8 +2595,7 @@ vring_notify(VuDev *dev, VuVirtq *vq)
static void _vu_queue_notify(VuDev *dev, VuVirtq *vq, bool sync) static void _vu_queue_notify(VuDev *dev, VuVirtq *vq, bool sync)
{ {
if (unlikely(dev->broken) || if (!vu_is_vq_usable(dev, vq)) {
unlikely(!vq->vring.avail)) {
return; return;
} }
@ -2856,8 +2920,7 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
unsigned int head; unsigned int head;
VuVirtqElement *elem; VuVirtqElement *elem;
if (unlikely(dev->broken) || if (!vu_is_vq_usable(dev, vq)) {
unlikely(!vq->vring.avail)) {
return NULL; return NULL;
} }
@ -3014,8 +3077,7 @@ vu_queue_fill(VuDev *dev, VuVirtq *vq,
{ {
struct vring_used_elem uelem; struct vring_used_elem uelem;
if (unlikely(dev->broken) || if (!vu_is_vq_usable(dev, vq)) {
unlikely(!vq->vring.avail)) {
return; return;
} }
@ -3044,8 +3106,7 @@ vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count)
{ {
uint16_t old, new; uint16_t old, new;
if (unlikely(dev->broken) || if (!vu_is_vq_usable(dev, vq)) {
unlikely(!vq->vring.avail)) {
return; return;
} }

View file

@ -31,10 +31,12 @@
#define VHOST_MEMORY_BASELINE_NREGIONS 8 #define VHOST_MEMORY_BASELINE_NREGIONS 8
/* /*
* Set a reasonable maximum number of ram slots, which will be supported by * vhost in the kernel usually supports 509 mem slots. 509 used to be the
* any architecture. * KVM limit, it supported 512, but 3 were used for internal purposes. This
* limit is sufficient to support many DIMMs and virtio-mem in
* "dynamic-memslots" mode.
*/ */
#define VHOST_USER_MAX_RAM_SLOTS 32 #define VHOST_USER_MAX_RAM_SLOTS 509
#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
@ -398,7 +400,7 @@ typedef struct VuDevInflightInfo {
struct VuDev { struct VuDev {
int sock; int sock;
uint32_t nregions; uint32_t nregions;
VuDevRegion regions[VHOST_USER_MAX_RAM_SLOTS]; VuDevRegion *regions;
VuVirtq *vq; VuVirtq *vq;
VuDevInflightInfo inflight_info; VuDevInflightInfo inflight_info;
int log_call_fd; int log_call_fd;

View file

@ -34,7 +34,7 @@ static void pci_config(void *obj, void *data, QGuestAllocator *t_alloc)
uint8_t bypass = qvirtio_config_readb(dev, 36); uint8_t bypass = qvirtio_config_readb(dev, 36);
g_assert_cmpint(input_range_start, ==, 0); g_assert_cmpint(input_range_start, ==, 0);
g_assert_cmphex(input_range_end, ==, UINT64_MAX); g_assert_cmphex(input_range_end, >=, UINT32_MAX);
g_assert_cmpint(domain_range_start, ==, 0); g_assert_cmpint(domain_range_start, ==, 0);
g_assert_cmpint(domain_range_end, ==, UINT32_MAX); g_assert_cmpint(domain_range_end, ==, UINT32_MAX);
g_assert_cmpint(bypass, ==, 1); g_assert_cmpint(bypass, ==, 1);