virtio, pci, pc: fixes, features

Bugfixes all over the place.
 HMAT support.
 New flags for vhost-user-blk utility.
 Auto-tuning of seg max for virtio storage.
 
 Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
 -----BEGIN PGP SIGNATURE-----
 
 iQFDBAABCAAtFiEEXQn9CHHI+FuUyooNKB8NuNKNVGkFAl4TaMEPHG1zdEByZWRo
 YXQuY29tAAoJECgfDbjSjVRpvzgH/2LyDAzCa9h93ikSJjmyUk5FUaqve38daEb3
 S3JYjwKxQx7u1ydooKhvBQnBCZ2i3S+k62gfYyKB+nBv8xvjs0Eg5D1YJ5E8hciy
 lf5OFGWWtX2iPDjZwQwT13kiJe0o3JRGxJJ6XqTEG+1EYOp7cky/FEv4PD030b9m
 I2wROZ/Am+onB9YJX8c0Vv1CG+AryuJNXnvwQzTXEjj4U7bEYUyJwVZaCRyAdWQ3
 uYXIZN9VwjVX6BFvy9ZAJbEsUVJvOM1/aQaDqcrLz+VlzRT7bRkKHi2G3vakrm1I
 r5OpgyLo84132awCncbSykKDH5o8WaxLaJBjGmuBfasMz9wPzAg=
 =uL1o
 -----END PGP SIGNATURE-----

Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging

virtio, pci, pc: fixes, features

Bugfixes all over the place.
HMAT support.
New flags for vhost-user-blk utility.
Auto-tuning of seg max for virtio storage.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

# gpg: Signature made Mon 06 Jan 2020 17:05:05 GMT
# gpg:                using RSA key 5D09FD0871C8F85B94CA8A0D281F0DB8D28D5469
# gpg:                issuer "mst@redhat.com"
# gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>" [full]
# gpg:                 aka "Michael S. Tsirkin <mst@redhat.com>" [full]
# Primary key fingerprint: 0270 606B 6F3C DF3D 0B17  0970 C350 3912 AFBE 8E67
#      Subkey fingerprint: 5D09 FD08 71C8 F85B 94CA  8A0D 281F 0DB8 D28D 5469

* remotes/mst/tags/for_upstream: (32 commits)
  intel_iommu: add present bit check for pasid table entries
  intel_iommu: a fix to vtd_find_as_from_bus_num()
  virtio-net: delete also control queue when TX/RX deleted
  virtio: reset region cache when on queue deletion
  virtio-mmio: update queue size on guest write
  tests: add virtio-scsi and virtio-blk seg_max_adjust test
  virtio: make seg_max virtqueue size dependent
  hw: fix using 4.2 compat in 5.0 machine types for i440fx/q35
  vhost-user-scsi: reset the device if supported
  vhost-user: add VHOST_USER_RESET_DEVICE to reset devices
  hw/pci/pci_host: Let pci_data_[read/write] use unsigned 'size' argument
  hw/pci/pci_host: Remove redundant PCI_DPRINTF()
  virtio-mmio: Clear v2 transport state on soft reset
  ACPI: add expected files for HMAT tests (acpihmat)
  tests/bios-tables-test: add test cases for ACPI HMAT
  tests/numa: Add case for QMP build HMAT
  hmat acpi: Build Memory Side Cache Information Structure(s)
  hmat acpi: Build System Locality Latency and Bandwidth Information Structure(s)
  hmat acpi: Build Memory Proximity Domain Attributes Structure(s)
  numa: Extend CLI to provide memory side cache information
  ...

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
Peter Maydell 2020-01-07 16:25:00 +00:00
commit 973d306dd6
45 changed files with 1796 additions and 133 deletions

View file

@ -23,6 +23,7 @@
*/
#include "qemu/osdep.h"
#include "qemu/units.h"
#include "sysemu/hostmem.h"
#include "sysemu/numa.h"
#include "sysemu/sysemu.h"
@ -129,6 +130,29 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
numa_info[nodenr].node_mem = object_property_get_uint(o, "size", NULL);
numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
}
/*
* If not set the initiator, set it to MAX_NODES. And if
* HMAT is enabled and this node has no cpus, QEMU will raise error.
*/
numa_info[nodenr].initiator = MAX_NODES;
if (node->has_initiator) {
if (!ms->numa_state->hmat_enabled) {
error_setg(errp, "ACPI Heterogeneous Memory Attribute Table "
"(HMAT) is disabled, enable it with -machine hmat=on "
"before using any of hmat specific options");
return;
}
if (node->initiator >= MAX_NODES) {
error_report("The initiator id %" PRIu16 " expects an integer "
"between 0 and %d", node->initiator,
MAX_NODES - 1);
return;
}
numa_info[nodenr].initiator = node->initiator;
}
numa_info[nodenr].present = true;
max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
ms->numa_state->num_nodes++;
@ -171,6 +195,253 @@ void parse_numa_distance(MachineState *ms, NumaDistOptions *dist, Error **errp)
ms->numa_state->have_numa_distance = true;
}
void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node,
Error **errp)
{
int i, first_bit, last_bit;
uint64_t max_entry, temp_base, bitmap_copy;
NodeInfo *numa_info = numa_state->nodes;
HMAT_LB_Info *hmat_lb =
numa_state->hmat_lb[node->hierarchy][node->data_type];
HMAT_LB_Data lb_data = {};
HMAT_LB_Data *lb_temp;
/* Error checking */
if (node->initiator > numa_state->num_nodes) {
error_setg(errp, "Invalid initiator=%d, it should be less than %d",
node->initiator, numa_state->num_nodes);
return;
}
if (node->target > numa_state->num_nodes) {
error_setg(errp, "Invalid target=%d, it should be less than %d",
node->target, numa_state->num_nodes);
return;
}
if (!numa_info[node->initiator].has_cpu) {
error_setg(errp, "Invalid initiator=%d, it isn't an "
"initiator proximity domain", node->initiator);
return;
}
if (!numa_info[node->target].present) {
error_setg(errp, "The target=%d should point to an existing node",
node->target);
return;
}
if (!hmat_lb) {
hmat_lb = g_malloc0(sizeof(*hmat_lb));
numa_state->hmat_lb[node->hierarchy][node->data_type] = hmat_lb;
hmat_lb->list = g_array_new(false, true, sizeof(HMAT_LB_Data));
}
hmat_lb->hierarchy = node->hierarchy;
hmat_lb->data_type = node->data_type;
lb_data.initiator = node->initiator;
lb_data.target = node->target;
if (node->data_type <= HMATLB_DATA_TYPE_WRITE_LATENCY) {
/* Input latency data */
if (!node->has_latency) {
error_setg(errp, "Missing 'latency' option");
return;
}
if (node->has_bandwidth) {
error_setg(errp, "Invalid option 'bandwidth' since "
"the data type is latency");
return;
}
/* Detect duplicate configuration */
for (i = 0; i < hmat_lb->list->len; i++) {
lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i);
if (node->initiator == lb_temp->initiator &&
node->target == lb_temp->target) {
error_setg(errp, "Duplicate configuration of the latency for "
"initiator=%d and target=%d", node->initiator,
node->target);
return;
}
}
hmat_lb->base = hmat_lb->base ? hmat_lb->base : UINT64_MAX;
if (node->latency) {
/* Calculate the temporary base and compressed latency */
max_entry = node->latency;
temp_base = 1;
while (QEMU_IS_ALIGNED(max_entry, 10)) {
max_entry /= 10;
temp_base *= 10;
}
/* Calculate the max compressed latency */
temp_base = MIN(hmat_lb->base, temp_base);
max_entry = node->latency / hmat_lb->base;
max_entry = MAX(hmat_lb->range_bitmap, max_entry);
/*
* For latency hmat_lb->range_bitmap record the max compressed
* latency which should be less than 0xFFFF (UINT16_MAX)
*/
if (max_entry >= UINT16_MAX) {
error_setg(errp, "Latency %" PRIu64 " between initiator=%d and "
"target=%d should not differ from previously entered "
"min or max values on more than %d", node->latency,
node->initiator, node->target, UINT16_MAX - 1);
return;
} else {
hmat_lb->base = temp_base;
hmat_lb->range_bitmap = max_entry;
}
/*
* Set lb_info_provided bit 0 as 1,
* latency information is provided
*/
numa_info[node->target].lb_info_provided |= BIT(0);
}
lb_data.data = node->latency;
} else if (node->data_type >= HMATLB_DATA_TYPE_ACCESS_BANDWIDTH) {
/* Input bandwidth data */
if (!node->has_bandwidth) {
error_setg(errp, "Missing 'bandwidth' option");
return;
}
if (node->has_latency) {
error_setg(errp, "Invalid option 'latency' since "
"the data type is bandwidth");
return;
}
if (!QEMU_IS_ALIGNED(node->bandwidth, MiB)) {
error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d and "
"target=%d should be 1MB aligned", node->bandwidth,
node->initiator, node->target);
return;
}
/* Detect duplicate configuration */
for (i = 0; i < hmat_lb->list->len; i++) {
lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i);
if (node->initiator == lb_temp->initiator &&
node->target == lb_temp->target) {
error_setg(errp, "Duplicate configuration of the bandwidth for "
"initiator=%d and target=%d", node->initiator,
node->target);
return;
}
}
hmat_lb->base = hmat_lb->base ? hmat_lb->base : 1;
if (node->bandwidth) {
/* Keep bitmap unchanged when bandwidth out of range */
bitmap_copy = hmat_lb->range_bitmap;
bitmap_copy |= node->bandwidth;
first_bit = ctz64(bitmap_copy);
temp_base = UINT64_C(1) << first_bit;
max_entry = node->bandwidth / temp_base;
last_bit = 64 - clz64(bitmap_copy);
/*
* For bandwidth, first_bit record the base unit of bandwidth bits,
* last_bit record the last bit of the max bandwidth. The max
* compressed bandwidth should be less than 0xFFFF (UINT16_MAX)
*/
if ((last_bit - first_bit) > UINT16_BITS ||
max_entry >= UINT16_MAX) {
error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d "
"and target=%d should not differ from previously "
"entered values on more than %d", node->bandwidth,
node->initiator, node->target, UINT16_MAX - 1);
return;
} else {
hmat_lb->base = temp_base;
hmat_lb->range_bitmap = bitmap_copy;
}
/*
* Set lb_info_provided bit 1 as 1,
* bandwidth information is provided
*/
numa_info[node->target].lb_info_provided |= BIT(1);
}
lb_data.data = node->bandwidth;
} else {
assert(0);
}
g_array_append_val(hmat_lb->list, lb_data);
}
void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node,
Error **errp)
{
int nb_numa_nodes = ms->numa_state->num_nodes;
NodeInfo *numa_info = ms->numa_state->nodes;
NumaHmatCacheOptions *hmat_cache = NULL;
if (node->node_id >= nb_numa_nodes) {
error_setg(errp, "Invalid node-id=%" PRIu32 ", it should be less "
"than %d", node->node_id, nb_numa_nodes);
return;
}
if (numa_info[node->node_id].lb_info_provided != (BIT(0) | BIT(1))) {
error_setg(errp, "The latency and bandwidth information of "
"node-id=%" PRIu32 " should be provided before memory side "
"cache attributes", node->node_id);
return;
}
if (node->level < 1 || node->level >= HMAT_LB_LEVELS) {
error_setg(errp, "Invalid level=%" PRIu8 ", it should be larger than 0 "
"and less than or equal to %d", node->level,
HMAT_LB_LEVELS - 1);
return;
}
assert(node->associativity < HMAT_CACHE_ASSOCIATIVITY__MAX);
assert(node->policy < HMAT_CACHE_WRITE_POLICY__MAX);
if (ms->numa_state->hmat_cache[node->node_id][node->level]) {
error_setg(errp, "Duplicate configuration of the side cache for "
"node-id=%" PRIu32 " and level=%" PRIu8,
node->node_id, node->level);
return;
}
if ((node->level > 1) &&
ms->numa_state->hmat_cache[node->node_id][node->level - 1] &&
(node->size >=
ms->numa_state->hmat_cache[node->node_id][node->level - 1]->size)) {
error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8
" should be less than the size(%" PRIu64 ") of "
"level=%u", node->size, node->level,
ms->numa_state->hmat_cache[node->node_id]
[node->level - 1]->size,
node->level - 1);
return;
}
if ((node->level < HMAT_LB_LEVELS - 1) &&
ms->numa_state->hmat_cache[node->node_id][node->level + 1] &&
(node->size <=
ms->numa_state->hmat_cache[node->node_id][node->level + 1]->size)) {
error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8
" should be larger than the size(%" PRIu64 ") of "
"level=%u", node->size, node->level,
ms->numa_state->hmat_cache[node->node_id]
[node->level + 1]->size,
node->level + 1);
return;
}
hmat_cache = g_malloc0(sizeof(*hmat_cache));
memcpy(hmat_cache, node, sizeof(*hmat_cache));
ms->numa_state->hmat_cache[node->node_id][node->level] = hmat_cache;
}
void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp)
{
Error *err = NULL;
@ -208,6 +479,32 @@ void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp)
machine_set_cpu_numa_node(ms, qapi_NumaCpuOptions_base(&object->u.cpu),
&err);
break;
case NUMA_OPTIONS_TYPE_HMAT_LB:
if (!ms->numa_state->hmat_enabled) {
error_setg(errp, "ACPI Heterogeneous Memory Attribute Table "
"(HMAT) is disabled, enable it with -machine hmat=on "
"before using any of hmat specific options");
return;
}
parse_numa_hmat_lb(ms->numa_state, &object->u.hmat_lb, &err);
if (err) {
goto end;
}
break;
case NUMA_OPTIONS_TYPE_HMAT_CACHE:
if (!ms->numa_state->hmat_enabled) {
error_setg(errp, "ACPI Heterogeneous Memory Attribute Table "
"(HMAT) is disabled, enable it with -machine hmat=on "
"before using any of hmat specific options");
return;
}
parse_numa_hmat_cache(ms, &object->u.hmat_cache, &err);
if (err) {
goto end;
}
break;
default:
abort();
}