virtio-scsi: add iothread-vq-mapping parameter

Allow virtio-scsi virtqueues to be assigned to different IOThreads. This
makes it possible to take advantage of host multi-queue block layer
scalability by assigning virtqueues that have affinity with vCPUs to
different IOThreads that have affinity with host CPUs. The same feature
was introduced for virtio-blk in the past:
https://developers.redhat.com/articles/2024/09/05/scaling-virtio-blk-disk-io-iothread-virtqueue-mapping

Here are fio randread 4k iodepth=64 results from a 4 vCPU guest with an
Intel P4800X SSD:
iothreads IOPS
------------------------------
1         189576
2         312698
4         346744

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20250311132616.1049687-12-stefanha@redhat.com>
Tested-by: Peter Krempa <pkrempa@redhat.com>
[kwolf: Updated 051 output, virtio-scsi can now use any iothread]
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
This commit is contained in:
Stefan Hajnoczi 2025-03-11 21:26:14 +08:00 committed by Kevin Wolf
parent b50629c335
commit 2e8e18c2e4
4 changed files with 108 additions and 52 deletions

View file

@ -18,6 +18,7 @@
#include "system/block-backend.h" #include "system/block-backend.h"
#include "hw/scsi/scsi.h" #include "hw/scsi/scsi.h"
#include "scsi/constants.h" #include "scsi/constants.h"
#include "hw/virtio/iothread-vq-mapping.h"
#include "hw/virtio/virtio-bus.h" #include "hw/virtio/virtio-bus.h"
/* Context: BQL held */ /* Context: BQL held */
@ -27,8 +28,16 @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp)
VirtIODevice *vdev = VIRTIO_DEVICE(s); VirtIODevice *vdev = VIRTIO_DEVICE(s);
BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
uint16_t num_vqs = vs->conf.num_queues + VIRTIO_SCSI_VQ_NUM_FIXED;
if (vs->conf.iothread) { if (vs->conf.iothread && vs->conf.iothread_vq_mapping_list) {
error_setg(errp,
"iothread and iothread-vq-mapping properties cannot be set "
"at the same time");
return;
}
if (vs->conf.iothread || vs->conf.iothread_vq_mapping_list) {
if (!k->set_guest_notifiers || !k->ioeventfd_assign) { if (!k->set_guest_notifiers || !k->ioeventfd_assign) {
error_setg(errp, error_setg(errp,
"device is incompatible with iothread " "device is incompatible with iothread "
@ -39,15 +48,50 @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp)
error_setg(errp, "ioeventfd is required for iothread"); error_setg(errp, "ioeventfd is required for iothread");
return; return;
} }
s->ctx = iothread_get_aio_context(vs->conf.iothread); }
} else {
if (!virtio_device_ioeventfd_enabled(vdev)) { s->vq_aio_context = g_new(AioContext *, num_vqs);
if (vs->conf.iothread_vq_mapping_list) {
if (!iothread_vq_mapping_apply(vs->conf.iothread_vq_mapping_list,
s->vq_aio_context, num_vqs, errp)) {
g_free(s->vq_aio_context);
s->vq_aio_context = NULL;
return; return;
} }
s->ctx = qemu_get_aio_context(); } else if (vs->conf.iothread) {
AioContext *ctx = iothread_get_aio_context(vs->conf.iothread);
for (uint16_t i = 0; i < num_vqs; i++) {
s->vq_aio_context[i] = ctx;
}
/* Released in virtio_scsi_dataplane_cleanup() */
object_ref(OBJECT(vs->conf.iothread));
} else {
AioContext *ctx = qemu_get_aio_context();
for (unsigned i = 0; i < num_vqs; i++) {
s->vq_aio_context[i] = ctx;
}
} }
} }
/* Context: BQL held */
void virtio_scsi_dataplane_cleanup(VirtIOSCSI *s)
{
VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
if (vs->conf.iothread_vq_mapping_list) {
iothread_vq_mapping_cleanup(vs->conf.iothread_vq_mapping_list);
}
if (vs->conf.iothread) {
object_unref(OBJECT(vs->conf.iothread));
}
g_free(s->vq_aio_context);
s->vq_aio_context = NULL;
}
static int virtio_scsi_set_host_notifier(VirtIOSCSI *s, VirtQueue *vq, int n) static int virtio_scsi_set_host_notifier(VirtIOSCSI *s, VirtQueue *vq, int n)
{ {
BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(s))); BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(s)));
@ -66,31 +110,20 @@ static int virtio_scsi_set_host_notifier(VirtIOSCSI *s, VirtQueue *vq, int n)
} }
/* Context: BH in IOThread */ /* Context: BH in IOThread */
static void virtio_scsi_dataplane_stop_bh(void *opaque) static void virtio_scsi_dataplane_stop_vq_bh(void *opaque)
{ {
VirtIOSCSI *s = opaque; AioContext *ctx = qemu_get_current_aio_context();
VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s); VirtQueue *vq = opaque;
EventNotifier *host_notifier; EventNotifier *host_notifier;
int i;
virtio_queue_aio_detach_host_notifier(vs->ctrl_vq, s->ctx); virtio_queue_aio_detach_host_notifier(vq, ctx);
host_notifier = virtio_queue_get_host_notifier(vs->ctrl_vq); host_notifier = virtio_queue_get_host_notifier(vq);
/* /*
* Test and clear notifier after disabling event, in case poll callback * Test and clear notifier after disabling event, in case poll callback
* didn't have time to run. * didn't have time to run.
*/ */
virtio_queue_host_notifier_read(host_notifier); virtio_queue_host_notifier_read(host_notifier);
virtio_queue_aio_detach_host_notifier(vs->event_vq, s->ctx);
host_notifier = virtio_queue_get_host_notifier(vs->event_vq);
virtio_queue_host_notifier_read(host_notifier);
for (i = 0; i < vs->conf.num_queues; i++) {
virtio_queue_aio_detach_host_notifier(vs->cmd_vqs[i], s->ctx);
host_notifier = virtio_queue_get_host_notifier(vs->cmd_vqs[i]);
virtio_queue_host_notifier_read(host_notifier);
}
} }
/* Context: BQL held */ /* Context: BQL held */
@ -154,11 +187,14 @@ int virtio_scsi_dataplane_start(VirtIODevice *vdev)
smp_wmb(); /* paired with aio_notify_accept() */ smp_wmb(); /* paired with aio_notify_accept() */
if (s->bus.drain_count == 0) { if (s->bus.drain_count == 0) {
virtio_queue_aio_attach_host_notifier(vs->ctrl_vq, s->ctx); virtio_queue_aio_attach_host_notifier(vs->ctrl_vq,
virtio_queue_aio_attach_host_notifier_no_poll(vs->event_vq, s->ctx); s->vq_aio_context[0]);
virtio_queue_aio_attach_host_notifier_no_poll(vs->event_vq,
s->vq_aio_context[1]);
for (i = 0; i < vs->conf.num_queues; i++) { for (i = 0; i < vs->conf.num_queues; i++) {
virtio_queue_aio_attach_host_notifier(vs->cmd_vqs[i], s->ctx); AioContext *ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i];
virtio_queue_aio_attach_host_notifier(vs->cmd_vqs[i], ctx);
} }
} }
return 0; return 0;
@ -207,7 +243,11 @@ void virtio_scsi_dataplane_stop(VirtIODevice *vdev)
s->dataplane_stopping = true; s->dataplane_stopping = true;
if (s->bus.drain_count == 0) { if (s->bus.drain_count == 0) {
aio_wait_bh_oneshot(s->ctx, virtio_scsi_dataplane_stop_bh, s); for (i = 0; i < vs->conf.num_queues + VIRTIO_SCSI_VQ_NUM_FIXED; i++) {
VirtQueue *vq = virtio_get_queue(&vs->parent_obj, i);
AioContext *ctx = s->vq_aio_context[i];
aio_wait_bh_oneshot(ctx, virtio_scsi_dataplane_stop_vq_bh, vq);
}
} }
blk_drain_all(); /* ensure there are no in-flight requests */ blk_drain_all(); /* ensure there are no in-flight requests */

View file

@ -27,6 +27,7 @@
#include "hw/qdev-properties.h" #include "hw/qdev-properties.h"
#include "hw/scsi/scsi.h" #include "hw/scsi/scsi.h"
#include "scsi/constants.h" #include "scsi/constants.h"
#include "hw/virtio/iothread-vq-mapping.h"
#include "hw/virtio/virtio-bus.h" #include "hw/virtio/virtio-bus.h"
#include "hw/virtio/virtio-access.h" #include "hw/virtio/virtio-access.h"
#include "trace.h" #include "trace.h"
@ -318,13 +319,6 @@ static void virtio_scsi_cancel_notify(Notifier *notifier, void *data)
g_free(n); g_free(n);
} }
static inline void virtio_scsi_ctx_check(VirtIOSCSI *s, SCSIDevice *d)
{
if (s->dataplane_started && d && blk_is_available(d->conf.blk)) {
assert(blk_get_aio_context(d->conf.blk) == s->ctx);
}
}
static void virtio_scsi_do_one_tmf_bh(VirtIOSCSIReq *req) static void virtio_scsi_do_one_tmf_bh(VirtIOSCSIReq *req)
{ {
VirtIOSCSI *s = req->dev; VirtIOSCSI *s = req->dev;
@ -517,9 +511,11 @@ static void virtio_scsi_flush_defer_tmf_to_aio_context(VirtIOSCSI *s)
assert(!s->dataplane_started); assert(!s->dataplane_started);
if (s->ctx) { for (uint32_t i = 0; i < s->parent_obj.conf.num_queues; i++) {
AioContext *ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i];
/* Our BH only runs after previously scheduled BHs */ /* Our BH only runs after previously scheduled BHs */
aio_wait_bh_oneshot(s->ctx, dummy_bh, NULL); aio_wait_bh_oneshot(ctx, dummy_bh, NULL);
} }
} }
@ -575,7 +571,6 @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
AioContext *ctx; AioContext *ctx;
int ret = 0; int ret = 0;
virtio_scsi_ctx_check(s, d);
/* Here VIRTIO_SCSI_S_OK means "FUNCTION COMPLETE". */ /* Here VIRTIO_SCSI_S_OK means "FUNCTION COMPLETE". */
req->resp.tmf.response = VIRTIO_SCSI_S_OK; req->resp.tmf.response = VIRTIO_SCSI_S_OK;
@ -639,6 +634,8 @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET: case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET:
case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: { case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: {
g_autoptr(GHashTable) aio_contexts = g_hash_table_new(NULL, NULL);
if (!d) { if (!d) {
goto fail; goto fail;
} }
@ -648,8 +645,15 @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
qatomic_inc(&req->remaining); qatomic_inc(&req->remaining);
ctx = s->ctx ?: qemu_get_aio_context(); for (uint32_t i = 0; i < s->parent_obj.conf.num_queues; i++) {
virtio_scsi_defer_tmf_to_aio_context(req, ctx); ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i];
if (!g_hash_table_add(aio_contexts, ctx)) {
continue; /* skip previously added AioContext */
}
virtio_scsi_defer_tmf_to_aio_context(req, ctx);
}
virtio_scsi_tmf_dec_remaining(req); virtio_scsi_tmf_dec_remaining(req);
ret = -EINPROGRESS; ret = -EINPROGRESS;
@ -770,9 +774,12 @@ static void virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq)
*/ */
static bool virtio_scsi_defer_to_dataplane(VirtIOSCSI *s) static bool virtio_scsi_defer_to_dataplane(VirtIOSCSI *s)
{ {
if (!s->ctx || s->dataplane_started) { if (s->dataplane_started) {
return false; return false;
} }
if (s->vq_aio_context[0] == qemu_get_aio_context()) {
return false; /* not using IOThreads */
}
virtio_device_start_ioeventfd(&s->parent_obj.parent_obj); virtio_device_start_ioeventfd(&s->parent_obj.parent_obj);
return !s->dataplane_fenced; return !s->dataplane_fenced;
@ -946,7 +953,6 @@ static int virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req)
virtio_scsi_complete_cmd_req(req); virtio_scsi_complete_cmd_req(req);
return -ENOENT; return -ENOENT;
} }
virtio_scsi_ctx_check(s, d);
req->sreq = scsi_req_new(d, req->req.cmd.tag, req->sreq = scsi_req_new(d, req->req.cmd.tag,
virtio_scsi_get_lun(req->req.cmd.lun), virtio_scsi_get_lun(req->req.cmd.lun),
req->req.cmd.cdb, vs->cdb_size, req); req->req.cmd.cdb, vs->cdb_size, req);
@ -1218,14 +1224,16 @@ static void virtio_scsi_hotplug(HotplugHandler *hotplug_dev, DeviceState *dev,
{ {
VirtIODevice *vdev = VIRTIO_DEVICE(hotplug_dev); VirtIODevice *vdev = VIRTIO_DEVICE(hotplug_dev);
VirtIOSCSI *s = VIRTIO_SCSI(vdev); VirtIOSCSI *s = VIRTIO_SCSI(vdev);
AioContext *ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED];
SCSIDevice *sd = SCSI_DEVICE(dev); SCSIDevice *sd = SCSI_DEVICE(dev);
int ret;
if (s->ctx && !s->dataplane_fenced) { if (ctx != qemu_get_aio_context() && !s->dataplane_fenced) {
ret = blk_set_aio_context(sd->conf.blk, s->ctx, errp); /*
if (ret < 0) { * Try to make the BlockBackend's AioContext match ours. Ignore failure
return; * because I/O will still work although block jobs and other users
} * might be slower when multiple AioContexts use a BlockBackend.
*/
blk_set_aio_context(sd->conf.blk, ctx, NULL);
} }
if (virtio_vdev_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) { if (virtio_vdev_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) {
@ -1260,7 +1268,7 @@ static void virtio_scsi_hotunplug(HotplugHandler *hotplug_dev, DeviceState *dev,
qdev_simple_device_unplug_cb(hotplug_dev, dev, errp); qdev_simple_device_unplug_cb(hotplug_dev, dev, errp);
if (s->ctx) { if (s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED] != qemu_get_aio_context()) {
/* If other users keep the BlockBackend in the iothread, that's ok */ /* If other users keep the BlockBackend in the iothread, that's ok */
blk_set_aio_context(sd->conf.blk, qemu_get_aio_context(), NULL); blk_set_aio_context(sd->conf.blk, qemu_get_aio_context(), NULL);
} }
@ -1294,7 +1302,7 @@ static void virtio_scsi_drained_begin(SCSIBus *bus)
for (uint32_t i = 0; i < total_queues; i++) { for (uint32_t i = 0; i < total_queues; i++) {
VirtQueue *vq = virtio_get_queue(vdev, i); VirtQueue *vq = virtio_get_queue(vdev, i);
virtio_queue_aio_detach_host_notifier(vq, s->ctx); virtio_queue_aio_detach_host_notifier(vq, s->vq_aio_context[i]);
} }
} }
@ -1320,10 +1328,12 @@ static void virtio_scsi_drained_end(SCSIBus *bus)
for (uint32_t i = 0; i < total_queues; i++) { for (uint32_t i = 0; i < total_queues; i++) {
VirtQueue *vq = virtio_get_queue(vdev, i); VirtQueue *vq = virtio_get_queue(vdev, i);
AioContext *ctx = s->vq_aio_context[i];
if (vq == vs->event_vq) { if (vq == vs->event_vq) {
virtio_queue_aio_attach_host_notifier_no_poll(vq, s->ctx); virtio_queue_aio_attach_host_notifier_no_poll(vq, ctx);
} else { } else {
virtio_queue_aio_attach_host_notifier(vq, s->ctx); virtio_queue_aio_attach_host_notifier(vq, ctx);
} }
} }
} }
@ -1430,12 +1440,13 @@ void virtio_scsi_common_unrealize(DeviceState *dev)
virtio_cleanup(vdev); virtio_cleanup(vdev);
} }
/* main loop */
static void virtio_scsi_device_unrealize(DeviceState *dev) static void virtio_scsi_device_unrealize(DeviceState *dev)
{ {
VirtIOSCSI *s = VIRTIO_SCSI(dev); VirtIOSCSI *s = VIRTIO_SCSI(dev);
virtio_scsi_reset_tmf_bh(s); virtio_scsi_reset_tmf_bh(s);
virtio_scsi_dataplane_cleanup(s);
qbus_set_hotplug_handler(BUS(&s->bus), NULL); qbus_set_hotplug_handler(BUS(&s->bus), NULL);
virtio_scsi_common_unrealize(dev); virtio_scsi_common_unrealize(dev);
qemu_mutex_destroy(&s->tmf_bh_lock); qemu_mutex_destroy(&s->tmf_bh_lock);
@ -1460,6 +1471,8 @@ static const Property virtio_scsi_properties[] = {
VIRTIO_SCSI_F_CHANGE, true), VIRTIO_SCSI_F_CHANGE, true),
DEFINE_PROP_LINK("iothread", VirtIOSCSI, parent_obj.conf.iothread, DEFINE_PROP_LINK("iothread", VirtIOSCSI, parent_obj.conf.iothread,
TYPE_IOTHREAD, IOThread *), TYPE_IOTHREAD, IOThread *),
DEFINE_PROP_IOTHREAD_VQ_MAPPING_LIST("iothread-vq-mapping", VirtIOSCSI,
parent_obj.conf.iothread_vq_mapping_list),
}; };
static const VMStateDescription vmstate_virtio_scsi = { static const VMStateDescription vmstate_virtio_scsi = {

View file

@ -22,6 +22,7 @@
#include "hw/virtio/virtio.h" #include "hw/virtio/virtio.h"
#include "hw/scsi/scsi.h" #include "hw/scsi/scsi.h"
#include "chardev/char-fe.h" #include "chardev/char-fe.h"
#include "qapi/qapi-types-virtio.h"
#include "system/iothread.h" #include "system/iothread.h"
#define TYPE_VIRTIO_SCSI_COMMON "virtio-scsi-common" #define TYPE_VIRTIO_SCSI_COMMON "virtio-scsi-common"
@ -60,6 +61,7 @@ struct VirtIOSCSIConf {
CharBackend chardev; CharBackend chardev;
uint32_t boot_tpgt; uint32_t boot_tpgt;
IOThread *iothread; IOThread *iothread;
IOThreadVirtQueueMappingList *iothread_vq_mapping_list;
}; };
struct VirtIOSCSI; struct VirtIOSCSI;
@ -97,7 +99,7 @@ struct VirtIOSCSI {
QTAILQ_HEAD(, VirtIOSCSIReq) tmf_bh_list; QTAILQ_HEAD(, VirtIOSCSIReq) tmf_bh_list;
/* Fields for dataplane below */ /* Fields for dataplane below */
AioContext *ctx; /* one iothread per virtio-scsi-pci for now */ AioContext **vq_aio_context; /* per-virtqueue AioContext pointer */
bool dataplane_started; bool dataplane_started;
bool dataplane_starting; bool dataplane_starting;
@ -115,6 +117,7 @@ void virtio_scsi_common_realize(DeviceState *dev,
void virtio_scsi_common_unrealize(DeviceState *dev); void virtio_scsi_common_unrealize(DeviceState *dev);
void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp); void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp);
void virtio_scsi_dataplane_cleanup(VirtIOSCSI *s);
int virtio_scsi_dataplane_start(VirtIODevice *s); int virtio_scsi_dataplane_start(VirtIODevice *s);
void virtio_scsi_dataplane_stop(VirtIODevice *s); void virtio_scsi_dataplane_stop(VirtIODevice *s);

View file

@ -181,7 +181,7 @@ QEMU X.Y.Z monitor - type 'help' for more information
Testing: -drive file=TEST_DIR/t.qcow2,if=none,node-name=disk -object iothread,id=thread0 -device virtio-scsi,iothread=thread0,id=virtio-scsi0 -device scsi-hd,bus=virtio-scsi0.0,drive=disk,share-rw=on -device virtio-scsi,id=virtio-scsi1 -device scsi-hd,bus=virtio-scsi1.0,drive=disk,share-rw=on Testing: -drive file=TEST_DIR/t.qcow2,if=none,node-name=disk -object iothread,id=thread0 -device virtio-scsi,iothread=thread0,id=virtio-scsi0 -device scsi-hd,bus=virtio-scsi0.0,drive=disk,share-rw=on -device virtio-scsi,id=virtio-scsi1 -device scsi-hd,bus=virtio-scsi1.0,drive=disk,share-rw=on
QEMU X.Y.Z monitor - type 'help' for more information QEMU X.Y.Z monitor - type 'help' for more information
(qemu) QEMU_PROG: -device scsi-hd,bus=virtio-scsi1.0,drive=disk,share-rw=on: Cannot change iothread of active block backend (qemu) quit
Testing: -drive file=TEST_DIR/t.qcow2,if=none,node-name=disk -object iothread,id=thread0 -device virtio-scsi,iothread=thread0,id=virtio-scsi0 -device scsi-hd,bus=virtio-scsi0.0,drive=disk,share-rw=on -device virtio-blk-pci,drive=disk,iothread=thread0,share-rw=on Testing: -drive file=TEST_DIR/t.qcow2,if=none,node-name=disk -object iothread,id=thread0 -device virtio-scsi,iothread=thread0,id=virtio-scsi0 -device scsi-hd,bus=virtio-scsi0.0,drive=disk,share-rw=on -device virtio-blk-pci,drive=disk,iothread=thread0,share-rw=on
QEMU X.Y.Z monitor - type 'help' for more information QEMU X.Y.Z monitor - type 'help' for more information