Block layer patches

- virtio-scsi: add iothread-vq-mapping parameter
 - Improve writethrough performance
 - Fix missing zero init in bdrv_snapshot_goto()
 - Added scripts/qcow2-to-stdout.py
 - Code cleanup and iotests fixes
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCAAvFiEE3D3rFZqa+V09dFb+fwmycsiPL9YFAmfTDysRHGt3b2xmQHJl
 ZGhhdC5jb20ACgkQfwmycsiPL9Yz6A//asOl37zjbtf9pYjY/gliH859TQOppPGD
 LB9IIr+nTDME0wfUkCOlag+CeEYZwkeo2PF+XeopsyzlJeBOk4tL7AkY57XYe3lZ
 M5hlnNrn6l3gb6iioMg60pEKSMrpKprB16vT3nAtyN6aEXsm9TvtPkWPFTCFGVeK
 W74VCr7wuXbfdEJcOGd8WhB9ZHIgwoWYnoL41tvCoefW2yNaMA6X0TLn98toXzOi
 il50ZnnchTQngns5R+n+1R1Ma995t393D+CArQcYVRzxKGOs5p0y4otz4gCkMhdp
 GVL09R7Ge4TteSJ2myxlN/EjYOxmdoMrVDajr4xPdHBw12MKzgk8i82h4/Es/Q5o
 3Npgx74+jDyqlICb/czTVM5KJINpyO80vO3N3WpYUOQGyTCcYgv7pIpy8pB2o6Te
 RPlv0W9bHVSSgThFFLQ0Ud8WRGJe1K/ar8bdmiWN08Wez1avENWaYmsv5zGnFL24
 vD6cNXMR4mF7mzyeWda/5hGKv75djVgX+ZfzvWNT3qgizD56JBOA3RdCRwBZJOJb
 TvJkfi5RGyaji9BfKVCYBL3/iDELJEVDW8jxvIIUrS0aPcTHpAQ5gTO7VAokreqZ
 5Smll11eeoEgPPvNLw8ikmOGTWOMkJGrmExP2K1ApANq3kSbBSU4jroEr0BG9PZT
 6Y0hUdtFSdU=
 =w2Ri
 -----END PGP SIGNATURE-----

Merge tag 'for-upstream' of https://repo.or.cz/qemu/kevin into staging

Block layer patches

- virtio-scsi: add iothread-vq-mapping parameter
- Improve writethrough performance
- Fix missing zero init in bdrv_snapshot_goto()
- Added scripts/qcow2-to-stdout.py
- Code cleanup and iotests fixes

# -----BEGIN PGP SIGNATURE-----
#
# iQJFBAABCAAvFiEE3D3rFZqa+V09dFb+fwmycsiPL9YFAmfTDysRHGt3b2xmQHJl
# ZGhhdC5jb20ACgkQfwmycsiPL9Yz6A//asOl37zjbtf9pYjY/gliH859TQOppPGD
# LB9IIr+nTDME0wfUkCOlag+CeEYZwkeo2PF+XeopsyzlJeBOk4tL7AkY57XYe3lZ
# M5hlnNrn6l3gb6iioMg60pEKSMrpKprB16vT3nAtyN6aEXsm9TvtPkWPFTCFGVeK
# W74VCr7wuXbfdEJcOGd8WhB9ZHIgwoWYnoL41tvCoefW2yNaMA6X0TLn98toXzOi
# il50ZnnchTQngns5R+n+1R1Ma995t393D+CArQcYVRzxKGOs5p0y4otz4gCkMhdp
# GVL09R7Ge4TteSJ2myxlN/EjYOxmdoMrVDajr4xPdHBw12MKzgk8i82h4/Es/Q5o
# 3Npgx74+jDyqlICb/czTVM5KJINpyO80vO3N3WpYUOQGyTCcYgv7pIpy8pB2o6Te
# RPlv0W9bHVSSgThFFLQ0Ud8WRGJe1K/ar8bdmiWN08Wez1avENWaYmsv5zGnFL24
# vD6cNXMR4mF7mzyeWda/5hGKv75djVgX+ZfzvWNT3qgizD56JBOA3RdCRwBZJOJb
# TvJkfi5RGyaji9BfKVCYBL3/iDELJEVDW8jxvIIUrS0aPcTHpAQ5gTO7VAokreqZ
# 5Smll11eeoEgPPvNLw8ikmOGTWOMkJGrmExP2K1ApANq3kSbBSU4jroEr0BG9PZT
# 6Y0hUdtFSdU=
# =w2Ri
# -----END PGP SIGNATURE-----
# gpg: Signature made Fri 14 Mar 2025 01:00:27 HKT
# gpg:                using RSA key DC3DEB159A9AF95D3D7456FE7F09B272C88F2FD6
# gpg:                issuer "kwolf@redhat.com"
# gpg: Good signature from "Kevin Wolf <kwolf@redhat.com>" [full]
# Primary key fingerprint: DC3D EB15 9A9A F95D 3D74  56FE 7F09 B272 C88F 2FD6

* tag 'for-upstream' of https://repo.or.cz/qemu/kevin: (23 commits)
  scripts/qcow2-to-stdout.py: Add script to write qcow2 images to stdout
  virtio-scsi: only expose cmd vqs via iothread-vq-mapping
  virtio-scsi: handle ctrl virtqueue in main loop
  virtio-scsi: add iothread-vq-mapping parameter
  virtio: extract iothread-vq-mapping.h API
  virtio-blk: tidy up iothread_vq_mapping functions
  virtio-blk: extract cleanup_iothread_vq_mapping() function
  virtio-scsi: perform TMFs in appropriate AioContexts
  virtio-scsi: protect events_dropped field
  virtio-scsi: introduce event and ctrl virtqueue locks
  scsi: introduce requests_lock
  scsi: track per-SCSIRequest AioContext
  dma: use current AioContext for dma_blk_io()
  scsi-disk: drop unused SCSIDiskState->bh field
  iotests: Limit qsd-migrate to working formats
  aio-posix: Adjust polling time also for new handlers
  aio-posix: Separate AioPolledEvent per AioHandler
  aio-posix: Factor out adjust_polling_time()
  aio: Create AioPolledEvent
  block/io: Ignore FUA with cache.no-flush=on
  ...

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
This commit is contained in:
Stefan Hajnoczi 2025-03-14 09:31:13 +08:00
commit 0462a32b4f
30 changed files with 1306 additions and 531 deletions

View file

@ -2357,18 +2357,6 @@ void *blk_blockalign(BlockBackend *blk, size_t size)
return qemu_blockalign(blk ? blk_bs(blk) : NULL, size); return qemu_blockalign(blk ? blk_bs(blk) : NULL, size);
} }
bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp)
{
BlockDriverState *bs = blk_bs(blk);
GLOBAL_STATE_CODE();
GRAPH_RDLOCK_GUARD_MAINLOOP();
if (!bs) {
return false;
}
return bdrv_op_is_blocked(bs, op, errp);
}
/** /**
* Return BB's current AioContext. Note that this context may change * Return BB's current AioContext. Note that this context may change

View file

@ -194,6 +194,7 @@ static int fd_open(BlockDriverState *bs)
} }
static int64_t raw_getlength(BlockDriverState *bs); static int64_t raw_getlength(BlockDriverState *bs);
static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs);
typedef struct RawPosixAIOData { typedef struct RawPosixAIOData {
BlockDriverState *bs; BlockDriverState *bs;
@ -804,6 +805,13 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
#endif #endif
s->needs_alignment = raw_needs_alignment(bs); s->needs_alignment = raw_needs_alignment(bs);
bs->supported_write_flags = BDRV_REQ_FUA;
if (s->use_linux_aio && !laio_has_fua()) {
bs->supported_write_flags &= ~BDRV_REQ_FUA;
} else if (s->use_linux_io_uring && !luring_has_fua()) {
bs->supported_write_flags &= ~BDRV_REQ_FUA;
}
bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK; bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
if (S_ISREG(st.st_mode)) { if (S_ISREG(st.st_mode)) {
/* When extending regular files, we get zeros from the OS */ /* When extending regular files, we get zeros from the OS */
@ -2477,7 +2485,8 @@ static inline bool raw_check_linux_aio(BDRVRawState *s)
#endif #endif
static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr, static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
uint64_t bytes, QEMUIOVector *qiov, int type) uint64_t bytes, QEMUIOVector *qiov, int type,
int flags)
{ {
BDRVRawState *s = bs->opaque; BDRVRawState *s = bs->opaque;
RawPosixAIOData acb; RawPosixAIOData acb;
@ -2508,13 +2517,13 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
#ifdef CONFIG_LINUX_IO_URING #ifdef CONFIG_LINUX_IO_URING
} else if (raw_check_linux_io_uring(s)) { } else if (raw_check_linux_io_uring(s)) {
assert(qiov->size == bytes); assert(qiov->size == bytes);
ret = luring_co_submit(bs, s->fd, offset, qiov, type); ret = luring_co_submit(bs, s->fd, offset, qiov, type, flags);
goto out; goto out;
#endif #endif
#ifdef CONFIG_LINUX_AIO #ifdef CONFIG_LINUX_AIO
} else if (raw_check_linux_aio(s)) { } else if (raw_check_linux_aio(s)) {
assert(qiov->size == bytes); assert(qiov->size == bytes);
ret = laio_co_submit(s->fd, offset, qiov, type, ret = laio_co_submit(s->fd, offset, qiov, type, flags,
s->aio_max_batch); s->aio_max_batch);
goto out; goto out;
#endif #endif
@ -2534,6 +2543,10 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
assert(qiov->size == bytes); assert(qiov->size == bytes);
ret = raw_thread_pool_submit(handle_aiocb_rw, &acb); ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
if (ret == 0 && (flags & BDRV_REQ_FUA)) {
/* TODO Use pwritev2() instead if it's available */
ret = raw_co_flush_to_disk(bs);
}
goto out; /* Avoid the compiler err of unused label */ goto out; /* Avoid the compiler err of unused label */
out: out:
@ -2571,14 +2584,14 @@ static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
int64_t bytes, QEMUIOVector *qiov, int64_t bytes, QEMUIOVector *qiov,
BdrvRequestFlags flags) BdrvRequestFlags flags)
{ {
return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_READ); return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_READ, flags);
} }
static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset, static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
int64_t bytes, QEMUIOVector *qiov, int64_t bytes, QEMUIOVector *qiov,
BdrvRequestFlags flags) BdrvRequestFlags flags)
{ {
return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_WRITE); return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_WRITE, flags);
} }
static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs) static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
@ -2600,12 +2613,12 @@ static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
#ifdef CONFIG_LINUX_IO_URING #ifdef CONFIG_LINUX_IO_URING
if (raw_check_linux_io_uring(s)) { if (raw_check_linux_io_uring(s)) {
return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH); return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH, 0);
} }
#endif #endif
#ifdef CONFIG_LINUX_AIO #ifdef CONFIG_LINUX_AIO
if (s->has_laio_fdsync && raw_check_linux_aio(s)) { if (s->has_laio_fdsync && raw_check_linux_aio(s)) {
return laio_co_submit(s->fd, 0, NULL, QEMU_AIO_FLUSH, 0); return laio_co_submit(s->fd, 0, NULL, QEMU_AIO_FLUSH, 0, 0);
} }
#endif #endif
return raw_thread_pool_submit(handle_aiocb_flush, &acb); return raw_thread_pool_submit(handle_aiocb_flush, &acb);
@ -3540,7 +3553,7 @@ static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
} }
trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS); trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS);
return raw_co_prw(bs, offset, len, qiov, QEMU_AIO_ZONE_APPEND); return raw_co_prw(bs, offset, len, qiov, QEMU_AIO_ZONE_APPEND, 0);
} }
#endif #endif

View file

@ -1058,6 +1058,10 @@ bdrv_driver_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
return -ENOMEDIUM; return -ENOMEDIUM;
} }
if (bs->open_flags & BDRV_O_NO_FLUSH) {
flags &= ~BDRV_REQ_FUA;
}
if ((flags & BDRV_REQ_FUA) && if ((flags & BDRV_REQ_FUA) &&
(~bs->supported_write_flags & BDRV_REQ_FUA)) { (~bs->supported_write_flags & BDRV_REQ_FUA)) {
flags &= ~BDRV_REQ_FUA; flags &= ~BDRV_REQ_FUA;

View file

@ -335,15 +335,24 @@ static void luring_deferred_fn(void *opaque)
* *
*/ */
static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
uint64_t offset, int type) uint64_t offset, int type, BdrvRequestFlags flags)
{ {
int ret; int ret;
struct io_uring_sqe *sqes = &luringcb->sqeq; struct io_uring_sqe *sqes = &luringcb->sqeq;
switch (type) { switch (type) {
case QEMU_AIO_WRITE: case QEMU_AIO_WRITE:
#ifdef HAVE_IO_URING_PREP_WRITEV2
{
int luring_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0;
io_uring_prep_writev2(sqes, fd, luringcb->qiov->iov,
luringcb->qiov->niov, offset, luring_flags);
}
#else
assert(flags == 0);
io_uring_prep_writev(sqes, fd, luringcb->qiov->iov, io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
luringcb->qiov->niov, offset); luringcb->qiov->niov, offset);
#endif
break; break;
case QEMU_AIO_ZONE_APPEND: case QEMU_AIO_ZONE_APPEND:
io_uring_prep_writev(sqes, fd, luringcb->qiov->iov, io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
@ -380,7 +389,8 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
} }
int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset, int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
QEMUIOVector *qiov, int type) QEMUIOVector *qiov, int type,
BdrvRequestFlags flags)
{ {
int ret; int ret;
AioContext *ctx = qemu_get_current_aio_context(); AioContext *ctx = qemu_get_current_aio_context();
@ -393,7 +403,7 @@ int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
}; };
trace_luring_co_submit(bs, s, &luringcb, fd, offset, qiov ? qiov->size : 0, trace_luring_co_submit(bs, s, &luringcb, fd, offset, qiov ? qiov->size : 0,
type); type);
ret = luring_do_submit(fd, &luringcb, s, offset, type); ret = luring_do_submit(fd, &luringcb, s, offset, type, flags);
if (ret < 0) { if (ret < 0) {
return ret; return ret;
@ -448,3 +458,12 @@ void luring_cleanup(LuringState *s)
trace_luring_cleanup_state(s); trace_luring_cleanup_state(s);
g_free(s); g_free(s);
} }
bool luring_has_fua(void)
{
#ifdef HAVE_IO_URING_PREP_WRITEV2
return true;
#else
return false;
#endif
}

View file

@ -368,7 +368,8 @@ static void laio_deferred_fn(void *opaque)
} }
static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
int type, uint64_t dev_max_batch) int type, BdrvRequestFlags flags,
uint64_t dev_max_batch)
{ {
LinuxAioState *s = laiocb->ctx; LinuxAioState *s = laiocb->ctx;
struct iocb *iocbs = &laiocb->iocb; struct iocb *iocbs = &laiocb->iocb;
@ -376,7 +377,15 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
switch (type) { switch (type) {
case QEMU_AIO_WRITE: case QEMU_AIO_WRITE:
#ifdef HAVE_IO_PREP_PWRITEV2
{
int laio_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0;
io_prep_pwritev2(iocbs, fd, qiov->iov, qiov->niov, offset, laio_flags);
}
#else
assert(flags == 0);
io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset); io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
#endif
break; break;
case QEMU_AIO_ZONE_APPEND: case QEMU_AIO_ZONE_APPEND:
io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset); io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
@ -409,7 +418,8 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
} }
int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov, int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
int type, uint64_t dev_max_batch) int type, BdrvRequestFlags flags,
uint64_t dev_max_batch)
{ {
int ret; int ret;
AioContext *ctx = qemu_get_current_aio_context(); AioContext *ctx = qemu_get_current_aio_context();
@ -422,7 +432,7 @@ int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
.qiov = qiov, .qiov = qiov,
}; };
ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch); ret = laio_do_submit(fd, &laiocb, offset, type, flags, dev_max_batch);
if (ret < 0) { if (ret < 0) {
return ret; return ret;
} }
@ -505,3 +515,12 @@ bool laio_has_fdsync(int fd)
io_destroy(ctx); io_destroy(ctx);
return (ret == -EINVAL) ? false : true; return (ret == -EINVAL) ? false : true;
} }
bool laio_has_fua(void)
{
#ifdef HAVE_IO_PREP_PWRITEV2
return true;
#else
return false;
#endif
}

View file

@ -296,6 +296,7 @@ int bdrv_snapshot_goto(BlockDriverState *bs,
bdrv_graph_wrunlock(); bdrv_graph_wrunlock();
ret = bdrv_snapshot_goto(fallback_bs, snapshot_id, errp); ret = bdrv_snapshot_goto(fallback_bs, snapshot_id, errp);
memset(bs->opaque, 0, drv->instance_size);
open_ret = drv->bdrv_open(bs, options, bs->open_flags, &local_err); open_ret = drv->bdrv_open(bs, options, bs->open_flags, &local_err);
qobject_unref(options); qobject_unref(options);
if (open_ret < 0) { if (open_ret < 0) {

View file

@ -33,6 +33,7 @@
#endif #endif
#include "hw/virtio/virtio-bus.h" #include "hw/virtio/virtio-bus.h"
#include "migration/qemu-file-types.h" #include "migration/qemu-file-types.h"
#include "hw/virtio/iothread-vq-mapping.h"
#include "hw/virtio/virtio-access.h" #include "hw/virtio/virtio-access.h"
#include "hw/virtio/virtio-blk-common.h" #include "hw/virtio/virtio-blk-common.h"
#include "qemu/coroutine.h" #include "qemu/coroutine.h"
@ -1423,128 +1424,6 @@ static const BlockDevOps virtio_block_ops = {
.drained_end = virtio_blk_drained_end, .drained_end = virtio_blk_drained_end,
}; };
static bool
validate_iothread_vq_mapping_list(IOThreadVirtQueueMappingList *list,
uint16_t num_queues, Error **errp)
{
g_autofree unsigned long *vqs = bitmap_new(num_queues);
g_autoptr(GHashTable) iothreads =
g_hash_table_new(g_str_hash, g_str_equal);
for (IOThreadVirtQueueMappingList *node = list; node; node = node->next) {
const char *name = node->value->iothread;
uint16List *vq;
if (!iothread_by_id(name)) {
error_setg(errp, "IOThread \"%s\" object does not exist", name);
return false;
}
if (!g_hash_table_add(iothreads, (gpointer)name)) {
error_setg(errp,
"duplicate IOThread name \"%s\" in iothread-vq-mapping",
name);
return false;
}
if (node != list) {
if (!!node->value->vqs != !!list->value->vqs) {
error_setg(errp, "either all items in iothread-vq-mapping "
"must have vqs or none of them must have it");
return false;
}
}
for (vq = node->value->vqs; vq; vq = vq->next) {
if (vq->value >= num_queues) {
error_setg(errp, "vq index %u for IOThread \"%s\" must be "
"less than num_queues %u in iothread-vq-mapping",
vq->value, name, num_queues);
return false;
}
if (test_and_set_bit(vq->value, vqs)) {
error_setg(errp, "cannot assign vq %u to IOThread \"%s\" "
"because it is already assigned", vq->value, name);
return false;
}
}
}
if (list->value->vqs) {
for (uint16_t i = 0; i < num_queues; i++) {
if (!test_bit(i, vqs)) {
error_setg(errp,
"missing vq %u IOThread assignment in iothread-vq-mapping",
i);
return false;
}
}
}
return true;
}
/**
* apply_iothread_vq_mapping:
* @iothread_vq_mapping_list: The mapping of virtqueues to IOThreads.
* @vq_aio_context: The array of AioContext pointers to fill in.
* @num_queues: The length of @vq_aio_context.
* @errp: If an error occurs, a pointer to the area to store the error.
*
* Fill in the AioContext for each virtqueue in the @vq_aio_context array given
* the iothread-vq-mapping parameter in @iothread_vq_mapping_list.
*
* Returns: %true on success, %false on failure.
**/
static bool apply_iothread_vq_mapping(
IOThreadVirtQueueMappingList *iothread_vq_mapping_list,
AioContext **vq_aio_context,
uint16_t num_queues,
Error **errp)
{
IOThreadVirtQueueMappingList *node;
size_t num_iothreads = 0;
size_t cur_iothread = 0;
if (!validate_iothread_vq_mapping_list(iothread_vq_mapping_list,
num_queues, errp)) {
return false;
}
for (node = iothread_vq_mapping_list; node; node = node->next) {
num_iothreads++;
}
for (node = iothread_vq_mapping_list; node; node = node->next) {
IOThread *iothread = iothread_by_id(node->value->iothread);
AioContext *ctx = iothread_get_aio_context(iothread);
/* Released in virtio_blk_vq_aio_context_cleanup() */
object_ref(OBJECT(iothread));
if (node->value->vqs) {
uint16List *vq;
/* Explicit vq:IOThread assignment */
for (vq = node->value->vqs; vq; vq = vq->next) {
assert(vq->value < num_queues);
vq_aio_context[vq->value] = ctx;
}
} else {
/* Round-robin vq:IOThread assignment */
for (unsigned i = cur_iothread; i < num_queues;
i += num_iothreads) {
vq_aio_context[i] = ctx;
}
}
cur_iothread++;
}
return true;
}
/* Context: BQL held */ /* Context: BQL held */
static bool virtio_blk_vq_aio_context_init(VirtIOBlock *s, Error **errp) static bool virtio_blk_vq_aio_context_init(VirtIOBlock *s, Error **errp)
{ {
@ -1577,7 +1456,7 @@ static bool virtio_blk_vq_aio_context_init(VirtIOBlock *s, Error **errp)
s->vq_aio_context = g_new(AioContext *, conf->num_queues); s->vq_aio_context = g_new(AioContext *, conf->num_queues);
if (conf->iothread_vq_mapping_list) { if (conf->iothread_vq_mapping_list) {
if (!apply_iothread_vq_mapping(conf->iothread_vq_mapping_list, if (!iothread_vq_mapping_apply(conf->iothread_vq_mapping_list,
s->vq_aio_context, s->vq_aio_context,
conf->num_queues, conf->num_queues,
errp)) { errp)) {
@ -1611,12 +1490,7 @@ static void virtio_blk_vq_aio_context_cleanup(VirtIOBlock *s)
assert(!s->ioeventfd_started); assert(!s->ioeventfd_started);
if (conf->iothread_vq_mapping_list) { if (conf->iothread_vq_mapping_list) {
IOThreadVirtQueueMappingList *node; iothread_vq_mapping_cleanup(conf->iothread_vq_mapping_list);
for (node = conf->iothread_vq_mapping_list; node; node = node->next) {
IOThread *iothread = iothread_by_id(node->value->iothread);
object_unref(OBJECT(iothread));
}
} }
if (conf->iothread) { if (conf->iothread) {

View file

@ -968,8 +968,7 @@ static void ide_dma_cb(void *opaque, int ret)
BDRV_SECTOR_SIZE, ide_dma_cb, s); BDRV_SECTOR_SIZE, ide_dma_cb, s);
break; break;
case IDE_DMA_TRIM: case IDE_DMA_TRIM:
s->bus->dma->aiocb = dma_blk_io(blk_get_aio_context(s->blk), s->bus->dma->aiocb = dma_blk_io(&s->sg, offset, BDRV_SECTOR_SIZE,
&s->sg, offset, BDRV_SECTOR_SIZE,
ide_issue_trim, s, ide_dma_cb, s, ide_issue_trim, s, ide_dma_cb, s,
DMA_DIRECTION_TO_DEVICE); DMA_DIRECTION_TO_DEVICE);
break; break;

View file

@ -187,8 +187,7 @@ static void pmac_ide_transfer_cb(void *opaque, int ret)
pmac_ide_transfer_cb, io); pmac_ide_transfer_cb, io);
break; break;
case IDE_DMA_TRIM: case IDE_DMA_TRIM:
s->bus->dma->aiocb = dma_blk_io(blk_get_aio_context(s->blk), &s->sg, s->bus->dma->aiocb = dma_blk_io(&s->sg, offset, 0x1, ide_issue_trim, s,
offset, 0x1, ide_issue_trim, s,
pmac_ide_transfer_cb, io, pmac_ide_transfer_cb, io,
DMA_DIRECTION_TO_DEVICE); DMA_DIRECTION_TO_DEVICE);
break; break;

View file

@ -100,10 +100,17 @@ static void scsi_device_for_each_req_sync(SCSIDevice *s,
assert(!runstate_is_running()); assert(!runstate_is_running());
assert(qemu_in_main_thread()); assert(qemu_in_main_thread());
/*
* Locking is not necessary because the guest is stopped and no other
* threads can be accessing the requests list, but take the lock for
* consistency.
*/
WITH_QEMU_LOCK_GUARD(&s->requests_lock) {
QTAILQ_FOREACH_SAFE(req, &s->requests, next, next_req) { QTAILQ_FOREACH_SAFE(req, &s->requests, next, next_req) {
fn(req, opaque); fn(req, opaque);
} }
} }
}
typedef struct { typedef struct {
SCSIDevice *s; SCSIDevice *s;
@ -115,21 +122,29 @@ static void scsi_device_for_each_req_async_bh(void *opaque)
{ {
g_autofree SCSIDeviceForEachReqAsyncData *data = opaque; g_autofree SCSIDeviceForEachReqAsyncData *data = opaque;
SCSIDevice *s = data->s; SCSIDevice *s = data->s;
AioContext *ctx; g_autoptr(GList) reqs = NULL;
/*
* Build a list of requests in this AioContext so fn() can be invoked later
* outside requests_lock.
*/
WITH_QEMU_LOCK_GUARD(&s->requests_lock) {
AioContext *ctx = qemu_get_current_aio_context();
SCSIRequest *req; SCSIRequest *req;
SCSIRequest *next; SCSIRequest *next;
/*
* The BB cannot have changed contexts between this BH being scheduled and
* now: BBs' AioContexts, when they have a node attached, can only be
* changed via bdrv_try_change_aio_context(), in a drained section. While
* we have the in-flight counter incremented, that drain must block.
*/
ctx = blk_get_aio_context(s->conf.blk);
assert(ctx == qemu_get_current_aio_context());
QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) { QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
data->fn(req, data->fn_opaque); if (req->ctx == ctx) {
scsi_req_ref(req); /* dropped after calling fn() */
reqs = g_list_prepend(reqs, req);
}
}
}
/* Call fn() on each request */
for (GList *elem = g_list_first(reqs); elem; elem = g_list_next(elem)) {
data->fn(elem->data, data->fn_opaque);
scsi_req_unref(elem->data);
} }
/* Drop the reference taken by scsi_device_for_each_req_async() */ /* Drop the reference taken by scsi_device_for_each_req_async() */
@ -139,9 +154,35 @@ static void scsi_device_for_each_req_async_bh(void *opaque)
blk_dec_in_flight(s->conf.blk); blk_dec_in_flight(s->conf.blk);
} }
static void scsi_device_for_each_req_async_do_ctx(gpointer key, gpointer value,
gpointer user_data)
{
AioContext *ctx = key;
SCSIDeviceForEachReqAsyncData *params = user_data;
SCSIDeviceForEachReqAsyncData *data;
data = g_new(SCSIDeviceForEachReqAsyncData, 1);
data->s = params->s;
data->fn = params->fn;
data->fn_opaque = params->fn_opaque;
/*
* Hold a reference to the SCSIDevice until
* scsi_device_for_each_req_async_bh() finishes.
*/
object_ref(OBJECT(data->s));
/* Paired with scsi_device_for_each_req_async_bh() */
blk_inc_in_flight(data->s->conf.blk);
aio_bh_schedule_oneshot(ctx, scsi_device_for_each_req_async_bh, data);
}
/* /*
* Schedule @fn() to be invoked for each enqueued request in device @s. @fn() * Schedule @fn() to be invoked for each enqueued request in device @s. @fn()
* runs in the AioContext that is executing the request. * must be thread-safe because it runs concurrently in each AioContext that is
* executing a request.
*
* Keeps the BlockBackend's in-flight counter incremented until everything is * Keeps the BlockBackend's in-flight counter incremented until everything is
* done, so draining it will settle all scheduled @fn() calls. * done, so draining it will settle all scheduled @fn() calls.
*/ */
@ -151,24 +192,26 @@ static void scsi_device_for_each_req_async(SCSIDevice *s,
{ {
assert(qemu_in_main_thread()); assert(qemu_in_main_thread());
SCSIDeviceForEachReqAsyncData *data = /* The set of AioContexts where the requests are being processed */
g_new(SCSIDeviceForEachReqAsyncData, 1); g_autoptr(GHashTable) aio_contexts = g_hash_table_new(NULL, NULL);
WITH_QEMU_LOCK_GUARD(&s->requests_lock) {
SCSIRequest *req;
QTAILQ_FOREACH(req, &s->requests, next) {
g_hash_table_add(aio_contexts, req->ctx);
}
}
data->s = s; /* Schedule a BH for each AioContext */
data->fn = fn; SCSIDeviceForEachReqAsyncData params = {
data->fn_opaque = opaque; .s = s,
.fn = fn,
/* .fn_opaque = opaque,
* Hold a reference to the SCSIDevice until };
* scsi_device_for_each_req_async_bh() finishes. g_hash_table_foreach(
*/ aio_contexts,
object_ref(OBJECT(s)); scsi_device_for_each_req_async_do_ctx,
&params
/* Paired with blk_dec_in_flight() in scsi_device_for_each_req_async_bh() */ );
blk_inc_in_flight(s->conf.blk);
aio_bh_schedule_oneshot(blk_get_aio_context(s->conf.blk),
scsi_device_for_each_req_async_bh,
data);
} }
static void scsi_device_realize(SCSIDevice *s, Error **errp) static void scsi_device_realize(SCSIDevice *s, Error **errp)
@ -349,6 +392,7 @@ static void scsi_qdev_realize(DeviceState *qdev, Error **errp)
dev->lun = lun; dev->lun = lun;
} }
qemu_mutex_init(&dev->requests_lock);
QTAILQ_INIT(&dev->requests); QTAILQ_INIT(&dev->requests);
scsi_device_realize(dev, &local_err); scsi_device_realize(dev, &local_err);
if (local_err) { if (local_err) {
@ -369,6 +413,8 @@ static void scsi_qdev_unrealize(DeviceState *qdev)
scsi_device_purge_requests(dev, SENSE_CODE(NO_SENSE)); scsi_device_purge_requests(dev, SENSE_CODE(NO_SENSE));
qemu_mutex_destroy(&dev->requests_lock);
scsi_device_unrealize(dev); scsi_device_unrealize(dev);
blockdev_mark_auto_del(dev->conf.blk); blockdev_mark_auto_del(dev->conf.blk);
@ -868,6 +914,7 @@ invalid_opcode:
} }
} }
req->ctx = qemu_get_current_aio_context();
req->cmd = cmd; req->cmd = cmd;
req->residual = req->cmd.xfer; req->residual = req->cmd.xfer;
@ -964,8 +1011,11 @@ static void scsi_req_enqueue_internal(SCSIRequest *req)
req->sg = NULL; req->sg = NULL;
} }
req->enqueued = true; req->enqueued = true;
WITH_QEMU_LOCK_GUARD(&req->dev->requests_lock) {
QTAILQ_INSERT_TAIL(&req->dev->requests, req, next); QTAILQ_INSERT_TAIL(&req->dev->requests, req, next);
} }
}
int32_t scsi_req_enqueue(SCSIRequest *req) int32_t scsi_req_enqueue(SCSIRequest *req)
{ {
@ -984,7 +1034,9 @@ static void scsi_req_dequeue(SCSIRequest *req)
trace_scsi_req_dequeue(req->dev->id, req->lun, req->tag); trace_scsi_req_dequeue(req->dev->id, req->lun, req->tag);
req->retry = false; req->retry = false;
if (req->enqueued) { if (req->enqueued) {
WITH_QEMU_LOCK_GUARD(&req->dev->requests_lock) {
QTAILQ_REMOVE(&req->dev->requests, req, next); QTAILQ_REMOVE(&req->dev->requests, req, next);
}
req->enqueued = false; req->enqueued = false;
scsi_req_unref(req); scsi_req_unref(req);
} }
@ -1961,8 +2013,7 @@ static void scsi_device_class_init(ObjectClass *klass, void *data)
static void scsi_dev_instance_init(Object *obj) static void scsi_dev_instance_init(Object *obj)
{ {
DeviceState *dev = DEVICE(obj); SCSIDevice *s = SCSI_DEVICE(obj);
SCSIDevice *s = SCSI_DEVICE(dev);
device_add_bootindex_property(obj, &s->conf.bootindex, device_add_bootindex_property(obj, &s->conf.bootindex,
"bootindex", NULL, "bootindex", NULL,

View file

@ -106,7 +106,6 @@ struct SCSIDiskState {
uint64_t max_unmap_size; uint64_t max_unmap_size;
uint64_t max_io_size; uint64_t max_io_size;
uint32_t quirks; uint32_t quirks;
QEMUBH *bh;
char *version; char *version;
char *serial; char *serial;
char *vendor; char *vendor;
@ -329,9 +328,8 @@ static void scsi_aio_complete(void *opaque, int ret)
SCSIDiskReq *r = (SCSIDiskReq *)opaque; SCSIDiskReq *r = (SCSIDiskReq *)opaque;
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
/* The request must only run in the BlockBackend's AioContext */ /* The request must run in its AioContext */
assert(blk_get_aio_context(s->qdev.conf.blk) == assert(r->req.ctx == qemu_get_current_aio_context());
qemu_get_current_aio_context());
assert(r->req.aiocb != NULL); assert(r->req.aiocb != NULL);
r->req.aiocb = NULL; r->req.aiocb = NULL;
@ -431,12 +429,10 @@ static void scsi_dma_complete(void *opaque, int ret)
static void scsi_read_complete_noio(SCSIDiskReq *r, int ret) static void scsi_read_complete_noio(SCSIDiskReq *r, int ret)
{ {
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
uint32_t n; uint32_t n;
/* The request must only run in the BlockBackend's AioContext */ /* The request must run in its AioContext */
assert(blk_get_aio_context(s->qdev.conf.blk) == assert(r->req.ctx == qemu_get_current_aio_context());
qemu_get_current_aio_context());
assert(r->req.aiocb == NULL); assert(r->req.aiocb == NULL);
if (scsi_disk_req_check_error(r, ret, ret > 0)) { if (scsi_disk_req_check_error(r, ret, ret > 0)) {
@ -488,8 +484,7 @@ static void scsi_do_read(SCSIDiskReq *r, int ret)
if (r->req.sg) { if (r->req.sg) {
dma_acct_start(s->qdev.conf.blk, &r->acct, r->req.sg, BLOCK_ACCT_READ); dma_acct_start(s->qdev.conf.blk, &r->acct, r->req.sg, BLOCK_ACCT_READ);
r->req.residual -= r->req.sg->size; r->req.residual -= r->req.sg->size;
r->req.aiocb = dma_blk_io(blk_get_aio_context(s->qdev.conf.blk), r->req.aiocb = dma_blk_io(r->req.sg, r->sector << BDRV_SECTOR_BITS,
r->req.sg, r->sector << BDRV_SECTOR_BITS,
BDRV_SECTOR_SIZE, BDRV_SECTOR_SIZE,
sdc->dma_readv, r, scsi_dma_complete, r, sdc->dma_readv, r, scsi_dma_complete, r,
DMA_DIRECTION_FROM_DEVICE); DMA_DIRECTION_FROM_DEVICE);
@ -564,12 +559,10 @@ static void scsi_read_data(SCSIRequest *req)
static void scsi_write_complete_noio(SCSIDiskReq *r, int ret) static void scsi_write_complete_noio(SCSIDiskReq *r, int ret)
{ {
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
uint32_t n; uint32_t n;
/* The request must only run in the BlockBackend's AioContext */ /* The request must run in its AioContext */
assert(blk_get_aio_context(s->qdev.conf.blk) == assert(r->req.ctx == qemu_get_current_aio_context());
qemu_get_current_aio_context());
assert (r->req.aiocb == NULL); assert (r->req.aiocb == NULL);
if (scsi_disk_req_check_error(r, ret, ret > 0)) { if (scsi_disk_req_check_error(r, ret, ret > 0)) {
@ -651,8 +644,7 @@ static void scsi_write_data(SCSIRequest *req)
if (r->req.sg) { if (r->req.sg) {
dma_acct_start(s->qdev.conf.blk, &r->acct, r->req.sg, BLOCK_ACCT_WRITE); dma_acct_start(s->qdev.conf.blk, &r->acct, r->req.sg, BLOCK_ACCT_WRITE);
r->req.residual -= r->req.sg->size; r->req.residual -= r->req.sg->size;
r->req.aiocb = dma_blk_io(blk_get_aio_context(s->qdev.conf.blk), r->req.aiocb = dma_blk_io(r->req.sg, r->sector << BDRV_SECTOR_BITS,
r->req.sg, r->sector << BDRV_SECTOR_BITS,
BDRV_SECTOR_SIZE, BDRV_SECTOR_SIZE,
sdc->dma_writev, r, scsi_dma_complete, r, sdc->dma_writev, r, scsi_dma_complete, r,
DMA_DIRECTION_TO_DEVICE); DMA_DIRECTION_TO_DEVICE);

View file

@ -18,6 +18,7 @@
#include "system/block-backend.h" #include "system/block-backend.h"
#include "hw/scsi/scsi.h" #include "hw/scsi/scsi.h"
#include "scsi/constants.h" #include "scsi/constants.h"
#include "hw/virtio/iothread-vq-mapping.h"
#include "hw/virtio/virtio-bus.h" #include "hw/virtio/virtio-bus.h"
/* Context: BQL held */ /* Context: BQL held */
@ -28,7 +29,14 @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp)
BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
if (vs->conf.iothread) { if (vs->conf.iothread && vs->conf.iothread_vq_mapping_list) {
error_setg(errp,
"iothread and iothread-vq-mapping properties cannot be set "
"at the same time");
return;
}
if (vs->conf.iothread || vs->conf.iothread_vq_mapping_list) {
if (!k->set_guest_notifiers || !k->ioeventfd_assign) { if (!k->set_guest_notifiers || !k->ioeventfd_assign) {
error_setg(errp, error_setg(errp,
"device is incompatible with iothread " "device is incompatible with iothread "
@ -39,13 +47,62 @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp)
error_setg(errp, "ioeventfd is required for iothread"); error_setg(errp, "ioeventfd is required for iothread");
return; return;
} }
s->ctx = iothread_get_aio_context(vs->conf.iothread); }
} else {
if (!virtio_device_ioeventfd_enabled(vdev)) { s->vq_aio_context = g_new(AioContext *, vs->conf.num_queues +
VIRTIO_SCSI_VQ_NUM_FIXED);
/*
* Handle the ctrl virtqueue in the main loop thread where device resets
* can be performed.
*/
s->vq_aio_context[0] = qemu_get_aio_context();
/*
* Handle the event virtqueue in the main loop thread where its no_poll
* behavior won't stop IOThread polling.
*/
s->vq_aio_context[1] = qemu_get_aio_context();
if (vs->conf.iothread_vq_mapping_list) {
if (!iothread_vq_mapping_apply(vs->conf.iothread_vq_mapping_list,
&s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED],
vs->conf.num_queues, errp)) {
g_free(s->vq_aio_context);
s->vq_aio_context = NULL;
return; return;
} }
s->ctx = qemu_get_aio_context(); } else if (vs->conf.iothread) {
AioContext *ctx = iothread_get_aio_context(vs->conf.iothread);
for (uint16_t i = 0; i < vs->conf.num_queues; i++) {
s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i] = ctx;
} }
/* Released in virtio_scsi_dataplane_cleanup() */
object_ref(OBJECT(vs->conf.iothread));
} else {
AioContext *ctx = qemu_get_aio_context();
for (unsigned i = 0; i < vs->conf.num_queues; i++) {
s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i] = ctx;
}
}
}
/* Context: BQL held */
void virtio_scsi_dataplane_cleanup(VirtIOSCSI *s)
{
VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
if (vs->conf.iothread_vq_mapping_list) {
iothread_vq_mapping_cleanup(vs->conf.iothread_vq_mapping_list);
}
if (vs->conf.iothread) {
object_unref(OBJECT(vs->conf.iothread));
}
g_free(s->vq_aio_context);
s->vq_aio_context = NULL;
} }
static int virtio_scsi_set_host_notifier(VirtIOSCSI *s, VirtQueue *vq, int n) static int virtio_scsi_set_host_notifier(VirtIOSCSI *s, VirtQueue *vq, int n)
@ -66,31 +123,20 @@ static int virtio_scsi_set_host_notifier(VirtIOSCSI *s, VirtQueue *vq, int n)
} }
/* Context: BH in IOThread */ /* Context: BH in IOThread */
static void virtio_scsi_dataplane_stop_bh(void *opaque) static void virtio_scsi_dataplane_stop_vq_bh(void *opaque)
{ {
VirtIOSCSI *s = opaque; AioContext *ctx = qemu_get_current_aio_context();
VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s); VirtQueue *vq = opaque;
EventNotifier *host_notifier; EventNotifier *host_notifier;
int i;
virtio_queue_aio_detach_host_notifier(vs->ctrl_vq, s->ctx); virtio_queue_aio_detach_host_notifier(vq, ctx);
host_notifier = virtio_queue_get_host_notifier(vs->ctrl_vq); host_notifier = virtio_queue_get_host_notifier(vq);
/* /*
* Test and clear notifier after disabling event, in case poll callback * Test and clear notifier after disabling event, in case poll callback
* didn't have time to run. * didn't have time to run.
*/ */
virtio_queue_host_notifier_read(host_notifier); virtio_queue_host_notifier_read(host_notifier);
virtio_queue_aio_detach_host_notifier(vs->event_vq, s->ctx);
host_notifier = virtio_queue_get_host_notifier(vs->event_vq);
virtio_queue_host_notifier_read(host_notifier);
for (i = 0; i < vs->conf.num_queues; i++) {
virtio_queue_aio_detach_host_notifier(vs->cmd_vqs[i], s->ctx);
host_notifier = virtio_queue_get_host_notifier(vs->cmd_vqs[i]);
virtio_queue_host_notifier_read(host_notifier);
}
} }
/* Context: BQL held */ /* Context: BQL held */
@ -154,11 +200,14 @@ int virtio_scsi_dataplane_start(VirtIODevice *vdev)
smp_wmb(); /* paired with aio_notify_accept() */ smp_wmb(); /* paired with aio_notify_accept() */
if (s->bus.drain_count == 0) { if (s->bus.drain_count == 0) {
virtio_queue_aio_attach_host_notifier(vs->ctrl_vq, s->ctx); virtio_queue_aio_attach_host_notifier(vs->ctrl_vq,
virtio_queue_aio_attach_host_notifier_no_poll(vs->event_vq, s->ctx); s->vq_aio_context[0]);
virtio_queue_aio_attach_host_notifier_no_poll(vs->event_vq,
s->vq_aio_context[1]);
for (i = 0; i < vs->conf.num_queues; i++) { for (i = 0; i < vs->conf.num_queues; i++) {
virtio_queue_aio_attach_host_notifier(vs->cmd_vqs[i], s->ctx); AioContext *ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i];
virtio_queue_aio_attach_host_notifier(vs->cmd_vqs[i], ctx);
} }
} }
return 0; return 0;
@ -207,7 +256,11 @@ void virtio_scsi_dataplane_stop(VirtIODevice *vdev)
s->dataplane_stopping = true; s->dataplane_stopping = true;
if (s->bus.drain_count == 0) { if (s->bus.drain_count == 0) {
aio_wait_bh_oneshot(s->ctx, virtio_scsi_dataplane_stop_bh, s); for (i = 0; i < vs->conf.num_queues + VIRTIO_SCSI_VQ_NUM_FIXED; i++) {
VirtQueue *vq = virtio_get_queue(&vs->parent_obj, i);
AioContext *ctx = s->vq_aio_context[i];
aio_wait_bh_oneshot(ctx, virtio_scsi_dataplane_stop_vq_bh, vq);
}
} }
blk_drain_all(); /* ensure there are no in-flight requests */ blk_drain_all(); /* ensure there are no in-flight requests */

View file

@ -27,6 +27,7 @@
#include "hw/qdev-properties.h" #include "hw/qdev-properties.h"
#include "hw/scsi/scsi.h" #include "hw/scsi/scsi.h"
#include "scsi/constants.h" #include "scsi/constants.h"
#include "hw/virtio/iothread-vq-mapping.h"
#include "hw/virtio/virtio-bus.h" #include "hw/virtio/virtio-bus.h"
#include "hw/virtio/virtio-access.h" #include "hw/virtio/virtio-access.h"
#include "trace.h" #include "trace.h"
@ -47,7 +48,7 @@ typedef struct VirtIOSCSIReq {
/* Used for two-stage request submission and TMFs deferred to BH */ /* Used for two-stage request submission and TMFs deferred to BH */
QTAILQ_ENTRY(VirtIOSCSIReq) next; QTAILQ_ENTRY(VirtIOSCSIReq) next;
/* Used for cancellation of request during TMFs */ /* Used for cancellation of request during TMFs. Atomic. */
int remaining; int remaining;
SCSIRequest *sreq; SCSIRequest *sreq;
@ -102,13 +103,18 @@ static void virtio_scsi_free_req(VirtIOSCSIReq *req)
g_free(req); g_free(req);
} }
static void virtio_scsi_complete_req(VirtIOSCSIReq *req) static void virtio_scsi_complete_req(VirtIOSCSIReq *req, QemuMutex *vq_lock)
{ {
VirtIOSCSI *s = req->dev; VirtIOSCSI *s = req->dev;
VirtQueue *vq = req->vq; VirtQueue *vq = req->vq;
VirtIODevice *vdev = VIRTIO_DEVICE(s); VirtIODevice *vdev = VIRTIO_DEVICE(s);
qemu_iovec_from_buf(&req->resp_iov, 0, &req->resp, req->resp_size); qemu_iovec_from_buf(&req->resp_iov, 0, &req->resp, req->resp_size);
if (vq_lock) {
qemu_mutex_lock(vq_lock);
}
virtqueue_push(vq, &req->elem, req->qsgl.size + req->resp_iov.size); virtqueue_push(vq, &req->elem, req->qsgl.size + req->resp_iov.size);
if (s->dataplane_started && !s->dataplane_fenced) { if (s->dataplane_started && !s->dataplane_fenced) {
virtio_notify_irqfd(vdev, vq); virtio_notify_irqfd(vdev, vq);
@ -116,6 +122,10 @@ static void virtio_scsi_complete_req(VirtIOSCSIReq *req)
virtio_notify(vdev, vq); virtio_notify(vdev, vq);
} }
if (vq_lock) {
qemu_mutex_unlock(vq_lock);
}
if (req->sreq) { if (req->sreq) {
req->sreq->hba_private = NULL; req->sreq->hba_private = NULL;
scsi_req_unref(req->sreq); scsi_req_unref(req->sreq);
@ -123,34 +133,20 @@ static void virtio_scsi_complete_req(VirtIOSCSIReq *req)
virtio_scsi_free_req(req); virtio_scsi_free_req(req);
} }
static void virtio_scsi_complete_req_bh(void *opaque) static void virtio_scsi_bad_req(VirtIOSCSIReq *req, QemuMutex *vq_lock)
{
VirtIOSCSIReq *req = opaque;
virtio_scsi_complete_req(req);
}
/*
* Called from virtio_scsi_do_one_tmf_bh() in main loop thread. The main loop
* thread cannot touch the virtqueue since that could race with an IOThread.
*/
static void virtio_scsi_complete_req_from_main_loop(VirtIOSCSIReq *req)
{
VirtIOSCSI *s = req->dev;
if (!s->ctx || s->ctx == qemu_get_aio_context()) {
/* No need to schedule a BH when there is no IOThread */
virtio_scsi_complete_req(req);
} else {
/* Run request completion in the IOThread */
aio_wait_bh_oneshot(s->ctx, virtio_scsi_complete_req_bh, req);
}
}
static void virtio_scsi_bad_req(VirtIOSCSIReq *req)
{ {
virtio_error(VIRTIO_DEVICE(req->dev), "wrong size for virtio-scsi headers"); virtio_error(VIRTIO_DEVICE(req->dev), "wrong size for virtio-scsi headers");
if (vq_lock) {
qemu_mutex_lock(vq_lock);
}
virtqueue_detach_element(req->vq, &req->elem, 0); virtqueue_detach_element(req->vq, &req->elem, 0);
if (vq_lock) {
qemu_mutex_unlock(vq_lock);
}
virtio_scsi_free_req(req); virtio_scsi_free_req(req);
} }
@ -235,12 +231,21 @@ static int virtio_scsi_parse_req(VirtIOSCSIReq *req,
return 0; return 0;
} }
static VirtIOSCSIReq *virtio_scsi_pop_req(VirtIOSCSI *s, VirtQueue *vq) static VirtIOSCSIReq *virtio_scsi_pop_req(VirtIOSCSI *s, VirtQueue *vq, QemuMutex *vq_lock)
{ {
VirtIOSCSICommon *vs = (VirtIOSCSICommon *)s; VirtIOSCSICommon *vs = (VirtIOSCSICommon *)s;
VirtIOSCSIReq *req; VirtIOSCSIReq *req;
if (vq_lock) {
qemu_mutex_lock(vq_lock);
}
req = virtqueue_pop(vq, sizeof(VirtIOSCSIReq) + vs->cdb_size); req = virtqueue_pop(vq, sizeof(VirtIOSCSIReq) + vs->cdb_size);
if (vq_lock) {
qemu_mutex_unlock(vq_lock);
}
if (!req) { if (!req) {
return NULL; return NULL;
} }
@ -294,137 +299,158 @@ typedef struct {
VirtIOSCSIReq *tmf_req; VirtIOSCSIReq *tmf_req;
} VirtIOSCSICancelNotifier; } VirtIOSCSICancelNotifier;
static void virtio_scsi_tmf_dec_remaining(VirtIOSCSIReq *tmf)
{
if (qatomic_fetch_dec(&tmf->remaining) == 1) {
trace_virtio_scsi_tmf_resp(virtio_scsi_get_lun(tmf->req.tmf.lun),
tmf->req.tmf.tag, tmf->resp.tmf.response);
virtio_scsi_complete_req(tmf, &tmf->dev->ctrl_lock);
}
}
static void virtio_scsi_cancel_notify(Notifier *notifier, void *data) static void virtio_scsi_cancel_notify(Notifier *notifier, void *data)
{ {
VirtIOSCSICancelNotifier *n = container_of(notifier, VirtIOSCSICancelNotifier *n = container_of(notifier,
VirtIOSCSICancelNotifier, VirtIOSCSICancelNotifier,
notifier); notifier);
if (--n->tmf_req->remaining == 0) { virtio_scsi_tmf_dec_remaining(n->tmf_req);
VirtIOSCSIReq *req = n->tmf_req;
trace_virtio_scsi_tmf_resp(virtio_scsi_get_lun(req->req.tmf.lun),
req->req.tmf.tag, req->resp.tmf.response);
virtio_scsi_complete_req(req);
}
g_free(n); g_free(n);
} }
static inline void virtio_scsi_ctx_check(VirtIOSCSI *s, SCSIDevice *d) static void virtio_scsi_tmf_cancel_req(VirtIOSCSIReq *tmf, SCSIRequest *r)
{ {
if (s->dataplane_started && d && blk_is_available(d->conf.blk)) { VirtIOSCSICancelNotifier *notifier;
assert(blk_get_aio_context(d->conf.blk) == s->ctx);
} assert(r->ctx == qemu_get_current_aio_context());
/* Decremented in virtio_scsi_cancel_notify() */
qatomic_inc(&tmf->remaining);
notifier = g_new(VirtIOSCSICancelNotifier, 1);
notifier->notifier.notify = virtio_scsi_cancel_notify;
notifier->tmf_req = tmf;
scsi_req_cancel_async(r, &notifier->notifier);
} }
static void virtio_scsi_do_one_tmf_bh(VirtIOSCSIReq *req) /* Execute a TMF on the requests in the current AioContext */
static void virtio_scsi_do_tmf_aio_context(void *opaque)
{ {
VirtIOSCSI *s = req->dev; AioContext *ctx = qemu_get_current_aio_context();
SCSIDevice *d = virtio_scsi_device_get(s, req->req.tmf.lun); VirtIOSCSIReq *tmf = opaque;
BusChild *kid; VirtIOSCSI *s = tmf->dev;
int target; SCSIDevice *d = virtio_scsi_device_get(s, tmf->req.tmf.lun);
SCSIRequest *r;
bool match_tag;
switch (req->req.tmf.subtype) {
case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET:
if (!d) { if (!d) {
req->resp.tmf.response = VIRTIO_SCSI_S_BAD_TARGET; tmf->resp.tmf.response = VIRTIO_SCSI_S_BAD_TARGET;
goto out; virtio_scsi_tmf_dec_remaining(tmf);
return;
} }
if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) {
req->resp.tmf.response = VIRTIO_SCSI_S_INCORRECT_LUN; /*
goto out; * This function could handle other subtypes that need to be processed in
} * the request's AioContext in the future, but for now only request
qatomic_inc(&s->resetting); * cancelation subtypes are performed here.
device_cold_reset(&d->qdev); */
qatomic_dec(&s->resetting); switch (tmf->req.tmf.subtype) {
case VIRTIO_SCSI_T_TMF_ABORT_TASK:
match_tag = true;
break; break;
case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET:
case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET: case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET:
target = req->req.tmf.lun[1]; match_tag = false;
qatomic_inc(&s->resetting);
rcu_read_lock();
QTAILQ_FOREACH_RCU(kid, &s->bus.qbus.children, sibling) {
SCSIDevice *d1 = SCSI_DEVICE(kid->child);
if (d1->channel == 0 && d1->id == target) {
device_cold_reset(&d1->qdev);
}
}
rcu_read_unlock();
qatomic_dec(&s->resetting);
break; break;
default: default:
g_assert_not_reached(); g_assert_not_reached();
} }
out: WITH_QEMU_LOCK_GUARD(&d->requests_lock) {
object_unref(OBJECT(d)); QTAILQ_FOREACH(r, &d->requests, next) {
virtio_scsi_complete_req_from_main_loop(req); VirtIOSCSIReq *cmd_req = r->hba_private;
assert(cmd_req); /* request has hba_private while enqueued */
if (r->ctx != ctx) {
continue;
}
if (match_tag && cmd_req->req.cmd.tag != tmf->req.tmf.tag) {
continue;
}
virtio_scsi_tmf_cancel_req(tmf, r);
}
} }
/* Some TMFs must be processed from the main loop thread */ /* Incremented by virtio_scsi_do_tmf() */
static void virtio_scsi_do_tmf_bh(void *opaque) virtio_scsi_tmf_dec_remaining(tmf);
{
VirtIOSCSI *s = opaque;
QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs);
VirtIOSCSIReq *req;
VirtIOSCSIReq *tmp;
object_unref(d);
}
static void dummy_bh(void *opaque)
{
/* Do nothing */
}
/*
* Wait for pending virtio_scsi_defer_tmf_to_aio_context() BHs.
*/
static void virtio_scsi_flush_defer_tmf_to_aio_context(VirtIOSCSI *s)
{
GLOBAL_STATE_CODE(); GLOBAL_STATE_CODE();
WITH_QEMU_LOCK_GUARD(&s->tmf_bh_lock) { assert(!s->dataplane_started);
QTAILQ_FOREACH_SAFE(req, &s->tmf_bh_list, next, tmp) {
QTAILQ_REMOVE(&s->tmf_bh_list, req, next);
QTAILQ_INSERT_TAIL(&reqs, req, next);
}
qemu_bh_delete(s->tmf_bh); for (uint32_t i = 0; i < s->parent_obj.conf.num_queues; i++) {
s->tmf_bh = NULL; AioContext *ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i];
}
QTAILQ_FOREACH_SAFE(req, &reqs, next, tmp) { /* Our BH only runs after previously scheduled BHs */
QTAILQ_REMOVE(&reqs, req, next); aio_wait_bh_oneshot(ctx, dummy_bh, NULL);
virtio_scsi_do_one_tmf_bh(req);
} }
} }
static void virtio_scsi_reset_tmf_bh(VirtIOSCSI *s) /*
* Run the TMF in a specific AioContext, handling only requests in that
* AioContext. This is necessary because requests can run in different
* AioContext and it is only possible to cancel them from the AioContext where
* they are running.
*/
static void virtio_scsi_defer_tmf_to_aio_context(VirtIOSCSIReq *tmf,
AioContext *ctx)
{ {
VirtIOSCSIReq *req; /* Decremented in virtio_scsi_do_tmf_aio_context() */
VirtIOSCSIReq *tmp; qatomic_inc(&tmf->remaining);
GLOBAL_STATE_CODE(); /* See virtio_scsi_flush_defer_tmf_to_aio_context() cleanup during reset */
aio_bh_schedule_oneshot(ctx, virtio_scsi_do_tmf_aio_context, tmf);
/* Called after ioeventfd has been stopped, so tmf_bh_lock is not needed */
if (s->tmf_bh) {
qemu_bh_delete(s->tmf_bh);
s->tmf_bh = NULL;
} }
QTAILQ_FOREACH_SAFE(req, &s->tmf_bh_list, next, tmp) { /*
QTAILQ_REMOVE(&s->tmf_bh_list, req, next); * Returns the AioContext for a given TMF's tag field or NULL. Note that the
* request identified by the tag may have completed by the time you can execute
/* SAM-6 6.3.2 Hard reset */ * a BH in the AioContext, so don't assume the request still exists in your BH.
req->resp.tmf.response = VIRTIO_SCSI_S_TARGET_FAILURE; */
virtio_scsi_complete_req(req); static AioContext *find_aio_context_for_tmf_tag(SCSIDevice *d,
} VirtIOSCSIReq *tmf)
}
static void virtio_scsi_defer_tmf_to_bh(VirtIOSCSIReq *req)
{ {
VirtIOSCSI *s = req->dev; WITH_QEMU_LOCK_GUARD(&d->requests_lock) {
SCSIRequest *r;
SCSIRequest *next;
WITH_QEMU_LOCK_GUARD(&s->tmf_bh_lock) { QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) {
QTAILQ_INSERT_TAIL(&s->tmf_bh_list, req, next); VirtIOSCSIReq *cmd_req = r->hba_private;
if (!s->tmf_bh) { /* hba_private is non-NULL while the request is enqueued */
s->tmf_bh = qemu_bh_new(virtio_scsi_do_tmf_bh, s); assert(cmd_req);
qemu_bh_schedule(s->tmf_bh);
if (cmd_req->req.cmd.tag == tmf->req.tmf.tag) {
return r->ctx;
} }
} }
} }
return NULL;
}
/* Return 0 if the request is ready to be completed and return to guest; /* Return 0 if the request is ready to be completed and return to guest;
* -EINPROGRESS if the request is submitted and will be completed later, in the * -EINPROGRESS if the request is submitted and will be completed later, in the
@ -433,9 +459,9 @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
{ {
SCSIDevice *d = virtio_scsi_device_get(s, req->req.tmf.lun); SCSIDevice *d = virtio_scsi_device_get(s, req->req.tmf.lun);
SCSIRequest *r, *next; SCSIRequest *r, *next;
AioContext *ctx;
int ret = 0; int ret = 0;
virtio_scsi_ctx_check(s, d);
/* Here VIRTIO_SCSI_S_OK means "FUNCTION COMPLETE". */ /* Here VIRTIO_SCSI_S_OK means "FUNCTION COMPLETE". */
req->resp.tmf.response = VIRTIO_SCSI_S_OK; req->resp.tmf.response = VIRTIO_SCSI_S_OK;
@ -450,7 +476,22 @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
req->req.tmf.tag, req->req.tmf.subtype); req->req.tmf.tag, req->req.tmf.subtype);
switch (req->req.tmf.subtype) { switch (req->req.tmf.subtype) {
case VIRTIO_SCSI_T_TMF_ABORT_TASK: case VIRTIO_SCSI_T_TMF_ABORT_TASK: {
if (!d) {
goto fail;
}
if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) {
goto incorrect_lun;
}
ctx = find_aio_context_for_tmf_tag(d, req);
if (ctx) {
virtio_scsi_defer_tmf_to_aio_context(req, ctx);
ret = -EINPROGRESS;
}
break;
}
case VIRTIO_SCSI_T_TMF_QUERY_TASK: case VIRTIO_SCSI_T_TMF_QUERY_TASK:
if (!d) { if (!d) {
goto fail; goto fail;
@ -458,44 +499,82 @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) { if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) {
goto incorrect_lun; goto incorrect_lun;
} }
QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) {
WITH_QEMU_LOCK_GUARD(&d->requests_lock) {
QTAILQ_FOREACH(r, &d->requests, next) {
VirtIOSCSIReq *cmd_req = r->hba_private; VirtIOSCSIReq *cmd_req = r->hba_private;
if (cmd_req && cmd_req->req.cmd.tag == req->req.tmf.tag) { assert(cmd_req); /* request has hba_private while enqueued */
break;
} if (cmd_req->req.cmd.tag == req->req.tmf.tag) {
}
if (r) {
/* /*
* Assert that the request has not been completed yet, we * "If the specified command is present in the task set,
* check for it in the loop above. * then return a service response set to FUNCTION
*/ * SUCCEEDED".
assert(r->hba_private);
if (req->req.tmf.subtype == VIRTIO_SCSI_T_TMF_QUERY_TASK) {
/* "If the specified command is present in the task set, then
* return a service response set to FUNCTION SUCCEEDED".
*/ */
req->resp.tmf.response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED; req->resp.tmf.response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED;
} else { }
VirtIOSCSICancelNotifier *notifier;
req->remaining = 1;
notifier = g_new(VirtIOSCSICancelNotifier, 1);
notifier->tmf_req = req;
notifier->notifier.notify = virtio_scsi_cancel_notify;
scsi_req_cancel_async(r, &notifier->notifier);
ret = -EINPROGRESS;
} }
} }
break; break;
case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET:
case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET: if (!d) {
virtio_scsi_defer_tmf_to_bh(req); goto fail;
ret = -EINPROGRESS; }
if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) {
goto incorrect_lun;
}
qatomic_inc(&s->resetting);
device_cold_reset(&d->qdev);
qatomic_dec(&s->resetting);
break; break;
case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET: {
BusChild *kid;
int target = req->req.tmf.lun[1];
qatomic_inc(&s->resetting);
rcu_read_lock();
QTAILQ_FOREACH_RCU(kid, &s->bus.qbus.children, sibling) {
SCSIDevice *d1 = SCSI_DEVICE(kid->child);
if (d1->channel == 0 && d1->id == target) {
device_cold_reset(&d1->qdev);
}
}
rcu_read_unlock();
qatomic_dec(&s->resetting);
break;
}
case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET: case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET:
case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: {
g_autoptr(GHashTable) aio_contexts = g_hash_table_new(NULL, NULL);
if (!d) {
goto fail;
}
if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) {
goto incorrect_lun;
}
qatomic_inc(&req->remaining);
for (uint32_t i = 0; i < s->parent_obj.conf.num_queues; i++) {
ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i];
if (!g_hash_table_add(aio_contexts, ctx)) {
continue; /* skip previously added AioContext */
}
virtio_scsi_defer_tmf_to_aio_context(req, ctx);
}
virtio_scsi_tmf_dec_remaining(req);
ret = -EINPROGRESS;
break;
}
case VIRTIO_SCSI_T_TMF_QUERY_TASK_SET: case VIRTIO_SCSI_T_TMF_QUERY_TASK_SET:
if (!d) { if (!d) {
goto fail; goto fail;
@ -504,34 +583,19 @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
goto incorrect_lun; goto incorrect_lun;
} }
/* Add 1 to "remaining" until virtio_scsi_do_tmf returns. WITH_QEMU_LOCK_GUARD(&d->requests_lock) {
* This way, if the bus starts calling back to the notifiers
* even before we finish the loop, virtio_scsi_cancel_notify
* will not complete the TMF too early.
*/
req->remaining = 1;
QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) { QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) {
if (r->hba_private) { /* Request has hba_private while enqueued */
if (req->req.tmf.subtype == VIRTIO_SCSI_T_TMF_QUERY_TASK_SET) { assert(r->hba_private);
/* "If there is any command present in the task set, then
/*
* "If there is any command present in the task set, then
* return a service response set to FUNCTION SUCCEEDED". * return a service response set to FUNCTION SUCCEEDED".
*/ */
req->resp.tmf.response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED; req->resp.tmf.response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED;
break; break;
} else {
VirtIOSCSICancelNotifier *notifier;
req->remaining++;
notifier = g_new(VirtIOSCSICancelNotifier, 1);
notifier->notifier.notify = virtio_scsi_cancel_notify;
notifier->tmf_req = req;
scsi_req_cancel_async(r, &notifier->notifier);
} }
} }
}
if (--req->remaining > 0) {
ret = -EINPROGRESS;
}
break; break;
case VIRTIO_SCSI_T_TMF_CLEAR_ACA: case VIRTIO_SCSI_T_TMF_CLEAR_ACA:
@ -562,7 +626,7 @@ static void virtio_scsi_handle_ctrl_req(VirtIOSCSI *s, VirtIOSCSIReq *req)
if (iov_to_buf(req->elem.out_sg, req->elem.out_num, 0, if (iov_to_buf(req->elem.out_sg, req->elem.out_num, 0,
&type, sizeof(type)) < sizeof(type)) { &type, sizeof(type)) < sizeof(type)) {
virtio_scsi_bad_req(req); virtio_scsi_bad_req(req, &s->ctrl_lock);
return; return;
} }
@ -570,7 +634,7 @@ static void virtio_scsi_handle_ctrl_req(VirtIOSCSI *s, VirtIOSCSIReq *req)
if (type == VIRTIO_SCSI_T_TMF) { if (type == VIRTIO_SCSI_T_TMF) {
if (virtio_scsi_parse_req(req, sizeof(VirtIOSCSICtrlTMFReq), if (virtio_scsi_parse_req(req, sizeof(VirtIOSCSICtrlTMFReq),
sizeof(VirtIOSCSICtrlTMFResp)) < 0) { sizeof(VirtIOSCSICtrlTMFResp)) < 0) {
virtio_scsi_bad_req(req); virtio_scsi_bad_req(req, &s->ctrl_lock);
return; return;
} else { } else {
r = virtio_scsi_do_tmf(s, req); r = virtio_scsi_do_tmf(s, req);
@ -580,7 +644,7 @@ static void virtio_scsi_handle_ctrl_req(VirtIOSCSI *s, VirtIOSCSIReq *req)
type == VIRTIO_SCSI_T_AN_SUBSCRIBE) { type == VIRTIO_SCSI_T_AN_SUBSCRIBE) {
if (virtio_scsi_parse_req(req, sizeof(VirtIOSCSICtrlANReq), if (virtio_scsi_parse_req(req, sizeof(VirtIOSCSICtrlANReq),
sizeof(VirtIOSCSICtrlANResp)) < 0) { sizeof(VirtIOSCSICtrlANResp)) < 0) {
virtio_scsi_bad_req(req); virtio_scsi_bad_req(req, &s->ctrl_lock);
return; return;
} else { } else {
req->req.an.event_requested = req->req.an.event_requested =
@ -600,7 +664,7 @@ static void virtio_scsi_handle_ctrl_req(VirtIOSCSI *s, VirtIOSCSIReq *req)
type == VIRTIO_SCSI_T_AN_SUBSCRIBE) type == VIRTIO_SCSI_T_AN_SUBSCRIBE)
trace_virtio_scsi_an_resp(virtio_scsi_get_lun(req->req.an.lun), trace_virtio_scsi_an_resp(virtio_scsi_get_lun(req->req.an.lun),
req->resp.an.response); req->resp.an.response);
virtio_scsi_complete_req(req); virtio_scsi_complete_req(req, &s->ctrl_lock);
} else { } else {
assert(r == -EINPROGRESS); assert(r == -EINPROGRESS);
} }
@ -610,7 +674,7 @@ static void virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq)
{ {
VirtIOSCSIReq *req; VirtIOSCSIReq *req;
while ((req = virtio_scsi_pop_req(s, vq))) { while ((req = virtio_scsi_pop_req(s, vq, &s->ctrl_lock))) {
virtio_scsi_handle_ctrl_req(s, req); virtio_scsi_handle_ctrl_req(s, req);
} }
} }
@ -625,9 +689,12 @@ static void virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq)
*/ */
static bool virtio_scsi_defer_to_dataplane(VirtIOSCSI *s) static bool virtio_scsi_defer_to_dataplane(VirtIOSCSI *s)
{ {
if (!s->ctx || s->dataplane_started) { if (s->dataplane_started) {
return false; return false;
} }
if (s->vq_aio_context[0] == qemu_get_aio_context()) {
return false; /* not using IOThreads */
}
virtio_device_start_ioeventfd(&s->parent_obj.parent_obj); virtio_device_start_ioeventfd(&s->parent_obj.parent_obj);
return !s->dataplane_fenced; return !s->dataplane_fenced;
@ -654,7 +721,7 @@ static void virtio_scsi_complete_cmd_req(VirtIOSCSIReq *req)
* in virtio_scsi_command_complete. * in virtio_scsi_command_complete.
*/ */
req->resp_size = sizeof(VirtIOSCSICmdResp); req->resp_size = sizeof(VirtIOSCSICmdResp);
virtio_scsi_complete_req(req); virtio_scsi_complete_req(req, NULL);
} }
static void virtio_scsi_command_failed(SCSIRequest *r) static void virtio_scsi_command_failed(SCSIRequest *r)
@ -788,7 +855,7 @@ static int virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req)
virtio_scsi_fail_cmd_req(req); virtio_scsi_fail_cmd_req(req);
return -ENOTSUP; return -ENOTSUP;
} else { } else {
virtio_scsi_bad_req(req); virtio_scsi_bad_req(req, NULL);
return -EINVAL; return -EINVAL;
} }
} }
@ -801,7 +868,6 @@ static int virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req)
virtio_scsi_complete_cmd_req(req); virtio_scsi_complete_cmd_req(req);
return -ENOENT; return -ENOENT;
} }
virtio_scsi_ctx_check(s, d);
req->sreq = scsi_req_new(d, req->req.cmd.tag, req->sreq = scsi_req_new(d, req->req.cmd.tag,
virtio_scsi_get_lun(req->req.cmd.lun), virtio_scsi_get_lun(req->req.cmd.lun),
req->req.cmd.cdb, vs->cdb_size, req); req->req.cmd.cdb, vs->cdb_size, req);
@ -843,7 +909,7 @@ static void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
virtio_queue_set_notification(vq, 0); virtio_queue_set_notification(vq, 0);
} }
while ((req = virtio_scsi_pop_req(s, vq))) { while ((req = virtio_scsi_pop_req(s, vq, NULL))) {
ret = virtio_scsi_handle_cmd_req_prepare(s, req); ret = virtio_scsi_handle_cmd_req_prepare(s, req);
if (!ret) { if (!ret) {
QTAILQ_INSERT_TAIL(&reqs, req, next); QTAILQ_INSERT_TAIL(&reqs, req, next);
@ -936,7 +1002,7 @@ static void virtio_scsi_reset(VirtIODevice *vdev)
assert(!s->dataplane_started); assert(!s->dataplane_started);
virtio_scsi_reset_tmf_bh(s); virtio_scsi_flush_defer_tmf_to_aio_context(s);
qatomic_inc(&s->resetting); qatomic_inc(&s->resetting);
bus_cold_reset(BUS(&s->bus)); bus_cold_reset(BUS(&s->bus));
@ -944,8 +1010,11 @@ static void virtio_scsi_reset(VirtIODevice *vdev)
vs->sense_size = VIRTIO_SCSI_SENSE_DEFAULT_SIZE; vs->sense_size = VIRTIO_SCSI_SENSE_DEFAULT_SIZE;
vs->cdb_size = VIRTIO_SCSI_CDB_DEFAULT_SIZE; vs->cdb_size = VIRTIO_SCSI_CDB_DEFAULT_SIZE;
WITH_QEMU_LOCK_GUARD(&s->event_lock) {
s->events_dropped = false; s->events_dropped = false;
} }
}
typedef struct { typedef struct {
uint32_t event; uint32_t event;
@ -973,7 +1042,8 @@ static void virtio_scsi_push_event(VirtIOSCSI *s,
return; return;
} }
req = virtio_scsi_pop_req(s, vs->event_vq); req = virtio_scsi_pop_req(s, vs->event_vq, &s->event_lock);
WITH_QEMU_LOCK_GUARD(&s->event_lock) {
if (!req) { if (!req) {
s->events_dropped = true; s->events_dropped = true;
return; return;
@ -983,9 +1053,10 @@ static void virtio_scsi_push_event(VirtIOSCSI *s,
event |= VIRTIO_SCSI_T_EVENTS_MISSED; event |= VIRTIO_SCSI_T_EVENTS_MISSED;
s->events_dropped = false; s->events_dropped = false;
} }
}
if (virtio_scsi_parse_req(req, 0, sizeof(VirtIOSCSIEvent))) { if (virtio_scsi_parse_req(req, 0, sizeof(VirtIOSCSIEvent))) {
virtio_scsi_bad_req(req); virtio_scsi_bad_req(req, &s->event_lock);
return; return;
} }
@ -1005,12 +1076,18 @@ static void virtio_scsi_push_event(VirtIOSCSI *s,
} }
trace_virtio_scsi_event(virtio_scsi_get_lun(evt->lun), event, reason); trace_virtio_scsi_event(virtio_scsi_get_lun(evt->lun), event, reason);
virtio_scsi_complete_req(req); virtio_scsi_complete_req(req, &s->event_lock);
} }
static void virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq) static void virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq)
{ {
if (s->events_dropped) { bool events_dropped;
WITH_QEMU_LOCK_GUARD(&s->event_lock) {
events_dropped = s->events_dropped;
}
if (events_dropped) {
VirtIOSCSIEventInfo info = { VirtIOSCSIEventInfo info = {
.event = VIRTIO_SCSI_T_NO_EVENT, .event = VIRTIO_SCSI_T_NO_EVENT,
}; };
@ -1061,14 +1138,16 @@ static void virtio_scsi_hotplug(HotplugHandler *hotplug_dev, DeviceState *dev,
{ {
VirtIODevice *vdev = VIRTIO_DEVICE(hotplug_dev); VirtIODevice *vdev = VIRTIO_DEVICE(hotplug_dev);
VirtIOSCSI *s = VIRTIO_SCSI(vdev); VirtIOSCSI *s = VIRTIO_SCSI(vdev);
AioContext *ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED];
SCSIDevice *sd = SCSI_DEVICE(dev); SCSIDevice *sd = SCSI_DEVICE(dev);
int ret;
if (s->ctx && !s->dataplane_fenced) { if (ctx != qemu_get_aio_context() && !s->dataplane_fenced) {
ret = blk_set_aio_context(sd->conf.blk, s->ctx, errp); /*
if (ret < 0) { * Try to make the BlockBackend's AioContext match ours. Ignore failure
return; * because I/O will still work although block jobs and other users
} * might be slower when multiple AioContexts use a BlockBackend.
*/
blk_set_aio_context(sd->conf.blk, ctx, NULL);
} }
if (virtio_vdev_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) { if (virtio_vdev_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) {
@ -1103,7 +1182,7 @@ static void virtio_scsi_hotunplug(HotplugHandler *hotplug_dev, DeviceState *dev,
qdev_simple_device_unplug_cb(hotplug_dev, dev, errp); qdev_simple_device_unplug_cb(hotplug_dev, dev, errp);
if (s->ctx) { if (s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED] != qemu_get_aio_context()) {
/* If other users keep the BlockBackend in the iothread, that's ok */ /* If other users keep the BlockBackend in the iothread, that's ok */
blk_set_aio_context(sd->conf.blk, qemu_get_aio_context(), NULL); blk_set_aio_context(sd->conf.blk, qemu_get_aio_context(), NULL);
} }
@ -1137,7 +1216,7 @@ static void virtio_scsi_drained_begin(SCSIBus *bus)
for (uint32_t i = 0; i < total_queues; i++) { for (uint32_t i = 0; i < total_queues; i++) {
VirtQueue *vq = virtio_get_queue(vdev, i); VirtQueue *vq = virtio_get_queue(vdev, i);
virtio_queue_aio_detach_host_notifier(vq, s->ctx); virtio_queue_aio_detach_host_notifier(vq, s->vq_aio_context[i]);
} }
} }
@ -1163,10 +1242,12 @@ static void virtio_scsi_drained_end(SCSIBus *bus)
for (uint32_t i = 0; i < total_queues; i++) { for (uint32_t i = 0; i < total_queues; i++) {
VirtQueue *vq = virtio_get_queue(vdev, i); VirtQueue *vq = virtio_get_queue(vdev, i);
AioContext *ctx = s->vq_aio_context[i];
if (vq == vs->event_vq) { if (vq == vs->event_vq) {
virtio_queue_aio_attach_host_notifier_no_poll(vq, s->ctx); virtio_queue_aio_attach_host_notifier_no_poll(vq, ctx);
} else { } else {
virtio_queue_aio_attach_host_notifier(vq, s->ctx); virtio_queue_aio_attach_host_notifier(vq, ctx);
} }
} }
} }
@ -1235,8 +1316,8 @@ static void virtio_scsi_device_realize(DeviceState *dev, Error **errp)
VirtIOSCSI *s = VIRTIO_SCSI(dev); VirtIOSCSI *s = VIRTIO_SCSI(dev);
Error *err = NULL; Error *err = NULL;
QTAILQ_INIT(&s->tmf_bh_list); qemu_mutex_init(&s->ctrl_lock);
qemu_mutex_init(&s->tmf_bh_lock); qemu_mutex_init(&s->event_lock);
virtio_scsi_common_realize(dev, virtio_scsi_common_realize(dev,
virtio_scsi_handle_ctrl, virtio_scsi_handle_ctrl,
@ -1271,15 +1352,16 @@ void virtio_scsi_common_unrealize(DeviceState *dev)
virtio_cleanup(vdev); virtio_cleanup(vdev);
} }
/* main loop */
static void virtio_scsi_device_unrealize(DeviceState *dev) static void virtio_scsi_device_unrealize(DeviceState *dev)
{ {
VirtIOSCSI *s = VIRTIO_SCSI(dev); VirtIOSCSI *s = VIRTIO_SCSI(dev);
virtio_scsi_reset_tmf_bh(s); virtio_scsi_dataplane_cleanup(s);
qbus_set_hotplug_handler(BUS(&s->bus), NULL); qbus_set_hotplug_handler(BUS(&s->bus), NULL);
virtio_scsi_common_unrealize(dev); virtio_scsi_common_unrealize(dev);
qemu_mutex_destroy(&s->tmf_bh_lock); qemu_mutex_destroy(&s->event_lock);
qemu_mutex_destroy(&s->ctrl_lock);
} }
static const Property virtio_scsi_properties[] = { static const Property virtio_scsi_properties[] = {
@ -1299,6 +1381,8 @@ static const Property virtio_scsi_properties[] = {
VIRTIO_SCSI_F_CHANGE, true), VIRTIO_SCSI_F_CHANGE, true),
DEFINE_PROP_LINK("iothread", VirtIOSCSI, parent_obj.conf.iothread, DEFINE_PROP_LINK("iothread", VirtIOSCSI, parent_obj.conf.iothread,
TYPE_IOTHREAD, IOThread *), TYPE_IOTHREAD, IOThread *),
DEFINE_PROP_IOTHREAD_VQ_MAPPING_LIST("iothread-vq-mapping", VirtIOSCSI,
parent_obj.conf.iothread_vq_mapping_list),
}; };
static const VMStateDescription vmstate_virtio_scsi = { static const VMStateDescription vmstate_virtio_scsi = {

View file

@ -0,0 +1,131 @@
/*
* IOThread Virtqueue Mapping
*
* Copyright Red Hat, Inc
*
* SPDX-License-Identifier: GPL-2.0-only
*/
#include "qemu/osdep.h"
#include "system/iothread.h"
#include "hw/virtio/iothread-vq-mapping.h"
static bool
iothread_vq_mapping_validate(IOThreadVirtQueueMappingList *list, uint16_t
num_queues, Error **errp)
{
g_autofree unsigned long *vqs = bitmap_new(num_queues);
g_autoptr(GHashTable) iothreads =
g_hash_table_new(g_str_hash, g_str_equal);
for (IOThreadVirtQueueMappingList *node = list; node; node = node->next) {
const char *name = node->value->iothread;
uint16List *vq;
if (!iothread_by_id(name)) {
error_setg(errp, "IOThread \"%s\" object does not exist", name);
return false;
}
if (!g_hash_table_add(iothreads, (gpointer)name)) {
error_setg(errp,
"duplicate IOThread name \"%s\" in iothread-vq-mapping",
name);
return false;
}
if (node != list) {
if (!!node->value->vqs != !!list->value->vqs) {
error_setg(errp, "either all items in iothread-vq-mapping "
"must have vqs or none of them must have it");
return false;
}
}
for (vq = node->value->vqs; vq; vq = vq->next) {
if (vq->value >= num_queues) {
error_setg(errp, "vq index %u for IOThread \"%s\" must be "
"less than num_queues %u in iothread-vq-mapping",
vq->value, name, num_queues);
return false;
}
if (test_and_set_bit(vq->value, vqs)) {
error_setg(errp, "cannot assign vq %u to IOThread \"%s\" "
"because it is already assigned", vq->value, name);
return false;
}
}
}
if (list->value->vqs) {
for (uint16_t i = 0; i < num_queues; i++) {
if (!test_bit(i, vqs)) {
error_setg(errp,
"missing vq %u IOThread assignment in iothread-vq-mapping",
i);
return false;
}
}
}
return true;
}
bool iothread_vq_mapping_apply(
IOThreadVirtQueueMappingList *list,
AioContext **vq_aio_context,
uint16_t num_queues,
Error **errp)
{
IOThreadVirtQueueMappingList *node;
size_t num_iothreads = 0;
size_t cur_iothread = 0;
if (!iothread_vq_mapping_validate(list, num_queues, errp)) {
return false;
}
for (node = list; node; node = node->next) {
num_iothreads++;
}
for (node = list; node; node = node->next) {
IOThread *iothread = iothread_by_id(node->value->iothread);
AioContext *ctx = iothread_get_aio_context(iothread);
/* Released in virtio_blk_vq_aio_context_cleanup() */
object_ref(OBJECT(iothread));
if (node->value->vqs) {
uint16List *vq;
/* Explicit vq:IOThread assignment */
for (vq = node->value->vqs; vq; vq = vq->next) {
assert(vq->value < num_queues);
vq_aio_context[vq->value] = ctx;
}
} else {
/* Round-robin vq:IOThread assignment */
for (unsigned i = cur_iothread; i < num_queues;
i += num_iothreads) {
vq_aio_context[i] = ctx;
}
}
cur_iothread++;
}
return true;
}
void iothread_vq_mapping_cleanup(IOThreadVirtQueueMappingList *list)
{
IOThreadVirtQueueMappingList *node;
for (node = list; node; node = node->next) {
IOThread *iothread = iothread_by_id(node->value->iothread);
object_unref(OBJECT(iothread));
}
}

View file

@ -1,5 +1,6 @@
system_virtio_ss = ss.source_set() system_virtio_ss = ss.source_set()
system_virtio_ss.add(files('virtio-bus.c')) system_virtio_ss.add(files('virtio-bus.c'))
system_virtio_ss.add(files('iothread-vq-mapping.c'))
system_virtio_ss.add(when: 'CONFIG_VIRTIO_PCI', if_true: files('virtio-pci.c')) system_virtio_ss.add(when: 'CONFIG_VIRTIO_PCI', if_true: files('virtio-pci.c'))
system_virtio_ss.add(when: 'CONFIG_VIRTIO_MMIO', if_true: files('virtio-mmio.c')) system_virtio_ss.add(when: 'CONFIG_VIRTIO_MMIO', if_true: files('virtio-mmio.c'))
system_virtio_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('virtio-crypto.c')) system_virtio_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('virtio-crypto.c'))

View file

@ -123,6 +123,10 @@ struct BHListSlice {
typedef QSLIST_HEAD(, AioHandler) AioHandlerSList; typedef QSLIST_HEAD(, AioHandler) AioHandlerSList;
typedef struct AioPolledEvent {
int64_t ns; /* current polling time in nanoseconds */
} AioPolledEvent;
struct AioContext { struct AioContext {
GSource source; GSource source;
@ -229,7 +233,6 @@ struct AioContext {
int poll_disable_cnt; int poll_disable_cnt;
/* Polling mode parameters */ /* Polling mode parameters */
int64_t poll_ns; /* current polling time in nanoseconds */
int64_t poll_max_ns; /* maximum polling time in nanoseconds */ int64_t poll_max_ns; /* maximum polling time in nanoseconds */
int64_t poll_grow; /* polling time growth factor */ int64_t poll_grow; /* polling time growth factor */
int64_t poll_shrink; /* polling time shrink factor */ int64_t poll_shrink; /* polling time shrink factor */

View file

@ -17,6 +17,7 @@
#define QEMU_RAW_AIO_H #define QEMU_RAW_AIO_H
#include "block/aio.h" #include "block/aio.h"
#include "block/block-common.h"
#include "qemu/iov.h" #include "qemu/iov.h"
/* AIO request types */ /* AIO request types */
@ -58,11 +59,18 @@ void laio_cleanup(LinuxAioState *s);
/* laio_co_submit: submit I/O requests in the thread's current AioContext. */ /* laio_co_submit: submit I/O requests in the thread's current AioContext. */
int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov, int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
int type, uint64_t dev_max_batch); int type, BdrvRequestFlags flags,
uint64_t dev_max_batch);
bool laio_has_fdsync(int); bool laio_has_fdsync(int);
bool laio_has_fua(void);
void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context); void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context); void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
#else
static inline bool laio_has_fua(void)
{
return false;
}
#endif #endif
/* io_uring.c - Linux io_uring implementation */ /* io_uring.c - Linux io_uring implementation */
#ifdef CONFIG_LINUX_IO_URING #ifdef CONFIG_LINUX_IO_URING
@ -71,9 +79,16 @@ void luring_cleanup(LuringState *s);
/* luring_co_submit: submit I/O requests in the thread's current AioContext. */ /* luring_co_submit: submit I/O requests in the thread's current AioContext. */
int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset, int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
QEMUIOVector *qiov, int type); QEMUIOVector *qiov, int type,
BdrvRequestFlags flags);
void luring_detach_aio_context(LuringState *s, AioContext *old_context); void luring_detach_aio_context(LuringState *s, AioContext *old_context);
void luring_attach_aio_context(LuringState *s, AioContext *new_context); void luring_attach_aio_context(LuringState *s, AioContext *new_context);
bool luring_has_fua(void);
#else
static inline bool luring_has_fua(void)
{
return false;
}
#endif #endif
#ifdef _WIN32 #ifdef _WIN32

View file

@ -24,6 +24,7 @@ struct SCSIRequest {
SCSIBus *bus; SCSIBus *bus;
SCSIDevice *dev; SCSIDevice *dev;
const SCSIReqOps *ops; const SCSIReqOps *ops;
AioContext *ctx;
uint32_t refcount; uint32_t refcount;
uint32_t tag; uint32_t tag;
uint32_t lun; uint32_t lun;
@ -48,6 +49,8 @@ struct SCSIRequest {
bool dma_started; bool dma_started;
BlockAIOCB *aiocb; BlockAIOCB *aiocb;
QEMUSGList *sg; QEMUSGList *sg;
/* Protected by SCSIDevice->requests_lock */
QTAILQ_ENTRY(SCSIRequest) next; QTAILQ_ENTRY(SCSIRequest) next;
}; };
@ -76,10 +79,7 @@ struct SCSIDevice
uint8_t sense[SCSI_SENSE_BUF_SIZE]; uint8_t sense[SCSI_SENSE_BUF_SIZE];
uint32_t sense_len; uint32_t sense_len;
/* QemuMutex requests_lock; /* protects the requests list */
* The requests list is only accessed from the AioContext that executes
* requests or from the main loop when IOThread processing is stopped.
*/
QTAILQ_HEAD(, SCSIRequest) requests; QTAILQ_HEAD(, SCSIRequest) requests;
uint32_t channel; uint32_t channel;

View file

@ -0,0 +1,45 @@
/*
* IOThread Virtqueue Mapping
*
* Copyright Red Hat, Inc
*
* SPDX-License-Identifier: GPL-2.0-only
*/
#ifndef HW_VIRTIO_IOTHREAD_VQ_MAPPING_H
#define HW_VIRTIO_IOTHREAD_VQ_MAPPING_H
#include "qapi/error.h"
#include "qapi/qapi-types-virtio.h"
/**
* iothread_vq_mapping_apply:
* @list: The mapping of virtqueues to IOThreads.
* @vq_aio_context: The array of AioContext pointers to fill in.
* @num_queues: The length of @vq_aio_context.
* @errp: If an error occurs, a pointer to the area to store the error.
*
* Fill in the AioContext for each virtqueue in the @vq_aio_context array given
* the iothread-vq-mapping parameter in @list.
*
* iothread_vq_mapping_cleanup() must be called to free IOThread object
* references after this function returns success.
*
* Returns: %true on success, %false on failure.
**/
bool iothread_vq_mapping_apply(
IOThreadVirtQueueMappingList *list,
AioContext **vq_aio_context,
uint16_t num_queues,
Error **errp);
/**
* iothread_vq_mapping_cleanup:
* @list: The mapping of virtqueues to IOThreads.
*
* Release IOThread object references that were acquired by
* iothread_vq_mapping_apply().
*/
void iothread_vq_mapping_cleanup(IOThreadVirtQueueMappingList *list);
#endif /* HW_VIRTIO_IOTHREAD_VQ_MAPPING_H */

View file

@ -22,6 +22,7 @@
#include "hw/virtio/virtio.h" #include "hw/virtio/virtio.h"
#include "hw/scsi/scsi.h" #include "hw/scsi/scsi.h"
#include "chardev/char-fe.h" #include "chardev/char-fe.h"
#include "qapi/qapi-types-virtio.h"
#include "system/iothread.h" #include "system/iothread.h"
#define TYPE_VIRTIO_SCSI_COMMON "virtio-scsi-common" #define TYPE_VIRTIO_SCSI_COMMON "virtio-scsi-common"
@ -60,6 +61,7 @@ struct VirtIOSCSIConf {
CharBackend chardev; CharBackend chardev;
uint32_t boot_tpgt; uint32_t boot_tpgt;
IOThread *iothread; IOThread *iothread;
IOThreadVirtQueueMappingList *iothread_vq_mapping_list;
}; };
struct VirtIOSCSI; struct VirtIOSCSI;
@ -82,18 +84,14 @@ struct VirtIOSCSI {
SCSIBus bus; SCSIBus bus;
int resetting; /* written from main loop thread, read from any thread */ int resetting; /* written from main loop thread, read from any thread */
QemuMutex event_lock; /* protects event_vq and events_dropped */
bool events_dropped; bool events_dropped;
/* QemuMutex ctrl_lock; /* protects ctrl_vq */
* TMFs deferred to main loop BH. These fields are protected by
* tmf_bh_lock.
*/
QemuMutex tmf_bh_lock;
QEMUBH *tmf_bh;
QTAILQ_HEAD(, VirtIOSCSIReq) tmf_bh_list;
/* Fields for dataplane below */ /* Fields for dataplane below */
AioContext *ctx; /* one iothread per virtio-scsi-pci for now */ AioContext **vq_aio_context; /* per-virtqueue AioContext pointer */
bool dataplane_started; bool dataplane_started;
bool dataplane_starting; bool dataplane_starting;
@ -111,6 +109,7 @@ void virtio_scsi_common_realize(DeviceState *dev,
void virtio_scsi_common_unrealize(DeviceState *dev); void virtio_scsi_common_unrealize(DeviceState *dev);
void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp); void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp);
void virtio_scsi_dataplane_cleanup(VirtIOSCSI *s);
int virtio_scsi_dataplane_start(VirtIODevice *s); int virtio_scsi_dataplane_start(VirtIODevice *s);
void virtio_scsi_dataplane_stop(VirtIODevice *s); void virtio_scsi_dataplane_stop(VirtIODevice *s);

View file

@ -86,7 +86,6 @@ bool blk_supports_write_perm(BlockBackend *blk);
bool blk_is_sg(BlockBackend *blk); bool blk_is_sg(BlockBackend *blk);
void blk_set_enable_write_cache(BlockBackend *blk, bool wce); void blk_set_enable_write_cache(BlockBackend *blk, bool wce);
int blk_get_flags(BlockBackend *blk); int blk_get_flags(BlockBackend *blk);
bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp);
int blk_set_aio_context(BlockBackend *blk, AioContext *new_context, int blk_set_aio_context(BlockBackend *blk, AioContext *new_context,
Error **errp); Error **errp);
void blk_add_aio_context_notifier(BlockBackend *blk, void blk_add_aio_context_notifier(BlockBackend *blk,

View file

@ -290,8 +290,7 @@ typedef BlockAIOCB *DMAIOFunc(int64_t offset, QEMUIOVector *iov,
BlockCompletionFunc *cb, void *cb_opaque, BlockCompletionFunc *cb, void *cb_opaque,
void *opaque); void *opaque);
BlockAIOCB *dma_blk_io(AioContext *ctx, BlockAIOCB *dma_blk_io(QEMUSGList *sg, uint64_t offset, uint32_t align,
QEMUSGList *sg, uint64_t offset, uint32_t align,
DMAIOFunc *io_func, void *io_func_opaque, DMAIOFunc *io_func, void *io_func_opaque,
BlockCompletionFunc *cb, void *opaque, DMADirection dir); BlockCompletionFunc *cb, void *opaque, DMADirection dir);
BlockAIOCB *dma_blk_read(BlockBackend *blk, BlockAIOCB *dma_blk_read(BlockBackend *blk,

View file

@ -2727,6 +2727,14 @@ config_host_data.set('HAVE_OPTRESET',
cc.has_header_symbol('getopt.h', 'optreset')) cc.has_header_symbol('getopt.h', 'optreset'))
config_host_data.set('HAVE_IPPROTO_MPTCP', config_host_data.set('HAVE_IPPROTO_MPTCP',
cc.has_header_symbol('netinet/in.h', 'IPPROTO_MPTCP')) cc.has_header_symbol('netinet/in.h', 'IPPROTO_MPTCP'))
if libaio.found()
config_host_data.set('HAVE_IO_PREP_PWRITEV2',
cc.has_header_symbol('libaio.h', 'io_prep_pwritev2'))
endif
if linux_io_uring.found()
config_host_data.set('HAVE_IO_URING_PREP_WRITEV2',
cc.has_header_symbol('liburing.h', 'io_uring_prep_writev2'))
endif
# has_member # has_member
config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID', config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID',

449
scripts/qcow2-to-stdout.py Executable file
View file

@ -0,0 +1,449 @@
#!/usr/bin/env python3
# This tool reads a disk image in any format and converts it to qcow2,
# writing the result directly to stdout.
#
# Copyright (C) 2024 Igalia, S.L.
#
# Authors: Alberto Garcia <berto@igalia.com>
# Madeeha Javed <javed@igalia.com>
#
# SPDX-License-Identifier: GPL-2.0-or-later
#
# qcow2 files produced by this script are always arranged like this:
#
# - qcow2 header
# - refcount table
# - refcount blocks
# - L1 table
# - L2 tables
# - Data clusters
#
# A note about variable names: in qcow2 there is one refcount table
# and one (active) L1 table, although each can occupy several
# clusters. For the sake of simplicity the code sometimes talks about
# refcount tables and L1 tables when referring to those clusters.
import argparse
import errno
import math
import os
import signal
import struct
import subprocess
import sys
import tempfile
import time
from contextlib import contextmanager
QCOW2_DEFAULT_CLUSTER_SIZE = 65536
QCOW2_DEFAULT_REFCOUNT_BITS = 16
QCOW2_FEATURE_NAME_TABLE = 0x6803F857
QCOW2_DATA_FILE_NAME_STRING = 0x44415441
QCOW2_V3_HEADER_LENGTH = 112 # Header length in QEMU 9.0. Must be a multiple of 8
QCOW2_INCOMPAT_DATA_FILE_BIT = 2
QCOW2_AUTOCLEAR_DATA_FILE_RAW_BIT = 1
QCOW_OFLAG_COPIED = 1 << 63
QEMU_STORAGE_DAEMON = "qemu-storage-daemon"
def bitmap_set(bitmap, idx):
bitmap[idx // 8] |= 1 << (idx % 8)
def bitmap_is_set(bitmap, idx):
return (bitmap[idx // 8] & (1 << (idx % 8))) != 0
def bitmap_iterator(bitmap, length):
for idx in range(length):
if bitmap_is_set(bitmap, idx):
yield idx
def align_up(num, d):
return d * math.ceil(num / d)
# Holes in the input file contain only zeroes so we can skip them and
# save time. This function returns the indexes of the clusters that
# are known to contain data. Those are the ones that we need to read.
def clusters_with_data(fd, cluster_size):
data_to = 0
while True:
try:
data_from = os.lseek(fd, data_to, os.SEEK_DATA)
data_to = align_up(os.lseek(fd, data_from, os.SEEK_HOLE), cluster_size)
for idx in range(data_from // cluster_size, data_to // cluster_size):
yield idx
except OSError as err:
if err.errno == errno.ENXIO: # End of file reached
break
raise err
# write_qcow2_content() expects a raw input file. If we have a different
# format we can use qemu-storage-daemon to make it appear as raw.
@contextmanager
def get_input_as_raw_file(input_file, input_format):
if input_format == "raw":
yield input_file
return
try:
temp_dir = tempfile.mkdtemp()
pid_file = os.path.join(temp_dir, "pid")
raw_file = os.path.join(temp_dir, "raw")
open(raw_file, "wb").close()
ret = subprocess.run(
[
QEMU_STORAGE_DAEMON,
"--daemonize",
"--pidfile", pid_file,
"--blockdev", f"driver=file,node-name=file0,driver=file,filename={input_file},read-only=on",
"--blockdev", f"driver={input_format},node-name=disk0,file=file0,read-only=on",
"--export", f"type=fuse,id=export0,node-name=disk0,mountpoint={raw_file},writable=off",
],
capture_output=True,
)
if ret.returncode != 0:
sys.exit("[Error] Could not start the qemu-storage-daemon:\n" +
ret.stderr.decode().rstrip('\n'))
yield raw_file
finally:
# Kill the storage daemon on exit
# and remove all temporary files
if os.path.exists(pid_file):
with open(pid_file, "r") as f:
pid = int(f.readline())
os.kill(pid, signal.SIGTERM)
while os.path.exists(pid_file):
time.sleep(0.1)
os.unlink(raw_file)
os.rmdir(temp_dir)
def write_features(cluster, offset, data_file_name):
if data_file_name is not None:
encoded_name = data_file_name.encode("utf-8")
padded_name_len = align_up(len(encoded_name), 8)
struct.pack_into(f">II{padded_name_len}s", cluster, offset,
QCOW2_DATA_FILE_NAME_STRING,
len(encoded_name),
encoded_name)
offset += 8 + padded_name_len
qcow2_features = [
# Incompatible
(0, 0, "dirty bit"),
(0, 1, "corrupt bit"),
(0, 2, "external data file"),
(0, 3, "compression type"),
(0, 4, "extended L2 entries"),
# Compatible
(1, 0, "lazy refcounts"),
# Autoclear
(2, 0, "bitmaps"),
(2, 1, "raw external data"),
]
struct.pack_into(">I", cluster, offset, QCOW2_FEATURE_NAME_TABLE)
struct.pack_into(">I", cluster, offset + 4, len(qcow2_features) * 48)
offset += 8
for feature_type, feature_bit, feature_name in qcow2_features:
struct.pack_into(">BB46s", cluster, offset,
feature_type, feature_bit, feature_name.encode("ascii"))
offset += 48
def write_qcow2_content(input_file, cluster_size, refcount_bits, data_file_name, data_file_raw):
# Some basic values
l1_entries_per_table = cluster_size // 8
l2_entries_per_table = cluster_size // 8
refcounts_per_table = cluster_size // 8
refcounts_per_block = cluster_size * 8 // refcount_bits
# Virtual disk size, number of data clusters and L1 entries
disk_size = align_up(os.path.getsize(input_file), 512)
total_data_clusters = math.ceil(disk_size / cluster_size)
l1_entries = math.ceil(total_data_clusters / l2_entries_per_table)
allocated_l1_tables = math.ceil(l1_entries / l1_entries_per_table)
# Max L1 table size is 32 MB (QCOW_MAX_L1_SIZE in block/qcow2.h)
if (l1_entries * 8) > (32 * 1024 * 1024):
sys.exit("[Error] The image size is too large. Try using a larger cluster size.")
# Two bitmaps indicating which L1 and L2 entries are set
l1_bitmap = bytearray(allocated_l1_tables * l1_entries_per_table // 8)
l2_bitmap = bytearray(l1_entries * l2_entries_per_table // 8)
allocated_l2_tables = 0
allocated_data_clusters = 0
if data_file_raw:
# If data_file_raw is set then all clusters are allocated and
# we don't need to read the input file at all.
allocated_l2_tables = l1_entries
for idx in range(l1_entries):
bitmap_set(l1_bitmap, idx)
for idx in range(total_data_clusters):
bitmap_set(l2_bitmap, idx)
else:
# Open the input file for reading
fd = os.open(input_file, os.O_RDONLY)
zero_cluster = bytes(cluster_size)
# Read all the clusters that contain data
for idx in clusters_with_data(fd, cluster_size):
cluster = os.pread(fd, cluster_size, cluster_size * idx)
# If the last cluster is smaller than cluster_size pad it with zeroes
if len(cluster) < cluster_size:
cluster += bytes(cluster_size - len(cluster))
# If a cluster has non-zero data then it must be allocated
# in the output file and its L2 entry must be set
if cluster != zero_cluster:
bitmap_set(l2_bitmap, idx)
allocated_data_clusters += 1
# Allocated data clusters also need their corresponding L1 entry and L2 table
l1_idx = math.floor(idx / l2_entries_per_table)
if not bitmap_is_set(l1_bitmap, l1_idx):
bitmap_set(l1_bitmap, l1_idx)
allocated_l2_tables += 1
# Total amount of allocated clusters excluding the refcount blocks and table
total_allocated_clusters = 1 + allocated_l1_tables + allocated_l2_tables
if data_file_name is None:
total_allocated_clusters += allocated_data_clusters
# Clusters allocated for the refcount blocks and table
allocated_refcount_blocks = math.ceil(total_allocated_clusters / refcounts_per_block)
allocated_refcount_tables = math.ceil(allocated_refcount_blocks / refcounts_per_table)
# Now we have a problem because allocated_refcount_blocks and allocated_refcount_tables...
# (a) increase total_allocated_clusters, and
# (b) need to be recalculated when total_allocated_clusters is increased
# So we need to repeat the calculation as long as the numbers change
while True:
new_total_allocated_clusters = total_allocated_clusters + allocated_refcount_tables + allocated_refcount_blocks
new_allocated_refcount_blocks = math.ceil(new_total_allocated_clusters / refcounts_per_block)
if new_allocated_refcount_blocks > allocated_refcount_blocks:
allocated_refcount_blocks = new_allocated_refcount_blocks
allocated_refcount_tables = math.ceil(allocated_refcount_blocks / refcounts_per_table)
else:
break
# Now that we have the final numbers we can update total_allocated_clusters
total_allocated_clusters += allocated_refcount_tables + allocated_refcount_blocks
# At this point we have the exact number of clusters that the output
# image is going to use so we can calculate all the offsets.
current_cluster_idx = 1
refcount_table_offset = current_cluster_idx * cluster_size
current_cluster_idx += allocated_refcount_tables
refcount_block_offset = current_cluster_idx * cluster_size
current_cluster_idx += allocated_refcount_blocks
l1_table_offset = current_cluster_idx * cluster_size
current_cluster_idx += allocated_l1_tables
l2_table_offset = current_cluster_idx * cluster_size
current_cluster_idx += allocated_l2_tables
data_clusters_offset = current_cluster_idx * cluster_size
# Calculate some values used in the qcow2 header
if allocated_l1_tables == 0:
l1_table_offset = 0
hdr_cluster_bits = int(math.log2(cluster_size))
hdr_refcount_bits = int(math.log2(refcount_bits))
hdr_length = QCOW2_V3_HEADER_LENGTH
hdr_incompat_features = 0
if data_file_name is not None:
hdr_incompat_features |= 1 << QCOW2_INCOMPAT_DATA_FILE_BIT
hdr_autoclear_features = 0
if data_file_raw:
hdr_autoclear_features |= 1 << QCOW2_AUTOCLEAR_DATA_FILE_RAW_BIT
### Write qcow2 header
cluster = bytearray(cluster_size)
struct.pack_into(">4sIQIIQIIQQIIQQQQII", cluster, 0,
b"QFI\xfb", # QCOW magic string
3, # version
0, # backing file offset
0, # backing file sizes
hdr_cluster_bits,
disk_size,
0, # encryption method
l1_entries,
l1_table_offset,
refcount_table_offset,
allocated_refcount_tables,
0, # number of snapshots
0, # snapshot table offset
hdr_incompat_features,
0, # compatible features
hdr_autoclear_features,
hdr_refcount_bits,
hdr_length,
)
write_features(cluster, hdr_length, data_file_name)
sys.stdout.buffer.write(cluster)
### Write refcount table
cur_offset = refcount_block_offset
remaining_refcount_table_entries = allocated_refcount_blocks # Each entry is a pointer to a refcount block
while remaining_refcount_table_entries > 0:
cluster = bytearray(cluster_size)
to_write = min(remaining_refcount_table_entries, refcounts_per_table)
remaining_refcount_table_entries -= to_write
for idx in range(to_write):
struct.pack_into(">Q", cluster, idx * 8, cur_offset)
cur_offset += cluster_size
sys.stdout.buffer.write(cluster)
### Write refcount blocks
remaining_refcount_block_entries = total_allocated_clusters # One entry for each allocated cluster
for tbl in range(allocated_refcount_blocks):
cluster = bytearray(cluster_size)
to_write = min(remaining_refcount_block_entries, refcounts_per_block)
remaining_refcount_block_entries -= to_write
# All refcount entries contain the number 1. The only difference
# is their bit width, defined when the image is created.
for idx in range(to_write):
if refcount_bits == 64:
struct.pack_into(">Q", cluster, idx * 8, 1)
elif refcount_bits == 32:
struct.pack_into(">L", cluster, idx * 4, 1)
elif refcount_bits == 16:
struct.pack_into(">H", cluster, idx * 2, 1)
elif refcount_bits == 8:
cluster[idx] = 1
elif refcount_bits == 4:
cluster[idx // 2] |= 1 << ((idx % 2) * 4)
elif refcount_bits == 2:
cluster[idx // 4] |= 1 << ((idx % 4) * 2)
elif refcount_bits == 1:
cluster[idx // 8] |= 1 << (idx % 8)
sys.stdout.buffer.write(cluster)
### Write L1 table
cur_offset = l2_table_offset
for tbl in range(allocated_l1_tables):
cluster = bytearray(cluster_size)
for idx in range(l1_entries_per_table):
l1_idx = tbl * l1_entries_per_table + idx
if bitmap_is_set(l1_bitmap, l1_idx):
struct.pack_into(">Q", cluster, idx * 8, cur_offset | QCOW_OFLAG_COPIED)
cur_offset += cluster_size
sys.stdout.buffer.write(cluster)
### Write L2 tables
cur_offset = data_clusters_offset
for tbl in range(l1_entries):
# Skip the empty L2 tables. We can identify them because
# there is no L1 entry pointing at them.
if bitmap_is_set(l1_bitmap, tbl):
cluster = bytearray(cluster_size)
for idx in range(l2_entries_per_table):
l2_idx = tbl * l2_entries_per_table + idx
if bitmap_is_set(l2_bitmap, l2_idx):
if data_file_name is None:
struct.pack_into(">Q", cluster, idx * 8, cur_offset | QCOW_OFLAG_COPIED)
cur_offset += cluster_size
else:
struct.pack_into(">Q", cluster, idx * 8, (l2_idx * cluster_size) | QCOW_OFLAG_COPIED)
sys.stdout.buffer.write(cluster)
### Write data clusters
if data_file_name is None:
for idx in bitmap_iterator(l2_bitmap, total_data_clusters):
cluster = os.pread(fd, cluster_size, cluster_size * idx)
# If the last cluster is smaller than cluster_size pad it with zeroes
if len(cluster) < cluster_size:
cluster += bytes(cluster_size - len(cluster))
sys.stdout.buffer.write(cluster)
if not data_file_raw:
os.close(fd)
def main():
# Command-line arguments
parser = argparse.ArgumentParser(
description="This program converts a QEMU disk image to qcow2 "
"and writes it to the standard output"
)
parser.add_argument("input_file", help="name of the input file")
parser.add_argument(
"-f",
dest="input_format",
metavar="input_format",
help="format of the input file (default: raw)",
default="raw",
)
parser.add_argument(
"-c",
dest="cluster_size",
metavar="cluster_size",
help=f"qcow2 cluster size (default: {QCOW2_DEFAULT_CLUSTER_SIZE})",
default=QCOW2_DEFAULT_CLUSTER_SIZE,
type=int,
choices=[1 << x for x in range(9, 22)],
)
parser.add_argument(
"-r",
dest="refcount_bits",
metavar="refcount_bits",
help=f"width of the reference count entries (default: {QCOW2_DEFAULT_REFCOUNT_BITS})",
default=QCOW2_DEFAULT_REFCOUNT_BITS,
type=int,
choices=[1 << x for x in range(7)],
)
parser.add_argument(
"-d",
dest="data_file",
help="create an image with input_file as an external data file",
action="store_true",
)
parser.add_argument(
"-R",
dest="data_file_raw",
help="enable data_file_raw on the generated image (implies -d)",
action="store_true",
)
args = parser.parse_args()
if args.data_file_raw:
args.data_file = True
if not os.path.isfile(args.input_file):
sys.exit(f"[Error] {args.input_file} does not exist or is not a regular file.")
if args.data_file and args.input_format != "raw":
sys.exit("[Error] External data files can only be used with raw input images")
# A 512 byte header is too small for the data file name extension
if args.data_file and args.cluster_size == 512:
sys.exit("[Error] External data files require a larger cluster size")
if sys.stdout.isatty():
sys.exit("[Error] Refusing to write to a tty. Try redirecting stdout.")
if args.data_file:
data_file_name = args.input_file
else:
data_file_name = None
with get_input_as_raw_file(args.input_file, args.input_format) as raw_file:
write_qcow2_content(
raw_file,
args.cluster_size,
args.refcount_bits,
data_file_name,
args.data_file_raw,
)
if __name__ == "__main__":
main()

View file

@ -211,7 +211,7 @@ static const AIOCBInfo dma_aiocb_info = {
.cancel_async = dma_aio_cancel, .cancel_async = dma_aio_cancel,
}; };
BlockAIOCB *dma_blk_io(AioContext *ctx, BlockAIOCB *dma_blk_io(
QEMUSGList *sg, uint64_t offset, uint32_t align, QEMUSGList *sg, uint64_t offset, uint32_t align,
DMAIOFunc *io_func, void *io_func_opaque, DMAIOFunc *io_func, void *io_func_opaque,
BlockCompletionFunc *cb, BlockCompletionFunc *cb,
@ -223,7 +223,7 @@ BlockAIOCB *dma_blk_io(AioContext *ctx,
dbs->acb = NULL; dbs->acb = NULL;
dbs->sg = sg; dbs->sg = sg;
dbs->ctx = ctx; dbs->ctx = qemu_get_current_aio_context();
dbs->offset = offset; dbs->offset = offset;
dbs->align = align; dbs->align = align;
dbs->sg_cur_index = 0; dbs->sg_cur_index = 0;
@ -251,7 +251,7 @@ BlockAIOCB *dma_blk_read(BlockBackend *blk,
QEMUSGList *sg, uint64_t offset, uint32_t align, QEMUSGList *sg, uint64_t offset, uint32_t align,
void (*cb)(void *opaque, int ret), void *opaque) void (*cb)(void *opaque, int ret), void *opaque)
{ {
return dma_blk_io(blk_get_aio_context(blk), sg, offset, align, return dma_blk_io(sg, offset, align,
dma_blk_read_io_func, blk, cb, opaque, dma_blk_read_io_func, blk, cb, opaque,
DMA_DIRECTION_FROM_DEVICE); DMA_DIRECTION_FROM_DEVICE);
} }
@ -269,7 +269,7 @@ BlockAIOCB *dma_blk_write(BlockBackend *blk,
QEMUSGList *sg, uint64_t offset, uint32_t align, QEMUSGList *sg, uint64_t offset, uint32_t align,
void (*cb)(void *opaque, int ret), void *opaque) void (*cb)(void *opaque, int ret), void *opaque)
{ {
return dma_blk_io(blk_get_aio_context(blk), sg, offset, align, return dma_blk_io(sg, offset, align,
dma_blk_write_io_func, blk, cb, opaque, dma_blk_write_io_func, blk, cb, opaque,
DMA_DIRECTION_TO_DEVICE); DMA_DIRECTION_TO_DEVICE);
} }

View file

@ -181,7 +181,7 @@ QEMU X.Y.Z monitor - type 'help' for more information
Testing: -drive file=TEST_DIR/t.qcow2,if=none,node-name=disk -object iothread,id=thread0 -device virtio-scsi,iothread=thread0,id=virtio-scsi0 -device scsi-hd,bus=virtio-scsi0.0,drive=disk,share-rw=on -device virtio-scsi,id=virtio-scsi1 -device scsi-hd,bus=virtio-scsi1.0,drive=disk,share-rw=on Testing: -drive file=TEST_DIR/t.qcow2,if=none,node-name=disk -object iothread,id=thread0 -device virtio-scsi,iothread=thread0,id=virtio-scsi0 -device scsi-hd,bus=virtio-scsi0.0,drive=disk,share-rw=on -device virtio-scsi,id=virtio-scsi1 -device scsi-hd,bus=virtio-scsi1.0,drive=disk,share-rw=on
QEMU X.Y.Z monitor - type 'help' for more information QEMU X.Y.Z monitor - type 'help' for more information
(qemu) QEMU_PROG: -device scsi-hd,bus=virtio-scsi1.0,drive=disk,share-rw=on: Cannot change iothread of active block backend (qemu) quit
Testing: -drive file=TEST_DIR/t.qcow2,if=none,node-name=disk -object iothread,id=thread0 -device virtio-scsi,iothread=thread0,id=virtio-scsi0 -device scsi-hd,bus=virtio-scsi0.0,drive=disk,share-rw=on -device virtio-blk-pci,drive=disk,iothread=thread0,share-rw=on Testing: -drive file=TEST_DIR/t.qcow2,if=none,node-name=disk -object iothread,id=thread0 -device virtio-scsi,iothread=thread0,id=virtio-scsi0 -device scsi-hd,bus=virtio-scsi0.0,drive=disk,share-rw=on -device virtio-blk-pci,drive=disk,iothread=thread0,share-rw=on
QEMU X.Y.Z monitor - type 'help' for more information QEMU X.Y.Z monitor - type 'help' for more information

View file

@ -22,7 +22,7 @@ import iotests
from iotests import filter_qemu_io, filter_qtest from iotests import filter_qemu_io, filter_qtest
iotests.script_initialize(supported_fmts=['generic'], iotests.script_initialize(supported_fmts=['qcow2', 'qed', 'raw'],
supported_protocols=['file'], supported_protocols=['file'],
supported_platforms=['linux']) supported_platforms=['linux'])

View file

@ -28,6 +28,9 @@
/* Stop userspace polling on a handler if it isn't active for some time */ /* Stop userspace polling on a handler if it isn't active for some time */
#define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND) #define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND)
static void adjust_polling_time(AioContext *ctx, AioPolledEvent *poll,
int64_t block_ns);
bool aio_poll_disabled(AioContext *ctx) bool aio_poll_disabled(AioContext *ctx)
{ {
return qatomic_read(&ctx->poll_disable_cnt); return qatomic_read(&ctx->poll_disable_cnt);
@ -392,7 +395,8 @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
* scanning all handlers with aio_dispatch_handlers(). * scanning all handlers with aio_dispatch_handlers().
*/ */
static bool aio_dispatch_ready_handlers(AioContext *ctx, static bool aio_dispatch_ready_handlers(AioContext *ctx,
AioHandlerList *ready_list) AioHandlerList *ready_list,
int64_t block_ns)
{ {
bool progress = false; bool progress = false;
AioHandler *node; AioHandler *node;
@ -400,6 +404,14 @@ static bool aio_dispatch_ready_handlers(AioContext *ctx,
while ((node = QLIST_FIRST(ready_list))) { while ((node = QLIST_FIRST(ready_list))) {
QLIST_REMOVE(node, node_ready); QLIST_REMOVE(node, node_ready);
progress = aio_dispatch_handler(ctx, node) || progress; progress = aio_dispatch_handler(ctx, node) || progress;
/*
* Adjust polling time only after aio_dispatch_handler(), which can
* add the handler to ctx->poll_aio_handlers.
*/
if (ctx->poll_max_ns && QLIST_IS_INSERTED(node, node_poll)) {
adjust_polling_time(ctx, &node->poll, block_ns);
}
} }
return progress; return progress;
@ -579,13 +591,19 @@ static bool run_poll_handlers(AioContext *ctx, AioHandlerList *ready_list,
static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list, static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list,
int64_t *timeout) int64_t *timeout)
{ {
AioHandler *node;
int64_t max_ns; int64_t max_ns;
if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) { if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
return false; return false;
} }
max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns); max_ns = 0;
QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
max_ns = MAX(max_ns, node->poll.ns);
}
max_ns = qemu_soonest_timeout(*timeout, max_ns);
if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) { if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
/* /*
* Enable poll mode. It pairs with the poll_set_started() in * Enable poll mode. It pairs with the poll_set_started() in
@ -600,6 +618,46 @@ static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list,
return false; return false;
} }
static void adjust_polling_time(AioContext *ctx, AioPolledEvent *poll,
int64_t block_ns)
{
if (block_ns <= poll->ns) {
/* This is the sweet spot, no adjustment needed */
} else if (block_ns > ctx->poll_max_ns) {
/* We'd have to poll for too long, poll less */
int64_t old = poll->ns;
if (ctx->poll_shrink) {
poll->ns /= ctx->poll_shrink;
} else {
poll->ns = 0;
}
trace_poll_shrink(ctx, old, poll->ns);
} else if (poll->ns < ctx->poll_max_ns &&
block_ns < ctx->poll_max_ns) {
/* There is room to grow, poll longer */
int64_t old = poll->ns;
int64_t grow = ctx->poll_grow;
if (grow == 0) {
grow = 2;
}
if (poll->ns) {
poll->ns *= grow;
} else {
poll->ns = 4000; /* start polling at 4 microseconds */
}
if (poll->ns > ctx->poll_max_ns) {
poll->ns = ctx->poll_max_ns;
}
trace_poll_grow(ctx, old, poll->ns);
}
}
bool aio_poll(AioContext *ctx, bool blocking) bool aio_poll(AioContext *ctx, bool blocking)
{ {
AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list); AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
@ -607,6 +665,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
bool use_notify_me; bool use_notify_me;
int64_t timeout; int64_t timeout;
int64_t start = 0; int64_t start = 0;
int64_t block_ns = 0;
/* /*
* There cannot be two concurrent aio_poll calls for the same AioContext (or * There cannot be two concurrent aio_poll calls for the same AioContext (or
@ -679,49 +738,13 @@ bool aio_poll(AioContext *ctx, bool blocking)
aio_notify_accept(ctx); aio_notify_accept(ctx);
/* Adjust polling time */ /* Calculate blocked time for adaptive polling */
if (ctx->poll_max_ns) { if (ctx->poll_max_ns) {
int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start; block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
if (block_ns <= ctx->poll_ns) {
/* This is the sweet spot, no adjustment needed */
} else if (block_ns > ctx->poll_max_ns) {
/* We'd have to poll for too long, poll less */
int64_t old = ctx->poll_ns;
if (ctx->poll_shrink) {
ctx->poll_ns /= ctx->poll_shrink;
} else {
ctx->poll_ns = 0;
}
trace_poll_shrink(ctx, old, ctx->poll_ns);
} else if (ctx->poll_ns < ctx->poll_max_ns &&
block_ns < ctx->poll_max_ns) {
/* There is room to grow, poll longer */
int64_t old = ctx->poll_ns;
int64_t grow = ctx->poll_grow;
if (grow == 0) {
grow = 2;
}
if (ctx->poll_ns) {
ctx->poll_ns *= grow;
} else {
ctx->poll_ns = 4000; /* start polling at 4 microseconds */
}
if (ctx->poll_ns > ctx->poll_max_ns) {
ctx->poll_ns = ctx->poll_max_ns;
}
trace_poll_grow(ctx, old, ctx->poll_ns);
}
} }
progress |= aio_bh_poll(ctx); progress |= aio_bh_poll(ctx);
progress |= aio_dispatch_ready_handlers(ctx, &ready_list); progress |= aio_dispatch_ready_handlers(ctx, &ready_list, block_ns);
aio_free_deleted_handlers(ctx); aio_free_deleted_handlers(ctx);
@ -767,11 +790,18 @@ void aio_context_use_g_source(AioContext *ctx)
void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns, void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
int64_t grow, int64_t shrink, Error **errp) int64_t grow, int64_t shrink, Error **errp)
{ {
AioHandler *node;
qemu_lockcnt_inc(&ctx->list_lock);
QLIST_FOREACH(node, &ctx->aio_handlers, node) {
node->poll.ns = 0;
}
qemu_lockcnt_dec(&ctx->list_lock);
/* No thread synchronization here, it doesn't matter if an incorrect value /* No thread synchronization here, it doesn't matter if an incorrect value
* is used once. * is used once.
*/ */
ctx->poll_max_ns = max_ns; ctx->poll_max_ns = max_ns;
ctx->poll_ns = 0;
ctx->poll_grow = grow; ctx->poll_grow = grow;
ctx->poll_shrink = shrink; ctx->poll_shrink = shrink;

View file

@ -38,6 +38,7 @@ struct AioHandler {
#endif #endif
int64_t poll_idle_timeout; /* when to stop userspace polling */ int64_t poll_idle_timeout; /* when to stop userspace polling */
bool poll_ready; /* has polling detected an event? */ bool poll_ready; /* has polling detected an event? */
AioPolledEvent poll;
}; };
/* Add a handler to a ready list */ /* Add a handler to a ready list */

View file

@ -609,7 +609,6 @@ AioContext *aio_context_new(Error **errp)
qemu_rec_mutex_init(&ctx->lock); qemu_rec_mutex_init(&ctx->lock);
timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx); timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
ctx->poll_ns = 0;
ctx->poll_max_ns = 0; ctx->poll_max_ns = 0;
ctx->poll_grow = 0; ctx->poll_grow = 0;
ctx->poll_shrink = 0; ctx->poll_shrink = 0;