Block layer patches

- scsi-disk: Add native FUA write support, enable FUA by default
 - qemu-img: fix offset calculation in bench
 - file-posix: allow BLKZEROOUT with -t writeback
 - file-posix: Probe paths and retry SG_IO on potential path errors
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCAAvFiEE3D3rFZqa+V09dFb+fwmycsiPL9YFAmgvbSARHGt3b2xmQHJl
 ZGhhdC5jb20ACgkQfwmycsiPL9ZDmhAAm34XjSe/aQJG76Nll18eO8lnoWidsnjX
 OoSI3//O77dE0AuMOxSVvfve+ZkV4ehZDeo+GeREBZh3TGBPHf+elVfo3XEu/L5u
 +jWTNgoNe7pdlkxB4pv9boHIy2C7+/odVNtmVggxuNy2kyQoKP0tXHEjqiHjNzB6
 jCHXegklC9psXt+wgQGJpFkeI8r2OtABkEP15PtI02gvQyk+spBg0sr6pB5FP2ZY
 y8YWgOXeXis6QHqARMpqoiRGxhCQrWuKuzwZfeyNmLvYLWCBqmt3Opk+3kMPU6NB
 KrXZHCJXtaIlLO7YeurPXcnFQsJ94IY7x1TvIza5mgY+ct7mal2uN4u29PnhnLLm
 eXSgSiXhb8h9PY8KfOQfU9brclijcbV8Rn7sSP6WPX00bHspib275gNG4RLPxnxi
 AezWfBg1IOYAvliwq99ZY+Ts+faezo7XiNQbfNpZ82pzxhO6IqemSH2IcDS6SpjO
 mNoGLNmCi3CpQw1bdlnqwiU1OxWsHK2627VGyLZXPnOrYPr+erN/A4Nucxr2bzJk
 69dKg0/ekTucTtKiF1uRl/bkhTOHplOogyyuKala6ogsthvBV8jmGAEEqZnMTVtx
 opAl7MSpVa4FWQ7C2LHi5vYEVNGC8OLXYBy09N73m/Q+Fs+z6bViFy3BQdUO+Nh1
 OjmUGzSrqRo=
 =yjOc
 -----END PGP SIGNATURE-----

Merge tag 'for-upstream' of https://repo.or.cz/qemu/kevin into staging

Block layer patches

- scsi-disk: Add native FUA write support, enable FUA by default
- qemu-img: fix offset calculation in bench
- file-posix: allow BLKZEROOUT with -t writeback
- file-posix: Probe paths and retry SG_IO on potential path errors

# -----BEGIN PGP SIGNATURE-----
#
# iQJFBAABCAAvFiEE3D3rFZqa+V09dFb+fwmycsiPL9YFAmgvbSARHGt3b2xmQHJl
# ZGhhdC5jb20ACgkQfwmycsiPL9ZDmhAAm34XjSe/aQJG76Nll18eO8lnoWidsnjX
# OoSI3//O77dE0AuMOxSVvfve+ZkV4ehZDeo+GeREBZh3TGBPHf+elVfo3XEu/L5u
# +jWTNgoNe7pdlkxB4pv9boHIy2C7+/odVNtmVggxuNy2kyQoKP0tXHEjqiHjNzB6
# jCHXegklC9psXt+wgQGJpFkeI8r2OtABkEP15PtI02gvQyk+spBg0sr6pB5FP2ZY
# y8YWgOXeXis6QHqARMpqoiRGxhCQrWuKuzwZfeyNmLvYLWCBqmt3Opk+3kMPU6NB
# KrXZHCJXtaIlLO7YeurPXcnFQsJ94IY7x1TvIza5mgY+ct7mal2uN4u29PnhnLLm
# eXSgSiXhb8h9PY8KfOQfU9brclijcbV8Rn7sSP6WPX00bHspib275gNG4RLPxnxi
# AezWfBg1IOYAvliwq99ZY+Ts+faezo7XiNQbfNpZ82pzxhO6IqemSH2IcDS6SpjO
# mNoGLNmCi3CpQw1bdlnqwiU1OxWsHK2627VGyLZXPnOrYPr+erN/A4Nucxr2bzJk
# 69dKg0/ekTucTtKiF1uRl/bkhTOHplOogyyuKala6ogsthvBV8jmGAEEqZnMTVtx
# opAl7MSpVa4FWQ7C2LHi5vYEVNGC8OLXYBy09N73m/Q+Fs+z6bViFy3BQdUO+Nh1
# OjmUGzSrqRo=
# =yjOc
# -----END PGP SIGNATURE-----
# gpg: Signature made Thu 22 May 2025 14:29:52 EDT
# gpg:                using RSA key DC3DEB159A9AF95D3D7456FE7F09B272C88F2FD6
# gpg:                issuer "kwolf@redhat.com"
# gpg: Good signature from "Kevin Wolf <kwolf@redhat.com>" [full]
# Primary key fingerprint: DC3D EB15 9A9A F95D 3D74  56FE 7F09 B272 C88F 2FD6

* tag 'for-upstream' of https://repo.or.cz/qemu/kevin:
  file-posix: Probe paths and retry SG_IO on potential path errors
  file-posix: allow BLKZEROOUT with -t writeback
  qemu-img: fix offset calculation in bench
  scsi-disk: Advertise FUA support by default
  scsi-disk: Add native FUA write support

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
This commit is contained in:
Stefan Hajnoczi 2025-05-23 09:26:29 -04:00
commit 668df86ee8
4 changed files with 135 additions and 54 deletions

View file

@ -41,6 +41,7 @@
#include "scsi/pr-manager.h"
#include "scsi/constants.h"
#include "scsi/utils.h"
#if defined(__APPLE__) && (__MACH__)
#include <sys/ioctl.h>
@ -72,6 +73,7 @@
#include <linux/blkzoned.h>
#endif
#include <linux/cdrom.h>
#include <linux/dm-ioctl.h>
#include <linux/fd.h>
#include <linux/fs.h>
#include <linux/hdreg.h>
@ -138,6 +140,22 @@
#define RAW_LOCK_PERM_BASE 100
#define RAW_LOCK_SHARED_BASE 200
/*
* Multiple retries are mostly meant for two separate scenarios:
*
* - DM_MPATH_PROBE_PATHS returns success, but before SG_IO completes, another
* path goes down.
*
* - DM_MPATH_PROBE_PATHS failed all paths in the current path group, so we have
* to send another SG_IO to switch to another path group to probe the paths in
* it.
*
* Even if each path is in a separate path group (path_grouping_policy set to
* failover), it's rare to have more than eight path groups - and even then
* pretty unlikely that only bad path groups would be chosen in eight retries.
*/
#define SG_IO_MAX_RETRIES 8
typedef struct BDRVRawState {
int fd;
bool use_lock;
@ -165,6 +183,7 @@ typedef struct BDRVRawState {
bool use_linux_aio:1;
bool has_laio_fdsync:1;
bool use_linux_io_uring:1;
bool use_mpath:1;
int page_cache_inconsistent; /* errno from fdatasync failure */
bool has_fallocate;
bool needs_alignment;
@ -785,17 +804,6 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
}
#endif
if (S_ISBLK(st.st_mode)) {
#ifdef __linux__
/* On Linux 3.10, BLKDISCARD leaves stale data in the page cache. Do
* not rely on the contents of discarded blocks unless using O_DIRECT.
* Same for BLKZEROOUT.
*/
if (!(bs->open_flags & BDRV_O_NOCACHE)) {
s->has_write_zeroes = false;
}
#endif
}
#ifdef __FreeBSD__
if (S_ISCHR(st.st_mode)) {
/*
@ -4264,15 +4272,105 @@ hdev_open_Mac_error:
/* Since this does ioctl the device must be already opened */
bs->sg = hdev_is_sg(bs);
/* sg devices aren't even block devices and can't use dm-mpath */
s->use_mpath = !bs->sg;
return ret;
}
#if defined(__linux__)
#if defined(DM_MPATH_PROBE_PATHS)
static bool coroutine_fn sgio_path_error(int ret, sg_io_hdr_t *io_hdr)
{
if (ret < 0) {
switch (ret) {
case -ENODEV:
return true;
case -EAGAIN:
/*
* The device is probably suspended. This happens while the dm table
* is reloaded, e.g. because a path is added or removed. This is an
* operation that should complete within 1ms, so just wait a bit and
* retry.
*
* If the device was suspended for another reason, we'll wait and
* retry SG_IO_MAX_RETRIES times. This is a tolerable delay before
* we return an error and potentially stop the VM.
*/
qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000);
return true;
default:
return false;
}
}
if (io_hdr->host_status != SCSI_HOST_OK) {
return true;
}
switch (io_hdr->status) {
case GOOD:
case CONDITION_GOOD:
case INTERMEDIATE_GOOD:
case INTERMEDIATE_C_GOOD:
case RESERVATION_CONFLICT:
case COMMAND_TERMINATED:
return false;
case CHECK_CONDITION:
return !scsi_sense_buf_is_guest_recoverable(io_hdr->sbp,
io_hdr->mx_sb_len);
default:
return true;
}
}
static bool coroutine_fn hdev_co_ioctl_sgio_retry(RawPosixAIOData *acb, int ret)
{
BDRVRawState *s = acb->bs->opaque;
RawPosixAIOData probe_acb;
if (!s->use_mpath) {
return false;
}
if (!sgio_path_error(ret, acb->ioctl.buf)) {
return false;
}
probe_acb = (RawPosixAIOData) {
.bs = acb->bs,
.aio_type = QEMU_AIO_IOCTL,
.aio_fildes = s->fd,
.aio_offset = 0,
.ioctl = {
.buf = NULL,
.cmd = DM_MPATH_PROBE_PATHS,
},
};
ret = raw_thread_pool_submit(handle_aiocb_ioctl, &probe_acb);
if (ret == -ENOTTY) {
s->use_mpath = false;
} else if (ret == -EAGAIN) {
/* The device might be suspended for a table reload, worth retrying */
return true;
}
return ret == 0;
}
#else
static bool coroutine_fn hdev_co_ioctl_sgio_retry(RawPosixAIOData *acb, int ret)
{
return false;
}
#endif /* DM_MPATH_PROBE_PATHS */
static int coroutine_fn
hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
{
BDRVRawState *s = bs->opaque;
RawPosixAIOData acb;
int retries = SG_IO_MAX_RETRIES;
int ret;
ret = fd_open(bs);
@ -4300,7 +4398,11 @@ hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
},
};
return raw_thread_pool_submit(handle_aiocb_ioctl, &acb);
do {
ret = raw_thread_pool_submit(handle_aiocb_ioctl, &acb);
} while (req == SG_IO && retries-- && hdev_co_ioctl_sgio_retry(&acb, ret));
return ret;
}
#endif /* linux */

View file

@ -37,7 +37,9 @@
#include "hw/virtio/virtio-iommu.h"
#include "audio/audio.h"
GlobalProperty hw_compat_10_0[] = {};
GlobalProperty hw_compat_10_0[] = {
{ "scsi-hd", "dpofua", "off" },
};
const size_t hw_compat_10_0_len = G_N_ELEMENTS(hw_compat_10_0);
GlobalProperty hw_compat_9_2[] = {

View file

@ -74,7 +74,7 @@ struct SCSIDiskClass {
*/
DMAIOFunc *dma_readv;
DMAIOFunc *dma_writev;
bool (*need_fua_emulation)(SCSICommand *cmd);
bool (*need_fua)(SCSICommand *cmd);
void (*update_sense)(SCSIRequest *r);
};
@ -85,7 +85,7 @@ typedef struct SCSIDiskReq {
uint32_t sector_count;
uint32_t buflen;
bool started;
bool need_fua_emulation;
bool need_fua;
struct iovec iov;
QEMUIOVector qiov;
BlockAcctCookie acct;
@ -389,24 +389,6 @@ static bool scsi_is_cmd_fua(SCSICommand *cmd)
}
}
static void scsi_write_do_fua(SCSIDiskReq *r)
{
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
assert(r->req.aiocb == NULL);
assert(!r->req.io_canceled);
if (r->need_fua_emulation) {
block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, 0,
BLOCK_ACCT_FLUSH);
r->req.aiocb = blk_aio_flush(s->qdev.conf.blk, scsi_aio_complete, r);
return;
}
scsi_req_complete(&r->req, GOOD);
scsi_req_unref(&r->req);
}
static void scsi_dma_complete_noio(SCSIDiskReq *r, int ret)
{
assert(r->req.aiocb == NULL);
@ -416,12 +398,7 @@ static void scsi_dma_complete_noio(SCSIDiskReq *r, int ret)
r->sector += r->sector_count;
r->sector_count = 0;
if (r->req.cmd.mode == SCSI_XFER_TO_DEV) {
scsi_write_do_fua(r);
return;
} else {
scsi_req_complete(&r->req, GOOD);
}
scsi_req_complete(&r->req, GOOD);
done:
scsi_req_unref(&r->req);
@ -564,7 +541,7 @@ static void scsi_read_data(SCSIRequest *req)
first = !r->started;
r->started = true;
if (first && r->need_fua_emulation) {
if (first && r->need_fua) {
block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, 0,
BLOCK_ACCT_FLUSH);
r->req.aiocb = blk_aio_flush(s->qdev.conf.blk, scsi_do_read_cb, r);
@ -589,8 +566,7 @@ static void scsi_write_complete_noio(SCSIDiskReq *r, int ret)
r->sector += n;
r->sector_count -= n;
if (r->sector_count == 0) {
scsi_write_do_fua(r);
return;
scsi_req_complete(&r->req, GOOD);
} else {
scsi_init_iovec(r, SCSI_DMA_BUF_SIZE);
trace_scsi_disk_write_complete_noio(r->req.tag, r->qiov.size);
@ -623,6 +599,7 @@ static void scsi_write_data(SCSIRequest *req)
SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req);
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
SCSIDiskClass *sdc = (SCSIDiskClass *) object_get_class(OBJECT(s));
BlockCompletionFunc *cb;
/* No data transfer may already be in progress */
assert(r->req.aiocb == NULL);
@ -648,11 +625,10 @@ static void scsi_write_data(SCSIRequest *req)
if (r->req.cmd.buf[0] == VERIFY_10 || r->req.cmd.buf[0] == VERIFY_12 ||
r->req.cmd.buf[0] == VERIFY_16) {
if (r->req.sg) {
scsi_dma_complete_noio(r, 0);
} else {
scsi_write_complete_noio(r, 0);
}
block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, 0,
BLOCK_ACCT_FLUSH);
cb = r->req.sg ? scsi_dma_complete : scsi_write_complete;
r->req.aiocb = blk_aio_flush(s->qdev.conf.blk, cb, r);
return;
}
@ -2391,7 +2367,7 @@ static int32_t scsi_disk_dma_command(SCSIRequest *req, uint8_t *buf)
scsi_check_condition(r, SENSE_CODE(LBA_OUT_OF_RANGE));
return 0;
}
r->need_fua_emulation = sdc->need_fua_emulation(&r->req.cmd);
r->need_fua = sdc->need_fua(&r->req.cmd);
if (r->sector_count == 0) {
scsi_req_complete(&r->req, GOOD);
}
@ -3137,7 +3113,8 @@ BlockAIOCB *scsi_dma_writev(int64_t offset, QEMUIOVector *iov,
{
SCSIDiskReq *r = opaque;
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
return blk_aio_pwritev(s->qdev.conf.blk, offset, iov, 0, cb, cb_opaque);
int flags = r->need_fua ? BDRV_REQ_FUA : 0;
return blk_aio_pwritev(s->qdev.conf.blk, offset, iov, flags, cb, cb_opaque);
}
static char *scsi_property_get_loadparm(Object *obj, Error **errp)
@ -3186,7 +3163,7 @@ static void scsi_disk_base_class_initfn(ObjectClass *klass, const void *data)
device_class_set_legacy_reset(dc, scsi_disk_reset);
sdc->dma_readv = scsi_dma_readv;
sdc->dma_writev = scsi_dma_writev;
sdc->need_fua_emulation = scsi_is_cmd_fua;
sdc->need_fua = scsi_is_cmd_fua;
}
static const TypeInfo scsi_disk_base_info = {
@ -3215,7 +3192,7 @@ static const Property scsi_hd_properties[] = {
DEFINE_PROP_BIT("removable", SCSIDiskState, features,
SCSI_DISK_F_REMOVABLE, false),
DEFINE_PROP_BIT("dpofua", SCSIDiskState, features,
SCSI_DISK_F_DPOFUA, false),
SCSI_DISK_F_DPOFUA, true),
DEFINE_PROP_UINT64("wwn", SCSIDiskState, qdev.wwn, 0),
DEFINE_PROP_UINT64("port_wwn", SCSIDiskState, qdev.port_wwn, 0),
DEFINE_PROP_UINT16("port_index", SCSIDiskState, port_index, 0),
@ -3338,7 +3315,7 @@ static void scsi_block_class_initfn(ObjectClass *klass, const void *data)
sdc->dma_readv = scsi_block_dma_readv;
sdc->dma_writev = scsi_block_dma_writev;
sdc->update_sense = scsi_block_update_sense;
sdc->need_fua_emulation = scsi_block_no_fua;
sdc->need_fua = scsi_block_no_fua;
dc->desc = "SCSI block device passthrough";
device_class_set_props(dc, scsi_block_properties);
dc->vmsd = &vmstate_scsi_disk_state;

View file

@ -4488,10 +4488,10 @@ static void bench_cb(void *opaque, int ret)
*/
b->in_flight++;
b->offset += b->step;
if (b->image_size == 0) {
if (b->image_size <= b->bufsize) {
b->offset = 0;
} else {
b->offset %= b->image_size;
b->offset %= b->image_size - b->bufsize;
}
if (b->write) {
acb = blk_aio_pwritev(b->blk, offset, b->qiov, 0, bench_cb, b);