mirror of
https://github.com/Motorhead1991/qemu.git
synced 2025-08-03 15:53:54 -06:00
Block layer patches
- scsi-disk: Add native FUA write support, enable FUA by default - qemu-img: fix offset calculation in bench - file-posix: allow BLKZEROOUT with -t writeback - file-posix: Probe paths and retry SG_IO on potential path errors -----BEGIN PGP SIGNATURE----- iQJFBAABCAAvFiEE3D3rFZqa+V09dFb+fwmycsiPL9YFAmgvbSARHGt3b2xmQHJl ZGhhdC5jb20ACgkQfwmycsiPL9ZDmhAAm34XjSe/aQJG76Nll18eO8lnoWidsnjX OoSI3//O77dE0AuMOxSVvfve+ZkV4ehZDeo+GeREBZh3TGBPHf+elVfo3XEu/L5u +jWTNgoNe7pdlkxB4pv9boHIy2C7+/odVNtmVggxuNy2kyQoKP0tXHEjqiHjNzB6 jCHXegklC9psXt+wgQGJpFkeI8r2OtABkEP15PtI02gvQyk+spBg0sr6pB5FP2ZY y8YWgOXeXis6QHqARMpqoiRGxhCQrWuKuzwZfeyNmLvYLWCBqmt3Opk+3kMPU6NB KrXZHCJXtaIlLO7YeurPXcnFQsJ94IY7x1TvIza5mgY+ct7mal2uN4u29PnhnLLm eXSgSiXhb8h9PY8KfOQfU9brclijcbV8Rn7sSP6WPX00bHspib275gNG4RLPxnxi AezWfBg1IOYAvliwq99ZY+Ts+faezo7XiNQbfNpZ82pzxhO6IqemSH2IcDS6SpjO mNoGLNmCi3CpQw1bdlnqwiU1OxWsHK2627VGyLZXPnOrYPr+erN/A4Nucxr2bzJk 69dKg0/ekTucTtKiF1uRl/bkhTOHplOogyyuKala6ogsthvBV8jmGAEEqZnMTVtx opAl7MSpVa4FWQ7C2LHi5vYEVNGC8OLXYBy09N73m/Q+Fs+z6bViFy3BQdUO+Nh1 OjmUGzSrqRo= =yjOc -----END PGP SIGNATURE----- Merge tag 'for-upstream' of https://repo.or.cz/qemu/kevin into staging Block layer patches - scsi-disk: Add native FUA write support, enable FUA by default - qemu-img: fix offset calculation in bench - file-posix: allow BLKZEROOUT with -t writeback - file-posix: Probe paths and retry SG_IO on potential path errors # -----BEGIN PGP SIGNATURE----- # # iQJFBAABCAAvFiEE3D3rFZqa+V09dFb+fwmycsiPL9YFAmgvbSARHGt3b2xmQHJl # ZGhhdC5jb20ACgkQfwmycsiPL9ZDmhAAm34XjSe/aQJG76Nll18eO8lnoWidsnjX # OoSI3//O77dE0AuMOxSVvfve+ZkV4ehZDeo+GeREBZh3TGBPHf+elVfo3XEu/L5u # +jWTNgoNe7pdlkxB4pv9boHIy2C7+/odVNtmVggxuNy2kyQoKP0tXHEjqiHjNzB6 # jCHXegklC9psXt+wgQGJpFkeI8r2OtABkEP15PtI02gvQyk+spBg0sr6pB5FP2ZY # y8YWgOXeXis6QHqARMpqoiRGxhCQrWuKuzwZfeyNmLvYLWCBqmt3Opk+3kMPU6NB # KrXZHCJXtaIlLO7YeurPXcnFQsJ94IY7x1TvIza5mgY+ct7mal2uN4u29PnhnLLm # eXSgSiXhb8h9PY8KfOQfU9brclijcbV8Rn7sSP6WPX00bHspib275gNG4RLPxnxi # AezWfBg1IOYAvliwq99ZY+Ts+faezo7XiNQbfNpZ82pzxhO6IqemSH2IcDS6SpjO # mNoGLNmCi3CpQw1bdlnqwiU1OxWsHK2627VGyLZXPnOrYPr+erN/A4Nucxr2bzJk # 69dKg0/ekTucTtKiF1uRl/bkhTOHplOogyyuKala6ogsthvBV8jmGAEEqZnMTVtx # opAl7MSpVa4FWQ7C2LHi5vYEVNGC8OLXYBy09N73m/Q+Fs+z6bViFy3BQdUO+Nh1 # OjmUGzSrqRo= # =yjOc # -----END PGP SIGNATURE----- # gpg: Signature made Thu 22 May 2025 14:29:52 EDT # gpg: using RSA key DC3DEB159A9AF95D3D7456FE7F09B272C88F2FD6 # gpg: issuer "kwolf@redhat.com" # gpg: Good signature from "Kevin Wolf <kwolf@redhat.com>" [full] # Primary key fingerprint: DC3D EB15 9A9A F95D 3D74 56FE 7F09 B272 C88F 2FD6 * tag 'for-upstream' of https://repo.or.cz/qemu/kevin: file-posix: Probe paths and retry SG_IO on potential path errors file-posix: allow BLKZEROOUT with -t writeback qemu-img: fix offset calculation in bench scsi-disk: Advertise FUA support by default scsi-disk: Add native FUA write support Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
This commit is contained in:
commit
668df86ee8
4 changed files with 135 additions and 54 deletions
|
@ -41,6 +41,7 @@
|
|||
|
||||
#include "scsi/pr-manager.h"
|
||||
#include "scsi/constants.h"
|
||||
#include "scsi/utils.h"
|
||||
|
||||
#if defined(__APPLE__) && (__MACH__)
|
||||
#include <sys/ioctl.h>
|
||||
|
@ -72,6 +73,7 @@
|
|||
#include <linux/blkzoned.h>
|
||||
#endif
|
||||
#include <linux/cdrom.h>
|
||||
#include <linux/dm-ioctl.h>
|
||||
#include <linux/fd.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/hdreg.h>
|
||||
|
@ -138,6 +140,22 @@
|
|||
#define RAW_LOCK_PERM_BASE 100
|
||||
#define RAW_LOCK_SHARED_BASE 200
|
||||
|
||||
/*
|
||||
* Multiple retries are mostly meant for two separate scenarios:
|
||||
*
|
||||
* - DM_MPATH_PROBE_PATHS returns success, but before SG_IO completes, another
|
||||
* path goes down.
|
||||
*
|
||||
* - DM_MPATH_PROBE_PATHS failed all paths in the current path group, so we have
|
||||
* to send another SG_IO to switch to another path group to probe the paths in
|
||||
* it.
|
||||
*
|
||||
* Even if each path is in a separate path group (path_grouping_policy set to
|
||||
* failover), it's rare to have more than eight path groups - and even then
|
||||
* pretty unlikely that only bad path groups would be chosen in eight retries.
|
||||
*/
|
||||
#define SG_IO_MAX_RETRIES 8
|
||||
|
||||
typedef struct BDRVRawState {
|
||||
int fd;
|
||||
bool use_lock;
|
||||
|
@ -165,6 +183,7 @@ typedef struct BDRVRawState {
|
|||
bool use_linux_aio:1;
|
||||
bool has_laio_fdsync:1;
|
||||
bool use_linux_io_uring:1;
|
||||
bool use_mpath:1;
|
||||
int page_cache_inconsistent; /* errno from fdatasync failure */
|
||||
bool has_fallocate;
|
||||
bool needs_alignment;
|
||||
|
@ -785,17 +804,6 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
|
|||
}
|
||||
#endif
|
||||
|
||||
if (S_ISBLK(st.st_mode)) {
|
||||
#ifdef __linux__
|
||||
/* On Linux 3.10, BLKDISCARD leaves stale data in the page cache. Do
|
||||
* not rely on the contents of discarded blocks unless using O_DIRECT.
|
||||
* Same for BLKZEROOUT.
|
||||
*/
|
||||
if (!(bs->open_flags & BDRV_O_NOCACHE)) {
|
||||
s->has_write_zeroes = false;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#ifdef __FreeBSD__
|
||||
if (S_ISCHR(st.st_mode)) {
|
||||
/*
|
||||
|
@ -4264,15 +4272,105 @@ hdev_open_Mac_error:
|
|||
/* Since this does ioctl the device must be already opened */
|
||||
bs->sg = hdev_is_sg(bs);
|
||||
|
||||
/* sg devices aren't even block devices and can't use dm-mpath */
|
||||
s->use_mpath = !bs->sg;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if defined(__linux__)
|
||||
#if defined(DM_MPATH_PROBE_PATHS)
|
||||
static bool coroutine_fn sgio_path_error(int ret, sg_io_hdr_t *io_hdr)
|
||||
{
|
||||
if (ret < 0) {
|
||||
switch (ret) {
|
||||
case -ENODEV:
|
||||
return true;
|
||||
case -EAGAIN:
|
||||
/*
|
||||
* The device is probably suspended. This happens while the dm table
|
||||
* is reloaded, e.g. because a path is added or removed. This is an
|
||||
* operation that should complete within 1ms, so just wait a bit and
|
||||
* retry.
|
||||
*
|
||||
* If the device was suspended for another reason, we'll wait and
|
||||
* retry SG_IO_MAX_RETRIES times. This is a tolerable delay before
|
||||
* we return an error and potentially stop the VM.
|
||||
*/
|
||||
qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000);
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (io_hdr->host_status != SCSI_HOST_OK) {
|
||||
return true;
|
||||
}
|
||||
|
||||
switch (io_hdr->status) {
|
||||
case GOOD:
|
||||
case CONDITION_GOOD:
|
||||
case INTERMEDIATE_GOOD:
|
||||
case INTERMEDIATE_C_GOOD:
|
||||
case RESERVATION_CONFLICT:
|
||||
case COMMAND_TERMINATED:
|
||||
return false;
|
||||
case CHECK_CONDITION:
|
||||
return !scsi_sense_buf_is_guest_recoverable(io_hdr->sbp,
|
||||
io_hdr->mx_sb_len);
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
static bool coroutine_fn hdev_co_ioctl_sgio_retry(RawPosixAIOData *acb, int ret)
|
||||
{
|
||||
BDRVRawState *s = acb->bs->opaque;
|
||||
RawPosixAIOData probe_acb;
|
||||
|
||||
if (!s->use_mpath) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!sgio_path_error(ret, acb->ioctl.buf)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
probe_acb = (RawPosixAIOData) {
|
||||
.bs = acb->bs,
|
||||
.aio_type = QEMU_AIO_IOCTL,
|
||||
.aio_fildes = s->fd,
|
||||
.aio_offset = 0,
|
||||
.ioctl = {
|
||||
.buf = NULL,
|
||||
.cmd = DM_MPATH_PROBE_PATHS,
|
||||
},
|
||||
};
|
||||
|
||||
ret = raw_thread_pool_submit(handle_aiocb_ioctl, &probe_acb);
|
||||
if (ret == -ENOTTY) {
|
||||
s->use_mpath = false;
|
||||
} else if (ret == -EAGAIN) {
|
||||
/* The device might be suspended for a table reload, worth retrying */
|
||||
return true;
|
||||
}
|
||||
|
||||
return ret == 0;
|
||||
}
|
||||
#else
|
||||
static bool coroutine_fn hdev_co_ioctl_sgio_retry(RawPosixAIOData *acb, int ret)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif /* DM_MPATH_PROBE_PATHS */
|
||||
|
||||
static int coroutine_fn
|
||||
hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
|
||||
{
|
||||
BDRVRawState *s = bs->opaque;
|
||||
RawPosixAIOData acb;
|
||||
int retries = SG_IO_MAX_RETRIES;
|
||||
int ret;
|
||||
|
||||
ret = fd_open(bs);
|
||||
|
@ -4300,7 +4398,11 @@ hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
|
|||
},
|
||||
};
|
||||
|
||||
return raw_thread_pool_submit(handle_aiocb_ioctl, &acb);
|
||||
do {
|
||||
ret = raw_thread_pool_submit(handle_aiocb_ioctl, &acb);
|
||||
} while (req == SG_IO && retries-- && hdev_co_ioctl_sgio_retry(&acb, ret));
|
||||
|
||||
return ret;
|
||||
}
|
||||
#endif /* linux */
|
||||
|
||||
|
|
|
@ -37,7 +37,9 @@
|
|||
#include "hw/virtio/virtio-iommu.h"
|
||||
#include "audio/audio.h"
|
||||
|
||||
GlobalProperty hw_compat_10_0[] = {};
|
||||
GlobalProperty hw_compat_10_0[] = {
|
||||
{ "scsi-hd", "dpofua", "off" },
|
||||
};
|
||||
const size_t hw_compat_10_0_len = G_N_ELEMENTS(hw_compat_10_0);
|
||||
|
||||
GlobalProperty hw_compat_9_2[] = {
|
||||
|
|
|
@ -74,7 +74,7 @@ struct SCSIDiskClass {
|
|||
*/
|
||||
DMAIOFunc *dma_readv;
|
||||
DMAIOFunc *dma_writev;
|
||||
bool (*need_fua_emulation)(SCSICommand *cmd);
|
||||
bool (*need_fua)(SCSICommand *cmd);
|
||||
void (*update_sense)(SCSIRequest *r);
|
||||
};
|
||||
|
||||
|
@ -85,7 +85,7 @@ typedef struct SCSIDiskReq {
|
|||
uint32_t sector_count;
|
||||
uint32_t buflen;
|
||||
bool started;
|
||||
bool need_fua_emulation;
|
||||
bool need_fua;
|
||||
struct iovec iov;
|
||||
QEMUIOVector qiov;
|
||||
BlockAcctCookie acct;
|
||||
|
@ -389,24 +389,6 @@ static bool scsi_is_cmd_fua(SCSICommand *cmd)
|
|||
}
|
||||
}
|
||||
|
||||
static void scsi_write_do_fua(SCSIDiskReq *r)
|
||||
{
|
||||
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
|
||||
|
||||
assert(r->req.aiocb == NULL);
|
||||
assert(!r->req.io_canceled);
|
||||
|
||||
if (r->need_fua_emulation) {
|
||||
block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, 0,
|
||||
BLOCK_ACCT_FLUSH);
|
||||
r->req.aiocb = blk_aio_flush(s->qdev.conf.blk, scsi_aio_complete, r);
|
||||
return;
|
||||
}
|
||||
|
||||
scsi_req_complete(&r->req, GOOD);
|
||||
scsi_req_unref(&r->req);
|
||||
}
|
||||
|
||||
static void scsi_dma_complete_noio(SCSIDiskReq *r, int ret)
|
||||
{
|
||||
assert(r->req.aiocb == NULL);
|
||||
|
@ -416,12 +398,7 @@ static void scsi_dma_complete_noio(SCSIDiskReq *r, int ret)
|
|||
|
||||
r->sector += r->sector_count;
|
||||
r->sector_count = 0;
|
||||
if (r->req.cmd.mode == SCSI_XFER_TO_DEV) {
|
||||
scsi_write_do_fua(r);
|
||||
return;
|
||||
} else {
|
||||
scsi_req_complete(&r->req, GOOD);
|
||||
}
|
||||
scsi_req_complete(&r->req, GOOD);
|
||||
|
||||
done:
|
||||
scsi_req_unref(&r->req);
|
||||
|
@ -564,7 +541,7 @@ static void scsi_read_data(SCSIRequest *req)
|
|||
|
||||
first = !r->started;
|
||||
r->started = true;
|
||||
if (first && r->need_fua_emulation) {
|
||||
if (first && r->need_fua) {
|
||||
block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, 0,
|
||||
BLOCK_ACCT_FLUSH);
|
||||
r->req.aiocb = blk_aio_flush(s->qdev.conf.blk, scsi_do_read_cb, r);
|
||||
|
@ -589,8 +566,7 @@ static void scsi_write_complete_noio(SCSIDiskReq *r, int ret)
|
|||
r->sector += n;
|
||||
r->sector_count -= n;
|
||||
if (r->sector_count == 0) {
|
||||
scsi_write_do_fua(r);
|
||||
return;
|
||||
scsi_req_complete(&r->req, GOOD);
|
||||
} else {
|
||||
scsi_init_iovec(r, SCSI_DMA_BUF_SIZE);
|
||||
trace_scsi_disk_write_complete_noio(r->req.tag, r->qiov.size);
|
||||
|
@ -623,6 +599,7 @@ static void scsi_write_data(SCSIRequest *req)
|
|||
SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req);
|
||||
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
|
||||
SCSIDiskClass *sdc = (SCSIDiskClass *) object_get_class(OBJECT(s));
|
||||
BlockCompletionFunc *cb;
|
||||
|
||||
/* No data transfer may already be in progress */
|
||||
assert(r->req.aiocb == NULL);
|
||||
|
@ -648,11 +625,10 @@ static void scsi_write_data(SCSIRequest *req)
|
|||
|
||||
if (r->req.cmd.buf[0] == VERIFY_10 || r->req.cmd.buf[0] == VERIFY_12 ||
|
||||
r->req.cmd.buf[0] == VERIFY_16) {
|
||||
if (r->req.sg) {
|
||||
scsi_dma_complete_noio(r, 0);
|
||||
} else {
|
||||
scsi_write_complete_noio(r, 0);
|
||||
}
|
||||
block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, 0,
|
||||
BLOCK_ACCT_FLUSH);
|
||||
cb = r->req.sg ? scsi_dma_complete : scsi_write_complete;
|
||||
r->req.aiocb = blk_aio_flush(s->qdev.conf.blk, cb, r);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -2391,7 +2367,7 @@ static int32_t scsi_disk_dma_command(SCSIRequest *req, uint8_t *buf)
|
|||
scsi_check_condition(r, SENSE_CODE(LBA_OUT_OF_RANGE));
|
||||
return 0;
|
||||
}
|
||||
r->need_fua_emulation = sdc->need_fua_emulation(&r->req.cmd);
|
||||
r->need_fua = sdc->need_fua(&r->req.cmd);
|
||||
if (r->sector_count == 0) {
|
||||
scsi_req_complete(&r->req, GOOD);
|
||||
}
|
||||
|
@ -3137,7 +3113,8 @@ BlockAIOCB *scsi_dma_writev(int64_t offset, QEMUIOVector *iov,
|
|||
{
|
||||
SCSIDiskReq *r = opaque;
|
||||
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
|
||||
return blk_aio_pwritev(s->qdev.conf.blk, offset, iov, 0, cb, cb_opaque);
|
||||
int flags = r->need_fua ? BDRV_REQ_FUA : 0;
|
||||
return blk_aio_pwritev(s->qdev.conf.blk, offset, iov, flags, cb, cb_opaque);
|
||||
}
|
||||
|
||||
static char *scsi_property_get_loadparm(Object *obj, Error **errp)
|
||||
|
@ -3186,7 +3163,7 @@ static void scsi_disk_base_class_initfn(ObjectClass *klass, const void *data)
|
|||
device_class_set_legacy_reset(dc, scsi_disk_reset);
|
||||
sdc->dma_readv = scsi_dma_readv;
|
||||
sdc->dma_writev = scsi_dma_writev;
|
||||
sdc->need_fua_emulation = scsi_is_cmd_fua;
|
||||
sdc->need_fua = scsi_is_cmd_fua;
|
||||
}
|
||||
|
||||
static const TypeInfo scsi_disk_base_info = {
|
||||
|
@ -3215,7 +3192,7 @@ static const Property scsi_hd_properties[] = {
|
|||
DEFINE_PROP_BIT("removable", SCSIDiskState, features,
|
||||
SCSI_DISK_F_REMOVABLE, false),
|
||||
DEFINE_PROP_BIT("dpofua", SCSIDiskState, features,
|
||||
SCSI_DISK_F_DPOFUA, false),
|
||||
SCSI_DISK_F_DPOFUA, true),
|
||||
DEFINE_PROP_UINT64("wwn", SCSIDiskState, qdev.wwn, 0),
|
||||
DEFINE_PROP_UINT64("port_wwn", SCSIDiskState, qdev.port_wwn, 0),
|
||||
DEFINE_PROP_UINT16("port_index", SCSIDiskState, port_index, 0),
|
||||
|
@ -3338,7 +3315,7 @@ static void scsi_block_class_initfn(ObjectClass *klass, const void *data)
|
|||
sdc->dma_readv = scsi_block_dma_readv;
|
||||
sdc->dma_writev = scsi_block_dma_writev;
|
||||
sdc->update_sense = scsi_block_update_sense;
|
||||
sdc->need_fua_emulation = scsi_block_no_fua;
|
||||
sdc->need_fua = scsi_block_no_fua;
|
||||
dc->desc = "SCSI block device passthrough";
|
||||
device_class_set_props(dc, scsi_block_properties);
|
||||
dc->vmsd = &vmstate_scsi_disk_state;
|
||||
|
|
|
@ -4488,10 +4488,10 @@ static void bench_cb(void *opaque, int ret)
|
|||
*/
|
||||
b->in_flight++;
|
||||
b->offset += b->step;
|
||||
if (b->image_size == 0) {
|
||||
if (b->image_size <= b->bufsize) {
|
||||
b->offset = 0;
|
||||
} else {
|
||||
b->offset %= b->image_size;
|
||||
b->offset %= b->image_size - b->bufsize;
|
||||
}
|
||||
if (b->write) {
|
||||
acb = blk_aio_pwritev(b->blk, offset, b->qiov, 0, bench_cb, b);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue