block/block-backend: add block layer APIs resembling Linux ZonedBlockDevice ioctls

Add zoned device option to host_device BlockDriver. It will be presented only
for zoned host block devices. By adding zone management operations to the
host_block_device BlockDriver, users can use the new block layer APIs
including Report Zone and four zone management operations
(open, close, finish, reset, reset_all).

Qemu-io uses the new APIs to perform zoned storage commands of the device:
zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs),
zone_finish(zf).

For example, to test zone_report, use following command:
$ ./build/qemu-io --image-opts -n driver=host_device, filename=/dev/nullb0
-c "zrp offset nr_zones"

Signed-off-by: Sam Li <faithilikerun@gmail.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Acked-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20230508045533.175575-4-faithilikerun@gmail.com
Message-id: 20230324090605.28361-4-faithilikerun@gmail.com
[Adjust commit message prefix as suggested by Philippe Mathieu-Daudé
<philmd@linaro.org> and remove spurious ret = -errno in
raw_co_zone_mgmt().
--Stefan]
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
This commit is contained in:
Sam Li 2023-05-08 12:55:28 +08:00 committed by Stefan Hajnoczi
parent a735b56e49
commit 6d43eaa396
9 changed files with 696 additions and 3 deletions

View file

@ -68,6 +68,9 @@
#include <sys/param.h>
#include <sys/syscall.h>
#include <sys/vfs.h>
#if defined(CONFIG_BLKZONED)
#include <linux/blkzoned.h>
#endif
#include <linux/cdrom.h>
#include <linux/fd.h>
#include <linux/fs.h>
@ -216,6 +219,13 @@ typedef struct RawPosixAIOData {
PreallocMode prealloc;
Error **errp;
} truncate;
struct {
unsigned int *nr_zones;
BlockZoneDescriptor *zones;
} zone_report;
struct {
unsigned long op;
} zone_mgmt;
};
} RawPosixAIOData;
@ -1234,6 +1244,7 @@ static int get_sysfs_str_val(struct stat *st, const char *attribute,
}
#endif
#if defined(CONFIG_BLKZONED)
static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
{
g_autofree char *val = NULL;
@ -1255,6 +1266,7 @@ static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
}
return 0;
}
#endif /* defined(CONFIG_BLKZONED) */
/*
* Get a sysfs attribute value as a long integer.
@ -1298,6 +1310,7 @@ static int hdev_get_max_segments(int fd, struct stat *st)
#endif
}
#if defined(CONFIG_BLKZONED)
static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
Error **errp)
{
@ -1311,7 +1324,54 @@ static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
return;
}
bs->bl.zoned = zoned;
ret = get_sysfs_long_val(st, "max_open_zones");
if (ret >= 0) {
bs->bl.max_open_zones = ret;
}
ret = get_sysfs_long_val(st, "max_active_zones");
if (ret >= 0) {
bs->bl.max_active_zones = ret;
}
/*
* The zoned device must at least have zone size and nr_zones fields.
*/
ret = get_sysfs_long_val(st, "chunk_sectors");
if (ret < 0) {
error_setg_errno(errp, -ret, "Unable to read chunk_sectors "
"sysfs attribute");
return;
} else if (!ret) {
error_setg(errp, "Read 0 from chunk_sectors sysfs attribute");
return;
}
bs->bl.zone_size = ret << BDRV_SECTOR_BITS;
ret = get_sysfs_long_val(st, "nr_zones");
if (ret < 0) {
error_setg_errno(errp, -ret, "Unable to read nr_zones "
"sysfs attribute");
return;
} else if (!ret) {
error_setg(errp, "Read 0 from nr_zones sysfs attribute");
return;
}
bs->bl.nr_zones = ret;
ret = get_sysfs_long_val(st, "zone_append_max_bytes");
if (ret > 0) {
bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS;
}
}
#else /* !defined(CONFIG_BLKZONED) */
static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
Error **errp)
{
bs->bl.zoned = BLK_Z_NONE;
}
#endif /* !defined(CONFIG_BLKZONED) */
static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
{
@ -1379,9 +1439,12 @@ static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
BDRVRawState *s = bs->opaque;
int ret;
/* If DASD, get blocksizes */
/* If DASD or zoned devices, get blocksizes */
if (check_for_dasd(s->fd) < 0) {
return -ENOTSUP;
/* zoned devices are not DASD */
if (bs->bl.zoned == BLK_Z_NONE) {
return -ENOTSUP;
}
}
ret = probe_logical_blocksize(s->fd, &bsz->log);
if (ret < 0) {
@ -1849,6 +1912,147 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
}
#endif
/*
* parse_zone - Fill a zone descriptor
*/
#if defined(CONFIG_BLKZONED)
static inline int parse_zone(struct BlockZoneDescriptor *zone,
const struct blk_zone *blkz) {
zone->start = blkz->start << BDRV_SECTOR_BITS;
zone->length = blkz->len << BDRV_SECTOR_BITS;
zone->wp = blkz->wp << BDRV_SECTOR_BITS;
#ifdef HAVE_BLK_ZONE_REP_CAPACITY
zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
#else
zone->cap = blkz->len << BDRV_SECTOR_BITS;
#endif
switch (blkz->type) {
case BLK_ZONE_TYPE_SEQWRITE_REQ:
zone->type = BLK_ZT_SWR;
break;
case BLK_ZONE_TYPE_SEQWRITE_PREF:
zone->type = BLK_ZT_SWP;
break;
case BLK_ZONE_TYPE_CONVENTIONAL:
zone->type = BLK_ZT_CONV;
break;
default:
error_report("Unsupported zone type: 0x%x", blkz->type);
return -ENOTSUP;
}
switch (blkz->cond) {
case BLK_ZONE_COND_NOT_WP:
zone->state = BLK_ZS_NOT_WP;
break;
case BLK_ZONE_COND_EMPTY:
zone->state = BLK_ZS_EMPTY;
break;
case BLK_ZONE_COND_IMP_OPEN:
zone->state = BLK_ZS_IOPEN;
break;
case BLK_ZONE_COND_EXP_OPEN:
zone->state = BLK_ZS_EOPEN;
break;
case BLK_ZONE_COND_CLOSED:
zone->state = BLK_ZS_CLOSED;
break;
case BLK_ZONE_COND_READONLY:
zone->state = BLK_ZS_RDONLY;
break;
case BLK_ZONE_COND_FULL:
zone->state = BLK_ZS_FULL;
break;
case BLK_ZONE_COND_OFFLINE:
zone->state = BLK_ZS_OFFLINE;
break;
default:
error_report("Unsupported zone state: 0x%x", blkz->cond);
return -ENOTSUP;
}
return 0;
}
#endif
#if defined(CONFIG_BLKZONED)
static int handle_aiocb_zone_report(void *opaque)
{
RawPosixAIOData *aiocb = opaque;
int fd = aiocb->aio_fildes;
unsigned int *nr_zones = aiocb->zone_report.nr_zones;
BlockZoneDescriptor *zones = aiocb->zone_report.zones;
/* zoned block devices use 512-byte sectors */
uint64_t sector = aiocb->aio_offset / 512;
struct blk_zone *blkz;
size_t rep_size;
unsigned int nrz;
int ret;
unsigned int n = 0, i = 0;
nrz = *nr_zones;
rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
g_autofree struct blk_zone_report *rep = NULL;
rep = g_malloc(rep_size);
blkz = (struct blk_zone *)(rep + 1);
while (n < nrz) {
memset(rep, 0, rep_size);
rep->sector = sector;
rep->nr_zones = nrz - n;
do {
ret = ioctl(fd, BLKREPORTZONE, rep);
} while (ret != 0 && errno == EINTR);
if (ret != 0) {
error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
fd, sector, errno);
return -errno;
}
if (!rep->nr_zones) {
break;
}
for (i = 0; i < rep->nr_zones; i++, n++) {
ret = parse_zone(&zones[n], &blkz[i]);
if (ret != 0) {
return ret;
}
/* The next report should start after the last zone reported */
sector = blkz[i].start + blkz[i].len;
}
}
*nr_zones = n;
return 0;
}
#endif
#if defined(CONFIG_BLKZONED)
static int handle_aiocb_zone_mgmt(void *opaque)
{
RawPosixAIOData *aiocb = opaque;
int fd = aiocb->aio_fildes;
uint64_t sector = aiocb->aio_offset / 512;
int64_t nr_sectors = aiocb->aio_nbytes / 512;
struct blk_zone_range range;
int ret;
/* Execute the operation */
range.sector = sector;
range.nr_sectors = nr_sectors;
do {
ret = ioctl(fd, aiocb->zone_mgmt.op, &range);
} while (ret != 0 && errno == EINTR);
return ret;
}
#endif
static int handle_aiocb_copy_range(void *opaque)
{
RawPosixAIOData *aiocb = opaque;
@ -3028,6 +3232,104 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
}
}
/*
* zone report - Get a zone block device's information in the form
* of an array of zone descriptors.
* zones is an array of zone descriptors to hold zone information on reply;
* offset can be any byte within the entire size of the device;
* nr_zones is the maxium number of sectors the command should operate on.
*/
#if defined(CONFIG_BLKZONED)
static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
unsigned int *nr_zones,
BlockZoneDescriptor *zones) {
BDRVRawState *s = bs->opaque;
RawPosixAIOData acb = (RawPosixAIOData) {
.bs = bs,
.aio_fildes = s->fd,
.aio_type = QEMU_AIO_ZONE_REPORT,
.aio_offset = offset,
.zone_report = {
.nr_zones = nr_zones,
.zones = zones,
},
};
return raw_thread_pool_submit(handle_aiocb_zone_report, &acb);
}
#endif
/*
* zone management operations - Execute an operation on a zone
*/
#if defined(CONFIG_BLKZONED)
static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
int64_t offset, int64_t len) {
BDRVRawState *s = bs->opaque;
RawPosixAIOData acb;
int64_t zone_size, zone_size_mask;
const char *op_name;
unsigned long zo;
int ret;
int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
zone_size = bs->bl.zone_size;
zone_size_mask = zone_size - 1;
if (offset & zone_size_mask) {
error_report("sector offset %" PRId64 " is not aligned to zone size "
"%" PRId64 "", offset / 512, zone_size / 512);
return -EINVAL;
}
if (((offset + len) < capacity && len & zone_size_mask) ||
offset + len > capacity) {
error_report("number of sectors %" PRId64 " is not aligned to zone size"
" %" PRId64 "", len / 512, zone_size / 512);
return -EINVAL;
}
switch (op) {
case BLK_ZO_OPEN:
op_name = "BLKOPENZONE";
zo = BLKOPENZONE;
break;
case BLK_ZO_CLOSE:
op_name = "BLKCLOSEZONE";
zo = BLKCLOSEZONE;
break;
case BLK_ZO_FINISH:
op_name = "BLKFINISHZONE";
zo = BLKFINISHZONE;
break;
case BLK_ZO_RESET:
op_name = "BLKRESETZONE";
zo = BLKRESETZONE;
break;
default:
error_report("Unsupported zone op: 0x%x", op);
return -ENOTSUP;
}
acb = (RawPosixAIOData) {
.bs = bs,
.aio_fildes = s->fd,
.aio_type = QEMU_AIO_ZONE_MGMT,
.aio_offset = offset,
.aio_nbytes = len,
.zone_mgmt = {
.op = zo,
},
};
ret = raw_thread_pool_submit(handle_aiocb_zone_mgmt, &acb);
if (ret != 0) {
error_report("ioctl %s failed %d", op_name, ret);
}
return ret;
}
#endif
static coroutine_fn int
raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
bool blkdev)
@ -3783,6 +4085,13 @@ static BlockDriver bdrv_host_device = {
#ifdef __linux__
.bdrv_co_ioctl = hdev_co_ioctl,
#endif
/* zoned device */
#if defined(CONFIG_BLKZONED)
/* zone management operations */
.bdrv_co_zone_report = raw_co_zone_report,
.bdrv_co_zone_mgmt = raw_co_zone_mgmt,
#endif
};
#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)