Block patches:

- One patch to make qcow2's discard-no-unref option do better what it is
   supposed to do (i.e. prevent fragmentation)
 - Two fixes for zoned requests
 -----BEGIN PGP SIGNATURE-----
 
 iQJGBAABCAAwFiEEy2LXoO44KeRfAE00ofpA0JgBnN8FAmVJHbgSHGhyZWl0ekBy
 ZWRoYXQuY29tAAoJEKH6QNCYAZzfLn4QAKxuUYZaXirv6K4U2tW4aAJtc5uESdwv
 WYhG7YU7MleBGCY0fRoih5thrPrzRLC8o1QhbRcA36+/PAZf4BYrJEfqLUdzuN5x
 6Vb1n3NRUzPD1+VfL/B9hVZhFbtTOUZuxPGEqCoHAmqBaeKuYRT1bLZbtRtPVLSk
 5eTMiyrpRMlBWc7O71eGKLqU4k0vAznwHBGf2Z93qWAsKcRZCwbAWYa7Q6rJ9jJ8
 1jNsQuAk0p74/uGEpFhoEVrFEcV6pMbI4+jB9i0t9YYxT0tLIdIX1VUx+AHJfItk
 IF2stB6SFOaAy2W3Fn+0oJvz40aMLzg9VjEeTpGmdlKC67ZTYa6Obwzy5WNLPIap
 k7VUheUEe8qoKUtxQNxGLR/HKEJSFXyhU0lgAGxE1gl2xc1QFFFsrimpwFd3d37j
 3PwfhjARHonf4ZXgsvtIjb7nG9seMZYO7Vht0OztJyW8c2XN5OFVPir9xLbd9VUg
 wZNGB8jAsHgj77+S/mRIwpP+laKL8wB7zYZ1mgFI98QJIYqL8tGdV/IiUhLljHzc
 XAmwekOhBMMbgHhliBy9zDuTy59+zZ0FoxZPn/JvBjqBAkEnz9EbhHxi2imQg+1d
 XSoLbx1X1yEbepWz8mCGiveLIPkt+3qMJuuQF76nURaA+nm3tCl/nKca6QLnVKzU
 2QtPWS0qRmwd
 =5w7S
 -----END PGP SIGNATURE-----

Merge tag 'pull-block-2023-11-06' of https://gitlab.com/hreitz/qemu into staging

Block patches:
- One patch to make qcow2's discard-no-unref option do better what it is
  supposed to do (i.e. prevent fragmentation)
- Two fixes for zoned requests

# -----BEGIN PGP SIGNATURE-----
#
# iQJGBAABCAAwFiEEy2LXoO44KeRfAE00ofpA0JgBnN8FAmVJHbgSHGhyZWl0ekBy
# ZWRoYXQuY29tAAoJEKH6QNCYAZzfLn4QAKxuUYZaXirv6K4U2tW4aAJtc5uESdwv
# WYhG7YU7MleBGCY0fRoih5thrPrzRLC8o1QhbRcA36+/PAZf4BYrJEfqLUdzuN5x
# 6Vb1n3NRUzPD1+VfL/B9hVZhFbtTOUZuxPGEqCoHAmqBaeKuYRT1bLZbtRtPVLSk
# 5eTMiyrpRMlBWc7O71eGKLqU4k0vAznwHBGf2Z93qWAsKcRZCwbAWYa7Q6rJ9jJ8
# 1jNsQuAk0p74/uGEpFhoEVrFEcV6pMbI4+jB9i0t9YYxT0tLIdIX1VUx+AHJfItk
# IF2stB6SFOaAy2W3Fn+0oJvz40aMLzg9VjEeTpGmdlKC67ZTYa6Obwzy5WNLPIap
# k7VUheUEe8qoKUtxQNxGLR/HKEJSFXyhU0lgAGxE1gl2xc1QFFFsrimpwFd3d37j
# 3PwfhjARHonf4ZXgsvtIjb7nG9seMZYO7Vht0OztJyW8c2XN5OFVPir9xLbd9VUg
# wZNGB8jAsHgj77+S/mRIwpP+laKL8wB7zYZ1mgFI98QJIYqL8tGdV/IiUhLljHzc
# XAmwekOhBMMbgHhliBy9zDuTy59+zZ0FoxZPn/JvBjqBAkEnz9EbhHxi2imQg+1d
# XSoLbx1X1yEbepWz8mCGiveLIPkt+3qMJuuQF76nURaA+nm3tCl/nKca6QLnVKzU
# 2QtPWS0qRmwd
# =5w7S
# -----END PGP SIGNATURE-----
# gpg: Signature made Tue 07 Nov 2023 01:09:12 HKT
# gpg:                using RSA key CB62D7A0EE3829E45F004D34A1FA40D098019CDF
# gpg:                issuer "hreitz@redhat.com"
# gpg: Good signature from "Hanna Reitz <hreitz@redhat.com>" [unknown]
# gpg: WARNING: The key's User ID is not certified with a trusted signature!
# gpg:          There is no indication that the signature belongs to the owner.
# Primary key fingerprint: CB62 D7A0 EE38 29E4 5F00  4D34 A1FA 40D0 9801 9CDF

* tag 'pull-block-2023-11-06' of https://gitlab.com/hreitz/qemu:
  file-posix: fix over-writing of returning zone_append offset
  block/file-posix: fix update_zones_wp() caller
  qcow2: keep reference on zeroize with discard-no-unref enabled

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
This commit is contained in:
Stefan Hajnoczi 2023-11-07 09:42:17 +08:00
commit 80aaef96b1
4 changed files with 51 additions and 28 deletions

View file

@ -160,7 +160,6 @@ typedef struct BDRVRawState {
bool has_write_zeroes:1; bool has_write_zeroes:1;
bool use_linux_aio:1; bool use_linux_aio:1;
bool use_linux_io_uring:1; bool use_linux_io_uring:1;
int64_t *offset; /* offset of zone append operation */
int page_cache_inconsistent; /* errno from fdatasync failure */ int page_cache_inconsistent; /* errno from fdatasync failure */
bool has_fallocate; bool has_fallocate;
bool needs_alignment; bool needs_alignment;
@ -2445,12 +2444,13 @@ static bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
return true; return true;
} }
static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
uint64_t bytes, QEMUIOVector *qiov, int type) uint64_t bytes, QEMUIOVector *qiov, int type)
{ {
BDRVRawState *s = bs->opaque; BDRVRawState *s = bs->opaque;
RawPosixAIOData acb; RawPosixAIOData acb;
int ret; int ret;
uint64_t offset = *offset_ptr;
if (fd_open(bs) < 0) if (fd_open(bs) < 0)
return -EIO; return -EIO;
@ -2513,8 +2513,8 @@ out:
uint64_t *wp = &wps->wp[offset / bs->bl.zone_size]; uint64_t *wp = &wps->wp[offset / bs->bl.zone_size];
if (!BDRV_ZT_IS_CONV(*wp)) { if (!BDRV_ZT_IS_CONV(*wp)) {
if (type & QEMU_AIO_ZONE_APPEND) { if (type & QEMU_AIO_ZONE_APPEND) {
*s->offset = *wp; *offset_ptr = *wp;
trace_zbd_zone_append_complete(bs, *s->offset trace_zbd_zone_append_complete(bs, *offset_ptr
>> BDRV_SECTOR_BITS); >> BDRV_SECTOR_BITS);
} }
/* Advance the wp if needed */ /* Advance the wp if needed */
@ -2523,7 +2523,10 @@ out:
} }
} }
} else { } else {
update_zones_wp(bs, s->fd, 0, 1); /*
* write and append write are not allowed to cross zone boundaries
*/
update_zones_wp(bs, s->fd, offset, 1);
} }
qemu_co_mutex_unlock(&wps->colock); qemu_co_mutex_unlock(&wps->colock);
@ -2536,14 +2539,14 @@ static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
int64_t bytes, QEMUIOVector *qiov, int64_t bytes, QEMUIOVector *qiov,
BdrvRequestFlags flags) BdrvRequestFlags flags)
{ {
return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ); return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_READ);
} }
static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset, static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
int64_t bytes, QEMUIOVector *qiov, int64_t bytes, QEMUIOVector *qiov,
BdrvRequestFlags flags) BdrvRequestFlags flags)
{ {
return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE); return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_WRITE);
} }
static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs) static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
@ -3470,7 +3473,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
len >> BDRV_SECTOR_BITS); len >> BDRV_SECTOR_BITS);
ret = raw_thread_pool_submit(handle_aiocb_zone_mgmt, &acb); ret = raw_thread_pool_submit(handle_aiocb_zone_mgmt, &acb);
if (ret != 0) { if (ret != 0) {
update_zones_wp(bs, s->fd, offset, i); update_zones_wp(bs, s->fd, offset, nrz);
error_report("ioctl %s failed %d", op_name, ret); error_report("ioctl %s failed %d", op_name, ret);
return ret; return ret;
} }
@ -3506,8 +3509,6 @@ static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
int64_t zone_size_mask = bs->bl.zone_size - 1; int64_t zone_size_mask = bs->bl.zone_size - 1;
int64_t iov_len = 0; int64_t iov_len = 0;
int64_t len = 0; int64_t len = 0;
BDRVRawState *s = bs->opaque;
s->offset = offset;
if (*offset & zone_size_mask) { if (*offset & zone_size_mask) {
error_report("sector offset %" PRId64 " is not aligned to zone size " error_report("sector offset %" PRId64 " is not aligned to zone size "
@ -3528,7 +3529,7 @@ static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
} }
trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS); trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS);
return raw_co_prw(bs, *offset, len, qiov, QEMU_AIO_ZONE_APPEND); return raw_co_prw(bs, offset, len, qiov, QEMU_AIO_ZONE_APPEND);
} }
#endif #endif

View file

@ -1983,7 +1983,7 @@ discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, uint64_t nb_clusters,
/* If we keep the reference, pass on the discard still */ /* If we keep the reference, pass on the discard still */
bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK, bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK,
s->cluster_size); s->cluster_size);
} }
} }
qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
@ -2061,9 +2061,15 @@ zero_in_l2_slice(BlockDriverState *bs, uint64_t offset,
QCow2ClusterType type = qcow2_get_cluster_type(bs, old_l2_entry); QCow2ClusterType type = qcow2_get_cluster_type(bs, old_l2_entry);
bool unmap = (type == QCOW2_CLUSTER_COMPRESSED) || bool unmap = (type == QCOW2_CLUSTER_COMPRESSED) ||
((flags & BDRV_REQ_MAY_UNMAP) && qcow2_cluster_is_allocated(type)); ((flags & BDRV_REQ_MAY_UNMAP) && qcow2_cluster_is_allocated(type));
uint64_t new_l2_entry = unmap ? 0 : old_l2_entry; bool keep_reference =
(s->discard_no_unref && type != QCOW2_CLUSTER_COMPRESSED);
uint64_t new_l2_entry = old_l2_entry;
uint64_t new_l2_bitmap = old_l2_bitmap; uint64_t new_l2_bitmap = old_l2_bitmap;
if (unmap && !keep_reference) {
new_l2_entry = 0;
}
if (has_subclusters(s)) { if (has_subclusters(s)) {
new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES; new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES;
} else { } else {
@ -2081,9 +2087,17 @@ zero_in_l2_slice(BlockDriverState *bs, uint64_t offset,
set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap); set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap);
} }
/* Then decrease the refcount */
if (unmap) { if (unmap) {
qcow2_free_any_cluster(bs, old_l2_entry, QCOW2_DISCARD_REQUEST); if (!keep_reference) {
/* Then decrease the refcount */
qcow2_free_any_cluster(bs, old_l2_entry, QCOW2_DISCARD_REQUEST);
} else if (s->discard_passthrough[QCOW2_DISCARD_REQUEST] &&
(type == QCOW2_CLUSTER_NORMAL ||
type == QCOW2_CLUSTER_ZERO_ALLOC)) {
/* If we keep the reference, pass on the discard still */
bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK,
s->cluster_size);
}
} }
} }

View file

@ -3528,16 +3528,20 @@
# @pass-discard-other: whether discard requests for the data source # @pass-discard-other: whether discard requests for the data source
# should be issued on other occasions where a cluster gets freed # should be issued on other occasions where a cluster gets freed
# #
# @discard-no-unref: when enabled, discards from the guest will not # @discard-no-unref: when enabled, data clusters will remain
# cause cluster allocations to be relinquished. This prevents # preallocated when they are no longer used, e.g. because they are
# qcow2 fragmentation that would be caused by such discards. # discarded or converted to zero clusters. As usual, whether the
# Besides potential performance degradation, such fragmentation # old data is discarded or kept on the protocol level (i.e. in the
# can lead to increased allocation of clusters past the end of the # image file) depends on the setting of the pass-discard-request
# image file, resulting in image files whose file length can grow # option. Keeping the clusters preallocated prevents qcow2
# much larger than their guest disk size would suggest. If image # fragmentation that would otherwise be caused by freeing and
# file length is of concern (e.g. when storing qcow2 images # re-allocating them later. Besides potential performance
# directly on block devices), you should consider enabling this # degradation, such fragmentation can lead to increased allocation
# option. (since 8.1) # of clusters past the end of the image file, resulting in image
# files whose file length can grow much larger than their guest disk
# size would suggest. If image file length is of concern (e.g. when
# storing qcow2 images directly on block devices), you should
# consider enabling this option. (since 8.1)
# #
# @overlap-check: which overlap checks to perform for writes to the # @overlap-check: which overlap checks to perform for writes to the
# image, defaults to 'cached' (since 2.2) # image, defaults to 'cached' (since 2.2)

View file

@ -1457,9 +1457,13 @@ SRST
(on/off; default: off) (on/off; default: off)
``discard-no-unref`` ``discard-no-unref``
When enabled, discards from the guest will not cause cluster When enabled, data clusters will remain preallocated when they are
allocations to be relinquished. This prevents qcow2 fragmentation no longer used, e.g. because they are discarded or converted to
that would be caused by such discards. Besides potential zero clusters. As usual, whether the old data is discarded or kept
on the protocol level (i.e. in the image file) depends on the
setting of the pass-discard-request option. Keeping the clusters
preallocated prevents qcow2 fragmentation that would otherwise be
caused by freeing and re-allocating them later. Besides potential
performance degradation, such fragmentation can lead to increased performance degradation, such fragmentation can lead to increased
allocation of clusters past the end of the image file, allocation of clusters past the end of the image file,
resulting in image files whose file length can grow much larger resulting in image files whose file length can grow much larger