-----BEGIN PGP SIGNATURE-----

Version: GnuPG v1 iQEcBAABAgAGBQJUV2wdAAoJEJykq7OBq3PIjcAH/29rl938ETw1wjXxYe3uH+R6 K2yFEiPh9/cOJSH0mJ+gD8DZIN+iyR4eoQGP2s5ALFPcX3bkYxRLlUeYK0BCp883 esc7gO6XPhLvTVqP0xgACRCdUwH2I0VTToDlHjXXZogyI/DuDX3gzWJufE3x1DGs WNTMOp5n/uYkWH3rI3DkInmbSddEz3pgX65a8BuYtw0V/RSeSRnHKDYHMygvJBRL EVfWRNeOIrZ730CyJry0t8ITjsZxiBDKXR5glNSwaIfQUfGkTSWi9YNSurNYkUDr aMS2rgvOVlrOUDKTHUj9oS3jgoGWcDtlk9E1MeSoyIptbRoMhdFVl1AUJZsrMJU= =Mfbu -----END PGP SIGNATURE----- Merge remote-tracking branch 'remotes/stefanha/tags/block-pull-request' into staging # gpg: Signature made Mon 03 Nov 2014 11:50:53 GMT using RSA key ID 81AB73C8 # gpg: Good signature from "Stefan Hajnoczi <stefanha@redhat.com>" # gpg: aka "Stefan Hajnoczi <stefanha@gmail.com>" * remotes/stefanha/tags/block-pull-request: (53 commits) block: declare blockjobs and dataplane friends! block: let commit blockjob run in BDS AioContext block: let mirror blockjob run in BDS AioContext block: let stream blockjob run in BDS AioContext block: let backup blockjob run in BDS AioContext block: add bdrv_drain() blockjob: add block_job_defer_to_main_loop() blockdev: add note that block_job_cb() must be thread-safe blockdev: acquire AioContext in blockdev_mark_auto_del() blockdev: acquire AioContext in do_qmp_query_block_jobs_one() block: acquire AioContext in generic blockjob QMP commands iotests: Expand test 061 block/qcow2: Simplify shared L2 handling in amend block/qcow2: Make get_refcount() global block/qcow2: Implement status CB for amend qemu-img: Fix insignificant memleak qemu-img: Add progress output for amend block: Add status callback to bdrv_amend_options() block: qemu-iotest 107 supports NFS iotests: Add test for qcow2's bdrv_make_empty ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2025-08-07 09:43:56 -06:00 · 2014-11-03 18:34:08 +00:00 · 2014-11-03 18:34:08 +00:00 · 9a33c0c851
commit 9a33c0c851
parent eb5f222b5c b112a65c52
53 changed files with 1637 additions and 377 deletions
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@ -9,7 +9,7 @@ block-obj-y += block-backend.o snapshot.o qapi.o
 block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
 block-obj-$(CONFIG_POSIX) += raw-posix.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
-block-obj-y += null.o
+block-obj-y += null.o mirror.o

 block-obj-y += nbd.o nbd-client.o sheepdog.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
@ -23,7 +23,6 @@ block-obj-y += accounting.o

 common-obj-y += stream.o
 common-obj-y += commit.o
-common-obj-y += mirror.o
 common-obj-y += backup.o

 iscsi.o-cflags     := $(LIBISCSI_CFLAGS)
--- a/block/backup.c
+++ b/block/backup.c
@ -227,9 +227,25 @@ static BlockErrorAction backup_error_action(BackupBlockJob *job,
    }
 }

+typedef struct {
+    int ret;
+} BackupCompleteData;
+
+static void backup_complete(BlockJob *job, void *opaque)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+    BackupCompleteData *data = opaque;
+
+    bdrv_unref(s->target);
+
+    block_job_completed(job, data->ret);
+    g_free(data);
+}
+
 static void coroutine_fn backup_run(void *opaque)
 {
    BackupBlockJob *job = opaque;
+    BackupCompleteData *data;
    BlockDriverState *bs = job->common.bs;
    BlockDriverState *target = job->target;
    BlockdevOnError on_target_error = job->on_target_error;
@ -344,9 +360,10 @@ static void coroutine_fn backup_run(void *opaque)
    hbitmap_free(job->bitmap);

    bdrv_iostatus_disable(target);
-    bdrv_unref(target);

-    block_job_completed(&job->common, ret);
+    data = g_malloc(sizeof(*data));
+    data->ret = ret;
+    block_job_defer_to_main_loop(&job->common, backup_complete, data);
 }

 void backup_start(BlockDriverState *bs, BlockDriverState *target,
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@ -195,6 +195,8 @@ static const char *event_names[BLKDBG_EVENT_MAX] = {
    [BLKDBG_PWRITEV]                        = "pwritev",
    [BLKDBG_PWRITEV_ZERO]                   = "pwritev_zero",
    [BLKDBG_PWRITEV_DONE]                   = "pwritev_done",
+
+    [BLKDBG_EMPTY_IMAGE_PREPARE]            = "empty_image_prepare",
 };

 static int get_event_by_name(const char *name, BlkDebugEvent *event)
--- a/block/commit.c
+++ b/block/commit.c
@ -60,17 +60,50 @@ static int coroutine_fn commit_populate(BlockDriverState *bs,
    return 0;
 }

-static void coroutine_fn commit_run(void *opaque)
+typedef struct {
+    int ret;
+} CommitCompleteData;
+
+static void commit_complete(BlockJob *job, void *opaque)
 {
-    CommitBlockJob *s = opaque;
+    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
+    CommitCompleteData *data = opaque;
    BlockDriverState *active = s->active;
    BlockDriverState *top = s->top;
    BlockDriverState *base = s->base;
    BlockDriverState *overlay_bs;
+    int ret = data->ret;
+
+    if (!block_job_is_cancelled(&s->common) && ret == 0) {
+        /* success */
+        ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str);
+    }
+
+    /* restore base open flags here if appropriate (e.g., change the base back
+     * to r/o). These reopens do not need to be atomic, since we won't abort
+     * even on failure here */
+    if (s->base_flags != bdrv_get_flags(base)) {
+        bdrv_reopen(base, s->base_flags, NULL);
+    }
+    overlay_bs = bdrv_find_overlay(active, top);
+    if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
+        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
+    }
+    g_free(s->backing_file_str);
+    block_job_completed(&s->common, ret);
+    g_free(data);
+}
+
+static void coroutine_fn commit_run(void *opaque)
+{
+    CommitBlockJob *s = opaque;
+    CommitCompleteData *data;
+    BlockDriverState *top = s->top;
+    BlockDriverState *base = s->base;
    int64_t sector_num, end;
    int ret = 0;
    int n = 0;
-    void *buf;
+    void *buf = NULL;
    int bytes_written = 0;
    int64_t base_len;

@ -78,18 +111,18 @@ static void coroutine_fn commit_run(void *opaque)


    if (s->common.len < 0) {
-        goto exit_restore_reopen;
+        goto out;
    }

    ret = base_len = bdrv_getlength(base);
    if (base_len < 0) {
-        goto exit_restore_reopen;
+        goto out;
    }

    if (base_len < s->common.len) {
        ret = bdrv_truncate(base, s->common.len);
        if (ret) {
-            goto exit_restore_reopen;
+            goto out;
        }
    }

@ -128,7 +161,7 @@ wait:
            if (s->on_error == BLOCKDEV_ON_ERROR_STOP ||
                s->on_error == BLOCKDEV_ON_ERROR_REPORT||
                (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC && ret == -ENOSPC)) {
-                goto exit_free_buf;
+                goto out;
            } else {
                n = 0;
                continue;
@ -140,27 +173,12 @@ wait:

    ret = 0;

-    if (!block_job_is_cancelled(&s->common) && sector_num == end) {
-        /* success */
-        ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str);
-    }
-
-exit_free_buf:
+out:
    qemu_vfree(buf);

-exit_restore_reopen:
-    /* restore base open flags here if appropriate (e.g., change the base back
-     * to r/o). These reopens do not need to be atomic, since we won't abort
-     * even on failure here */
-    if (s->base_flags != bdrv_get_flags(base)) {
-        bdrv_reopen(base, s->base_flags, NULL);
-    }
-    overlay_bs = bdrv_find_overlay(active, top);
-    if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
-        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
-    }
-    g_free(s->backing_file_str);
-    block_job_completed(&s->common, ret);
+    data = g_malloc(sizeof(*data));
+    data->ret = ret;
+    block_job_defer_to_main_loop(&s->common, commit_complete, data);
 }

 static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
--- a/block/curl.c
+++ b/block/curl.c
@ -64,6 +64,7 @@ static CURLMcode __curl_multi_socket_action(CURLM *multi_handle,
 #define SECTOR_SIZE     512
 #define READ_AHEAD_DEFAULT (256 * 1024)
 #define CURL_TIMEOUT_DEFAULT 5
+#define CURL_TIMEOUT_MAX 10000

 #define FIND_RET_NONE   0
 #define FIND_RET_OK     1
@ -112,7 +113,7 @@ typedef struct BDRVCURLState {
    char *url;
    size_t readahead_size;
    bool sslverify;
-    int timeout;
+    uint64_t timeout;
    char *cookie;
    bool accept_range;
    AioContext *aio_context;
@ -390,7 +391,7 @@ static CURLState *curl_init_state(BlockDriverState *bs, BDRVCURLState *s)
        if (s->cookie) {
            curl_easy_setopt(state->curl, CURLOPT_COOKIE, s->cookie);
        }
-        curl_easy_setopt(state->curl, CURLOPT_TIMEOUT, s->timeout);
+        curl_easy_setopt(state->curl, CURLOPT_TIMEOUT, (long)s->timeout);
        curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION,
                         (void *)curl_read_cb);
        curl_easy_setopt(state->curl, CURLOPT_WRITEDATA, (void *)state);
@ -546,6 +547,10 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,

    s->timeout = qemu_opt_get_number(opts, CURL_BLOCK_OPT_TIMEOUT,
                                     CURL_TIMEOUT_DEFAULT);
+    if (s->timeout > CURL_TIMEOUT_MAX) {
+        error_setg(errp, "timeout parameter is too large or negative");
+        goto out_noclean;
+    }

    s->sslverify = qemu_opt_get_bool(opts, CURL_BLOCK_OPT_SSLVERIFY, true);

--- a/block/iscsi.c
+++ b/block/iscsi.c
@ -362,6 +362,12 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs,
        return -EINVAL;
    }

+    if (bs->bl.max_transfer_length && nb_sectors > bs->bl.max_transfer_length) {
+        error_report("iSCSI Error: Write of %d sectors exceeds max_xfer_len "
+                     "of %d sectors", nb_sectors, bs->bl.max_transfer_length);
+        return -EINVAL;
+    }
+
    lba = sector_qemu2lun(sector_num, iscsilun);
    num_sectors = sector_qemu2lun(nb_sectors, iscsilun);
    iscsi_co_init_iscsitask(iscsilun, &iTask);
@ -529,6 +535,12 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
        return -EINVAL;
    }

+    if (bs->bl.max_transfer_length && nb_sectors > bs->bl.max_transfer_length) {
+        error_report("iSCSI Error: Read of %d sectors exceeds max_xfer_len "
+                     "of %d sectors", nb_sectors, bs->bl.max_transfer_length);
+        return -EINVAL;
+    }
+
    if (iscsilun->lbprz && nb_sectors >= ISCSI_CHECKALLOC_THRES &&
        !iscsi_allocationmap_is_allocated(iscsilun, sector_num, nb_sectors)) {
        int64_t ret;
@ -1489,31 +1501,44 @@ static void iscsi_close(BlockDriverState *bs)
    memset(iscsilun, 0, sizeof(IscsiLun));
 }

+static int sector_limits_lun2qemu(int64_t sector, IscsiLun *iscsilun)
+{
+    return MIN(sector_lun2qemu(sector, iscsilun), INT_MAX / 2 + 1);
+}
+
 static void iscsi_refresh_limits(BlockDriverState *bs, Error **errp)
 {
-    IscsiLun *iscsilun = bs->opaque;
-
    /* We don't actually refresh here, but just return data queried in
     * iscsi_open(): iscsi targets don't change their limits. */
+
+    IscsiLun *iscsilun = bs->opaque;
+    uint32_t max_xfer_len = iscsilun->use_16_for_rw ? 0xffffffff : 0xffff;
+
+    if (iscsilun->bl.max_xfer_len) {
+        max_xfer_len = MIN(max_xfer_len, iscsilun->bl.max_xfer_len);
+    }
+
+    bs->bl.max_transfer_length = sector_limits_lun2qemu(max_xfer_len, iscsilun);
+
    if (iscsilun->lbp.lbpu) {
        if (iscsilun->bl.max_unmap < 0xffffffff) {
-            bs->bl.max_discard = sector_lun2qemu(iscsilun->bl.max_unmap,
-                                                 iscsilun);
+            bs->bl.max_discard =
+                sector_limits_lun2qemu(iscsilun->bl.max_unmap, iscsilun);
        }
-        bs->bl.discard_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran,
-                                                   iscsilun);
+        bs->bl.discard_alignment =
+            sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun);
    }

    if (iscsilun->bl.max_ws_len < 0xffffffff) {
-        bs->bl.max_write_zeroes = sector_lun2qemu(iscsilun->bl.max_ws_len,
-                                                  iscsilun);
+        bs->bl.max_write_zeroes =
+            sector_limits_lun2qemu(iscsilun->bl.max_ws_len, iscsilun);
    }
    if (iscsilun->lbp.lbpws) {
-        bs->bl.write_zeroes_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran,
-                                                        iscsilun);
+        bs->bl.write_zeroes_alignment =
+            sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun);
    }
-    bs->bl.opt_transfer_length = sector_lun2qemu(iscsilun->bl.opt_xfer_len,
-                                                 iscsilun);
+    bs->bl.opt_transfer_length =
+        sector_limits_lun2qemu(iscsilun->bl.opt_xfer_len, iscsilun);
 }

 /* Since iscsi_open() ignores bdrv_flags, there is nothing to do here in
--- a/block/mirror.c
+++ b/block/mirror.c
@ -45,6 +45,7 @@ typedef struct MirrorBlockJob {
    int64_t sector_num;
    int64_t granularity;
    size_t buf_size;
+    int64_t bdev_length;
    unsigned long *cow_bitmap;
    BdrvDirtyBitmap *dirty_bitmap;
    HBitmapIter hbi;
@ -54,6 +55,7 @@ typedef struct MirrorBlockJob {

    unsigned long *in_flight_bitmap;
    int in_flight;
+    int sectors_in_flight;
    int ret;
 } MirrorBlockJob;

@ -87,6 +89,7 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
    trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret);

    s->in_flight--;
+    s->sectors_in_flight -= op->nb_sectors;
    iov = op->qiov.iov;
    for (i = 0; i < op->qiov.niov; i++) {
        MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
@ -98,8 +101,11 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
    chunk_num = op->sector_num / sectors_per_chunk;
    nb_chunks = op->nb_sectors / sectors_per_chunk;
    bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
-    if (s->cow_bitmap && ret >= 0) {
-        bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
+    if (ret >= 0) {
+        if (s->cow_bitmap) {
+            bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
+        }
+        s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE;
    }

    qemu_iovec_destroy(&op->qiov);
@ -172,7 +178,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
    hbitmap_next_sector = s->sector_num;
    sector_num = s->sector_num;
    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
-    end = s->common.len >> BDRV_SECTOR_BITS;
+    end = s->bdev_length / BDRV_SECTOR_SIZE;

    /* Extend the QEMUIOVector to include all adjacent blocks that will
     * be copied in this operation.
@ -284,6 +290,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)

    /* Copy the dirty cluster.  */
    s->in_flight++;
+    s->sectors_in_flight += nb_sectors;
    trace_mirror_one_iteration(s, sector_num, nb_sectors);
    bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
                   mirror_read_complete, op);
@ -314,9 +321,56 @@ static void mirror_drain(MirrorBlockJob *s)
    }
 }

+typedef struct {
+    int ret;
+} MirrorExitData;
+
+static void mirror_exit(BlockJob *job, void *opaque)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+    MirrorExitData *data = opaque;
+    AioContext *replace_aio_context = NULL;
+
+    if (s->to_replace) {
+        replace_aio_context = bdrv_get_aio_context(s->to_replace);
+        aio_context_acquire(replace_aio_context);
+    }
+
+    if (s->should_complete && data->ret == 0) {
+        BlockDriverState *to_replace = s->common.bs;
+        if (s->to_replace) {
+            to_replace = s->to_replace;
+        }
+        if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) {
+            bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL);
+        }
+        bdrv_swap(s->target, to_replace);
+        if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) {
+            /* drop the bs loop chain formed by the swap: break the loop then
+             * trigger the unref from the top one */
+            BlockDriverState *p = s->base->backing_hd;
+            bdrv_set_backing_hd(s->base, NULL);
+            bdrv_unref(p);
+        }
+    }
+    if (s->to_replace) {
+        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
+        error_free(s->replace_blocker);
+        bdrv_unref(s->to_replace);
+    }
+    if (replace_aio_context) {
+        aio_context_release(replace_aio_context);
+    }
+    g_free(s->replaces);
+    bdrv_unref(s->target);
+    block_job_completed(&s->common, data->ret);
+    g_free(data);
+}
+
 static void coroutine_fn mirror_run(void *opaque)
 {
    MirrorBlockJob *s = opaque;
+    MirrorExitData *data;
    BlockDriverState *bs = s->common.bs;
    int64_t sector_num, end, sectors_per_chunk, length;
    uint64_t last_pause_ns;
@ -329,11 +383,11 @@ static void coroutine_fn mirror_run(void *opaque)
        goto immediate_exit;
    }

-    s->common.len = bdrv_getlength(bs);
-    if (s->common.len < 0) {
-        ret = s->common.len;
+    s->bdev_length = bdrv_getlength(bs);
+    if (s->bdev_length < 0) {
+        ret = s->bdev_length;
        goto immediate_exit;
-    } else if (s->common.len == 0) {
+    } else if (s->bdev_length == 0) {
        /* Report BLOCK_JOB_READY and wait for complete. */
        block_job_event_ready(&s->common);
        s->synced = true;
@ -344,7 +398,7 @@ static void coroutine_fn mirror_run(void *opaque)
        goto immediate_exit;
    }

-    length = DIV_ROUND_UP(s->common.len, s->granularity);
+    length = DIV_ROUND_UP(s->bdev_length, s->granularity);
    s->in_flight_bitmap = bitmap_new(length);

    /* If we have no backing file yet in the destination, we cannot let
@ -364,7 +418,7 @@ static void coroutine_fn mirror_run(void *opaque)
        }
    }

-    end = s->common.len >> BDRV_SECTOR_BITS;
+    end = s->bdev_length / BDRV_SECTOR_SIZE;
    s->buf = qemu_try_blockalign(bs, s->buf_size);
    if (s->buf == NULL) {
        ret = -ENOMEM;
@ -409,6 +463,12 @@ static void coroutine_fn mirror_run(void *opaque)
        }

        cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
+        /* s->common.offset contains the number of bytes already processed so
+         * far, cnt is the number of dirty sectors remaining and
+         * s->sectors_in_flight is the number of sectors currently being
+         * processed; together those are the current total operation length */
+        s->common.len = s->common.offset +
+                        (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE;

        /* Note that even when no rate limit is applied we need to yield
         * periodically with no pending I/O so that qemu_aio_flush() returns.
@ -445,7 +505,6 @@ static void coroutine_fn mirror_run(void *opaque)
                 * report completion.  This way, block-job-cancel will leave
                 * the target in a consistent state.
                 */
-                s->common.offset = end * BDRV_SECTOR_SIZE;
                if (!s->synced) {
                    block_job_event_ready(&s->common);
                    s->synced = true;
@ -467,15 +526,13 @@ static void coroutine_fn mirror_run(void *opaque)
             * mirror_populate runs.
             */
            trace_mirror_before_drain(s, cnt);
-            bdrv_drain_all();
+            bdrv_drain(bs);
            cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
        }

        ret = 0;
        trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
        if (!s->synced) {
-            /* Publish progress */
-            s->common.offset = (end - cnt) * BDRV_SECTOR_SIZE;
            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
            if (block_job_is_cancelled(&s->common)) {
                break;
@ -510,31 +567,10 @@ immediate_exit:
    g_free(s->in_flight_bitmap);
    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
    bdrv_iostatus_disable(s->target);
-    if (s->should_complete && ret == 0) {
-        BlockDriverState *to_replace = s->common.bs;
-        if (s->to_replace) {
-            to_replace = s->to_replace;
-        }
-        if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) {
-            bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL);
-        }
-        bdrv_swap(s->target, to_replace);
-        if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) {
-            /* drop the bs loop chain formed by the swap: break the loop then
-             * trigger the unref from the top one */
-            BlockDriverState *p = s->base->backing_hd;
-            bdrv_set_backing_hd(s->base, NULL);
-            bdrv_unref(p);
-        }
-    }
-    if (s->to_replace) {
-        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
-        error_free(s->replace_blocker);
-        bdrv_unref(s->to_replace);
-    }
-    g_free(s->replaces);
-    bdrv_unref(s->target);
-    block_job_completed(&s->common, ret);
+
+    data = g_malloc(sizeof(*data));
+    data->ret = ret;
+    block_job_defer_to_main_loop(&s->common, mirror_exit, data);
 }

 static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
@ -574,16 +610,23 @@ static void mirror_complete(BlockJob *job, Error **errp)

    /* check the target bs is not blocked and block all operations on it */
    if (s->replaces) {
+        AioContext *replace_aio_context;
+
        s->to_replace = check_to_replace_node(s->replaces, &local_err);
        if (!s->to_replace) {
            error_propagate(errp, local_err);
            return;
        }

+        replace_aio_context = bdrv_get_aio_context(s->to_replace);
+        aio_context_acquire(replace_aio_context);
+
        error_setg(&s->replace_blocker,
                   "block device is in use by block-job-complete");
        bdrv_op_block_all(s->to_replace, s->replace_blocker);
        bdrv_ref(s->to_replace);
+
+        aio_context_release(replace_aio_context);
    }

    s->should_complete = true;
--- a/block/parallels.c
+++ b/block/parallels.c
@ -155,7 +155,7 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
    offset = sector_num % s->tracks;

    /* not allocated */
-    if ((index > s->catalog_size) || (s->catalog_bitmap[index] == 0))
+    if ((index >= s->catalog_size) || (s->catalog_bitmap[index] == 0))
        return -1;
    return
        ((uint64_t)s->catalog_bitmap[index] * s->off_multiplier + offset) * 512;
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@ -1414,7 +1414,7 @@ int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
 * clusters.
 */
 static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
-    unsigned int nb_clusters, enum qcow2_discard_type type)
+    unsigned int nb_clusters, enum qcow2_discard_type type, bool full_discard)
 {
    BDRVQcowState *s = bs->opaque;
    uint64_t *l2_table;
@ -1436,23 +1436,30 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
        old_l2_entry = be64_to_cpu(l2_table[l2_index + i]);

        /*
-         * Make sure that a discarded area reads back as zeroes for v3 images
-         * (we cannot do it for v2 without actually writing a zero-filled
-         * buffer). We can skip the operation if the cluster is already marked
-         * as zero, or if it's unallocated and we don't have a backing file.
+         * If full_discard is false, make sure that a discarded area reads back
+         * as zeroes for v3 images (we cannot do it for v2 without actually
+         * writing a zero-filled buffer). We can skip the operation if the
+         * cluster is already marked as zero, or if it's unallocated and we
+         * don't have a backing file.
         *
         * TODO We might want to use bdrv_get_block_status(bs) here, but we're
         * holding s->lock, so that doesn't work today.
+         *
+         * If full_discard is true, the sector should not read back as zeroes,
+         * but rather fall through to the backing file.
         */
        switch (qcow2_get_cluster_type(old_l2_entry)) {
            case QCOW2_CLUSTER_UNALLOCATED:
-                if (!bs->backing_hd) {
+                if (full_discard || !bs->backing_hd) {
                    continue;
                }
                break;

            case QCOW2_CLUSTER_ZERO:
-                continue;
+                if (!full_discard) {
+                    continue;
+                }
+                break;

            case QCOW2_CLUSTER_NORMAL:
            case QCOW2_CLUSTER_COMPRESSED:
@ -1464,7 +1471,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,

        /* First remove L2 entries */
        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
-        if (s->qcow_version >= 3) {
+        if (!full_discard && s->qcow_version >= 3) {
            l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
        } else {
            l2_table[l2_index + i] = cpu_to_be64(0);
@ -1483,7 +1490,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
 }

 int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
-    int nb_sectors, enum qcow2_discard_type type)
+    int nb_sectors, enum qcow2_discard_type type, bool full_discard)
 {
    BDRVQcowState *s = bs->opaque;
    uint64_t end_offset;
@ -1506,7 +1513,7 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,

    /* Each L2 table is handled by its own loop iteration */
    while (nb_clusters > 0) {
-        ret = discard_single_l2(bs, offset, nb_clusters, type);
+        ret = discard_single_l2(bs, offset, nb_clusters, type, full_discard);
        if (ret < 0) {
            goto fail;
        }
@ -1606,15 +1613,14 @@ fail:
 * Expands all zero clusters in a specific L1 table (or deallocates them, for
 * non-backed non-pre-allocated zero clusters).
 *
- * expanded_clusters is a bitmap where every bit corresponds to one cluster in
- * the image file; a bit gets set if the corresponding cluster has been used for
- * zero expansion (i.e., has been filled with zeroes and is referenced from an
- * L2 table). nb_clusters contains the total cluster count of the image file,
- * i.e., the number of bits in expanded_clusters.
+ * l1_entries and *visited_l1_entries are used to keep track of progress for
+ * status_cb(). l1_entries contains the total number of L1 entries and
+ * *visited_l1_entries counts all visited L1 entries.
 */
 static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
-                                      int l1_size, uint8_t **expanded_clusters,
-                                      uint64_t *nb_clusters)
+                                      int l1_size, int64_t *visited_l1_entries,
+                                      int64_t l1_entries,
+                                      BlockDriverAmendStatusCB *status_cb)
 {
    BDRVQcowState *s = bs->opaque;
    bool is_active_l1 = (l1_table == s->l1_table);
@ -1634,9 +1640,14 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
    for (i = 0; i < l1_size; i++) {
        uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
        bool l2_dirty = false;
+        int l2_refcount;

        if (!l2_offset) {
            /* unallocated */
+            (*visited_l1_entries)++;
+            if (status_cb) {
+                status_cb(bs, *visited_l1_entries, l1_entries);
+            }
            continue;
        }

@ -1653,33 +1664,19 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
            goto fail;
        }

+        l2_refcount = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits);
+        if (l2_refcount < 0) {
+            ret = l2_refcount;
+            goto fail;
+        }
+
        for (j = 0; j < s->l2_size; j++) {
            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
-            int64_t offset = l2_entry & L2E_OFFSET_MASK, cluster_index;
+            int64_t offset = l2_entry & L2E_OFFSET_MASK;
            int cluster_type = qcow2_get_cluster_type(l2_entry);
            bool preallocated = offset != 0;

-            if (cluster_type == QCOW2_CLUSTER_NORMAL) {
-                cluster_index = offset >> s->cluster_bits;
-                assert((cluster_index >= 0) && (cluster_index < *nb_clusters));
-                if ((*expanded_clusters)[cluster_index / 8] &
-                    (1 << (cluster_index % 8))) {
-                    /* Probably a shared L2 table; this cluster was a zero
-                     * cluster which has been expanded, its refcount
-                     * therefore most likely requires an update. */
-                    ret = qcow2_update_cluster_refcount(bs, cluster_index, 1,
-                                                        QCOW2_DISCARD_NEVER);
-                    if (ret < 0) {
-                        goto fail;
-                    }
-                    /* Since we just increased the refcount, the COPIED flag may
-                     * no longer be set. */
-                    l2_table[j] = cpu_to_be64(l2_entry & ~QCOW_OFLAG_COPIED);
-                    l2_dirty = true;
-                }
-                continue;
-            }
-            else if (qcow2_get_cluster_type(l2_entry) != QCOW2_CLUSTER_ZERO) {
+            if (cluster_type != QCOW2_CLUSTER_ZERO) {
                continue;
            }

@ -1697,6 +1694,19 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                    ret = offset;
                    goto fail;
                }
+
+                if (l2_refcount > 1) {
+                    /* For shared L2 tables, set the refcount accordingly (it is
+                     * already 1 and needs to be l2_refcount) */
+                    ret = qcow2_update_cluster_refcount(bs,
+                            offset >> s->cluster_bits, l2_refcount - 1,
+                            QCOW2_DISCARD_OTHER);
+                    if (ret < 0) {
+                        qcow2_free_clusters(bs, offset, s->cluster_size,
+                                            QCOW2_DISCARD_OTHER);
+                        goto fail;
+                    }
+                }
            }

            ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
@ -1718,29 +1728,12 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                goto fail;
            }

-            l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
-            l2_dirty = true;
-
-            cluster_index = offset >> s->cluster_bits;
-
-            if (cluster_index >= *nb_clusters) {
-                uint64_t old_bitmap_size = (*nb_clusters + 7) / 8;
-                uint64_t new_bitmap_size;
-                /* The offset may lie beyond the old end of the underlying image
-                 * file for growable files only */
-                assert(bs->file->growable);
-                *nb_clusters = size_to_clusters(s, bs->file->total_sectors *
-                                                BDRV_SECTOR_SIZE);
-                new_bitmap_size = (*nb_clusters + 7) / 8;
-                *expanded_clusters = g_realloc(*expanded_clusters,
-                                               new_bitmap_size);
-                /* clear the newly allocated space */
-                memset(&(*expanded_clusters)[old_bitmap_size], 0,
-                       new_bitmap_size - old_bitmap_size);
+            if (l2_refcount == 1) {
+                l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
+            } else {
+                l2_table[j] = cpu_to_be64(offset);
            }
-
-            assert((cluster_index >= 0) && (cluster_index < *nb_clusters));
-            (*expanded_clusters)[cluster_index / 8] |= 1 << (cluster_index % 8);
+            l2_dirty = true;
        }

        if (is_active_l1) {
@ -1769,6 +1762,11 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                }
            }
        }
+
+        (*visited_l1_entries)++;
+        if (status_cb) {
+            status_cb(bs, *visited_l1_entries, l1_entries);
+        }
    }

    ret = 0;
@ -1795,25 +1793,25 @@ fail:
 * allocation for pre-allocated ones). This is important for downgrading to a
 * qcow2 version which doesn't yet support metadata zero clusters.
 */
-int qcow2_expand_zero_clusters(BlockDriverState *bs)
+int qcow2_expand_zero_clusters(BlockDriverState *bs,
+                               BlockDriverAmendStatusCB *status_cb)
 {
    BDRVQcowState *s = bs->opaque;
    uint64_t *l1_table = NULL;
-    uint64_t nb_clusters;
-    uint8_t *expanded_clusters;
+    int64_t l1_entries = 0, visited_l1_entries = 0;
    int ret;
    int i, j;

-    nb_clusters = size_to_clusters(s, bs->file->total_sectors *
-                                   BDRV_SECTOR_SIZE);
-    expanded_clusters = g_try_malloc0((nb_clusters + 7) / 8);
-    if (expanded_clusters == NULL) {
-        ret = -ENOMEM;
-        goto fail;
+    if (status_cb) {
+        l1_entries = s->l1_size;
+        for (i = 0; i < s->nb_snapshots; i++) {
+            l1_entries += s->snapshots[i].l1_size;
+        }
    }

    ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size,
-                                     &expanded_clusters, &nb_clusters);
+                                     &visited_l1_entries, l1_entries,
+                                     status_cb);
    if (ret < 0) {
        goto fail;
    }
@ -1847,7 +1845,8 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs)
        }

        ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size,
-                                         &expanded_clusters, &nb_clusters);
+                                         &visited_l1_entries, l1_entries,
+                                         status_cb);
        if (ret < 0) {
            goto fail;
        }
@ -1856,7 +1855,6 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs)
    ret = 0;

 fail:
-    g_free(expanded_clusters);
    g_free(l1_table);
    return ret;
 }
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@ -91,7 +91,7 @@ static int load_refcount_block(BlockDriverState *bs,
 * return value is the refcount of the cluster, negative values are -errno
 * and indicate an error.
 */
-static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
+int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index)
 {
    BDRVQcowState *s = bs->opaque;
    uint64_t refcount_table_index, block_index;
@ -629,8 +629,7 @@ fail:
 }

 /*
- * Increases or decreases the refcount of a given cluster by one.
- * addend must be 1 or -1.
+ * Increases or decreases the refcount of a given cluster.
 *
 * If the return value is non-negative, it is the new refcount of the cluster.
 * If it is negative, it is -errno and indicates an error.
@ -649,7 +648,7 @@ int qcow2_update_cluster_refcount(BlockDriverState *bs,
        return ret;
    }

-    return get_refcount(bs, cluster_index);
+    return qcow2_get_refcount(bs, cluster_index);
 }


@ -670,7 +669,7 @@ static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
 retry:
    for(i = 0; i < nb_clusters; i++) {
        uint64_t next_cluster_index = s->free_cluster_index++;
-        refcount = get_refcount(bs, next_cluster_index);
+        refcount = qcow2_get_refcount(bs, next_cluster_index);

        if (refcount < 0) {
            return refcount;
@ -734,7 +733,7 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
        /* Check how many clusters there are free */
        cluster_index = offset >> s->cluster_bits;
        for(i = 0; i < nb_clusters; i++) {
-            refcount = get_refcount(bs, cluster_index++);
+            refcount = qcow2_get_refcount(bs, cluster_index++);

            if (refcount < 0) {
                return refcount;
@ -981,7 +980,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                                    cluster_index, addend,
                                    QCOW2_DISCARD_SNAPSHOT);
                        } else {
-                            refcount = get_refcount(bs, cluster_index);
+                            refcount = qcow2_get_refcount(bs, cluster_index);
                        }

                        if (refcount < 0) {
@ -1021,7 +1020,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                refcount = qcow2_update_cluster_refcount(bs, l2_offset >>
                        s->cluster_bits, addend, QCOW2_DISCARD_SNAPSHOT);
            } else {
-                refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
+                refcount = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits);
            }
            if (refcount < 0) {
                ret = refcount;
@ -1332,8 +1331,8 @@ fail:
 * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
 *
 * This function does not print an error message nor does it increment
- * check_errors if get_refcount fails (this is because such an error will have
- * been already detected and sufficiently signaled by the calling function
+ * check_errors if qcow2_get_refcount fails (this is because such an error will
+ * have been already detected and sufficiently signaled by the calling function
 * (qcow2_check_refcounts) by the time this function is called).
 */
 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
@ -1354,7 +1353,7 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
            continue;
        }

-        refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
+        refcount = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits);
        if (refcount < 0) {
            /* don't print message nor increment check_errors */
            continue;
@ -1396,7 +1395,8 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,

            if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
                ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
-                refcount = get_refcount(bs, data_offset >> s->cluster_bits);
+                refcount = qcow2_get_refcount(bs,
+                                              data_offset >> s->cluster_bits);
                if (refcount < 0) {
                    /* don't print message nor increment check_errors */
                    continue;
@ -1632,7 +1632,7 @@ static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
    int refcount1, refcount2, ret;

    for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
-        refcount1 = get_refcount(bs, i);
+        refcount1 = qcow2_get_refcount(bs, i);
        if (refcount1 < 0) {
            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
                i, strerror(-refcount1));
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@ -441,7 +441,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
    qcow2_discard_clusters(bs, qcow2_vm_state_offset(s),
                           align_offset(sn->vm_state_size, s->cluster_size)
                                >> BDRV_SECTOR_BITS,
-                           QCOW2_DISCARD_NEVER);
+                           QCOW2_DISCARD_NEVER, false);

 #ifdef DEBUG_ALLOC
    {
--- a/block/qcow2.c
+++ b/block/qcow2.c
@ -2089,7 +2089,7 @@ static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,

    qemu_co_mutex_lock(&s->lock);
    ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS,
-        nb_sectors, QCOW2_DISCARD_REQUEST);
+        nb_sectors, QCOW2_DISCARD_REQUEST, false);
    qemu_co_mutex_unlock(&s->lock);
    return ret;
 }
@ -2230,6 +2230,195 @@ fail:
    return ret;
 }

+static int make_completely_empty(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret, l1_clusters;
+    int64_t offset;
+    uint64_t *new_reftable = NULL;
+    uint64_t rt_entry, l1_size2;
+    struct {
+        uint64_t l1_offset;
+        uint64_t reftable_offset;
+        uint32_t reftable_clusters;
+    } QEMU_PACKED l1_ofs_rt_ofs_cls;
+
+    ret = qcow2_cache_empty(bs, s->l2_table_cache);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = qcow2_cache_empty(bs, s->refcount_block_cache);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Refcounts will be broken utterly */
+    ret = qcow2_mark_dirty(bs);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
+
+    l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t));
+    l1_size2 = (uint64_t)s->l1_size * sizeof(uint64_t);
+
+    /* After this call, neither the in-memory nor the on-disk refcount
+     * information accurately describe the actual references */
+
+    ret = bdrv_write_zeroes(bs->file, s->l1_table_offset / BDRV_SECTOR_SIZE,
+                            l1_clusters * s->cluster_sectors, 0);
+    if (ret < 0) {
+        goto fail_broken_refcounts;
+    }
+    memset(s->l1_table, 0, l1_size2);
+
+    BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE);
+
+    /* Overwrite enough clusters at the beginning of the sectors to place
+     * the refcount table, a refcount block and the L1 table in; this may
+     * overwrite parts of the existing refcount and L1 table, which is not
+     * an issue because the dirty flag is set, complete data loss is in fact
+     * desired and partial data loss is consequently fine as well */
+    ret = bdrv_write_zeroes(bs->file, s->cluster_size / BDRV_SECTOR_SIZE,
+                            (2 + l1_clusters) * s->cluster_size /
+                            BDRV_SECTOR_SIZE, 0);
+    /* This call (even if it failed overall) may have overwritten on-disk
+     * refcount structures; in that case, the in-memory refcount information
+     * will probably differ from the on-disk information which makes the BDS
+     * unusable */
+    if (ret < 0) {
+        goto fail_broken_refcounts;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
+    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);
+
+    /* "Create" an empty reftable (one cluster) directly after the image
+     * header and an empty L1 table three clusters after the image header;
+     * the cluster between those two will be used as the first refblock */
+    cpu_to_be64w(&l1_ofs_rt_ofs_cls.l1_offset, 3 * s->cluster_size);
+    cpu_to_be64w(&l1_ofs_rt_ofs_cls.reftable_offset, s->cluster_size);
+    cpu_to_be32w(&l1_ofs_rt_ofs_cls.reftable_clusters, 1);
+    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset),
+                           &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls));
+    if (ret < 0) {
+        goto fail_broken_refcounts;
+    }
+
+    s->l1_table_offset = 3 * s->cluster_size;
+
+    new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t));
+    if (!new_reftable) {
+        ret = -ENOMEM;
+        goto fail_broken_refcounts;
+    }
+
+    s->refcount_table_offset = s->cluster_size;
+    s->refcount_table_size   = s->cluster_size / sizeof(uint64_t);
+
+    g_free(s->refcount_table);
+    s->refcount_table = new_reftable;
+    new_reftable = NULL;
+
+    /* Now the in-memory refcount information again corresponds to the on-disk
+     * information (reftable is empty and no refblocks (the refblock cache is
+     * empty)); however, this means some clusters (e.g. the image header) are
+     * referenced, but not refcounted, but the normal qcow2 code assumes that
+     * the in-memory information is always correct */
+
+    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
+
+    /* Enter the first refblock into the reftable */
+    rt_entry = cpu_to_be64(2 * s->cluster_size);
+    ret = bdrv_pwrite_sync(bs->file, s->cluster_size,
+                           &rt_entry, sizeof(rt_entry));
+    if (ret < 0) {
+        goto fail_broken_refcounts;
+    }
+    s->refcount_table[0] = 2 * s->cluster_size;
+
+    s->free_cluster_index = 0;
+    assert(3 + l1_clusters <= s->refcount_block_size);
+    offset = qcow2_alloc_clusters(bs, 3 * s->cluster_size + l1_size2);
+    if (offset < 0) {
+        ret = offset;
+        goto fail_broken_refcounts;
+    } else if (offset > 0) {
+        error_report("First cluster in emptied image is in use");
+        abort();
+    }
+
+    /* Now finally the in-memory information corresponds to the on-disk
+     * structures and is correct */
+    ret = qcow2_mark_clean(bs);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    return 0;
+
+fail_broken_refcounts:
+    /* The BDS is unusable at this point. If we wanted to make it usable, we
+     * would have to call qcow2_refcount_close(), qcow2_refcount_init(),
+     * qcow2_check_refcounts(), qcow2_refcount_close() and qcow2_refcount_init()
+     * again. However, because the functions which could have caused this error
+     * path to be taken are used by those functions as well, it's very likely
+     * that that sequence will fail as well. Therefore, just eject the BDS. */
+    bs->drv = NULL;
+
+fail:
+    g_free(new_reftable);
+    return ret;
+}
+
+static int qcow2_make_empty(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t start_sector;
+    int sector_step = INT_MAX / BDRV_SECTOR_SIZE;
+    int l1_clusters, ret = 0;
+
+    l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t));
+
+    if (s->qcow_version >= 3 && !s->snapshots &&
+        3 + l1_clusters <= s->refcount_block_size) {
+        /* The following function only works for qcow2 v3 images (it requires
+         * the dirty flag) and only as long as there are no snapshots (because
+         * it completely empties the image). Furthermore, the L1 table and three
+         * additional clusters (image header, refcount table, one refcount
+         * block) have to fit inside one refcount block. */
+        return make_completely_empty(bs);
+    }
+
+    /* This fallback code simply discards every active cluster; this is slow,
+     * but works in all cases */
+    for (start_sector = 0; start_sector < bs->total_sectors;
+         start_sector += sector_step)
+    {
+        /* As this function is generally used after committing an external
+         * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
+         * default action for this kind of discard is to pass the discard,
+         * which will ideally result in an actually smaller image file, as
+         * is probably desired. */
+        ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE,
+                                     MIN(sector_step,
+                                         bs->total_sectors - start_sector),
+                                     QCOW2_DISCARD_SNAPSHOT, true);
+        if (ret < 0) {
+            break;
+        }
+    }
+
+    return ret;
+}
+
 static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
 {
    BDRVQcowState *s = bs->opaque;
@ -2361,7 +2550,8 @@ static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
 * Downgrades an image's version. To achieve this, any incompatible features
 * have to be removed.
 */
-static int qcow2_downgrade(BlockDriverState *bs, int target_version)
+static int qcow2_downgrade(BlockDriverState *bs, int target_version,
+                           BlockDriverAmendStatusCB *status_cb)
 {
    BDRVQcowState *s = bs->opaque;
    int current_version = s->qcow_version;
@ -2410,7 +2600,7 @@ static int qcow2_downgrade(BlockDriverState *bs, int target_version)
    /* clearing autoclear features is trivial */
    s->autoclear_features = 0;

-    ret = qcow2_expand_zero_clusters(bs);
+    ret = qcow2_expand_zero_clusters(bs, status_cb);
    if (ret < 0) {
        return ret;
    }
@ -2424,7 +2614,8 @@ static int qcow2_downgrade(BlockDriverState *bs, int target_version)
    return 0;
 }

-static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts)
+static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
+                               BlockDriverAmendStatusCB *status_cb)
 {
    BDRVQcowState *s = bs->opaque;
    int old_version = s->qcow_version, new_version = old_version;
@ -2502,7 +2693,7 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts)
                return ret;
            }
        } else {
-            ret = qcow2_downgrade(bs, new_version);
+            ret = qcow2_downgrade(bs, new_version, status_cb);
            if (ret < 0) {
                return ret;
            }
@ -2676,6 +2867,7 @@ static BlockDriver bdrv_qcow2 = {
    .bdrv_co_discard        = qcow2_co_discard,
    .bdrv_truncate          = qcow2_truncate,
    .bdrv_write_compressed  = qcow2_write_compressed,
+    .bdrv_make_empty        = qcow2_make_empty,

    .bdrv_snapshot_create   = qcow2_snapshot_create,
    .bdrv_snapshot_goto     = qcow2_snapshot_goto,
--- a/block/qcow2.h
+++ b/block/qcow2.h
@ -487,6 +487,8 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
 int qcow2_refcount_init(BlockDriverState *bs);
 void qcow2_refcount_close(BlockDriverState *bs);

+int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index);
+
 int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index,
                                  int addend, enum qcow2_discard_type type);

@ -534,10 +536,11 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,

 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m);
 int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
-    int nb_sectors, enum qcow2_discard_type type);
+    int nb_sectors, enum qcow2_discard_type type, bool full_discard);
 int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors);

-int qcow2_expand_zero_clusters(BlockDriverState *bs);
+int qcow2_expand_zero_clusters(BlockDriverState *bs,
+                               BlockDriverAmendStatusCB *status_cb);

 /* qcow2-snapshot.c functions */
 int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info);
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@ -1481,12 +1481,12 @@ out:
    return result;
 }

-static int64_t try_fiemap(BlockDriverState *bs, off_t start, off_t *data,
-                          off_t *hole, int nb_sectors, int *pnum)
+static int try_fiemap(BlockDriverState *bs, off_t start, off_t *data,
+                      off_t *hole, int nb_sectors)
 {
 #ifdef CONFIG_FIEMAP
    BDRVRawState *s = bs->opaque;
-    int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
+    int ret = 0;
    struct {
        struct fiemap fm;
        struct fiemap_extent fe;
@ -1527,18 +1527,14 @@ static int64_t try_fiemap(BlockDriverState *bs, off_t start, off_t *data,
 #endif
 }

-static int64_t try_seek_hole(BlockDriverState *bs, off_t start, off_t *data,
-                             off_t *hole, int *pnum)
+static int try_seek_hole(BlockDriverState *bs, off_t start, off_t *data,
+                         off_t *hole)
 {
 #if defined SEEK_HOLE && defined SEEK_DATA
    BDRVRawState *s = bs->opaque;

    *hole = lseek(s->fd, start, SEEK_HOLE);
    if (*hole == -1) {
-        /* -ENXIO indicates that sector_num was past the end of the file.
-         * There is a virtual hole there.  */
-        assert(errno != -ENXIO);
-
        return -errno;
    }

@ -1552,7 +1548,7 @@ static int64_t try_seek_hole(BlockDriverState *bs, off_t start, off_t *data,
        }
    }

-    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
+    return 0;
 #else
    return -ENOTSUP;
 #endif
@ -1578,7 +1574,8 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
                                                    int nb_sectors, int *pnum)
 {
    off_t start, data = 0, hole = 0;
-    int64_t ret;
+    int64_t total_size;
+    int ret;

    ret = fd_open(bs);
    if (ret < 0) {
@ -1586,29 +1583,38 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
    }

    start = sector_num * BDRV_SECTOR_SIZE;
+    total_size = bdrv_getlength(bs);
+    if (total_size < 0) {
+        return total_size;
+    } else if (start >= total_size) {
+        *pnum = 0;
+        return 0;
+    } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) {
+        nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
+    }

-    ret = try_seek_hole(bs, start, &data, &hole, pnum);
+    ret = try_seek_hole(bs, start, &data, &hole);
    if (ret < 0) {
-        ret = try_fiemap(bs, start, &data, &hole, nb_sectors, pnum);
+        ret = try_fiemap(bs, start, &data, &hole, nb_sectors);
        if (ret < 0) {
            /* Assume everything is allocated. */
            data = 0;
            hole = start + nb_sectors * BDRV_SECTOR_SIZE;
-            ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
+            ret = 0;
        }
    }

+    assert(ret >= 0);
+
    if (data <= start) {
        /* On a data extent, compute sectors to the end of the extent.  */
        *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE);
+        return ret | BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
    } else {
        /* On a hole, compute sectors to the beginning of the next extent.  */
        *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
-        ret &= ~BDRV_BLOCK_DATA;
-        ret |= BDRV_BLOCK_ZERO;
+        return ret | BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID | start;
    }
-
-    return ret;
 }

 static coroutine_fn BlockAIOCB *raw_aio_discard(BlockDriverState *bs,
--- a/block/rbd.c
+++ b/block/rbd.c
@ -887,6 +887,18 @@ static BlockAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs,
 }
 #endif

+#ifdef LIBRBD_SUPPORTS_INVALIDATE
+static void qemu_rbd_invalidate_cache(BlockDriverState *bs,
+                                      Error **errp)
+{
+    BDRVRBDState *s = bs->opaque;
+    int r = rbd_invalidate_cache(s->image);
+    if (r < 0) {
+        error_setg_errno(errp, -r, "Failed to invalidate the cache");
+    }
+}
+#endif
+
 static QemuOptsList qemu_rbd_create_opts = {
    .name = "rbd-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(qemu_rbd_create_opts.head),
@ -936,6 +948,9 @@ static BlockDriver bdrv_rbd = {
    .bdrv_snapshot_delete   = qemu_rbd_snap_remove,
    .bdrv_snapshot_list     = qemu_rbd_snap_list,
    .bdrv_snapshot_goto     = qemu_rbd_snap_rollback,
+#ifdef LIBRBD_SUPPORTS_INVALIDATE
+    .bdrv_invalidate_cache  = qemu_rbd_invalidate_cache,
+#endif
 };

 static void bdrv_rbd_init(void)
--- a/block/snapshot.c
+++ b/block/snapshot.c
@ -236,6 +236,10 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
        error_setg(errp, "snapshot_id and name are both NULL");
        return -EINVAL;
    }
+
+    /* drain all pending i/o before deleting snapshot */
+    bdrv_drain_all();
+
    if (drv->bdrv_snapshot_delete) {
        return drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp);
    }
--- a/block/stream.c
+++ b/block/stream.c
@ -79,9 +79,39 @@ static void close_unused_images(BlockDriverState *top, BlockDriverState *base,
    bdrv_refresh_limits(top, NULL);
 }

+typedef struct {
+    int ret;
+    bool reached_end;
+} StreamCompleteData;
+
+static void stream_complete(BlockJob *job, void *opaque)
+{
+    StreamBlockJob *s = container_of(job, StreamBlockJob, common);
+    StreamCompleteData *data = opaque;
+    BlockDriverState *base = s->base;
+
+    if (!block_job_is_cancelled(&s->common) && data->reached_end &&
+        data->ret == 0) {
+        const char *base_id = NULL, *base_fmt = NULL;
+        if (base) {
+            base_id = s->backing_file_str;
+            if (base->drv) {
+                base_fmt = base->drv->format_name;
+            }
+        }
+        data->ret = bdrv_change_backing_file(job->bs, base_id, base_fmt);
+        close_unused_images(job->bs, base, base_id);
+    }
+
+    g_free(s->backing_file_str);
+    block_job_completed(&s->common, data->ret);
+    g_free(data);
+}
+
 static void coroutine_fn stream_run(void *opaque)
 {
    StreamBlockJob *s = opaque;
+    StreamCompleteData *data;
    BlockDriverState *bs = s->common.bs;
    BlockDriverState *base = s->base;
    int64_t sector_num, end;
@ -183,21 +213,13 @@ wait:
    /* Do not remove the backing file if an error was there but ignored.  */
    ret = error;

-    if (!block_job_is_cancelled(&s->common) && sector_num == end && ret == 0) {
-        const char *base_id = NULL, *base_fmt = NULL;
-        if (base) {
-            base_id = s->backing_file_str;
-            if (base->drv) {
-                base_fmt = base->drv->format_name;
-            }
-        }
-        ret = bdrv_change_backing_file(bs, base_id, base_fmt);
-        close_unused_images(bs, base, base_id);
-    }
-
    qemu_vfree(buf);
-    g_free(s->backing_file_str);
-    block_job_completed(&s->common, ret);
+
+    /* Modify backing chain and close BDSes in main loop */
+    data = g_malloc(sizeof(*data));
+    data->ret = ret;
+    data->reached_end = sector_num == end;
+    block_job_defer_to_main_loop(&s->common, stream_complete, data);
 }

 static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)