qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
To: qemu-block@nongnu.org
Cc: qemu-devel@nongnu.org, crosa@redhat.com, ehabkost@redhat.com,
	mreitz@redhat.com, kwolf@redhat.com, vsementsov@virtuozzo.com,
	jsnow@redhat.com
Subject: [PATCH v3 6/6] block/qcow2: use seqcache for compressed writes
Date: Fri,  5 Mar 2021 20:35:07 +0300	[thread overview]
Message-ID: <20210305173507.393137-7-vsementsov@virtuozzo.com> (raw)
In-Reply-To: <20210305173507.393137-1-vsementsov@virtuozzo.com>

Compressed writes are unaligned to 512, which works very slow in
O_DIRECT mode. Let's use the cache.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 block/coroutines.h     |   3 +
 block/qcow2.h          |   4 ++
 block/qcow2-refcount.c |  10 +++
 block/qcow2.c          | 158 ++++++++++++++++++++++++++++++++++++++---
 4 files changed, 164 insertions(+), 11 deletions(-)

diff --git a/block/coroutines.h b/block/coroutines.h
index 4cfb4946e6..cb8e05830e 100644
--- a/block/coroutines.h
+++ b/block/coroutines.h
@@ -66,4 +66,7 @@ int coroutine_fn bdrv_co_readv_vmstate(BlockDriverState *bs,
 int coroutine_fn bdrv_co_writev_vmstate(BlockDriverState *bs,
                                         QEMUIOVector *qiov, int64_t pos);
 
+int coroutine_fn qcow2_co_flush_compressed_cache(BlockDriverState *bs);
+int generated_co_wrapper qcow2_flush_compressed_cache(BlockDriverState *bs);
+
 #endif /* BLOCK_COROUTINES_INT_H */
diff --git a/block/qcow2.h b/block/qcow2.h
index e18d8dfbae..8b8fed0950 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -28,6 +28,7 @@
 #include "crypto/block.h"
 #include "qemu/coroutine.h"
 #include "qemu/units.h"
+#include "qemu/seqcache.h"
 #include "block/block_int.h"
 
 //#define DEBUG_ALLOC
@@ -422,6 +423,9 @@ typedef struct BDRVQcow2State {
     Qcow2CompressionType compression_type;
 
     GHashTable *inflight_writes_counters;
+
+    SeqCache *compressed_cache;
+    int compressed_flush_ret;
 } BDRVQcow2State;
 
 typedef struct Qcow2COWRegion {
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 464d133368..218917308e 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -29,6 +29,7 @@
 #include "qemu/bswap.h"
 #include "qemu/cutils.h"
 #include "trace.h"
+#include "block/coroutines.h"
 
 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
                                     uint64_t max);
@@ -1040,6 +1041,10 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
                 qcow2_cache_discard(s->l2_table_cache, table);
             }
 
+            if (s->compressed_cache) {
+                seqcache_discard_cluster(s->compressed_cache, cluster_offset);
+            }
+
             if (s->discard_passthrough[type]) {
                 update_refcount_discard(bs, cluster_offset, s->cluster_size);
             }
@@ -1349,6 +1354,11 @@ int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
     BDRVQcow2State *s = bs->opaque;
     int ret;
 
+    ret = qcow2_flush_compressed_cache(bs);
+    if (ret < 0) {
+        return ret;
+    }
+
     ret = qcow2_cache_write(bs, s->l2_table_cache);
     if (ret < 0) {
         return ret;
diff --git a/block/qcow2.c b/block/qcow2.c
index 6ee94421e0..5f3713cd6f 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -42,6 +42,7 @@
 #include "qapi/qapi-visit-block-core.h"
 #include "crypto.h"
 #include "block/aio_task.h"
+#include "block/coroutines.h"
 
 /*
   Differences with QCOW:
@@ -1834,6 +1835,10 @@ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options,
     s->inflight_writes_counters =
         g_hash_table_new_full(g_int64_hash, g_int64_equal, g_free, g_free);
 
+    if (!has_data_file(bs) && (bs->file->bs->open_flags & BDRV_O_NOCACHE)) {
+        s->compressed_cache = seqcache_new(s->cluster_size);
+    }
+
     return ret;
 
  fail:
@@ -2653,6 +2658,91 @@ fail_nometa:
     return ret;
 }
 
+/*
+ * Flush one cluster of compressed cache if exists. If @unfinished is true and
+ * there is no finished cluster to flush, flush the unfinished one. Otherwise
+ * flush only finished clusters.
+ *
+ * Return 0 if nothing to flush, 1 if one cluster successfully flushed and <0 on
+ * error.
+ */
+static int coroutine_fn
+qcow2_co_compressed_flush_one(BlockDriverState *bs, bool unfinished)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int ret;
+    int64_t align = 1;
+    int64_t offset, bytes;
+    uint8_t *buf;
+
+    if (!s->compressed_cache) {
+        return 0;
+    }
+
+    if (!seqcache_get_next_flush(s->compressed_cache, &offset, &bytes, &buf,
+                                 &unfinished))
+    {
+        return 0;
+    }
+
+    qcow2_inflight_writes_inc(bs, offset, bytes);
+
+    /*
+     * Don't try to align-up unfinished cached cluster, parallel write to it is
+     * possible! For finished host clusters data in the tail of the cluster will
+     * be never used. So, take some good alignment for speed.
+     *
+     * Note also, that seqcache guarantees that allocated size of @buf is enough
+     * to fill the cluster up to the end, so we are safe to align up with
+     * align <= cluster_size.
+     */
+    if (!unfinished) {
+        align = MIN(s->cluster_size,
+                    MAX(s->data_file->bs->bl.request_alignment,
+                        bdrv_opt_mem_align(bs)));
+    }
+
+    BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
+    ret = bdrv_co_pwrite(s->data_file, offset,
+                         QEMU_ALIGN_UP(offset + bytes, align) - offset,
+                         buf, 0);
+    if (ret < 0 && s->compressed_flush_ret == 0) {
+        /*
+         * The data that was "written" earlier is lost. We don't want to
+         * care with storing it somehow to retry flushing later. Also, how much
+         * data will we have to store, if not drop it after failed flush?
+         * After this point qcow2_co_flush_compressed_cache() (and
+         * qcow2_flush_compressed_cache()) never return success.
+         */
+        s->compressed_flush_ret = ret;
+    }
+
+    seqcache_discard_cluster(s->compressed_cache, offset);
+
+    qcow2_inflight_writes_dec(bs, offset, bytes);
+
+    return ret < 0 ? ret : 1;
+}
+
+int coroutine_fn qcow2_co_flush_compressed_cache(BlockDriverState *bs)
+{
+    BDRVQcow2State *s = bs->opaque;
+
+    if (s->compressed_cache) {
+        uint64_t nb_clusters = seqcache_nb_clusters(s->compressed_cache);
+
+        /*
+         * If parallel writes are possible we don't want to loop forever. So
+         * flush only clusters available at start of flush operation.
+         */
+        while (nb_clusters--) {
+            qcow2_co_compressed_flush_one(bs, true);
+        }
+    }
+
+    return s->compressed_flush_ret;
+}
+
 static int qcow2_inactivate(BlockDriverState *bs)
 {
     BDRVQcow2State *s = bs->opaque;
@@ -2667,6 +2757,13 @@ static int qcow2_inactivate(BlockDriverState *bs)
                           bdrv_get_device_or_node_name(bs));
     }
 
+    ret = qcow2_flush_compressed_cache(bs);
+    if (ret) {
+        result = ret;
+        error_report("Failed to flush the compressed write cache: %s",
+                     strerror(-ret));
+    }
+
     ret = qcow2_cache_flush(bs, s->l2_table_cache);
     if (ret) {
         result = ret;
@@ -2699,6 +2796,12 @@ static void qcow2_close(BlockDriverState *bs)
         qcow2_inactivate(bs);
     }
 
+    /*
+     * Cache should be flushed in qcow2_inactivate() and should be empty in
+     * inactive mode. So we are safe to free it.
+     */
+    seqcache_free(s->compressed_cache);
+
     cache_clean_timer_del(bs);
     qcow2_cache_destroy(s->l2_table_cache);
     qcow2_cache_destroy(s->refcount_block_cache);
@@ -4558,18 +4661,42 @@ qcow2_co_pwritev_compressed_task(BlockDriverState *bs,
         goto fail;
     }
 
-    qcow2_inflight_writes_inc(bs, cluster_offset, out_len);
+    if (s->compressed_cache) {
+        /*
+         * It's important to do seqcache_write() in the same critical section
+         * (by s->lock) as qcow2_alloc_compressed_cluster_offset(), so that the
+         * cache is filled sequentially.
+         */
+        seqcache_write(s->compressed_cache, cluster_offset, out_len, out_buf);
 
-    qemu_co_mutex_unlock(&s->lock);
+        qemu_co_mutex_unlock(&s->lock);
 
-    BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
-    ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0);
+        ret = qcow2_co_compressed_flush_one(bs, false);
+        if (ret < 0) {
+            /*
+             * if ret < 0 it probably not this request which failed, but some
+             * previous one that was cached. Moreover, when we write to the
+             * cache we probably may always report success, because
+             * seqcache_write() never fails. Still, it's better to inform
+             * compressed backup now then wait until final flush.
+             */
+            goto fail;
+        }
+    } else {
+        qcow2_inflight_writes_inc(bs, cluster_offset, out_len);
 
-    qcow2_inflight_writes_dec(bs, cluster_offset, out_len);
+        qemu_co_mutex_unlock(&s->lock);
 
-    if (ret < 0) {
-        goto fail;
+        BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
+        ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0);
+
+        qcow2_inflight_writes_dec(bs, cluster_offset, out_len);
+
+        if (ret < 0) {
+            goto fail;
+        }
     }
+
 success:
     ret = 0;
 fail:
@@ -4681,10 +4808,19 @@ qcow2_co_preadv_compressed(BlockDriverState *bs,
 
     out_buf = qemu_blockalign(bs, s->cluster_size);
 
-    BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
-    ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
-    if (ret < 0) {
-        goto fail;
+    /*
+     * seqcache_read may return less bytes than csize, as csize may exceed
+     * actual compressed data size. So we are OK if seqcache_read returns
+     * something > 0.
+     */
+    if (!s->compressed_cache ||
+        seqcache_read(s->compressed_cache, coffset, csize, buf) <= 0)
+    {
+        BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
+        ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
+        if (ret < 0) {
+            goto fail;
+        }
     }
 
     if (qcow2_co_decompress(bs, out_buf, s->cluster_size, buf, csize) < 0) {
-- 
2.29.2



  parent reply	other threads:[~2021-03-05 18:35 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-03-05 17:35 [PATCH v3 0/6] qcow2: compressed write cache Vladimir Sementsov-Ogievskiy
2021-03-05 17:35 ` [PATCH v3 1/6] block-jobs: flush target at the end of .run() Vladimir Sementsov-Ogievskiy
2021-03-11 16:57   ` Max Reitz
2021-03-05 17:35 ` [PATCH v3 2/6] iotests: add qcow2-discard-during-rewrite Vladimir Sementsov-Ogievskiy
2021-03-05 17:35 ` [PATCH v3 3/6] block/qcow2: introduce inflight writes counters: fix discard Vladimir Sementsov-Ogievskiy
2021-03-11 19:58   ` Max Reitz
2021-03-12  9:09     ` Vladimir Sementsov-Ogievskiy
2021-03-12 11:17       ` Max Reitz
2021-03-12 12:32         ` Vladimir Sementsov-Ogievskiy
2021-03-12 12:42           ` Vladimir Sementsov-Ogievskiy
2021-03-12 15:01             ` Max Reitz
2021-03-12 12:46           ` Vladimir Sementsov-Ogievskiy
2021-03-12 15:10             ` Max Reitz
2021-03-12 15:24               ` Vladimir Sementsov-Ogievskiy
2021-03-12 15:52                 ` Max Reitz
2021-03-12 16:03                   ` Vladimir Sementsov-Ogievskiy
2021-03-12 14:58           ` Max Reitz
2021-03-12 15:39             ` Vladimir Sementsov-Ogievskiy
2021-03-05 17:35 ` [PATCH v3 4/6] util: implement seqcache Vladimir Sementsov-Ogievskiy
2021-03-12 13:41   ` Max Reitz
2021-03-12 14:37     ` Vladimir Sementsov-Ogievskiy
2021-03-12 15:13       ` Max Reitz
2021-06-04 14:31   ` Vladimir Sementsov-Ogievskiy
2021-03-05 17:35 ` [PATCH v3 5/6] block-coroutine-wrapper: allow non bdrv_ prefix Vladimir Sementsov-Ogievskiy
2021-03-12 16:53   ` Max Reitz
2021-03-05 17:35 ` Vladimir Sementsov-Ogievskiy [this message]
2021-03-12 18:15   ` [PATCH v3 6/6] block/qcow2: use seqcache for compressed writes Max Reitz
2021-03-12 18:43     ` Vladimir Sementsov-Ogievskiy
2021-03-15  9:58       ` Max Reitz
2021-03-15 14:40         ` Vladimir Sementsov-Ogievskiy
2021-03-16 12:25           ` Max Reitz
2021-03-16 17:48             ` Vladimir Sementsov-Ogievskiy
2021-03-17  8:09               ` Max Reitz
2021-03-12 18:45     ` Vladimir Sementsov-Ogievskiy
2021-03-29 20:18     ` Vladimir Sementsov-Ogievskiy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210305173507.393137-7-vsementsov@virtuozzo.com \
    --to=vsementsov@virtuozzo.com \
    --cc=crosa@redhat.com \
    --cc=ehabkost@redhat.com \
    --cc=jsnow@redhat.com \
    --cc=kwolf@redhat.com \
    --cc=mreitz@redhat.com \
    --cc=qemu-block@nongnu.org \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).