qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v1 0/2] qcow2: fix parallel rewrite and discard
@ 2021-02-25 10:10 Vladimir Sementsov-Ogievskiy
  2021-02-25 10:10 ` [PATCH v1 1/2] iotests: add qcow2-discard-during-rewrite Vladimir Sementsov-Ogievskiy
  2021-02-25 10:10 ` [PATCH v1 2/2] block/qcow2: introduce inflight writes counters: fix discard Vladimir Sementsov-Ogievskiy
  0 siblings, 2 replies; 3+ messages in thread
From: Vladimir Sementsov-Ogievskiy @ 2021-02-25 10:10 UTC (permalink / raw)
  To: qemu-block; +Cc: qemu-devel, mreitz, kwolf, Vladimir Sementsov-Ogievskiy

Hi all! It occurs that nothing prevents discarding and reallocating host
cluster during data writing. This way data writing will pollute another
flash allocated cluster of data or metadata.

Here is my suggestion to fix it basing on improved refcounts model. Look
at 02 for details.

I don't insist on this version, and will soon send a v2, based on
CoRwLock, as Kevin suggested, which should look simpler. Still, with v1
we keep possibility of relatively async discard.. Doesn't seem worth the
complexity.. But I'd like to share my idea of additional "runtime"
reference counters for clusters, as it may be needed later if we face
problems with more restrictive CoRwLock or may be for some other task.
So here is it.

Vladimir Sementsov-Ogievskiy (2):
  iotests: add qcow2-discard-during-rewrite
  block/qcow2: introduce inflight writes counters: fix discard

 block/qcow2.h                                 |   9 +
 block/qcow2-refcount.c                        | 154 +++++++++++++++++-
 block/qcow2.c                                 |  26 ++-
 .../tests/qcow2-discard-during-rewrite        |  72 ++++++++
 .../tests/qcow2-discard-during-rewrite.out    |  21 +++
 5 files changed, 278 insertions(+), 4 deletions(-)
 create mode 100755 tests/qemu-iotests/tests/qcow2-discard-during-rewrite
 create mode 100644 tests/qemu-iotests/tests/qcow2-discard-during-rewrite.out

-- 
2.29.2



^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH v1 1/2] iotests: add qcow2-discard-during-rewrite
  2021-02-25 10:10 [PATCH v1 0/2] qcow2: fix parallel rewrite and discard Vladimir Sementsov-Ogievskiy
@ 2021-02-25 10:10 ` Vladimir Sementsov-Ogievskiy
  2021-02-25 10:10 ` [PATCH v1 2/2] block/qcow2: introduce inflight writes counters: fix discard Vladimir Sementsov-Ogievskiy
  1 sibling, 0 replies; 3+ messages in thread
From: Vladimir Sementsov-Ogievskiy @ 2021-02-25 10:10 UTC (permalink / raw)
  To: qemu-block; +Cc: qemu-devel, mreitz, kwolf, Vladimir Sementsov-Ogievskiy

Simple test:
 - start writing to allocated cluster A
 - discard this cluster
 - write to another unallocated cluster B (it's allocated in same place
   where A was allocated)
 - continue writing to A

For now last action pollutes cluster B which is a bug fixed by the
following commit.

For now, add test to "disabled" group, so that it doesn't run
automatically.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 .../tests/qcow2-discard-during-rewrite        | 72 +++++++++++++++++++
 .../tests/qcow2-discard-during-rewrite.out    | 21 ++++++
 2 files changed, 93 insertions(+)
 create mode 100755 tests/qemu-iotests/tests/qcow2-discard-during-rewrite
 create mode 100644 tests/qemu-iotests/tests/qcow2-discard-during-rewrite.out

diff --git a/tests/qemu-iotests/tests/qcow2-discard-during-rewrite b/tests/qemu-iotests/tests/qcow2-discard-during-rewrite
new file mode 100755
index 0000000000..7f0d8a107a
--- /dev/null
+++ b/tests/qemu-iotests/tests/qcow2-discard-during-rewrite
@@ -0,0 +1,72 @@
+#!/usr/bin/env bash
+# group: quick disabled
+#
+# Test discarding (and reusing) host cluster during writing data to it.
+#
+# Copyright (c) 2021 Virtuozzo International GmbH.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=vsementsov@virtuozzo.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+status=1	# failure is the default!
+
+_cleanup()
+{
+    _cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./../common.rc
+. ./../common.filter
+
+_supported_fmt qcow2
+_supported_proto file fuse
+_supported_os Linux
+
+size=1M
+_make_test_img $size
+
+(
+cat <<EOF
+write -P 1 0 64K
+
+break pwritev A
+aio_write -P 2 0 64K
+wait_break A
+
+discard 0 64K
+write -P 3 128K 64K
+read -P 3 128K 64K
+
+break pwritev_done B
+resume A
+wait_break B
+resume B
+
+read -P 0 0 64K
+read -P 3 128K 64K
+EOF
+) | $QEMU_IO blkdebug::$TEST_IMG | _filter_qemu_io
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/tests/qcow2-discard-during-rewrite.out b/tests/qemu-iotests/tests/qcow2-discard-during-rewrite.out
new file mode 100644
index 0000000000..8e75b2fbff
--- /dev/null
+++ b/tests/qemu-iotests/tests/qcow2-discard-during-rewrite.out
@@ -0,0 +1,21 @@
+QA output created by qcow2-discard-during-rewrite
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
+wrote 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+blkdebug: Suspended request 'A'
+discard 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset 131072
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 131072
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+blkdebug: Resuming request 'A'
+blkdebug: Suspended request 'B'
+blkdebug: Resuming request 'B'
+wrote 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 131072
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+*** done
-- 
2.29.2



^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH v1 2/2] block/qcow2: introduce inflight writes counters: fix discard
  2021-02-25 10:10 [PATCH v1 0/2] qcow2: fix parallel rewrite and discard Vladimir Sementsov-Ogievskiy
  2021-02-25 10:10 ` [PATCH v1 1/2] iotests: add qcow2-discard-during-rewrite Vladimir Sementsov-Ogievskiy
@ 2021-02-25 10:10 ` Vladimir Sementsov-Ogievskiy
  1 sibling, 0 replies; 3+ messages in thread
From: Vladimir Sementsov-Ogievskiy @ 2021-02-25 10:10 UTC (permalink / raw)
  To: qemu-block; +Cc: qemu-devel, mreitz, kwolf, Vladimir Sementsov-Ogievskiy

There is a bug in qcow2: host cluster can be discarded (refcount
becomes 0) and reused during data write. In this case data write may
pollute another cluster (recently allocated) or even metadata.

To fix the issue let's track inflight writes to host cluster in the
hash table and consider new counter when discarding and reusing host
clusters.

Enable qcow2-discard-during-rewrite as it is fixed.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 block/qcow2.h                                 |   9 +
 block/qcow2-refcount.c                        | 154 +++++++++++++++++-
 block/qcow2.c                                 |  26 ++-
 .../tests/qcow2-discard-during-rewrite        |   2 +-
 4 files changed, 186 insertions(+), 5 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index 0678073b74..fea2525a76 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -420,6 +420,8 @@ typedef struct BDRVQcow2State {
      * is to convert the image with the desired compression type set.
      */
     Qcow2CompressionType compression_type;
+
+    GHashTable *inflight_writes_counters;
 } BDRVQcow2State;
 
 typedef struct Qcow2COWRegion {
@@ -896,6 +898,13 @@ int qcow2_shrink_reftable(BlockDriverState *bs);
 int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size);
 int qcow2_detect_metadata_preallocation(BlockDriverState *bs);
 
+int qcow2_inflight_writes_inc_locked(BlockDriverState *bs, int64_t offset,
+                                     int64_t length);
+int qcow2_inflight_writes_dec(BlockDriverState *bs, int64_t offset,
+                              int64_t length);
+int qcow2_inflight_writes_dec_locked(BlockDriverState *bs, int64_t offset,
+                                     int64_t length);
+
 /* qcow2-cluster.c functions */
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                         bool exact_size);
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 8e649b008e..0ecb1167a6 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -799,6 +799,145 @@ found:
     }
 }
 
+/*
+ * Qcow2InFlightRefcount is a type for values of s->inflight_writes_counters
+ * hasm map. And it's keys are cluster indices.
+ */
+typedef struct Qcow2InFlightRefcount {
+    /*
+     * Number of in-flight writes to the cluster, always > 0, as when becomes
+     * 0 the entry is removed from s->inflight_writes_counters.
+     */
+    uint64_t inflight_writes_cnt;
+
+    /* Cluster refcount is known to be zero */
+    bool refcount_zero;
+
+    /* Cluster refcount was made zero with this discard-type */
+    enum qcow2_discard_type type;
+} Qcow2InFlightRefcount;
+
+static Qcow2InFlightRefcount *find_infl_wr(BDRVQcow2State *s,
+                                           int64_t cluster_index)
+{
+    Qcow2InFlightRefcount *infl;
+
+    if (!s->inflight_writes_counters) {
+        return NULL;
+    }
+
+    infl = g_hash_table_lookup(s->inflight_writes_counters, &cluster_index);
+
+    if (infl) {
+        assert(infl->inflight_writes_cnt > 0);
+    }
+
+    return infl;
+}
+
+/*
+ * Returns true if there are any in-flight writes to the cluster blocking
+ * its reallocation.
+ */
+static bool has_infl_wr(BDRVQcow2State *s, int64_t cluster_index)
+{
+    return !!find_infl_wr(s, cluster_index);
+}
+
+static int update_inflight_write_cnt(BlockDriverState *bs,
+                                     int64_t offset, int64_t length,
+                                     bool decrease, bool locked)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int64_t start, last, cluster_offset;
+
+    if (locked) {
+        qemu_co_mutex_assert_locked(&s->lock);
+    }
+
+    start = start_of_cluster(s, offset);
+    last = start_of_cluster(s, offset + length - 1);
+    for (cluster_offset = start; cluster_offset <= last;
+         cluster_offset += s->cluster_size)
+    {
+        int64_t cluster_index = cluster_offset >> s->cluster_bits;
+        Qcow2InFlightRefcount *infl = find_infl_wr(s, cluster_index);
+
+        if (!infl) {
+            infl = g_new0(Qcow2InFlightRefcount, 1);
+            g_hash_table_insert(s->inflight_writes_counters,
+                                g_memdup(&cluster_index, sizeof(cluster_index)),
+                                infl);
+        }
+
+        if (decrease) {
+            assert(infl->inflight_writes_cnt >= 1);
+            infl->inflight_writes_cnt--;
+        } else {
+            infl->inflight_writes_cnt++;
+        }
+
+        if (infl->inflight_writes_cnt == 0) {
+            bool refcount_zero = infl->refcount_zero;
+            enum qcow2_discard_type type = infl->type;
+
+            g_hash_table_remove(s->inflight_writes_counters, &cluster_index);
+
+            if (refcount_zero) {
+                /*
+                 * Slow path. We must reset normal refcount to actually release
+                 * the cluster.
+                 */
+                int ret;
+
+                if (!locked) {
+                    qemu_co_mutex_lock(&s->lock);
+                }
+                ret = qcow2_update_cluster_refcount(bs, cluster_index, 0,
+                                                    true, type);
+                if (!locked) {
+                    qemu_co_mutex_unlock(&s->lock);
+                }
+
+                if (ret < 0) {
+                    return ret;
+                }
+            }
+        }
+
+    }
+
+    return 0;
+}
+
+/*
+ * It makes sense to call qcow2_inflight_writes_inc() only in same s->lock
+ * critical section where corresponding region was allocated (or taken from L2
+ * table).
+ */
+int qcow2_inflight_writes_inc_locked(BlockDriverState *bs, int64_t offset,
+                                     int64_t length)
+{
+    return update_inflight_write_cnt(bs, offset, length, false, true);
+}
+
+/*
+ * Called with s->lock not locked by caller. Will take s->lock only if need to
+ * release the cluster (refcount is 0 and inflight-write-cnt becomes zero).
+ */
+int qcow2_inflight_writes_dec(BlockDriverState *bs, int64_t offset,
+                              int64_t length)
+{
+    return update_inflight_write_cnt(bs, offset, length, true, false);
+}
+
+/* Called with s->lock locked. */
+int qcow2_inflight_writes_dec_locked(BlockDriverState *bs, int64_t offset,
+                                     int64_t length)
+{
+    return update_inflight_write_cnt(bs, offset, length, true, true);
+}
+
 /* XXX: cache several refcount block clusters ? */
 /* @addend is the absolute value of the addend; if @decrease is set, @addend
  * will be subtracted from the current refcount, otherwise it will be added */
@@ -885,6 +1024,13 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
 
         if (refcount == 0) {
             void *table;
+            Qcow2InFlightRefcount *infl = find_infl_wr(s, cluster_index);
+
+            if (infl) {
+                infl->refcount_zero = true;
+                infl->type = type;
+                continue;
+            }
 
             table = qcow2_cache_is_table_offset(s->refcount_block_cache,
                                                 offset);
@@ -983,7 +1129,7 @@ retry:
 
         if (ret < 0) {
             return ret;
-        } else if (refcount != 0) {
+        } else if (refcount != 0 || has_infl_wr(s, next_cluster_index)) {
             goto retry;
         }
     }
@@ -1046,7 +1192,7 @@ int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
             ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
             if (ret < 0) {
                 return ret;
-            } else if (refcount != 0) {
+            } else if (refcount != 0 || has_infl_wr(s, cluster_index)) {
                 break;
             }
         }
@@ -2294,7 +2440,9 @@ static int64_t alloc_clusters_imrt(BlockDriverState *bs,
          contiguous_free_clusters < cluster_count;
          cluster++)
     {
-        if (!s->get_refcount(*refcount_table, cluster)) {
+        if (!s->get_refcount(*refcount_table, cluster) &&
+            !has_infl_wr(s, cluster))
+        {
             contiguous_free_clusters++;
             if (first_gap) {
                 /* If this is the first free cluster found, update
diff --git a/block/qcow2.c b/block/qcow2.c
index d9f49a52e7..00a5a2eca0 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1831,6 +1831,8 @@ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options,
 #endif
 
     qemu_co_queue_init(&s->thread_task_queue);
+    s->inflight_writes_counters =
+        g_hash_table_new_full(g_int64_hash, g_int64_equal, g_free, g_free);
 
     return ret;
 
@@ -2546,6 +2548,9 @@ out_unlocked:
 
 out_locked:
     qcow2_handle_l2meta(bs, &l2meta, false);
+
+    qcow2_inflight_writes_dec_locked(bs, host_offset, bytes);
+
     qemu_co_mutex_unlock(&s->lock);
 
     qemu_vfree(crypt_buf);
@@ -2605,6 +2610,8 @@ static coroutine_fn int qcow2_co_pwritev_part(
             goto out_locked;
         }
 
+        qcow2_inflight_writes_inc_locked(bs, host_offset, cur_bytes);
+
         qemu_co_mutex_unlock(&s->lock);
 
         if (!aio && cur_bytes != bytes) {
@@ -2707,6 +2714,9 @@ static void qcow2_close(BlockDriverState *bs)
     g_free(s->image_backing_file);
     g_free(s->image_backing_format);
 
+    assert(g_hash_table_size(s->inflight_writes_counters) == 0);
+    g_hash_table_unref(s->inflight_writes_counters);
+
     if (has_data_file(bs)) {
         bdrv_unref_child(bs, s->data_file);
         s->data_file = NULL;
@@ -4097,10 +4107,17 @@ qcow2_co_copy_range_to(BlockDriverState *bs,
             goto fail;
         }
 
+        qcow2_inflight_writes_inc_locked(bs, host_offset, cur_bytes);
+
         qemu_co_mutex_unlock(&s->lock);
+
         ret = bdrv_co_copy_range_to(src, src_offset, s->data_file, host_offset,
                                     cur_bytes, read_flags, write_flags);
+
         qemu_co_mutex_lock(&s->lock);
+
+        qcow2_inflight_writes_dec_locked(bs, host_offset, cur_bytes);
+
         if (ret < 0) {
             goto fail;
         }
@@ -4536,13 +4553,20 @@ qcow2_co_pwritev_compressed_task(BlockDriverState *bs,
     }
 
     ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len, true);
-    qemu_co_mutex_unlock(&s->lock);
     if (ret < 0) {
+        qemu_co_mutex_unlock(&s->lock);
         goto fail;
     }
 
+    qcow2_inflight_writes_inc_locked(bs, cluster_offset, out_len);
+
+    qemu_co_mutex_unlock(&s->lock);
+
     BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
     ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0);
+
+    qcow2_inflight_writes_dec(bs, cluster_offset, out_len);
+
     if (ret < 0) {
         goto fail;
     }
diff --git a/tests/qemu-iotests/tests/qcow2-discard-during-rewrite b/tests/qemu-iotests/tests/qcow2-discard-during-rewrite
index 7f0d8a107a..2e2e0d2cb0 100755
--- a/tests/qemu-iotests/tests/qcow2-discard-during-rewrite
+++ b/tests/qemu-iotests/tests/qcow2-discard-during-rewrite
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# group: quick disabled
+# group: quick
 #
 # Test discarding (and reusing) host cluster during writing data to it.
 #
-- 
2.29.2



^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-02-25 10:13 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-02-25 10:10 [PATCH v1 0/2] qcow2: fix parallel rewrite and discard Vladimir Sementsov-Ogievskiy
2021-02-25 10:10 ` [PATCH v1 1/2] iotests: add qcow2-discard-during-rewrite Vladimir Sementsov-Ogievskiy
2021-02-25 10:10 ` [PATCH v1 2/2] block/qcow2: introduce inflight writes counters: fix discard Vladimir Sementsov-Ogievskiy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).