All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization
@ 2011-09-13  7:53 Frediano Ziglio
  2011-09-13  7:53 ` [Qemu-devel] [PATCH][RFC][1/2] qcow2: optimize refminus updates Frediano Ziglio
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Frediano Ziglio @ 2011-09-13  7:53 UTC (permalink / raw)
  To: kwolf; +Cc: qemu-devel, Frediano Ziglio

These patches try to trade-off between leaks and speed for clusters
refcounts.

Refcount increments (REF+ or refp) are handled in a different way from
decrements (REF- or refm). The reason it that posting or not flushing
a REF- cause "just" a leak while posting a REF+ cause a corruption.

To optimize REF- I just used an array to store offsets then when a
flush is requested or array reach a limit (currently 1022) the array
is sorted and written to disk. I use an array with offset instead of
ranges to support compression (an offset could appear multiple times
in the array).
I consider this patch quite ready.

To optimize REF+ I mark a range as allocated and use this range to
get new ones (avoiding writing refcount to disk). When a flush is
requested or in some situations (like snapshot) this cache is disabled
and flushed (written as REF-).
I do not consider this patch ready, it works and pass all io-tests
but for instance I would avoid allocating new clusters for refcount
during preallocation.

End speed up is quite visible allocating clusters (more then 20%).


Frediano Ziglio (2):
  qcow2: optimize refminus updates
  qcow2: ref+ optimization

 block/qcow2-refcount.c |  270 +++++++++++++++++++++++++++++++++++++++++++++---
 block/qcow2.c          |    2 +
 block/qcow2.h          |   16 +++
 3 files changed, 275 insertions(+), 13 deletions(-)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Qemu-devel] [PATCH][RFC][1/2] qcow2: optimize refminus updates
  2011-09-13  7:53 [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization Frediano Ziglio
@ 2011-09-13  7:53 ` Frediano Ziglio
  2011-09-13  7:53 ` [Qemu-devel] [PATCH][RFC][2/2] qcow2: ref+ optimization Frediano Ziglio
  2011-09-13 10:37 ` [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization Kevin Wolf
  2 siblings, 0 replies; 11+ messages in thread
From: Frediano Ziglio @ 2011-09-13  7:53 UTC (permalink / raw)
  To: kwolf; +Cc: qemu-devel, Frediano Ziglio

Cache refcount decrement in an array to trade-off between
leaks and speed.

Signed-off-by: Frediano Ziglio <freddy77@gmail.com>
---
 block/qcow2-refcount.c |  142 ++++++++++++++++++++++++++++++++++++++++++++++--
 block/qcow2.c          |    1 +
 block/qcow2.h          |   14 +++++
 3 files changed, 153 insertions(+), 4 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 9605367..7d59b68 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -40,6 +40,13 @@ int qcow2_refcount_init(BlockDriverState *bs)
     BDRVQcowState *s = bs->opaque;
     int ret, refcount_table_size2, i;
 
+    s->refm_cache_index = 0;
+    s->refm_cache_len = 1024;
+    s->refm_cache = g_malloc(s->refm_cache_len * sizeof(uint64));
+    if (!s->refm_cache) {
+        goto fail;
+    }
+
     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
     s->refcount_table = g_malloc(refcount_table_size2);
     if (s->refcount_table_size > 0) {
@@ -53,12 +60,14 @@ int qcow2_refcount_init(BlockDriverState *bs)
     }
     return 0;
  fail:
+    g_free(s->refm_cache);
     return -ENOMEM;
 }
 
 void qcow2_refcount_close(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
+    g_free(s->refm_cache);
     g_free(s->refcount_table);
 }
 
@@ -634,13 +643,21 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
 void qcow2_free_clusters(BlockDriverState *bs,
                           int64_t offset, int64_t size)
 {
+    BDRVQcowState *s = bs->opaque;
     int ret;
+    int64_t start, last;
 
     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
-    ret = update_refcount(bs, offset, size, -1);
-    if (ret < 0) {
-        fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
-        /* TODO Remember the clusters to free them later and avoid leaking */
+    start = offset & ~(s->cluster_size - 1);
+    last = (offset + size - 1) & ~(s->cluster_size - 1);
+    for (; start <= last; start += s->cluster_size) {
+        ret = qcow2_refm_add(bs, start);
+        if (ret < 0) {
+            fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
+            /* TODO Remember the clusters to free them later
+             * and avoid leaking */
+            break;
+        }
     }
 }
 
@@ -1165,3 +1182,120 @@ fail:
     return ret;
 }
 
+int qcow2_refm_add_any(BlockDriverState *bs, int64_t offset)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    offset &= ~QCOW_OFLAG_COPIED;
+    if (s->refm_cache_index + 2 > s->refm_cache_len) {
+        int ret = qcow2_refm_flush(bs);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    if ((offset & QCOW_OFLAG_COMPRESSED)) {
+        int nb_csectors = ((offset >> s->csize_shift) & s->csize_mask) + 1;
+        int64_t last;
+
+        offset = (offset & s->cluster_offset_mask) & ~511;
+        last  = offset + nb_csectors * 512 - 1;
+        if (!in_same_refcount_block(s, offset, last)) {
+            s->refm_cache[s->refm_cache_index++] = last;
+        }
+    }
+    s->refm_cache[s->refm_cache_index++] = offset;
+    return 0;
+}
+
+static int uint64_cmp(const void *a, const void *b)
+{
+#define A (*((const uint64_t *)a))
+#define B (*((const uint64_t *)b))
+    if (A == B) {
+        return 0;
+    }
+    return A > B ? 1 : -1;
+#undef A
+#undef B
+}
+
+int qcow2_refm_flush(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint16_t *refcount_block = NULL;
+    int64_t old_table_index = -1;
+    int ret, i, saved_index = 0;
+    int len = s->refm_cache_index;
+
+    /* sort cache */
+    qsort(s->refm_cache, len, sizeof(uint64_t), uint64_cmp);
+
+    /* save */
+    for (i = 0; i < len; ++i) {
+        uint64_t cluster_offset = s->refm_cache[i];
+        int block_index, refcount;
+        int64_t cluster_index = cluster_offset >> s->cluster_bits;
+        int64_t table_index =
+            cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
+
+        /* Load the refcount block and allocate it if needed */
+        if (table_index != old_table_index) {
+            if (refcount_block) {
+                ret = qcow2_cache_put(bs, s->refcount_block_cache,
+                    (void **) &refcount_block);
+                if (ret < 0) {
+                    goto fail;
+                }
+                saved_index = i;
+                refcount_block = NULL;
+            }
+
+            ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
+            if (ret < 0) {
+                goto fail;
+            }
+        }
+        old_table_index = table_index;
+
+        qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
+
+        /* we can update the count and save it */
+        block_index = cluster_index &
+            ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
+
+        refcount = be16_to_cpu(refcount_block[block_index]);
+        refcount--;
+        if (refcount < 0) {
+            ret = -EINVAL;
+            goto fail;
+        }
+        if (refcount == 0 && cluster_index < s->free_cluster_index) {
+            s->free_cluster_index = cluster_index;
+        }
+        refcount_block[block_index] = cpu_to_be16(refcount);
+    }
+
+    saved_index = len = 0;
+    s->refm_cache_index = 0;
+    ret = 0;
+fail:
+    /* Write last changed block to disk */
+    if (refcount_block) {
+        int wret;
+        wret = qcow2_cache_put(bs, s->refcount_block_cache,
+            (void **) &refcount_block);
+        if (wret < 0) {
+            return ret < 0 ? ret : wret;
+        }
+    }
+
+    if (saved_index < len) {
+        memmove(s->refm_cache, s->refm_cache + saved_index,
+            (len - saved_index) * sizeof(uint64_t));
+        s->refm_cache_index = len - saved_index;
+    }
+
+    return ret;
+}
+
diff --git a/block/qcow2.c b/block/qcow2.c
index 510ff68..89ae765 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -622,6 +622,7 @@ static void qcow2_close(BlockDriverState *bs)
     g_free(s->l1_table);
 
     qcow2_cache_flush(bs, s->l2_table_cache);
+    qcow2_refm_flush(bs);
     qcow2_cache_flush(bs, s->refcount_block_cache);
 
     qcow2_cache_destroy(bs, s->l2_table_cache);
diff --git a/block/qcow2.h b/block/qcow2.h
index 531af39..49d3d55 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -103,6 +103,8 @@ typedef struct BDRVQcowState {
 
     Qcow2Cache* l2_table_cache;
     Qcow2Cache* refcount_block_cache;
+    int refm_cache_len, refm_cache_index;
+    uint64_t *refm_cache;
 
     uint8_t *cluster_cache;
     uint8_t *cluster_data;
@@ -181,6 +183,18 @@ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
 int qcow2_refcount_init(BlockDriverState *bs);
 void qcow2_refcount_close(BlockDriverState *bs);
 
+int qcow2_refm_add_any(BlockDriverState *bs, int64_t offset);
+int qcow2_refm_flush(BlockDriverState *bs);
+static inline int qcow2_refm_add(BlockDriverState *bs, int64_t offset)
+{
+    BDRVQcowState *s = bs->opaque;
+    if (s->refm_cache_index < s->refm_cache_len) {
+        s->refm_cache[s->refm_cache_index++] = offset;
+        return 0;
+    }
+    return qcow2_refm_add_any(bs, offset);
+}
+
 int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size);
 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size);
 void qcow2_free_clusters(BlockDriverState *bs,
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [Qemu-devel] [PATCH][RFC][2/2] qcow2: ref+ optimization
  2011-09-13  7:53 [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization Frediano Ziglio
  2011-09-13  7:53 ` [Qemu-devel] [PATCH][RFC][1/2] qcow2: optimize refminus updates Frediano Ziglio
@ 2011-09-13  7:53 ` Frediano Ziglio
  2011-09-13 10:37 ` [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization Kevin Wolf
  2 siblings, 0 replies; 11+ messages in thread
From: Frediano Ziglio @ 2011-09-13  7:53 UTC (permalink / raw)
  To: kwolf; +Cc: qemu-devel, Frediano Ziglio

preallocate multiple refcount increment in order to collapse
allocation writes. This cause leaks in case of Qemu crash but
no corruptions.

Signed-off-by: Frediano Ziglio <freddy77@gmail.com>
---
 block/qcow2-refcount.c |  128 ++++++++++++++++++++++++++++++++++++++++++++---
 block/qcow2.c          |    1 +
 block/qcow2.h          |    2 +
 3 files changed, 122 insertions(+), 9 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 7d59b68..3792cda 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -30,6 +30,7 @@ static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
                             int64_t offset, int64_t length,
                             int addend);
+static void qcow2_refp_enable(BlockDriverState *bs);
 
 
 /*********************************************************/
@@ -117,6 +118,12 @@ static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
         ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
     refcount = be16_to_cpu(refcount_block[block_index]);
 
+    /* ignore preallocation */
+    if (cluster_index >= s->refp_prealloc_begin
+        && cluster_index < s->refp_prealloc_end) {
+        --refcount;
+    }
+
     ret = qcow2_cache_put(bs, s->refcount_block_cache,
         (void**) &refcount_block);
     if (ret < 0) {
@@ -207,6 +214,10 @@ static int alloc_refcount_block(BlockDriverState *bs,
      *   refcount block into the cache
      */
 
+    uint64_t old_free_cluster_index = s->free_cluster_index;
+    qcow2_refp_flush(bs);
+    s->free_cluster_index = old_free_cluster_index;
+
     *refcount_block = NULL;
 
     /* We write to the refcount table, so we might depend on L2 tables */
@@ -215,6 +226,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
     /* Allocate the refcount block itself and mark it as used */
     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
     if (new_block < 0) {
+        qcow2_refp_enable(bs);
         return new_block;
     }
 
@@ -279,6 +291,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
         }
 
         s->refcount_table[refcount_table_index] = new_block;
+        qcow2_refp_enable(bs);
         return 0;
     }
 
@@ -400,10 +413,11 @@ static int alloc_refcount_block(BlockDriverState *bs,
     s->refcount_table_offset = table_offset;
 
     /* Free old table. Remember, we must not change free_cluster_index */
-    uint64_t old_free_cluster_index = s->free_cluster_index;
+    old_free_cluster_index = s->free_cluster_index;
     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
     s->free_cluster_index = old_free_cluster_index;
 
+    qcow2_refp_enable(bs);
     ret = load_refcount_block(bs, new_block, (void**) refcount_block);
     if (ret < 0) {
         return ret;
@@ -417,6 +431,7 @@ fail_block:
     if (*refcount_block != NULL) {
         qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
     }
+    qcow2_refp_enable(bs);
     return ret;
 }
 
@@ -529,9 +544,23 @@ static int update_cluster_refcount(BlockDriverState *bs,
     BDRVQcowState *s = bs->opaque;
     int ret;
 
-    ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend);
-    if (ret < 0) {
-        return ret;
+    /* handle preallocation */
+    if (cluster_index >= s->refp_prealloc_begin
+        && cluster_index < s->refp_prealloc_end) {
+
+        /* free previous (should never happen) */
+        int64_t index = s->refp_prealloc_begin;
+        for (; index < cluster_index; ++index) {
+            qcow2_refm_add(bs, index << s->cluster_bits);
+        }
+        addend--;
+        s->refp_prealloc_begin = cluster_index + 1;
+    }
+    if (addend) {
+        ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend);
+        if (ret < 0) {
+            return ret;
+        }
     }
 
     bdrv_flush(bs->file);
@@ -572,20 +601,94 @@ retry:
     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
 }
 
+static void qcow2_refp_enable(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    if (s->refp_prealloc_end < 0) {
+        /* enable again ? */
+        if (++s->refp_prealloc_end == 0) {
+            s->refp_prealloc_end =
+                s->refp_prealloc_begin;
+        }
+    }
+}
+
+int qcow2_refp_flush(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t index, end = s->refp_prealloc_end;
+
+    if (end < 0) {
+        s->refp_prealloc_end = end - 1;
+        return 0;
+    }
+
+    index = s->refp_prealloc_begin;
+    /* this disable next allocations */
+    s->refp_prealloc_end = -1;
+    for (; index < end; ++index) {
+        qcow2_refm_add(bs, index << s->cluster_bits);
+    }
+    qcow2_refm_flush(bs);
+    return 0;
+}
+
 int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size)
 {
-    int64_t offset;
-    int ret;
+    BDRVQcowState *s = bs->opaque;
+    int64_t offset, cluster_index;
+    int ret, nb_clusters;
+    uint32_t n_prealloc = 0;
 
     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
+    nb_clusters = size_to_clusters(s, size);
     offset = alloc_clusters_noref(bs, size);
     if (offset < 0) {
         return offset;
     }
 
-    ret = update_refcount(bs, offset, size, 1);
-    if (ret < 0) {
-        return ret;
+    /* preallocation */
+    cluster_index = offset >> s->cluster_bits;
+    if (cluster_index >= s->refp_prealloc_begin &&
+        cluster_index < s->refp_prealloc_end) {
+
+        /* free previous (should never happen) */
+        int64_t index = s->refp_prealloc_begin;
+        for (; index < cluster_index; ++index) {
+            qcow2_refm_add(bs, index << s->cluster_bits);
+        }
+        while (cluster_index < s->refp_prealloc_end
+            && nb_clusters > 0) {
+            --nb_clusters;
+            ++cluster_index;
+        }
+        s->refp_prealloc_begin = cluster_index;
+    }
+
+    /* try to allocate new space for preallocation */
+    if (s->refp_prealloc_begin == s->refp_prealloc_end) {
+        s->refp_prealloc_begin =
+            s->refp_prealloc_end = cluster_index + nb_clusters;
+        while (nb_clusters < 1024
+            && get_refcount(bs, s->refp_prealloc_begin + n_prealloc) == 0) {
+            ++nb_clusters;
+            ++n_prealloc;
+            s->refp_prealloc_end = -1;
+        }
+    }
+
+    if (nb_clusters) {
+        ret = update_refcount(bs, cluster_index << s->cluster_bits,
+                              nb_clusters << s->cluster_bits, 1);
+        if (ret < 0) {
+            return ret;
+        }
+        if (n_prealloc) {
+            assert(s->refp_prealloc_end == -1);
+            s->refp_prealloc_end =
+                s->refp_prealloc_begin + n_prealloc;
+        }
     }
 
     return offset;
@@ -739,6 +842,9 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
         l1_allocated = 0;
     }
 
+    /* disable preallocation */
+    qcow2_refp_flush(bs);
+
     for(i = 0; i < l1_size; i++) {
         l2_offset = l1_table[i];
         if (l2_offset) {
@@ -761,6 +867,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                                        s->csize_mask) + 1;
                         if (addend != 0) {
                             int ret;
+                            /* XXX preallocation ??? */
                             ret = update_refcount(bs,
                                 (offset & s->cluster_offset_mask) & ~511,
                                 nb_csectors * 512, addend);
@@ -836,6 +943,9 @@ fail:
     qcow2_cache_set_writethrough(bs, s->refcount_block_cache,
         old_refcount_writethrough);
 
+    /* enable preallocation again */
+    qcow2_refp_enable(bs);
+
     if (l1_modified) {
         for(i = 0; i < l1_size; i++)
             cpu_to_be64s(&l1_table[i]);
diff --git a/block/qcow2.c b/block/qcow2.c
index 89ae765..51014e1 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -622,6 +622,7 @@ static void qcow2_close(BlockDriverState *bs)
     g_free(s->l1_table);
 
     qcow2_cache_flush(bs, s->l2_table_cache);
+    qcow2_refp_flush(bs);
     qcow2_refm_flush(bs);
     qcow2_cache_flush(bs, s->refcount_block_cache);
 
diff --git a/block/qcow2.h b/block/qcow2.h
index 49d3d55..98b1ab5 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -105,6 +105,7 @@ typedef struct BDRVQcowState {
     Qcow2Cache* refcount_block_cache;
     int refm_cache_len, refm_cache_index;
     uint64_t *refm_cache;
+    int64_t refp_prealloc_begin, refp_prealloc_end;
 
     uint8_t *cluster_cache;
     uint8_t *cluster_data;
@@ -185,6 +186,7 @@ void qcow2_refcount_close(BlockDriverState *bs);
 
 int qcow2_refm_add_any(BlockDriverState *bs, int64_t offset);
 int qcow2_refm_flush(BlockDriverState *bs);
+int qcow2_refp_flush(BlockDriverState *bs);
 static inline int qcow2_refm_add(BlockDriverState *bs, int64_t offset)
 {
     BDRVQcowState *s = bs->opaque;
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization
  2011-09-13  7:53 [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization Frediano Ziglio
  2011-09-13  7:53 ` [Qemu-devel] [PATCH][RFC][1/2] qcow2: optimize refminus updates Frediano Ziglio
  2011-09-13  7:53 ` [Qemu-devel] [PATCH][RFC][2/2] qcow2: ref+ optimization Frediano Ziglio
@ 2011-09-13 10:37 ` Kevin Wolf
  2011-09-13 13:36   ` Frediano Ziglio
  2011-09-13 14:55   ` Frediano Ziglio
  2 siblings, 2 replies; 11+ messages in thread
From: Kevin Wolf @ 2011-09-13 10:37 UTC (permalink / raw)
  To: Frediano Ziglio; +Cc: qemu-devel

Am 13.09.2011 09:53, schrieb Frediano Ziglio:
> These patches try to trade-off between leaks and speed for clusters
> refcounts.
> 
> Refcount increments (REF+ or refp) are handled in a different way from
> decrements (REF- or refm). The reason it that posting or not flushing
> a REF- cause "just" a leak while posting a REF+ cause a corruption.
> 
> To optimize REF- I just used an array to store offsets then when a
> flush is requested or array reach a limit (currently 1022) the array
> is sorted and written to disk. I use an array with offset instead of
> ranges to support compression (an offset could appear multiple times
> in the array).
> I consider this patch quite ready.

Ok, first of all let's clarify what this optimises. I don't think it
changes anything at all for the writeback cache modes, because these
already do most operations in memory only. So this must be about
optimising some operations with cache=writethrough. REF- isn't about
normal cluster allocation, it is about COW with internal snapshots or
bdrv_discard. Do you have benchmarks for any of them?

I strongly disagree with your approach for REF-. We already have a
cache, and introducing a second one sounds like a bad idea. I think we
could get a very similar effect if we introduced a
qcow2_cache_entry_mark_dirty_wb() that marks a given refcount block as
dirty, but at the same time tells the cache that even in write-through
mode it can still treat this block as write-back. This should require
much less code changes.

But let's measure the effects first, I suspect that for cluster
allocation it doesn't help much because every REF- comes with a REF+.

> To optimize REF+ I mark a range as allocated and use this range to
> get new ones (avoiding writing refcount to disk). When a flush is
> requested or in some situations (like snapshot) this cache is disabled
> and flushed (written as REF-).
> I do not consider this patch ready, it works and pass all io-tests
> but for instance I would avoid allocating new clusters for refcount
> during preallocation.

The only question here is if improving cache=writethrough cluster
allocation performance is worth the additional complexity in the already
complex refcounting code.

The alternative that was discussed before is the dirty bit approach that
is used in QED and would allow us to use writeback for all refcount
blocks, regardless of REF- or REF+. It would be an easier approach
requiring less code changes, but it comes with the cost of requiring an
fsck after a qemu crash.

> End speed up is quite visible allocating clusters (more then 20%).

What benchmark do you use for testing this?

Kevin

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization
  2011-09-13 10:37 ` [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization Kevin Wolf
@ 2011-09-13 13:36   ` Frediano Ziglio
  2011-09-14  9:10     ` Kevin Wolf
  2011-09-13 14:55   ` Frediano Ziglio
  1 sibling, 1 reply; 11+ messages in thread
From: Frediano Ziglio @ 2011-09-13 13:36 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: qemu-devel

2011/9/13 Kevin Wolf <kwolf@redhat.com>:
> Am 13.09.2011 09:53, schrieb Frediano Ziglio:
>> These patches try to trade-off between leaks and speed for clusters
>> refcounts.
>>
>> Refcount increments (REF+ or refp) are handled in a different way from
>> decrements (REF- or refm). The reason it that posting or not flushing
>> a REF- cause "just" a leak while posting a REF+ cause a corruption.
>>
>> To optimize REF- I just used an array to store offsets then when a
>> flush is requested or array reach a limit (currently 1022) the array
>> is sorted and written to disk. I use an array with offset instead of
>> ranges to support compression (an offset could appear multiple times
>> in the array).
>> I consider this patch quite ready.
>
> Ok, first of all let's clarify what this optimises. I don't think it
> changes anything at all for the writeback cache modes, because these
> already do most operations in memory only. So this must be about
> optimising some operations with cache=writethrough. REF- isn't about
> normal cluster allocation, it is about COW with internal snapshots or
> bdrv_discard. Do you have benchmarks for any of them?
>
> I strongly disagree with your approach for REF-. We already have a
> cache, and introducing a second one sounds like a bad idea. I think we
> could get a very similar effect if we introduced a
> qcow2_cache_entry_mark_dirty_wb() that marks a given refcount block as
> dirty, but at the same time tells the cache that even in write-through
> mode it can still treat this block as write-back. This should require
> much less code changes.
>

Yes, mainly optimize for writethrough. I did not test with writeback
but should improve even this (I think here you have some flush to keep
consistency).
I'll try to write a qcow2_cache_entry_mark_dirty_wb patch and test it.

> But let's measure the effects first, I suspect that for cluster
> allocation it doesn't help much because every REF- comes with a REF+.
>

That's 50% of effort if REF- clusters are far from REF+ :)

>> To optimize REF+ I mark a range as allocated and use this range to
>> get new ones (avoiding writing refcount to disk). When a flush is
>> requested or in some situations (like snapshot) this cache is disabled
>> and flushed (written as REF-).
>> I do not consider this patch ready, it works and pass all io-tests
>> but for instance I would avoid allocating new clusters for refcount
>> during preallocation.
>
> The only question here is if improving cache=writethrough cluster
> allocation performance is worth the additional complexity in the already
> complex refcounting code.
>

I didn't see this optimization as a second level cache, but yes, for
REF- is a second cache.

> The alternative that was discussed before is the dirty bit approach that
> is used in QED and would allow us to use writeback for all refcount
> blocks, regardless of REF- or REF+. It would be an easier approach
> requiring less code changes, but it comes with the cost of requiring an
> fsck after a qemu crash.
>

I was thinking about changing the header magic first time we change
refcount in order to mark image as dirty so newer Qemu recognize the
flag while former one does not recognize image. Obviously reverting
magic on image close.

>> End speed up is quite visible allocating clusters (more then 20%).
>
> What benchmark do you use for testing this?
>
> Kevin
>

Currently I'm using bonnie++ but I noted similar improves with iozone.
The test script format an image then launch a Linux machine which run
a script and save result to a file.
The test image is seems by this virtual machine as a separate disk.
The file on hist reside in a separate LV.
I got quite consistent results (of course not working on the machine
while testing, is not actually dedicated to this job).

Actually I'm running the test (added a test working in a snapshot image).

Frediano

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization
  2011-09-13 10:37 ` [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization Kevin Wolf
  2011-09-13 13:36   ` Frediano Ziglio
@ 2011-09-13 14:55   ` Frediano Ziglio
  1 sibling, 0 replies; 11+ messages in thread
From: Frediano Ziglio @ 2011-09-13 14:55 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: qemu-devel

2011/9/13 Kevin Wolf <kwolf@redhat.com>:
> Am 13.09.2011 09:53, schrieb Frediano Ziglio:
>> These patches try to trade-off between leaks and speed for clusters
>> refcounts.
>>
>> Refcount increments (REF+ or refp) are handled in a different way from
>> decrements (REF- or refm). The reason it that posting or not flushing
>> a REF- cause "just" a leak while posting a REF+ cause a corruption.
>>
>> To optimize REF- I just used an array to store offsets then when a
>> flush is requested or array reach a limit (currently 1022) the array
>> is sorted and written to disk. I use an array with offset instead of
>> ranges to support compression (an offset could appear multiple times
>> in the array).
>> I consider this patch quite ready.
>
> Ok, first of all let's clarify what this optimises. I don't think it
> changes anything at all for the writeback cache modes, because these
> already do most operations in memory only. So this must be about
> optimising some operations with cache=writethrough. REF- isn't about
> normal cluster allocation, it is about COW with internal snapshots or
> bdrv_discard. Do you have benchmarks for any of them?
>
> I strongly disagree with your approach for REF-. We already have a
> cache, and introducing a second one sounds like a bad idea. I think we
> could get a very similar effect if we introduced a
> qcow2_cache_entry_mark_dirty_wb() that marks a given refcount block as
> dirty, but at the same time tells the cache that even in write-through
> mode it can still treat this block as write-back. This should require
> much less code changes.
>
> But let's measure the effects first, I suspect that for cluster
> allocation it doesn't help much because every REF- comes with a REF+.
>
>> To optimize REF+ I mark a range as allocated and use this range to
>> get new ones (avoiding writing refcount to disk). When a flush is
>> requested or in some situations (like snapshot) this cache is disabled
>> and flushed (written as REF-).
>> I do not consider this patch ready, it works and pass all io-tests
>> but for instance I would avoid allocating new clusters for refcount
>> during preallocation.
>
> The only question here is if improving cache=writethrough cluster
> allocation performance is worth the additional complexity in the already
> complex refcounting code.
>
> The alternative that was discussed before is the dirty bit approach that
> is used in QED and would allow us to use writeback for all refcount
> blocks, regardless of REF- or REF+. It would be an easier approach
> requiring less code changes, but it comes with the cost of requiring an
> fsck after a qemu crash.
>
>> End speed up is quite visible allocating clusters (more then 20%).
>
> What benchmark do you use for testing this?
>
> Kevin
>

Here you are some results (kb/s)

with patch (ref-, ref+), qcow2s is qcow2 with a snapshot

run   raw    qcow2  qcow2s
1     22748  4878   4792
2     29557  15839  23144

without
run   raw    qcow2  qcow2s
1     21979  4308   1021
2     26249  13776  24182

Frediano

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization
  2011-09-13 13:36   ` Frediano Ziglio
@ 2011-09-14  9:10     ` Kevin Wolf
  2011-09-14  9:52       ` Frediano Ziglio
  0 siblings, 1 reply; 11+ messages in thread
From: Kevin Wolf @ 2011-09-14  9:10 UTC (permalink / raw)
  To: Frediano Ziglio; +Cc: qemu-devel

Am 13.09.2011 15:36, schrieb Frediano Ziglio:
> 2011/9/13 Kevin Wolf <kwolf@redhat.com>:
>> Am 13.09.2011 09:53, schrieb Frediano Ziglio:
>>> These patches try to trade-off between leaks and speed for clusters
>>> refcounts.
>>>
>>> Refcount increments (REF+ or refp) are handled in a different way from
>>> decrements (REF- or refm). The reason it that posting or not flushing
>>> a REF- cause "just" a leak while posting a REF+ cause a corruption.
>>>
>>> To optimize REF- I just used an array to store offsets then when a
>>> flush is requested or array reach a limit (currently 1022) the array
>>> is sorted and written to disk. I use an array with offset instead of
>>> ranges to support compression (an offset could appear multiple times
>>> in the array).
>>> I consider this patch quite ready.
>>
>> Ok, first of all let's clarify what this optimises. I don't think it
>> changes anything at all for the writeback cache modes, because these
>> already do most operations in memory only. So this must be about
>> optimising some operations with cache=writethrough. REF- isn't about
>> normal cluster allocation, it is about COW with internal snapshots or
>> bdrv_discard. Do you have benchmarks for any of them?
>>
>> I strongly disagree with your approach for REF-. We already have a
>> cache, and introducing a second one sounds like a bad idea. I think we
>> could get a very similar effect if we introduced a
>> qcow2_cache_entry_mark_dirty_wb() that marks a given refcount block as
>> dirty, but at the same time tells the cache that even in write-through
>> mode it can still treat this block as write-back. This should require
>> much less code changes.
>>
> 
> Yes, mainly optimize for writethrough. I did not test with writeback
> but should improve even this (I think here you have some flush to keep
> consistency).
> I'll try to write a qcow2_cache_entry_mark_dirty_wb patch and test it.

Great, thanks!

>> But let's measure the effects first, I suspect that for cluster
>> allocation it doesn't help much because every REF- comes with a REF+.
>>
> 
> That's 50% of effort if REF- clusters are far from REF+ :)

I would expect that the next REF+ allocates exactly the REF- cluster.
But you still have a point, we save the write on REF- and combine it
with the REF+ write.

>>> To optimize REF+ I mark a range as allocated and use this range to
>>> get new ones (avoiding writing refcount to disk). When a flush is
>>> requested or in some situations (like snapshot) this cache is disabled
>>> and flushed (written as REF-).
>>> I do not consider this patch ready, it works and pass all io-tests
>>> but for instance I would avoid allocating new clusters for refcount
>>> during preallocation.
>>
>> The only question here is if improving cache=writethrough cluster
>> allocation performance is worth the additional complexity in the already
>> complex refcounting code.
>>
> 
> I didn't see this optimization as a second level cache, but yes, for
> REF- is a second cache.
> 
>> The alternative that was discussed before is the dirty bit approach that
>> is used in QED and would allow us to use writeback for all refcount
>> blocks, regardless of REF- or REF+. It would be an easier approach
>> requiring less code changes, but it comes with the cost of requiring an
>> fsck after a qemu crash.
>>
> 
> I was thinking about changing the header magic first time we change
> refcount in order to mark image as dirty so newer Qemu recognize the
> flag while former one does not recognize image. Obviously reverting
> magic on image close.

We've discussed this idea before and I think it wasn't considered a
great idea to automagically change the header in an incompatible way.
But we can always say that for improved performance you need to upgrade
your image to qcow2 v3.

>>> End speed up is quite visible allocating clusters (more then 20%).
>>
>> What benchmark do you use for testing this?
>>
>> Kevin
>>
> 
> Currently I'm using bonnie++ but I noted similar improves with iozone.
> The test script format an image then launch a Linux machine which run
> a script and save result to a file.
> The test image is seems by this virtual machine as a separate disk.
> The file on hist reside in a separate LV.
> I got quite consistent results (of course not working on the machine
> while testing, is not actually dedicated to this job).
> 
> Actually I'm running the test (added a test working in a snapshot image).

Okay. Let me guess the remaining variables: The image is on an ext4 host
filesystem, you use cache=writethrough and virtio-blk. You don't use
backing files, compression and encryption. For your tests with internal
snapshots you have exactly one internal snapshot that is taken
immediately before the benchmark. Oh, and not to forget, KVM is enabled.

Are these assumptions correct?

Kevin

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization
  2011-09-14  9:10     ` Kevin Wolf
@ 2011-09-14  9:52       ` Frediano Ziglio
  2011-09-14 10:21         ` Kevin Wolf
  0 siblings, 1 reply; 11+ messages in thread
From: Frediano Ziglio @ 2011-09-14  9:52 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: qemu-devel

2011/9/14 Kevin Wolf <kwolf@redhat.com>:
> Am 13.09.2011 15:36, schrieb Frediano Ziglio:
>> 2011/9/13 Kevin Wolf <kwolf@redhat.com>:
>>> Am 13.09.2011 09:53, schrieb Frediano Ziglio:
>>>> These patches try to trade-off between leaks and speed for clusters
>>>> refcounts.
>>>>
>>>> Refcount increments (REF+ or refp) are handled in a different way from
>>>> decrements (REF- or refm). The reason it that posting or not flushing
>>>> a REF- cause "just" a leak while posting a REF+ cause a corruption.
>>>>
>>>> To optimize REF- I just used an array to store offsets then when a
>>>> flush is requested or array reach a limit (currently 1022) the array
>>>> is sorted and written to disk. I use an array with offset instead of
>>>> ranges to support compression (an offset could appear multiple times
>>>> in the array).
>>>> I consider this patch quite ready.
>>>
>>> Ok, first of all let's clarify what this optimises. I don't think it
>>> changes anything at all for the writeback cache modes, because these
>>> already do most operations in memory only. So this must be about
>>> optimising some operations with cache=writethrough. REF- isn't about
>>> normal cluster allocation, it is about COW with internal snapshots or
>>> bdrv_discard. Do you have benchmarks for any of them?
>>>
>>> I strongly disagree with your approach for REF-. We already have a
>>> cache, and introducing a second one sounds like a bad idea. I think we
>>> could get a very similar effect if we introduced a
>>> qcow2_cache_entry_mark_dirty_wb() that marks a given refcount block as
>>> dirty, but at the same time tells the cache that even in write-through
>>> mode it can still treat this block as write-back. This should require
>>> much less code changes.
>>>
>>
>> Yes, mainly optimize for writethrough. I did not test with writeback
>> but should improve even this (I think here you have some flush to keep
>> consistency).
>> I'll try to write a qcow2_cache_entry_mark_dirty_wb patch and test it.
>
> Great, thanks!
>

Don't expect however the patch too soon, I'm quite busy in these days.

>>> But let's measure the effects first, I suspect that for cluster
>>> allocation it doesn't help much because every REF- comes with a REF+.
>>>
>>
>> That's 50% of effort if REF- clusters are far from REF+ :)
>
> I would expect that the next REF+ allocates exactly the REF- cluster.
> But you still have a point, we save the write on REF- and combine it
> with the REF+ write.
>

This is still a TODO for REF+ patch.

Oh... time ago looking at refcount code I realize that a single
deallocation could be reused in some cases only after Qemu restart.
For instance
- got a single cluster REF- which take refcount to 0
- free_cluster_index get decreased to this index
- we get a new cluster request for 2 clusters
- free_cluster_index get increased
we skip freed deallocation and if we don't get a new deallocation for
a cluster with index minor to our freed cluster this cluster is not
reused.
(I didn't test this behavior, no leak, no corruption, just image could
be larger then expected)

>>>> To optimize REF+ I mark a range as allocated and use this range to
>>>> get new ones (avoiding writing refcount to disk). When a flush is
>>>> requested or in some situations (like snapshot) this cache is disabled
>>>> and flushed (written as REF-).
>>>> I do not consider this patch ready, it works and pass all io-tests
>>>> but for instance I would avoid allocating new clusters for refcount
>>>> during preallocation.
>>>
>>> The only question here is if improving cache=writethrough cluster
>>> allocation performance is worth the additional complexity in the already
>>> complex refcounting code.
>>>
>>
>> I didn't see this optimization as a second level cache, but yes, for
>> REF- is a second cache.
>>
>>> The alternative that was discussed before is the dirty bit approach that
>>> is used in QED and would allow us to use writeback for all refcount
>>> blocks, regardless of REF- or REF+. It would be an easier approach
>>> requiring less code changes, but it comes with the cost of requiring an
>>> fsck after a qemu crash.
>>>
>>
>> I was thinking about changing the header magic first time we change
>> refcount in order to mark image as dirty so newer Qemu recognize the
>> flag while former one does not recognize image. Obviously reverting
>> magic on image close.
>
> We've discussed this idea before and I think it wasn't considered a
> great idea to automagically change the header in an incompatible way.
> But we can always say that for improved performance you need to upgrade
> your image to qcow2 v3.
>

I don't understand why there is not a wiki page for detailed qcow3
changes. I saw your post on May. I follow this ML since August so I
think I missed a lot of discussion on qcow improves.

>>>> End speed up is quite visible allocating clusters (more then 20%).
>>>
>>> What benchmark do you use for testing this?
>>>
>>> Kevin
>>>
>>
>> Currently I'm using bonnie++ but I noted similar improves with iozone.
>> The test script format an image then launch a Linux machine which run
>> a script and save result to a file.
>> The test image is seems by this virtual machine as a separate disk.
>> The file on hist reside in a separate LV.
>> I got quite consistent results (of course not working on the machine
>> while testing, is not actually dedicated to this job).
>>
>> Actually I'm running the test (added a test working in a snapshot image).
>
> Okay. Let me guess the remaining variables: The image is on an ext4 host
> filesystem, you use cache=writethrough and virtio-blk. You don't use
> backing files, compression and encryption. For your tests with internal
> snapshots you have exactly one internal snapshot that is taken
> immediately before the benchmark. Oh, and not to forget, KVM is enabled.
>
> Are these assumptions correct?
>

change ext4 and put xfs and assumptions are ok. Yes I use internal
snapshots (REF- are useful only in this case). To produce "qcow2s" I
use these commands

$QEMU_IMG create -f qcow2 -o preallocation=metadata $FILE 15g
$QEMU_IMG snapshot -c test $FILE

> Kevin
>

I run again tests (yesterdays test was not that consistence, today I
stopped all unneeded services and avoid the X session tunneled by
ssh).

without patched
run   raw    qcow2   qcow2s
1     22758  4206    1054
2     28401  21745   17997

with patches (ref-,ref+)
run   raw    qcow2   qcow2s
1     22563  4965    4382
2     23823  21043   20904

beside I still don't understand difference in second runs for raw
(about 20%!!!) this confirms the huge improves with snapshot.

Frediano

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization
  2011-09-14  9:52       ` Frediano Ziglio
@ 2011-09-14 10:21         ` Kevin Wolf
  2011-09-14 11:49           ` Frediano Ziglio
  2011-09-15  7:24           ` Frediano Ziglio
  0 siblings, 2 replies; 11+ messages in thread
From: Kevin Wolf @ 2011-09-14 10:21 UTC (permalink / raw)
  To: Frediano Ziglio; +Cc: qemu-devel

Am 14.09.2011 11:52, schrieb Frediano Ziglio:
> 2011/9/14 Kevin Wolf <kwolf@redhat.com>:
>> Am 13.09.2011 15:36, schrieb Frediano Ziglio:
>>> 2011/9/13 Kevin Wolf <kwolf@redhat.com>:
>>>> Am 13.09.2011 09:53, schrieb Frediano Ziglio:
>>>>> These patches try to trade-off between leaks and speed for clusters
>>>>> refcounts.
>>>>>
>>>>> Refcount increments (REF+ or refp) are handled in a different way from
>>>>> decrements (REF- or refm). The reason it that posting or not flushing
>>>>> a REF- cause "just" a leak while posting a REF+ cause a corruption.
>>>>>
>>>>> To optimize REF- I just used an array to store offsets then when a
>>>>> flush is requested or array reach a limit (currently 1022) the array
>>>>> is sorted and written to disk. I use an array with offset instead of
>>>>> ranges to support compression (an offset could appear multiple times
>>>>> in the array).
>>>>> I consider this patch quite ready.
>>>>
>>>> Ok, first of all let's clarify what this optimises. I don't think it
>>>> changes anything at all for the writeback cache modes, because these
>>>> already do most operations in memory only. So this must be about
>>>> optimising some operations with cache=writethrough. REF- isn't about
>>>> normal cluster allocation, it is about COW with internal snapshots or
>>>> bdrv_discard. Do you have benchmarks for any of them?
>>>>
>>>> I strongly disagree with your approach for REF-. We already have a
>>>> cache, and introducing a second one sounds like a bad idea. I think we
>>>> could get a very similar effect if we introduced a
>>>> qcow2_cache_entry_mark_dirty_wb() that marks a given refcount block as
>>>> dirty, but at the same time tells the cache that even in write-through
>>>> mode it can still treat this block as write-back. This should require
>>>> much less code changes.
>>>>
>>>
>>> Yes, mainly optimize for writethrough. I did not test with writeback
>>> but should improve even this (I think here you have some flush to keep
>>> consistency).
>>> I'll try to write a qcow2_cache_entry_mark_dirty_wb patch and test it.
>>
>> Great, thanks!
>>
> 
> Don't expect however the patch too soon, I'm quite busy in these days.

Ok, no problem. It's not really urgent either.

>>>> But let's measure the effects first, I suspect that for cluster
>>>> allocation it doesn't help much because every REF- comes with a REF+.
>>>>
>>>
>>> That's 50% of effort if REF- clusters are far from REF+ :)
>>
>> I would expect that the next REF+ allocates exactly the REF- cluster.
>> But you still have a point, we save the write on REF- and combine it
>> with the REF+ write.
>>
> 
> This is still a TODO for REF+ patch.

Actually, I was talking about the qcow2_cache_entry_mark_dirty_wb() case
without any other change. You get it automatically then.

> Oh... time ago looking at refcount code I realize that a single
> deallocation could be reused in some cases only after Qemu restart.
> For instance
> - got a single cluster REF- which take refcount to 0
> - free_cluster_index get decreased to this index
> - we get a new cluster request for 2 clusters
> - free_cluster_index get increased
> we skip freed deallocation and if we don't get a new deallocation for
> a cluster with index minor to our freed cluster this cluster is not
> reused.
> (I didn't test this behavior, no leak, no corruption, just image could
> be larger then expected)

Yes, I'm aware of that. I'm not sure if it matters in practice.

>>>>> To optimize REF+ I mark a range as allocated and use this range to
>>>>> get new ones (avoiding writing refcount to disk). When a flush is
>>>>> requested or in some situations (like snapshot) this cache is disabled
>>>>> and flushed (written as REF-).
>>>>> I do not consider this patch ready, it works and pass all io-tests
>>>>> but for instance I would avoid allocating new clusters for refcount
>>>>> during preallocation.
>>>>
>>>> The only question here is if improving cache=writethrough cluster
>>>> allocation performance is worth the additional complexity in the already
>>>> complex refcounting code.
>>>>
>>>
>>> I didn't see this optimization as a second level cache, but yes, for
>>> REF- is a second cache.
>>>
>>>> The alternative that was discussed before is the dirty bit approach that
>>>> is used in QED and would allow us to use writeback for all refcount
>>>> blocks, regardless of REF- or REF+. It would be an easier approach
>>>> requiring less code changes, but it comes with the cost of requiring an
>>>> fsck after a qemu crash.
>>>>
>>>
>>> I was thinking about changing the header magic first time we change
>>> refcount in order to mark image as dirty so newer Qemu recognize the
>>> flag while former one does not recognize image. Obviously reverting
>>> magic on image close.
>>
>> We've discussed this idea before and I think it wasn't considered a
>> great idea to automagically change the header in an incompatible way.
>> But we can always say that for improved performance you need to upgrade
>> your image to qcow2 v3.
>>
> 
> I don't understand why there is not a wiki page for detailed qcow3
> changes. I saw your post on May. I follow this ML since August so I
> think I missed a lot of discussion on qcow improves.

Unfortunately there have been almost no comments, so you can consider
RFC v2 as the current proposal.

>>>>> End speed up is quite visible allocating clusters (more then 20%).
>>>>
>>>> What benchmark do you use for testing this?
>>>>
>>>> Kevin
>>>>
>>>
>>> Currently I'm using bonnie++ but I noted similar improves with iozone.
>>> The test script format an image then launch a Linux machine which run
>>> a script and save result to a file.
>>> The test image is seems by this virtual machine as a separate disk.
>>> The file on hist reside in a separate LV.
>>> I got quite consistent results (of course not working on the machine
>>> while testing, is not actually dedicated to this job).
>>>
>>> Actually I'm running the test (added a test working in a snapshot image).
>>
>> Okay. Let me guess the remaining variables: The image is on an ext4 host
>> filesystem, you use cache=writethrough and virtio-blk. You don't use
>> backing files, compression and encryption. For your tests with internal
>> snapshots you have exactly one internal snapshot that is taken
>> immediately before the benchmark. Oh, and not to forget, KVM is enabled.
>>
>> Are these assumptions correct?
>>
> 
> change ext4 and put xfs and assumptions are ok. Yes I use internal
> snapshots (REF- are useful only in this case). To produce "qcow2s" I
> use these commands
> 
> $QEMU_IMG create -f qcow2 -o preallocation=metadata $FILE 15g
> $QEMU_IMG snapshot -c test $FILE

Ah, using preallocation for this is a nice trick. :-)

Anyway, thanks, I think I understand now what you're measuring.

> I run again tests (yesterdays test was not that consistence, today I
> stopped all unneeded services and avoid the X session tunneled by
> ssh).
> 
> without patched
> run   raw    qcow2   qcow2s
> 1     22758  4206    1054
> 2     28401  21745   17997
> 
> with patches (ref-,ref+)
> run   raw    qcow2   qcow2s
> 1     22563  4965    4382
> 2     23823  21043   20904
> 
> beside I still don't understand difference in second runs for raw
> (about 20%!!!) this confirms the huge improves with snapshot.

Measuring with cache=writethrough means that you use the host page
cache, so I think that could be where you get the difference for raw.
Numbers should be more consistent for cache=none, but that's not what we
want to test...

Did you also test how each patch performs if applied on its own, without
the other one? It would be interesting to see how much REF- optimisation
contributes and how much REF+.

Kevin

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization
  2011-09-14 10:21         ` Kevin Wolf
@ 2011-09-14 11:49           ` Frediano Ziglio
  2011-09-15  7:24           ` Frediano Ziglio
  1 sibling, 0 replies; 11+ messages in thread
From: Frediano Ziglio @ 2011-09-14 11:49 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: qemu-devel

2011/9/14 Kevin Wolf <kwolf@redhat.com>:
... omissis...
>
>>>>>> To optimize REF+ I mark a range as allocated and use this range to
>>>>>> get new ones (avoiding writing refcount to disk). When a flush is
>>>>>> requested or in some situations (like snapshot) this cache is disabled
>>>>>> and flushed (written as REF-).
>>>>>> I do not consider this patch ready, it works and pass all io-tests
>>>>>> but for instance I would avoid allocating new clusters for refcount
>>>>>> during preallocation.
>>>>>
>>>>> The only question here is if improving cache=writethrough cluster
>>>>> allocation performance is worth the additional complexity in the already
>>>>> complex refcounting code.
>>>>>
>>>>
>>>> I didn't see this optimization as a second level cache, but yes, for
>>>> REF- is a second cache.
>>>>
>>>>> The alternative that was discussed before is the dirty bit approach that
>>>>> is used in QED and would allow us to use writeback for all refcount
>>>>> blocks, regardless of REF- or REF+. It would be an easier approach
>>>>> requiring less code changes, but it comes with the cost of requiring an
>>>>> fsck after a qemu crash.
>>>>>
>>>>
>>>> I was thinking about changing the header magic first time we change
>>>> refcount in order to mark image as dirty so newer Qemu recognize the
>>>> flag while former one does not recognize image. Obviously reverting
>>>> magic on image close.
>>>
>>> We've discussed this idea before and I think it wasn't considered a
>>> great idea to automagically change the header in an incompatible way.
>>> But we can always say that for improved performance you need to upgrade
>>> your image to qcow2 v3.
>>>
>>
>> I don't understand why there is not a wiki page for detailed qcow3
>> changes. I saw your post on May. I follow this ML since August so I
>> think I missed a lot of discussion on qcow improves.
>
> Unfortunately there have been almost no comments, so you can consider
> RFC v2 as the current proposal.
>

:(

>>>>>> End speed up is quite visible allocating clusters (more then 20%).
>>>>>
>>>>> What benchmark do you use for testing this?
>>>>>
>>>>> Kevin
>>>>>
>>>>
>>>> Currently I'm using bonnie++ but I noted similar improves with iozone.
>>>> The test script format an image then launch a Linux machine which run
>>>> a script and save result to a file.
>>>> The test image is seems by this virtual machine as a separate disk.
>>>> The file on hist reside in a separate LV.
>>>> I got quite consistent results (of course not working on the machine
>>>> while testing, is not actually dedicated to this job).
>>>>
>>>> Actually I'm running the test (added a test working in a snapshot image).
>>>
>>> Okay. Let me guess the remaining variables: The image is on an ext4 host
>>> filesystem, you use cache=writethrough and virtio-blk. You don't use
>>> backing files, compression and encryption. For your tests with internal
>>> snapshots you have exactly one internal snapshot that is taken
>>> immediately before the benchmark. Oh, and not to forget, KVM is enabled.
>>>
>>> Are these assumptions correct?
>>>
>>
>> change ext4 and put xfs and assumptions are ok. Yes I use internal
>> snapshots (REF- are useful only in this case). To produce "qcow2s" I
>> use these commands
>>
>> $QEMU_IMG create -f qcow2 -o preallocation=metadata $FILE 15g
>> $QEMU_IMG snapshot -c test $FILE
>
> Ah, using preallocation for this is a nice trick. :-)
>
> Anyway, thanks, I think I understand now what you're measuring.
>

Yes, and real performance are even worst cause preallocating only
metadata you get many less physical read from disk.

>> I run again tests (yesterdays test was not that consistence, today I
>> stopped all unneeded services and avoid the X session tunneled by
>> ssh).
>>
>> without patched
>> run   raw    qcow2   qcow2s
>> 1     22758  4206    1054
>> 2     28401  21745 17997
>>
>> with patches (ref-,ref+)
>> run   raw    qcow2   qcow2s
>> 1     22563  4965    4382
>> 2     23823  21043 20904
>>
>> beside I still don't understand difference in second runs for raw
>> (about 20%!!!) this confirms the huge improves with snapshot.
>
> Measuring with cache=writethrough means that you use the host page
> cache, so I think that could be where you get the difference for raw.
> Numbers should be more consistent for cache=none, but that's not what we
> want to test...
>
> Did you also test how each patch performs if applied on its own, without
> the other one? It would be interesting to see how much REF- optimisation
> contributes and how much REF+.
>
> Kevin
>

Here you are

only ref-
run   raw    qcow2   qcow2s
1     22579  4072    4053
2     30322  19667   16830

so mostly REF- for snapshot.

Frediano

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization
  2011-09-14 10:21         ` Kevin Wolf
  2011-09-14 11:49           ` Frediano Ziglio
@ 2011-09-15  7:24           ` Frediano Ziglio
  1 sibling, 0 replies; 11+ messages in thread
From: Frediano Ziglio @ 2011-09-15  7:24 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: qemu-devel

2011/9/14 Kevin Wolf <kwolf@redhat.com>:
...
>
>>>>> But let's measure the effects first, I suspect that for cluster
>>>>> allocation it doesn't help much because every REF- comes with a REF+.
>>>>>
>>>>
>>>> That's 50% of effort if REF- clusters are far from REF+ :)
>>>
>>> I would expect that the next REF+ allocates exactly the REF- cluster.
>>> But you still have a point, we save the write on REF- and combine it
>>> with the REF+ write.
>>>
>>
>> This is still a TODO for REF+ patch.
>
> Actually, I was talking about the qcow2_cache_entry_mark_dirty_wb() case
> without any other change. You get it automatically then.
>

No, I forgot I already thought about this. REF- cluster get not reused
on normal operation, but reach 0 only during snapshot delete. This
cause if it reach 0 it means that it was 1 but if it was 1 copied flag
would be 1 and so there is no reason to decrement counter (reductio ab
asburdum).

Frediano

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2011-09-15  7:24 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-09-13  7:53 [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization Frediano Ziglio
2011-09-13  7:53 ` [Qemu-devel] [PATCH][RFC][1/2] qcow2: optimize refminus updates Frediano Ziglio
2011-09-13  7:53 ` [Qemu-devel] [PATCH][RFC][2/2] qcow2: ref+ optimization Frediano Ziglio
2011-09-13 10:37 ` [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization Kevin Wolf
2011-09-13 13:36   ` Frediano Ziglio
2011-09-14  9:10     ` Kevin Wolf
2011-09-14  9:52       ` Frediano Ziglio
2011-09-14 10:21         ` Kevin Wolf
2011-09-14 11:49           ` Frediano Ziglio
2011-09-15  7:24           ` Frediano Ziglio
2011-09-13 14:55   ` Frediano Ziglio

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.