All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [RFC PATCHv2] block: optimize zero writes with bdrv_write_zeroes
@ 2014-03-21 14:36 Peter Lieven
  2014-03-21 14:50 ` Eric Blake
  2014-03-26  9:16 ` Markus Armbruster
  0 siblings, 2 replies; 6+ messages in thread
From: Peter Lieven @ 2014-03-21 14:36 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, famz, Peter Lieven, mreitz, stefanha, pbonzini

this patch tries to optimize zero write requests
by automatically using bdrv_write_zeroes if it is
supported by the format.

this should significantly speed up file system initialization and
should speed zero write test used to test backend storage performance.

the difference can simply be tested by e.g.

dd if=/dev/zero of=/dev/vdX bs=1M

Signed-off-by: Peter Lieven <pl@kamp.de>
---
v1->v2: - add detect-zeroes=off|on|unmap knob to drive cmdline parameter
        - call zero detection only for format (bs->file != NULL)

 block.c                   |   39 ++++++++++++++++++++++++++++++++++++++-
 include/block/block_int.h |   12 ++++++++++++
 include/qemu-common.h     |    1 +
 qemu-options.hx           |    6 ++++++
 util/iov.c                |   21 +++++++++++++++++++++
 5 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index acb70fd..6bda7ce 100644
--- a/block.c
+++ b/block.c
@@ -817,6 +817,25 @@ static int bdrv_assign_node_name(BlockDriverState *bs,
     return 0;
 }
 
+static int bdrv_set_detect_zeroes(BlockDriverState *bs,
+                                  const char *detect_zeroes,
+                                  Error **errp)
+{
+    if (!detect_zeroes || !strncmp(detect_zeroes, "off", 3)) {
+        bs->detect_zeroes = BDRV_DETECT_ZEROES_OFF;
+    } else if (!strncmp(detect_zeroes, "on", 2)) {
+        bs->detect_zeroes = BDRV_DETECT_ZEROES_ON;
+    } else if (!strncmp(detect_zeroes, "unmap", 5)) {
+        bs->detect_zeroes = BDRV_DETECT_ZEROES_UNMAP;
+    } else {
+        error_setg(errp, "invalid value for detect-zeroes: %s",
+                   detect_zeroes);
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
 /*
  * Common part for opening disk images and files
  *
@@ -827,7 +846,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
 {
     int ret, open_flags;
     const char *filename;
-    const char *node_name = NULL;
+    const char *node_name = NULL, *detect_zeroes = NULL;
     Error *local_err = NULL;
 
     assert(drv != NULL);
@@ -855,6 +874,13 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
     }
     qdict_del(options, "node-name");
 
+    detect_zeroes = qdict_get_try_str(options, "detect-zeroes");
+    ret = bdrv_set_detect_zeroes(bs, detect_zeroes, errp);
+    if (ret < 0) {
+        return ret;
+    }
+    qdict_del(options, "detect-zeroes");
+
     /* bdrv_open() with directly using a protocol as drv. This layer is already
      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
      * and return immediately. */
@@ -3179,6 +3205,17 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
 
     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
 
+    if (!ret && bs->detect_zeroes != BDRV_DETECT_ZEROES_OFF &&
+        !(flags & BDRV_REQ_ZERO_WRITE) && bs->file &&
+        drv->bdrv_co_write_zeroes && qemu_iovec_is_zero(qiov)) {
+        flags |= BDRV_REQ_ZERO_WRITE;
+        /* if the device was not opened with discard=on the below flag
+         * is immediately cleared again in bdrv_co_do_write_zeroes */
+        if (bs->detect_zeroes == BDRV_DETECT_ZEROES_UNMAP) {
+            flags |= BDRV_REQ_MAY_UNMAP;
+        }
+    }
+
     if (ret < 0) {
         /* Do nothing, write notifier decided to fail this request */
     } else if (flags & BDRV_REQ_ZERO_WRITE) {
diff --git a/include/block/block_int.h b/include/block/block_int.h
index cd5bc73..7a3013a 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -271,6 +271,17 @@ typedef struct BlockLimits {
 } BlockLimits;
 
 /*
+ * Different operation modes for automatic zero detection
+ * to speed the write operation up with bdrv_write_zeroes.
+ */
+typedef enum {
+    BDRV_DETECT_ZEROES_OFF   = 0x0,
+    BDRV_DETECT_ZEROES_ON    = 0x1,
+    /* also set the BDRV_MAY_UNMAP flag with bdrv_write_zeroes */
+    BDRV_DETECT_ZEROES_UNMAP = 0x2,
+} BdrvDetectZeroes;
+
+/*
  * Note: the function bdrv_append() copies and swaps contents of
  * BlockDriverStates, so if you add new fields to this struct, please
  * inspect bdrv_append() to determine if the new fields need to be
@@ -365,6 +376,7 @@ struct BlockDriverState {
     BlockJob *job;
 
     QDict *options;
+    BdrvDetectZeroes detect_zeroes;
 };
 
 int get_tmp_filename(char *filename, int size);
diff --git a/include/qemu-common.h b/include/qemu-common.h
index c8a58a8..574da73 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -330,6 +330,7 @@ void qemu_iovec_concat(QEMUIOVector *dst,
 void qemu_iovec_concat_iov(QEMUIOVector *dst,
                            struct iovec *src_iov, unsigned int src_cnt,
                            size_t soffset, size_t sbytes);
+bool qemu_iovec_is_zero(QEMUIOVector *qiov);
 void qemu_iovec_destroy(QEMUIOVector *qiov);
 void qemu_iovec_reset(QEMUIOVector *qiov);
 size_t qemu_iovec_to_buf(QEMUIOVector *qiov, size_t offset,
diff --git a/qemu-options.hx b/qemu-options.hx
index ee5437b..1836307 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -410,6 +410,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive,
     "       [,cache=writethrough|writeback|none|directsync|unsafe][,format=f]\n"
     "       [,serial=s][,addr=A][,id=name][,aio=threads|native]\n"
     "       [,readonly=on|off][,copy-on-read=on|off]\n"
+    "       [,detect-zeroes=on|off|unmap]\n"
     "       [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]]\n"
     "       [[,iops=i]|[[,iops_rd=r][,iops_wr=w]]]\n"
     "       [[,bps_max=bm]|[[,bps_rd_max=rm][,bps_wr_max=wm]]]\n"
@@ -470,6 +471,11 @@ Open drive @option{file} as read-only. Guest write attempts will fail.
 @item copy-on-read=@var{copy-on-read}
 @var{copy-on-read} is "on" or "off" and enables whether to copy read backing
 file sectors into the image file.
+@item detect-zeroes=@var{detect-zeroes}
+@var{detect-zeroes} is "off", "on" or "unmap" and enables the automatic
+conversion of plain zero writes by the OS to driver specific optimized
+zero write commands. If "unmap" is choosen and @var{discard} is "on"
+a zero write may even be converted to an UNMAP operation.
 @end table
 
 By default, the @option{cache=writeback} mode is used. It will report data
diff --git a/util/iov.c b/util/iov.c
index 6569b5a..0b17392 100644
--- a/util/iov.c
+++ b/util/iov.c
@@ -335,6 +335,27 @@ void qemu_iovec_concat(QEMUIOVector *dst,
     qemu_iovec_concat_iov(dst, src->iov, src->niov, soffset, sbytes);
 }
 
+/*
+ *  check if the contents of the iovecs is all zero
+ */
+bool qemu_iovec_is_zero(QEMUIOVector *qiov)
+{
+    int i;
+    for (i = 0; i < qiov->niov; i++) {
+        size_t offs = qiov->iov[i].iov_len & ~(4 * sizeof(long) - 1);
+        uint8_t *ptr = qiov->iov[i].iov_base;
+        if (offs && !buffer_is_zero(qiov->iov[i].iov_base, offs)) {
+            return false;
+        }
+        for (; offs < qiov->iov[i].iov_len; offs++) {
+            if (ptr[offs]) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
 void qemu_iovec_destroy(QEMUIOVector *qiov)
 {
     assert(qiov->nalloc != -1);
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [Qemu-devel] [RFC PATCHv2] block: optimize zero writes with bdrv_write_zeroes
  2014-03-21 14:36 [Qemu-devel] [RFC PATCHv2] block: optimize zero writes with bdrv_write_zeroes Peter Lieven
@ 2014-03-21 14:50 ` Eric Blake
  2014-03-21 20:20   ` Peter Lieven
  2014-03-26  9:16 ` Markus Armbruster
  1 sibling, 1 reply; 6+ messages in thread
From: Eric Blake @ 2014-03-21 14:50 UTC (permalink / raw)
  To: Peter Lieven, qemu-devel; +Cc: kwolf, pbonzini, famz, stefanha, mreitz

[-- Attachment #1: Type: text/plain, Size: 2020 bytes --]

On 03/21/2014 08:36 AM, Peter Lieven wrote:
> this patch tries to optimize zero write requests
> by automatically using bdrv_write_zeroes if it is
> supported by the format.
> 
> this should significantly speed up file system initialization and
> should speed zero write test used to test backend storage performance.
> 
> the difference can simply be tested by e.g.
> 
> dd if=/dev/zero of=/dev/vdX bs=1M
> 
> Signed-off-by: Peter Lieven <pl@kamp.de>
> ---
> v1->v2: - add detect-zeroes=off|on|unmap knob to drive cmdline parameter
>         - call zero detection only for format (bs->file != NULL)
> 

> +static int bdrv_set_detect_zeroes(BlockDriverState *bs,
> +                                  const char *detect_zeroes,
> +                                  Error **errp)
> +{
> +    if (!detect_zeroes || !strncmp(detect_zeroes, "off", 3)) {
> +        bs->detect_zeroes = BDRV_DETECT_ZEROES_OFF;

This parses "offer"

> +    } else if (!strncmp(detect_zeroes, "on", 2)) {
> +        bs->detect_zeroes = BDRV_DETECT_ZEROES_ON;

and this parses "onto".

> +    } else if (!strncmp(detect_zeroes, "unmap", 5)) {
> +        bs->detect_zeroes = BDRV_DETECT_ZEROES_UNMAP;

In all three cases, shouldn't you be using strcmp() instead of strncmp()?

> +    } else {
> +        error_setg(errp, "invalid value for detect-zeroes: %s",
> +                   detect_zeroes);

Especially since you warn about other unknown spellings, it feels weird
to not warn about the spellings where the prefix matches but the overall
spelling is unknown.
>  file sectors into the image file.
> +@item detect-zeroes=@var{detect-zeroes}
> +@var{detect-zeroes} is "off", "on" or "unmap" and enables the automatic
> +conversion of plain zero writes by the OS to driver specific optimized
> +zero write commands. If "unmap" is choosen and @var{discard} is "on"

s/choosen/chosen/

-- 
Eric Blake   eblake redhat com    +1-919-301-3266
Libvirt virtualization library http://libvirt.org


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 604 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Qemu-devel] [RFC PATCHv2] block: optimize zero writes with bdrv_write_zeroes
  2014-03-21 14:50 ` Eric Blake
@ 2014-03-21 20:20   ` Peter Lieven
  0 siblings, 0 replies; 6+ messages in thread
From: Peter Lieven @ 2014-03-21 20:20 UTC (permalink / raw)
  To: Eric Blake, qemu-devel; +Cc: kwolf, pbonzini, famz, stefanha, mreitz

Am 21.03.2014 15:50, schrieb Eric Blake:
> On 03/21/2014 08:36 AM, Peter Lieven wrote:
>> this patch tries to optimize zero write requests
>> by automatically using bdrv_write_zeroes if it is
>> supported by the format.
>>
>> this should significantly speed up file system initialization and
>> should speed zero write test used to test backend storage performance.
>>
>> the difference can simply be tested by e.g.
>>
>> dd if=/dev/zero of=/dev/vdX bs=1M
>>
>> Signed-off-by: Peter Lieven <pl@kamp.de>
>> ---
>> v1->v2: - add detect-zeroes=off|on|unmap knob to drive cmdline parameter
>>         - call zero detection only for format (bs->file != NULL)
>>
>> +static int bdrv_set_detect_zeroes(BlockDriverState *bs,
>> +                                  const char *detect_zeroes,
>> +                                  Error **errp)
>> +{
>> +    if (!detect_zeroes || !strncmp(detect_zeroes, "off", 3)) {
>> +        bs->detect_zeroes = BDRV_DETECT_ZEROES_OFF;
> This parses "offer"
>
>> +    } else if (!strncmp(detect_zeroes, "on", 2)) {
>> +        bs->detect_zeroes = BDRV_DETECT_ZEROES_ON;
> and this parses "onto".
>
>> +    } else if (!strncmp(detect_zeroes, "unmap", 5)) {
>> +        bs->detect_zeroes = BDRV_DETECT_ZEROES_UNMAP;
> In all three cases, shouldn't you be using strcmp() instead of strncmp()?

yes, you are right.

Thanks,
Peter

>
>> +    } else {
>> +        error_setg(errp, "invalid value for detect-zeroes: %s",
>> +                   detect_zeroes);
> Especially since you warn about other unknown spellings, it feels weird
> to not warn about the spellings where the prefix matches but the overall
> spelling is unknown.
>>  file sectors into the image file.
>> +@item detect-zeroes=@var{detect-zeroes}
>> +@var{detect-zeroes} is "off", "on" or "unmap" and enables the automatic
>> +conversion of plain zero writes by the OS to driver specific optimized
>> +zero write commands. If "unmap" is choosen and @var{discard} is "on"
> s/choosen/chosen/
>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Qemu-devel] [RFC PATCHv2] block: optimize zero writes with bdrv_write_zeroes
  2014-03-21 14:36 [Qemu-devel] [RFC PATCHv2] block: optimize zero writes with bdrv_write_zeroes Peter Lieven
  2014-03-21 14:50 ` Eric Blake
@ 2014-03-26  9:16 ` Markus Armbruster
  2014-03-27 13:06   ` Peter Lieven
  1 sibling, 1 reply; 6+ messages in thread
From: Markus Armbruster @ 2014-03-26  9:16 UTC (permalink / raw)
  To: Peter Lieven; +Cc: kwolf, famz, qemu-devel, mreitz, stefanha, pbonzini

Peter Lieven <pl@kamp.de> writes:

> this patch tries to optimize zero write requests
> by automatically using bdrv_write_zeroes if it is
> supported by the format.
>
> this should significantly speed up file system initialization and
> should speed zero write test used to test backend storage performance.
>
> the difference can simply be tested by e.g.
>
> dd if=/dev/zero of=/dev/vdX bs=1M

Got actual numbers?  Preferably for some operation that matters to
users.

[...]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Qemu-devel] [RFC PATCHv2] block: optimize zero writes with bdrv_write_zeroes
  2014-03-26  9:16 ` Markus Armbruster
@ 2014-03-27 13:06   ` Peter Lieven
  2014-03-27 14:07     ` Markus Armbruster
  0 siblings, 1 reply; 6+ messages in thread
From: Peter Lieven @ 2014-03-27 13:06 UTC (permalink / raw)
  To: Markus Armbruster; +Cc: kwolf, famz, qemu-devel, mreitz, stefanha, pbonzini

On 26.03.2014 10:16, Markus Armbruster wrote:
> Peter Lieven <pl@kamp.de> writes:
>
>> this patch tries to optimize zero write requests
>> by automatically using bdrv_write_zeroes if it is
>> supported by the format.
>>
>> this should significantly speed up file system initialization and
>> should speed zero write test used to test backend storage performance.
>>
>> the difference can simply be tested by e.g.
>>
>> dd if=/dev/zero of=/dev/vdX bs=1M
> Got actual numbers?  Preferably for some operation that matters to
> users.

To give you a rough figure I have an iSCSI Storage for testing that
has an 1GBit connection. The above command gives about 110MB/s
with detect-zeroes=off and about 980MB/s with detect-zeroes=unmap.

Additionally I ran a test with v1 of the patch:

---8<---

I created a 60GB qcow2 container and formatted it with ext4. To immediately show the difference
I disabled lazy inode table and lazy journal init (writing zero takes place immediately then).

Timing without the patch:
time mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0 /dev/vda
real 1m5.649s
user 0m0.416s
sys  0m3.148s

Timing with the patch:
time mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0 /dev/vdX
real 0m1.228s
user 0m0.116s
sys  0m0.732s

Container Size after Format without the patch: 1150615552 Byte (1097.3MB)
Container Size after Format with the patch: 24645632 Byte (23.5MB)

--->8---

Without the patch means detect-zeroes=off (default) and with the patch means detect-zeroes=unmap.

BR,
Peter

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Qemu-devel] [RFC PATCHv2] block: optimize zero writes with bdrv_write_zeroes
  2014-03-27 13:06   ` Peter Lieven
@ 2014-03-27 14:07     ` Markus Armbruster
  0 siblings, 0 replies; 6+ messages in thread
From: Markus Armbruster @ 2014-03-27 14:07 UTC (permalink / raw)
  To: Peter Lieven; +Cc: kwolf, famz, qemu-devel, mreitz, stefanha, pbonzini

Peter Lieven <pl@kamp.de> writes:

> On 26.03.2014 10:16, Markus Armbruster wrote:
>> Peter Lieven <pl@kamp.de> writes:
>>
>>> this patch tries to optimize zero write requests
>>> by automatically using bdrv_write_zeroes if it is
>>> supported by the format.
>>>
>>> this should significantly speed up file system initialization and
>>> should speed zero write test used to test backend storage performance.
>>>
>>> the difference can simply be tested by e.g.
>>>
>>> dd if=/dev/zero of=/dev/vdX bs=1M
>> Got actual numbers?  Preferably for some operation that matters to
>> users.
>
> To give you a rough figure I have an iSCSI Storage for testing that
> has an 1GBit connection. The above command gives about 110MB/s
> with detect-zeroes=off and about 980MB/s with detect-zeroes=unmap.
>
> Additionally I ran a test with v1 of the patch:
>
> ---8<---
>
> I created a 60GB qcow2 container and formatted it with ext4. To
> immediately show the difference
> I disabled lazy inode table and lazy journal init (writing zero takes
> place immediately then).
>
> Timing without the patch:
> time mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0 /dev/vda
> real 1m5.649s
> user 0m0.416s
> sys  0m3.148s
>
> Timing with the patch:
> time mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0 /dev/vdX
> real 0m1.228s
> user 0m0.116s
> sys  0m0.732s
>
> Container Size after Format without the patch: 1150615552 Byte (1097.3MB)
> Container Size after Format with the patch: 24645632 Byte (23.5MB)
>
> --->8---
>
> Without the patch means detect-zeroes=off (default) and with the patch
> means detect-zeroes=unmap.

Recommend to document your testing in the commit message.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2014-03-27 14:08 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-03-21 14:36 [Qemu-devel] [RFC PATCHv2] block: optimize zero writes with bdrv_write_zeroes Peter Lieven
2014-03-21 14:50 ` Eric Blake
2014-03-21 20:20   ` Peter Lieven
2014-03-26  9:16 ` Markus Armbruster
2014-03-27 13:06   ` Peter Lieven
2014-03-27 14:07     ` Markus Armbruster

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.