QEMU-Devel Archive on lore.kernel.org
 help / color / Atom feed
* [Qemu-devel] [PATCH 0/2] block/nvme: add support for write zeros and discard
@ 2019-08-25  7:15 Maxim Levitsky
  2019-08-25  7:15 ` [Qemu-devel] [PATCH 1/2] block/nvme: add support for write zeros Maxim Levitsky
  2019-08-25  7:15 ` [Qemu-devel] [PATCH 2/2] block/nvme: add support for discard Maxim Levitsky
  0 siblings, 2 replies; 15+ messages in thread
From: Maxim Levitsky @ 2019-08-25  7:15 UTC (permalink / raw)
  To: qemu-devel
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Max Reitz, Paolo Bonzini,
	Maxim Levitsky, John Snow

This is the second part of the patches I prepared
for this driver back when I worked on mdev-nvme.

Best regards,
	Maxim Levitsky

Maxim Levitsky (2):
  block/nvme: add support for write zeros
  block/nvme: add support for discard

 block/nvme.c         | 155 ++++++++++++++++++++++++++++++++++++++++++-
 block/trace-events   |   3 +
 include/block/nvme.h |  19 +++++-
 3 files changed, 175 insertions(+), 2 deletions(-)

-- 
2.17.2



^ permalink raw reply	[flat|nested] 15+ messages in thread

* [Qemu-devel] [PATCH 1/2] block/nvme: add support for write zeros
  2019-08-25  7:15 [Qemu-devel] [PATCH 0/2] block/nvme: add support for write zeros and discard Maxim Levitsky
@ 2019-08-25  7:15 ` Maxim Levitsky
  2019-08-27 22:22   ` John Snow
  2019-08-25  7:15 ` [Qemu-devel] [PATCH 2/2] block/nvme: add support for discard Maxim Levitsky
  1 sibling, 1 reply; 15+ messages in thread
From: Maxim Levitsky @ 2019-08-25  7:15 UTC (permalink / raw)
  To: qemu-devel
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Max Reitz, Paolo Bonzini,
	Maxim Levitsky, John Snow

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 block/nvme.c         | 72 +++++++++++++++++++++++++++++++++++++++++++-
 block/trace-events   |  1 +
 include/block/nvme.h | 19 +++++++++++-
 3 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 5be3a39b63..f8bd11e19a 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -111,6 +111,8 @@ typedef struct {
     uint64_t max_transfer;
     bool plugged;
 
+    bool supports_write_zeros;
+
     CoMutex dma_map_lock;
     CoQueue dma_flush_queue;
 
@@ -421,6 +423,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
     NvmeIdNs *idns;
     NvmeLBAF *lbaf;
     uint8_t *resp;
+    uint16_t oncs;
     int r;
     uint64_t iova;
     NvmeCmd cmd = {
@@ -458,6 +461,9 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
     s->max_transfer = MIN_NON_ZERO(s->max_transfer,
                           s->page_size / sizeof(uint64_t) * s->page_size);
 
+    oncs = le16_to_cpu(idctrl->oncs);
+    s->supports_write_zeros = (oncs & NVME_ONCS_WRITE_ZEROS) != 0;
+
     memset(resp, 0, 4096);
 
     cmd.cdw10 = 0;
@@ -470,6 +476,12 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
     s->nsze = le64_to_cpu(idns->nsze);
     lbaf = &idns->lbaf[NVME_ID_NS_FLBAS_INDEX(idns->flbas)];
 
+    if (NVME_ID_NS_DLFEAT_WRITE_ZEROS(idns->dlfeat) &&
+            NVME_ID_NS_DLFEAT_READ_BEHAVIOR(idns->dlfeat) ==
+                    NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROS) {
+        bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP;
+    }
+
     if (lbaf->ms) {
         error_setg(errp, "Namespaces with metadata are not yet supported");
         goto out;
@@ -764,6 +776,8 @@ static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
     int ret;
     BDRVNVMeState *s = bs->opaque;
 
+    bs->supported_write_flags = BDRV_REQ_FUA;
+
     opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &error_abort);
     device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE);
@@ -792,7 +806,6 @@ static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
             goto fail;
         }
     }
-    bs->supported_write_flags = BDRV_REQ_FUA;
     return 0;
 fail:
     nvme_close(bs);
@@ -1086,6 +1099,60 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
 }
 
 
+static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
+                                              int64_t offset,
+                                              int bytes,
+                                              BdrvRequestFlags flags)
+{
+    BDRVNVMeState *s = bs->opaque;
+    NVMeQueuePair *ioq = s->queues[1];
+    NVMeRequest *req;
+
+    uint32_t cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF;
+
+    if (!s->supports_write_zeros) {
+        return -ENOTSUP;
+    }
+
+    NvmeCmd cmd = {
+        .opcode = NVME_CMD_WRITE_ZEROS,
+        .nsid = cpu_to_le32(s->nsid),
+        .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
+        .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
+    };
+
+    NVMeCoData data = {
+        .ctx = bdrv_get_aio_context(bs),
+        .ret = -EINPROGRESS,
+    };
+
+    if (flags & BDRV_REQ_MAY_UNMAP) {
+        cdw12 |= (1 << 25);
+    }
+
+    if (flags & BDRV_REQ_FUA) {
+        cdw12 |= (1 << 30);
+    }
+
+    cmd.cdw12 = cpu_to_le32(cdw12);
+
+    trace_nvme_write_zeros(s, offset, bytes, flags);
+    assert(s->nr_queues > 1);
+    req = nvme_get_free_req(ioq);
+    assert(req);
+
+    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
+
+    data.co = qemu_coroutine_self();
+    while (data.ret == -EINPROGRESS) {
+        qemu_coroutine_yield();
+    }
+
+    trace_nvme_rw_done(s, true, offset, bytes, data.ret);
+    return data.ret;
+}
+
+
 static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
                                BlockReopenQueue *queue, Error **errp)
 {
@@ -1190,6 +1257,9 @@ static BlockDriver bdrv_nvme = {
 
     .bdrv_co_preadv           = nvme_co_preadv,
     .bdrv_co_pwritev          = nvme_co_pwritev,
+
+    .bdrv_co_pwrite_zeroes    = nvme_co_pwrite_zeroes,
+
     .bdrv_co_flush_to_disk    = nvme_co_flush,
     .bdrv_reopen_prepare      = nvme_reopen_prepare,
 
diff --git a/block/trace-events b/block/trace-events
index 04209f058d..8209fbd0c7 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -149,6 +149,7 @@ nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6,
 nvme_handle_event(void *s) "s %p"
 nvme_poll_cb(void *s) "s %p"
 nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset %"PRId64" bytes %"PRId64" flags %d niov %d"
+nvme_write_zeros(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset %"PRId64" bytes %"PRId64" flags %d"
 nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
 nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d"
 nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d"
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 3ec8efcc43..1f5b406344 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -653,12 +653,29 @@ typedef struct NvmeIdNs {
     uint8_t     mc;
     uint8_t     dpc;
     uint8_t     dps;
-    uint8_t     res30[98];
+
+    uint8_t     nmic;
+    uint8_t     rescap;
+    uint8_t     fpi;
+    uint8_t     dlfeat;
+
+    uint8_t     res30[94];
     NvmeLBAF    lbaf[16];
     uint8_t     res192[192];
     uint8_t     vs[3712];
 } NvmeIdNs;
 
+
+/*Deallocate Logical Block Features*/
+#define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)       ((dlfeat) & 0x10)
+#define NVME_ID_NS_DLFEAT_WRITE_ZEROS(dlfeat)     ((dlfeat) & 0x08)
+
+#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR(dlfeat)     ((dlfeat) & 0x7)
+#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_UNDEFINED   0
+#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROS       1
+#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ONES        2
+
+
 #define NVME_ID_NS_NSFEAT_THIN(nsfeat)      ((nsfeat & 0x1))
 #define NVME_ID_NS_FLBAS_EXTENDED(flbas)    ((flbas >> 4) & 0x1)
 #define NVME_ID_NS_FLBAS_INDEX(flbas)       ((flbas & 0xf))
-- 
2.17.2



^ permalink raw reply	[flat|nested] 15+ messages in thread

* [Qemu-devel] [PATCH 2/2] block/nvme: add support for discard
  2019-08-25  7:15 [Qemu-devel] [PATCH 0/2] block/nvme: add support for write zeros and discard Maxim Levitsky
  2019-08-25  7:15 ` [Qemu-devel] [PATCH 1/2] block/nvme: add support for write zeros Maxim Levitsky
@ 2019-08-25  7:15 ` Maxim Levitsky
  2019-08-27 22:29   ` John Snow
  1 sibling, 1 reply; 15+ messages in thread
From: Maxim Levitsky @ 2019-08-25  7:15 UTC (permalink / raw)
  To: qemu-devel
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Max Reitz, Paolo Bonzini,
	Maxim Levitsky, John Snow

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 block/nvme.c       | 83 ++++++++++++++++++++++++++++++++++++++++++++++
 block/trace-events |  2 ++
 2 files changed, 85 insertions(+)

diff --git a/block/nvme.c b/block/nvme.c
index f8bd11e19a..dd041f39c9 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -112,6 +112,7 @@ typedef struct {
     bool plugged;
 
     bool supports_write_zeros;
+    bool supports_discard;
 
     CoMutex dma_map_lock;
     CoQueue dma_flush_queue;
@@ -463,6 +464,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
 
     oncs = le16_to_cpu(idctrl->oncs);
     s->supports_write_zeros = (oncs & NVME_ONCS_WRITE_ZEROS) != 0;
+    s->supports_discard = (oncs & NVME_ONCS_DSM) != 0;
 
     memset(resp, 0, 4096);
 
@@ -1153,6 +1155,86 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
 }
 
 
+static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
+                                         int64_t offset,
+                                         int bytes)
+{
+    BDRVNVMeState *s = bs->opaque;
+    NVMeQueuePair *ioq = s->queues[1];
+    NVMeRequest *req;
+    NvmeDsmRange *buf;
+    QEMUIOVector local_qiov;
+    int ret;
+
+    NvmeCmd cmd = {
+        .opcode = NVME_CMD_DSM,
+        .nsid = cpu_to_le32(s->nsid),
+        .cdw10 = cpu_to_le32(0), /*number of ranges - 0 based*/
+        .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/
+    };
+
+    NVMeCoData data = {
+        .ctx = bdrv_get_aio_context(bs),
+        .ret = -EINPROGRESS,
+    };
+
+    if (!s->supports_discard) {
+        return -ENOTSUP;
+    }
+
+    assert(s->nr_queues > 1);
+
+    buf = qemu_try_blockalign0(bs, s->page_size);
+    if (!buf) {
+        return -ENOMEM;
+    }
+
+    buf->nlb = cpu_to_le32(bytes >> s->blkshift);
+    buf->slba = cpu_to_le64(offset >> s->blkshift);
+    buf->cattr = 0;
+
+    qemu_iovec_init(&local_qiov, 1);
+    qemu_iovec_add(&local_qiov, buf, 4096);
+
+    req = nvme_get_free_req(ioq);
+    assert(req);
+
+    qemu_co_mutex_lock(&s->dma_map_lock);
+    ret = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov);
+    qemu_co_mutex_unlock(&s->dma_map_lock);
+
+    if (ret) {
+        req->busy = false;
+        goto out;
+    }
+
+    trace_nvme_dsm(s, offset, bytes);
+
+    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
+
+    data.co = qemu_coroutine_self();
+    while (data.ret == -EINPROGRESS) {
+        qemu_coroutine_yield();
+    }
+
+    qemu_co_mutex_lock(&s->dma_map_lock);
+    ret = nvme_cmd_unmap_qiov(bs, &local_qiov);
+    qemu_co_mutex_unlock(&s->dma_map_lock);
+
+    if (ret) {
+        goto out;
+    }
+
+    ret = data.ret;
+    trace_nvme_dsm_done(s, offset, bytes, ret);
+out:
+    qemu_iovec_destroy(&local_qiov);
+    qemu_vfree(buf);
+    return ret;
+
+}
+
+
 static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
                                BlockReopenQueue *queue, Error **errp)
 {
@@ -1259,6 +1341,7 @@ static BlockDriver bdrv_nvme = {
     .bdrv_co_pwritev          = nvme_co_pwritev,
 
     .bdrv_co_pwrite_zeroes    = nvme_co_pwrite_zeroes,
+    .bdrv_co_pdiscard         = nvme_co_pdiscard,
 
     .bdrv_co_flush_to_disk    = nvme_co_flush,
     .bdrv_reopen_prepare      = nvme_reopen_prepare,
diff --git a/block/trace-events b/block/trace-events
index 8209fbd0c7..7d1d48b502 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -153,6 +153,8 @@ nvme_write_zeros(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offs
 nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
 nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d"
 nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d"
+nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset %"PRId64" bytes %"PRId64""
+nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset %"PRId64" bytes %"PRId64" ret %d"
 nvme_dma_map_flush(void *s) "s %p"
 nvme_free_req_queue_wait(void *q) "q %p"
 nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
-- 
2.17.2



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH 1/2] block/nvme: add support for write zeros
  2019-08-25  7:15 ` [Qemu-devel] [PATCH 1/2] block/nvme: add support for write zeros Maxim Levitsky
@ 2019-08-27 22:22   ` John Snow
  2019-08-28  9:02     ` Maxim Levitsky
  2019-09-13 13:30     ` Maxim Levitsky
  0 siblings, 2 replies; 15+ messages in thread
From: John Snow @ 2019-08-27 22:22 UTC (permalink / raw)
  To: Maxim Levitsky, qemu-devel
  Cc: Fam Zheng, Kevin Wolf, Paolo Bonzini, qemu-block, Max Reitz

Without a commit message, I have no real hope of reviewing this. I was
CC'd, though, so I'll give it a blind shot.

We want to add write_zeroes support for block/nvme, but I can't really
verify any of that is correct or working without a unit test, a spec, or
some instructions to help me verify any of this does what it looks like
it does.

On 8/25/19 3:15 AM, Maxim Levitsky wrote:
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> ---
>  block/nvme.c         | 72 +++++++++++++++++++++++++++++++++++++++++++-
>  block/trace-events   |  1 +
>  include/block/nvme.h | 19 +++++++++++-
>  3 files changed, 90 insertions(+), 2 deletions(-)
> 
> diff --git a/block/nvme.c b/block/nvme.c
> index 5be3a39b63..f8bd11e19a 100644
> --- a/block/nvme.c
> +++ b/block/nvme.c
> @@ -111,6 +111,8 @@ typedef struct {
>      uint64_t max_transfer;
>      bool plugged;
>  
> +    bool supports_write_zeros;
> +

I suppose the spelling of "zeroes" is not as established as I thought it
was. Actually, what's worse is that the NVME writers apparently couldn't
decide what to name it themselves either:

"Write Zeroes" has 23 hits.
"Write Zeros" has just two, in Figure 114 for the Identify NS Data.

Oh, in QEMU we're not much better:

jhuston@probe (review) ~/s/qemu> git grep -i zeros | wc -l
265
jhuston@probe (review) ~/s/qemu> git grep -i zeroes | wc -l
747

I'm going to suggest that we use 'zeroes' as the spelling here, to match
the existing 'pwrite_zeroes', and then otherwise to match the NVME
spec's usual spelling.

>      CoMutex dma_map_lock;
>      CoQueue dma_flush_queue;
>  
> @@ -421,6 +423,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
>      NvmeIdNs *idns;
>      NvmeLBAF *lbaf;
>      uint8_t *resp;
> +    uint16_t oncs;
>      int r;
>      uint64_t iova;
>      NvmeCmd cmd = {
> @@ -458,6 +461,9 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
>      s->max_transfer = MIN_NON_ZERO(s->max_transfer,
>                            s->page_size / sizeof(uint64_t) * s->page_size);
>  
> +    oncs = le16_to_cpu(idctrl->oncs);
> +    s->supports_write_zeros = (oncs & NVME_ONCS_WRITE_ZEROS) != 0;
> +

For other reviewers: oncs is "Optional NVM Command Support".

I think it's better to say `!!(oncs & NVME_ONCS_WRITE_ZEROES)` to remove
doubt over the width of the bitmask.

>      memset(resp, 0, 4096);
>  
>      cmd.cdw10 = 0;
> @@ -470,6 +476,12 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
>      s->nsze = le64_to_cpu(idns->nsze);
>      lbaf = &idns->lbaf[NVME_ID_NS_FLBAS_INDEX(idns->flbas)];
>  
> +    if (NVME_ID_NS_DLFEAT_WRITE_ZEROS(idns->dlfeat) &&
> +            NVME_ID_NS_DLFEAT_READ_BEHAVIOR(idns->dlfeat) ==
> +                    NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROS) {
> +        bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP;
> +    }
> +
>      if (lbaf->ms) {
>          error_setg(errp, "Namespaces with metadata are not yet supported");
>          goto out;
> @@ -764,6 +776,8 @@ static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
>      int ret;
>      BDRVNVMeState *s = bs->opaque;
>  
> +    bs->supported_write_flags = BDRV_REQ_FUA;
> +

Is this a related change?

>      opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
>      qemu_opts_absorb_qdict(opts, options, &error_abort);
>      device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE);
> @@ -792,7 +806,6 @@ static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
>              goto fail;
>          }
>      }
> -    bs->supported_write_flags = BDRV_REQ_FUA;
>      return 0;
>  fail:
>      nvme_close(bs);
> @@ -1086,6 +1099,60 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
>  }
>  
>  
> +static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
> +                                              int64_t offset,
> +                                              int bytes,
> +                                              BdrvRequestFlags flags)
> +{
> +    BDRVNVMeState *s = bs->opaque;
> +    NVMeQueuePair *ioq = s->queues[1];

I think it'd be slick to name the queues, but that's not related to this
patch.

> +    NVMeRequest *req;
> +
> +    uint32_t cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF;
> +
> +    if (!s->supports_write_zeros) {
> +        return -ENOTSUP;
> +    }
> +
> +    NvmeCmd cmd = {
> +        .opcode = NVME_CMD_WRITE_ZEROS,
> +        .nsid = cpu_to_le32(s->nsid),
> +        .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
> +        .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
> +    };
> +
> +    NVMeCoData data = {
> +        .ctx = bdrv_get_aio_context(bs),
> +        .ret = -EINPROGRESS,
> +    };
> +
> +    if (flags & BDRV_REQ_MAY_UNMAP) {
> +        cdw12 |= (1 << 25);
> +    }
> +
> +    if (flags & BDRV_REQ_FUA) {
> +        cdw12 |= (1 << 30);
> +    }
> +
> +    cmd.cdw12 = cpu_to_le32(cdw12);
> +
> +    trace_nvme_write_zeros(s, offset, bytes, flags);
> +    assert(s->nr_queues > 1);
> +    req = nvme_get_free_req(ioq);
> +    assert(req);
> +
> +    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
> +
> +    data.co = qemu_coroutine_self();
> +    while (data.ret == -EINPROGRESS) {
> +        qemu_coroutine_yield();
> +    }
> +
> +    trace_nvme_rw_done(s, true, offset, bytes, data.ret);
> +    return data.ret;
> +}
> +
> +
>  static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
>                                 BlockReopenQueue *queue, Error **errp)
>  {
> @@ -1190,6 +1257,9 @@ static BlockDriver bdrv_nvme = {
>  
>      .bdrv_co_preadv           = nvme_co_preadv,
>      .bdrv_co_pwritev          = nvme_co_pwritev,
> +
> +    .bdrv_co_pwrite_zeroes    = nvme_co_pwrite_zeroes,
> +
>      .bdrv_co_flush_to_disk    = nvme_co_flush,
>      .bdrv_reopen_prepare      = nvme_reopen_prepare,
>  
> diff --git a/block/trace-events b/block/trace-events
> index 04209f058d..8209fbd0c7 100644
> --- a/block/trace-events
> +++ b/block/trace-events
> @@ -149,6 +149,7 @@ nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6,
>  nvme_handle_event(void *s) "s %p"
>  nvme_poll_cb(void *s) "s %p"
>  nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset %"PRId64" bytes %"PRId64" flags %d niov %d"
> +nvme_write_zeros(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset %"PRId64" bytes %"PRId64" flags %d"

I was told once that we wanted trace events to match the name of the
function they occurred within whenever possible.

Maybe we haven't really been very good about enforcing that.

Oh well.

>  nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
>  nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d"
>  nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d"
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index 3ec8efcc43..1f5b406344 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -653,12 +653,29 @@ typedef struct NvmeIdNs {
>      uint8_t     mc;
>      uint8_t     dpc;
>      uint8_t     dps;
> -    uint8_t     res30[98];
> +
> +    uint8_t     nmic;
> +    uint8_t     rescap;
> +    uint8_t     fpi;
> +    uint8_t     dlfeat;
> +
> +    uint8_t     res30[94];

I wouldn't call this "res30" anymore, because now it starts at the 34th
byte instead of the 30th.

>      NvmeLBAF    lbaf[16];
>      uint8_t     res192[192];
>      uint8_t     vs[3712];
>  } NvmeIdNs;
>  

Pre-existing, but what are any of these names supposed to mean?

(I imagine they match the spec, but where?...)

Leaving me some breadcrumbs would greatly reduce the time it takes
someone who doesn't already know NVME to review this, and I suspect
you've looked them up recently, so leaving little notes in the cover
letter at least for relevant sections is very nice for hardware spec
patches like this.

> +
> +/*Deallocate Logical Block Features*/

ah. "dlfeat" --> "Deallocate Logical (Block) Features".

From here:

NVME Express 1.3, Figure 114: Identify - Identify Namespace Data
Structure, NVM Command Set Specific

> +#define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)       ((dlfeat) & 0x10)

Not used in this patch?

> +#define NVME_ID_NS_DLFEAT_WRITE_ZEROS(dlfeat)     ((dlfeat) & 0x08)
> +
> +#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR(dlfeat)     ((dlfeat) & 0x7)
> +#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_UNDEFINED   0
> +#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROS       1
> +#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ONES        2
> +
> +
>  #define NVME_ID_NS_NSFEAT_THIN(nsfeat)      ((nsfeat & 0x1))
>  #define NVME_ID_NS_FLBAS_EXTENDED(flbas)    ((flbas >> 4) & 0x1)
>  #define NVME_ID_NS_FLBAS_INDEX(flbas)       ((flbas & 0xf))
> 

Seems good otherwise, but I didn't trace the actual execution of the new
command too far -- I'll assume it works. :)

--js


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2] block/nvme: add support for discard
  2019-08-25  7:15 ` [Qemu-devel] [PATCH 2/2] block/nvme: add support for discard Maxim Levitsky
@ 2019-08-27 22:29   ` John Snow
  2019-08-28  9:03     ` Maxim Levitsky
  0 siblings, 1 reply; 15+ messages in thread
From: John Snow @ 2019-08-27 22:29 UTC (permalink / raw)
  To: Maxim Levitsky, qemu-devel
  Cc: Fam Zheng, Kevin Wolf, Paolo Bonzini, qemu-block, Max Reitz



On 8/25/19 3:15 AM, Maxim Levitsky wrote:
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> ---
>  block/nvme.c       | 83 ++++++++++++++++++++++++++++++++++++++++++++++
>  block/trace-events |  2 ++
>  2 files changed, 85 insertions(+)
> 
> diff --git a/block/nvme.c b/block/nvme.c
> index f8bd11e19a..dd041f39c9 100644
> --- a/block/nvme.c
> +++ b/block/nvme.c
> @@ -112,6 +112,7 @@ typedef struct {
>      bool plugged;
>  
>      bool supports_write_zeros;
> +    bool supports_discard;
>  
>      CoMutex dma_map_lock;
>      CoQueue dma_flush_queue;
> @@ -463,6 +464,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
>  
>      oncs = le16_to_cpu(idctrl->oncs);
>      s->supports_write_zeros = (oncs & NVME_ONCS_WRITE_ZEROS) != 0;
> +    s->supports_discard = (oncs & NVME_ONCS_DSM) != 0;

Same comment -- checking !!(register & FIELD) is nicer than the
negative. (I'm actually not sure even the !! is needed, but it seems to
be a QEMU-ism and I've caught myself using it...)

Rest looks good to me on a skim, but I'm not very well-versed in NVME.

>  
>      memset(resp, 0, 4096);
>  
> @@ -1153,6 +1155,86 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
>  }
>  
>  
> +static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
> +                                         int64_t offset,
> +                                         int bytes)
> +{
> +    BDRVNVMeState *s = bs->opaque;
> +    NVMeQueuePair *ioq = s->queues[1];
> +    NVMeRequest *req;
> +    NvmeDsmRange *buf;
> +    QEMUIOVector local_qiov;
> +    int ret;
> +
> +    NvmeCmd cmd = {
> +        .opcode = NVME_CMD_DSM,
> +        .nsid = cpu_to_le32(s->nsid),
> +        .cdw10 = cpu_to_le32(0), /*number of ranges - 0 based*/
> +        .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/
> +    };
> +
> +    NVMeCoData data = {
> +        .ctx = bdrv_get_aio_context(bs),
> +        .ret = -EINPROGRESS,
> +    };
> +
> +    if (!s->supports_discard) {
> +        return -ENOTSUP;
> +    }
> +
> +    assert(s->nr_queues > 1);
> +
> +    buf = qemu_try_blockalign0(bs, s->page_size);
> +    if (!buf) {
> +        return -ENOMEM;
> +    }
> +
> +    buf->nlb = cpu_to_le32(bytes >> s->blkshift);
> +    buf->slba = cpu_to_le64(offset >> s->blkshift);
> +    buf->cattr = 0;
> +
> +    qemu_iovec_init(&local_qiov, 1);
> +    qemu_iovec_add(&local_qiov, buf, 4096);
> +
> +    req = nvme_get_free_req(ioq);
> +    assert(req);
> +
> +    qemu_co_mutex_lock(&s->dma_map_lock);
> +    ret = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov);
> +    qemu_co_mutex_unlock(&s->dma_map_lock);
> +
> +    if (ret) {
> +        req->busy = false;
> +        goto out;
> +    }
> +
> +    trace_nvme_dsm(s, offset, bytes);
> +
> +    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
> +
> +    data.co = qemu_coroutine_self();
> +    while (data.ret == -EINPROGRESS) {
> +        qemu_coroutine_yield();
> +    }
> +
> +    qemu_co_mutex_lock(&s->dma_map_lock);
> +    ret = nvme_cmd_unmap_qiov(bs, &local_qiov);
> +    qemu_co_mutex_unlock(&s->dma_map_lock);
> +
> +    if (ret) {
> +        goto out;
> +    }
> +
> +    ret = data.ret;
> +    trace_nvme_dsm_done(s, offset, bytes, ret);
> +out:
> +    qemu_iovec_destroy(&local_qiov);
> +    qemu_vfree(buf);
> +    return ret;
> +
> +}
> +
> +
>  static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
>                                 BlockReopenQueue *queue, Error **errp)
>  {
> @@ -1259,6 +1341,7 @@ static BlockDriver bdrv_nvme = {
>      .bdrv_co_pwritev          = nvme_co_pwritev,
>  
>      .bdrv_co_pwrite_zeroes    = nvme_co_pwrite_zeroes,
> +    .bdrv_co_pdiscard         = nvme_co_pdiscard,
>  
>      .bdrv_co_flush_to_disk    = nvme_co_flush,
>      .bdrv_reopen_prepare      = nvme_reopen_prepare,
> diff --git a/block/trace-events b/block/trace-events
> index 8209fbd0c7..7d1d48b502 100644
> --- a/block/trace-events
> +++ b/block/trace-events
> @@ -153,6 +153,8 @@ nvme_write_zeros(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offs
>  nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
>  nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d"
>  nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d"
> +nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset %"PRId64" bytes %"PRId64""
> +nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset %"PRId64" bytes %"PRId64" ret %d"
>  nvme_dma_map_flush(void *s) "s %p"
>  nvme_free_req_queue_wait(void *q) "q %p"
>  nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
> 


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH 1/2] block/nvme: add support for write zeros
  2019-08-27 22:22   ` John Snow
@ 2019-08-28  9:02     ` Maxim Levitsky
  2019-09-13 13:30     ` Maxim Levitsky
  1 sibling, 0 replies; 15+ messages in thread
From: Maxim Levitsky @ 2019-08-28  9:02 UTC (permalink / raw)
  To: John Snow, qemu-devel
  Cc: Fam Zheng, Kevin Wolf, Paolo Bonzini, qemu-block, Max Reitz

On Tue, 2019-08-27 at 18:22 -0400, John Snow wrote:
> Without a commit message, I have no real hope of reviewing this. I was
> CC'd, though, so I'll give it a blind shot.
> 
> We want to add write_zeroes support for block/nvme, but I can't really
> verify any of that is correct or working without a unit test, a spec, or
> some instructions to help me verify any of this does what it looks like
> it does.

Its a bit problematic to have unit tests for this specific driver,
because it is the first and the only driver which is a driver in the
real sense of the word, that is it works directly with host hardware,
replacing the in-kernel driver.

I guess some unit tests could be done using a nested VM or so.


Yea, I was initially confused as well as what write zeros does, but it
does _exactly_ what it says, just writes zeros to the block you ask
for. So basically just like a write command, but without data.

Plus optionally if the drive  declares that after discard,
the blocks read as zeros, you can enable 'DEALOC' bit in write zeros

command to specify that in addition to filling the block with zeros
it would be nice to discard its flash data as well (this is still a hint
though).


> 
> On 8/25/19 3:15 AM, Maxim Levitsky wrote:
> > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > ---
> >  block/nvme.c         | 72 +++++++++++++++++++++++++++++++++++++++++++-
> >  block/trace-events   |  1 +
> >  include/block/nvme.h | 19 +++++++++++-
> >  3 files changed, 90 insertions(+), 2 deletions(-)
> > 
> > diff --git a/block/nvme.c b/block/nvme.c
> > index 5be3a39b63..f8bd11e19a 100644
> > --- a/block/nvme.c
> > +++ b/block/nvme.c
> > @@ -111,6 +111,8 @@ typedef struct {
> >      uint64_t max_transfer;
> >      bool plugged;
> >  
> > +    bool supports_write_zeros;
> > +
> 
> I suppose the spelling of "zeroes" is not as established as I thought it
> was. Actually, what's worse is that the NVME writers apparently couldn't
> decide what to name it themselves either:
> 
> "Write Zeroes" has 23 hits.
> "Write Zeros" has just two, in Figure 114 for the Identify NS Data.
> 
> Oh, in QEMU we're not much better:
> 
> jhuston@probe (review) ~/s/qemu> git grep -i zeros | wc -l
> 265
> jhuston@probe (review) ~/s/qemu> git grep -i zeroes | wc -l
> 747
> 
> I'm going to suggest that we use 'zeroes' as the spelling here, to match
> the existing 'pwrite_zeroes', and then otherwise to match the NVME
> spec's usual spelling.
No problem I didn't notice the two spelling to be honest.


> 
> >      CoMutex dma_map_lock;
> >      CoQueue dma_flush_queue;
> >  
> > @@ -421,6 +423,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
> >      NvmeIdNs *idns;
> >      NvmeLBAF *lbaf;
> >      uint8_t *resp;
> > +    uint16_t oncs;
> >      int r;
> >      uint64_t iova;
> >      NvmeCmd cmd = {
> > @@ -458,6 +461,9 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
> >      s->max_transfer = MIN_NON_ZERO(s->max_transfer,
> >                            s->page_size / sizeof(uint64_t) * s->page_size);
> >  
> > +    oncs = le16_to_cpu(idctrl->oncs);
> > +    s->supports_write_zeros = (oncs & NVME_ONCS_WRITE_ZEROS) != 0;
> > +
> 
> For other reviewers: oncs is "Optional NVM Command Support".
> 
> I think it's better to say `!!(oncs & NVME_ONCS_WRITE_ZEROES)` to remove
> doubt over the width of the bitmask.
No problem.


> 
> >      memset(resp, 0, 4096);
> >  
> >      cmd.cdw10 = 0;
> > @@ -470,6 +476,12 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
> >      s->nsze = le64_to_cpu(idns->nsze);
> >      lbaf = &idns->lbaf[NVME_ID_NS_FLBAS_INDEX(idns->flbas)];
> >  
> > +    if (NVME_ID_NS_DLFEAT_WRITE_ZEROS(idns->dlfeat) &&
> > +            NVME_ID_NS_DLFEAT_READ_BEHAVIOR(idns->dlfeat) ==
> > +                    NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROS) {
> > +        bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP;
> > +    }
> > +
> >      if (lbaf->ms) {
> >          error_setg(errp, "Namespaces with metadata are not yet supported");
> >          goto out;
> > @@ -764,6 +776,8 @@ static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
> >      int ret;
> >      BDRVNVMeState *s = bs->opaque;
> >  
> > +    bs->supported_write_flags = BDRV_REQ_FUA;
> > +
> 
> Is this a related change?
Yes. this tells qemu that you can 'force unit access' on write zeros
command, which is a flag (non optional thankfully) that tells
the drive to actually write to the flash, and not just cache the result
in some of its caches.

> 
> >      opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
> >      qemu_opts_absorb_qdict(opts, options, &error_abort);
> >      device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE);
> > @@ -792,7 +806,6 @@ static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
> >              goto fail;
> >          }
> >      }
> > -    bs->supported_write_flags = BDRV_REQ_FUA;
> >      return 0;
> >  fail:
> >      nvme_close(bs);
> > @@ -1086,6 +1099,60 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
> >  }
> >  
> >  
> > +static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
> > +                                              int64_t offset,
> > +                                              int bytes,
> > +                                              BdrvRequestFlags flags)
> > +{
> > +    BDRVNVMeState *s = bs->opaque;
> > +    NVMeQueuePair *ioq = s->queues[1];
> 
> I think it'd be slick to name the queues, but that's not related to this
> patch.
You can say that again. There is plenty todo in that driver.
This driver kind of has support to allocate more that one hardware IO queue,
but it only allocate 1 queue. It is actually quite optimal from my work on nvme-mdev.
Note that [1] is because the [0] is the admin queue, so technically it allocates
2 queues, one admin and one IO.

> 
> > +    NVMeRequest *req;
> > +
> > +    uint32_t cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF;
> > +
> > +    if (!s->supports_write_zeros) {
> > +        return -ENOTSUP;
> > +    }
> > +
> > +    NvmeCmd cmd = {
> > +        .opcode = NVME_CMD_WRITE_ZEROS,
> > +        .nsid = cpu_to_le32(s->nsid),
> > +        .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
> > +        .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
> > +    };
> > +
> > +    NVMeCoData data = {
> > +        .ctx = bdrv_get_aio_context(bs),
> > +        .ret = -EINPROGRESS,
> > +    };
> > +
> > +    if (flags & BDRV_REQ_MAY_UNMAP) {
> > +        cdw12 |= (1 << 25);
> > +    }
> > +
> > +    if (flags & BDRV_REQ_FUA) {
> > +        cdw12 |= (1 << 30);
> > +    }
> > +
> > +    cmd.cdw12 = cpu_to_le32(cdw12);
> > +
> > +    trace_nvme_write_zeros(s, offset, bytes, flags);
> > +    assert(s->nr_queues > 1);
> > +    req = nvme_get_free_req(ioq);
> > +    assert(req);
> > +
> > +    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
> > +
> > +    data.co = qemu_coroutine_self();
> > +    while (data.ret == -EINPROGRESS) {
> > +        qemu_coroutine_yield();
> > +    }
> > +
> > +    trace_nvme_rw_done(s, true, offset, bytes, data.ret);
> > +    return data.ret;
> > +}
> > +
> > +
> >  static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
> >                                 BlockReopenQueue *queue, Error **errp)
> >  {
> > @@ -1190,6 +1257,9 @@ static BlockDriver bdrv_nvme = {
> >  
> >      .bdrv_co_preadv           = nvme_co_preadv,
> >      .bdrv_co_pwritev          = nvme_co_pwritev,
> > +
> > +    .bdrv_co_pwrite_zeroes    = nvme_co_pwrite_zeroes,
> > +
> >      .bdrv_co_flush_to_disk    = nvme_co_flush,
> >      .bdrv_reopen_prepare      = nvme_reopen_prepare,
> >  
> > diff --git a/block/trace-events b/block/trace-events
> > index 04209f058d..8209fbd0c7 100644
> > --- a/block/trace-events
> > +++ b/block/trace-events
> > @@ -149,6 +149,7 @@ nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6,
> >  nvme_handle_event(void *s) "s %p"
> >  nvme_poll_cb(void *s) "s %p"
> >  nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset %"PRId64" bytes %"PRId64" flags %d niov %d"
> > +nvme_write_zeros(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset %"PRId64" bytes %"PRId64" flags %d"
> 
> I was told once that we wanted trace events to match the name of the
> function they occurred within whenever possible.
> 
> Maybe we haven't really been very good about enforcing that.
I didn't knew that.


> 
> Oh well.
> 
> >  nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
> >  nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d"
> >  nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d"
> > diff --git a/include/block/nvme.h b/include/block/nvme.h
> > index 3ec8efcc43..1f5b406344 100644
> > --- a/include/block/nvme.h
> > +++ b/include/block/nvme.h
> > @@ -653,12 +653,29 @@ typedef struct NvmeIdNs {
> >      uint8_t     mc;
> >      uint8_t     dpc;
> >      uint8_t     dps;
> > -    uint8_t     res30[98];
> > +
> > +    uint8_t     nmic;
> > +    uint8_t     rescap;
> > +    uint8_t     fpi;
> > +    uint8_t     dlfeat;
> > +
> > +    uint8_t     res30[94];
> 
> I wouldn't call this "res30" anymore, because now it starts at the 34th
> byte instead of the 30th.

True. That  is one of the nice things about keeping a separate userspace nvme driver (
the nvme-mdev was made to prevent this), that you need to duplicate everything,
and that includes the spec structures that already exist and up to date in the kernel.

> 
> >      NvmeLBAF    lbaf[16];
> >      uint8_t     res192[192];
> >      uint8_t     vs[3712];
> >  } NvmeIdNs;
> >  
> 
> Pre-existing, but what are any of these names supposed to mean?
> 
> (I imagine they match the spec, but where?...)
lbaf = LBA formats, these are up to 16 LBA formats, where 
each of them specifies the LBA block size, metadata size,
and hint about how fast that format is.
Basically this allows the user to 'low level format' the drive
to several sector sizes (usually 512 and 4K), plus enable/disable metadata
(extra data that drive can store per sector for checksumming purposes)

vs = vendor specific.

> 
> Leaving me some breadcrumbs would greatly reduce the time it takes
> someone who doesn't already know NVME to review this, and I suspect
> you've looked them up recently, so leaving little notes in the cover
> letter at least for relevant sections is very nice for hardware spec
> patches like this.
> 
> > +
> > +/*Deallocate Logical Block Features*/
> 
> ah. "dlfeat" --> "Deallocate Logical (Block) Features".
> 
> From here:
> 
> NVME Express 1.3, Figure 114: Identify - Identify Namespace Data
> Structure, NVM Command Set Specific
> 
> > +#define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)       ((dlfeat) & 0x10)
> 
> Not used in this patch?
Yep, this thing is metadata related which we don't support yet.

> 
> > +#define NVME_ID_NS_DLFEAT_WRITE_ZEROS(dlfeat)     ((dlfeat) & 0x08)
> > +
> > +#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR(dlfeat)     ((dlfeat) & 0x7)
> > +#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_UNDEFINED   0
> > +#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROS       1
> > +#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ONES        2
> > +
> > +
> >  #define NVME_ID_NS_NSFEAT_THIN(nsfeat)      ((nsfeat & 0x1))
> >  #define NVME_ID_NS_FLBAS_EXTENDED(flbas)    ((flbas >> 4) & 0x1)
> >  #define NVME_ID_NS_FLBAS_INDEX(flbas)       ((flbas & 0xf))
> > 
> 
> Seems good otherwise, but I didn't trace the actual execution of the new
> command too far -- I'll assume it works. :)

I tested it a bit on the drives I have, seems to work.

Best regards,
	Maxim Levitsky



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2] block/nvme: add support for discard
  2019-08-27 22:29   ` John Snow
@ 2019-08-28  9:03     ` Maxim Levitsky
  2019-09-05 13:24       ` Maxim Levitsky
  0 siblings, 1 reply; 15+ messages in thread
From: Maxim Levitsky @ 2019-08-28  9:03 UTC (permalink / raw)
  To: John Snow, qemu-devel
  Cc: Fam Zheng, Kevin Wolf, Paolo Bonzini, qemu-block, Max Reitz

On Tue, 2019-08-27 at 18:29 -0400, John Snow wrote:
> 
> On 8/25/19 3:15 AM, Maxim Levitsky wrote:
> > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > ---
> >  block/nvme.c       | 83 ++++++++++++++++++++++++++++++++++++++++++++++
> >  block/trace-events |  2 ++
> >  2 files changed, 85 insertions(+)
> > 
> > diff --git a/block/nvme.c b/block/nvme.c
> > index f8bd11e19a..dd041f39c9 100644
> > --- a/block/nvme.c
> > +++ b/block/nvme.c
> > @@ -112,6 +112,7 @@ typedef struct {
> >      bool plugged;
> >  
> >      bool supports_write_zeros;
> > +    bool supports_discard;
> >  
> >      CoMutex dma_map_lock;
> >      CoQueue dma_flush_queue;
> > @@ -463,6 +464,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
> >  
> >      oncs = le16_to_cpu(idctrl->oncs);
> >      s->supports_write_zeros = (oncs & NVME_ONCS_WRITE_ZEROS) != 0;
> > +    s->supports_discard = (oncs & NVME_ONCS_DSM) != 0;
> 
> Same comment -- checking !!(register & FIELD) is nicer than the
> negative. (I'm actually not sure even the !! is needed, but it seems to
> be a QEMU-ism and I've caught myself using it...)

All right, no problem to use !!

> 
> Rest looks good to me on a skim, but I'm not very well-versed in NVME.
Thanks!


> 
> >  
> >      memset(resp, 0, 4096);
> >  
> > @@ -1153,6 +1155,86 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
> >  }
> >  
> >  
> > +static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
> > +                                         int64_t offset,
> > +                                         int bytes)
> > +{
> > +    BDRVNVMeState *s = bs->opaque;
> > +    NVMeQueuePair *ioq = s->queues[1];
> > +    NVMeRequest *req;
> > +    NvmeDsmRange *buf;
> > +    QEMUIOVector local_qiov;
> > +    int ret;
> > +
> > +    NvmeCmd cmd = {
> > +        .opcode = NVME_CMD_DSM,
> > +        .nsid = cpu_to_le32(s->nsid),
> > +        .cdw10 = cpu_to_le32(0), /*number of ranges - 0 based*/
> > +        .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/
> > +    };
> > +
> > +    NVMeCoData data = {
> > +        .ctx = bdrv_get_aio_context(bs),
> > +        .ret = -EINPROGRESS,
> > +    };
> > +
> > +    if (!s->supports_discard) {
> > +        return -ENOTSUP;
> > +    }
> > +
> > +    assert(s->nr_queues > 1);
> > +
> > +    buf = qemu_try_blockalign0(bs, s->page_size);
> > +    if (!buf) {
> > +        return -ENOMEM;
> > +    }
> > +
> > +    buf->nlb = cpu_to_le32(bytes >> s->blkshift);
> > +    buf->slba = cpu_to_le64(offset >> s->blkshift);
> > +    buf->cattr = 0;
> > +
> > +    qemu_iovec_init(&local_qiov, 1);
> > +    qemu_iovec_add(&local_qiov, buf, 4096);
> > +
> > +    req = nvme_get_free_req(ioq);
> > +    assert(req);
> > +
> > +    qemu_co_mutex_lock(&s->dma_map_lock);
> > +    ret = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov);
> > +    qemu_co_mutex_unlock(&s->dma_map_lock);
> > +
> > +    if (ret) {
> > +        req->busy = false;
> > +        goto out;
> > +    }
> > +
> > +    trace_nvme_dsm(s, offset, bytes);
> > +
> > +    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
> > +
> > +    data.co = qemu_coroutine_self();
> > +    while (data.ret == -EINPROGRESS) {
> > +        qemu_coroutine_yield();
> > +    }
> > +
> > +    qemu_co_mutex_lock(&s->dma_map_lock);
> > +    ret = nvme_cmd_unmap_qiov(bs, &local_qiov);
> > +    qemu_co_mutex_unlock(&s->dma_map_lock);
> > +
> > +    if (ret) {
> > +        goto out;
> > +    }
> > +
> > +    ret = data.ret;
> > +    trace_nvme_dsm_done(s, offset, bytes, ret);
> > +out:
> > +    qemu_iovec_destroy(&local_qiov);
> > +    qemu_vfree(buf);
> > +    return ret;
> > +
> > +}
> > +
> > +
> >  static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
> >                                 BlockReopenQueue *queue, Error **errp)
> >  {
> > @@ -1259,6 +1341,7 @@ static BlockDriver bdrv_nvme = {
> >      .bdrv_co_pwritev          = nvme_co_pwritev,
> >  
> >      .bdrv_co_pwrite_zeroes    = nvme_co_pwrite_zeroes,
> > +    .bdrv_co_pdiscard         = nvme_co_pdiscard,
> >  
> >      .bdrv_co_flush_to_disk    = nvme_co_flush,
> >      .bdrv_reopen_prepare      = nvme_reopen_prepare,
> > diff --git a/block/trace-events b/block/trace-events
> > index 8209fbd0c7..7d1d48b502 100644
> > --- a/block/trace-events
> > +++ b/block/trace-events
> > @@ -153,6 +153,8 @@ nvme_write_zeros(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offs
> >  nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
> >  nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d"
> >  nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d"
> > +nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset %"PRId64" bytes %"PRId64""
> > +nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset %"PRId64" bytes %"PRId64" ret %d"
> >  nvme_dma_map_flush(void *s) "s %p"
> >  nvme_free_req_queue_wait(void *q) "q %p"
> >  nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
> > 


Best regards,
	Maxim Levitsky



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2] block/nvme: add support for discard
  2019-08-28  9:03     ` Maxim Levitsky
@ 2019-09-05 13:24       ` Maxim Levitsky
  2019-09-05 17:27         ` John Snow
  0 siblings, 1 reply; 15+ messages in thread
From: Maxim Levitsky @ 2019-09-05 13:24 UTC (permalink / raw)
  To: John Snow, qemu-devel
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Max Reitz, John Ferlan, Paolo Bonzini

On Wed, 2019-08-28 at 12:03 +0300, Maxim Levitsky wrote:
> On Tue, 2019-08-27 at 18:29 -0400, John Snow wrote:
> > 
> > On 8/25/19 3:15 AM, Maxim Levitsky wrote:
> > > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > > ---
> > >  block/nvme.c       | 83 ++++++++++++++++++++++++++++++++++++++++++++++
> > >  block/trace-events |  2 ++
> > >  2 files changed, 85 insertions(+)
> > > 
> > > diff --git a/block/nvme.c b/block/nvme.c
> > > index f8bd11e19a..dd041f39c9 100644
> > > --- a/block/nvme.c
> > > +++ b/block/nvme.c
> > > @@ -112,6 +112,7 @@ typedef struct {
> > >      bool plugged;
> > >  
> > >      bool supports_write_zeros;
> > > +    bool supports_discard;
> > >  
> > >      CoMutex dma_map_lock;
> > >      CoQueue dma_flush_queue;
> > > @@ -463,6 +464,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
> > >  
> > >      oncs = le16_to_cpu(idctrl->oncs);
> > >      s->supports_write_zeros = (oncs & NVME_ONCS_WRITE_ZEROS) != 0;
> > > +    s->supports_discard = (oncs & NVME_ONCS_DSM) != 0;
> > 
> > Same comment -- checking !!(register & FIELD) is nicer than the
> > negative. (I'm actually not sure even the !! is needed, but it seems to
> > be a QEMU-ism and I've caught myself using it...)
> 
> All right, no problem to use !!
> 
> > 
> > Rest looks good to me on a skim, but I'm not very well-versed in NVME.
> 
> Thanks!
> 

Kind ping about this patch series.

Apart from using !!, do you think that this patch series
can be merged, or should I do anything else?
Which tree do you think this should be committed to?

I kind of want to see that merged before the freeze
starts, if there are no objections,
to reduce the amount of pending stuff in my queue.


[...]



Best regards,
	Maxim Levitsky



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2] block/nvme: add support for discard
  2019-09-05 13:24       ` Maxim Levitsky
@ 2019-09-05 17:27         ` John Snow
  2019-09-05 17:32           ` Maxim Levitsky
  2019-09-09  9:25           ` Max Reitz
  0 siblings, 2 replies; 15+ messages in thread
From: John Snow @ 2019-09-05 17:27 UTC (permalink / raw)
  To: Maxim Levitsky, qemu-devel
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Max Reitz, John Ferlan, Paolo Bonzini



On 9/5/19 9:24 AM, Maxim Levitsky wrote:
> On Wed, 2019-08-28 at 12:03 +0300, Maxim Levitsky wrote:
>> On Tue, 2019-08-27 at 18:29 -0400, John Snow wrote:
>>>
>>> On 8/25/19 3:15 AM, Maxim Levitsky wrote:
>>>> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
>>>> ---
>>>>  block/nvme.c       | 83 ++++++++++++++++++++++++++++++++++++++++++++++
>>>>  block/trace-events |  2 ++
>>>>  2 files changed, 85 insertions(+)
>>>>
>>>> diff --git a/block/nvme.c b/block/nvme.c
>>>> index f8bd11e19a..dd041f39c9 100644
>>>> --- a/block/nvme.c
>>>> +++ b/block/nvme.c
>>>> @@ -112,6 +112,7 @@ typedef struct {
>>>>      bool plugged;
>>>>  
>>>>      bool supports_write_zeros;
>>>> +    bool supports_discard;
>>>>  
>>>>      CoMutex dma_map_lock;
>>>>      CoQueue dma_flush_queue;
>>>> @@ -463,6 +464,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
>>>>  
>>>>      oncs = le16_to_cpu(idctrl->oncs);
>>>>      s->supports_write_zeros = (oncs & NVME_ONCS_WRITE_ZEROS) != 0;
>>>> +    s->supports_discard = (oncs & NVME_ONCS_DSM) != 0;
>>>
>>> Same comment -- checking !!(register & FIELD) is nicer than the
>>> negative. (I'm actually not sure even the !! is needed, but it seems to
>>> be a QEMU-ism and I've caught myself using it...)
>>
>> All right, no problem to use !!
>>
>>>
>>> Rest looks good to me on a skim, but I'm not very well-versed in NVME.
>>
>> Thanks!
>>
> 
> Kind ping about this patch series.
> 
> Apart from using !!, do you think that this patch series
> can be merged, or should I do anything else?
> Which tree do you think this should be committed to?
> 
> I kind of want to see that merged before the freeze
> starts, if there are no objections,
> to reduce the amount of pending stuff in my queue.
> 

Didn't I ask a few other things?

like not using "res30" because you've moved the fields around, and
trying to be consistent about "zeros" vs "zeroes".

Removing "+#define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)       ((dlfeat) &
0x10)" because it's unused.

You also probably require review (or at least an ACK) from Keith Busch
who maintains this file.

--js


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2] block/nvme: add support for discard
  2019-09-05 17:27         ` John Snow
@ 2019-09-05 17:32           ` Maxim Levitsky
  2019-09-09  9:25           ` Max Reitz
  1 sibling, 0 replies; 15+ messages in thread
From: Maxim Levitsky @ 2019-09-05 17:32 UTC (permalink / raw)
  To: John Snow, qemu-devel
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Max Reitz, John Ferlan,
	Keith Busch, Paolo Bonzini

On Thu, 2019-09-05 at 13:27 -0400, John Snow wrote:
> 
> On 9/5/19 9:24 AM, Maxim Levitsky wrote:
> > On Wed, 2019-08-28 at 12:03 +0300, Maxim Levitsky wrote:
> > > On Tue, 2019-08-27 at 18:29 -0400, John Snow wrote:
> > > > 
> > > > On 8/25/19 3:15 AM, Maxim Levitsky wrote:
> > > > > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > > > > ---
> > > > >  block/nvme.c       | 83 ++++++++++++++++++++++++++++++++++++++++++++++
> > > > >  block/trace-events |  2 ++
> > > > >  2 files changed, 85 insertions(+)
> > > > > 
> > > > > diff --git a/block/nvme.c b/block/nvme.c
> > > > > index f8bd11e19a..dd041f39c9 100644
> > > > > --- a/block/nvme.c
> > > > > +++ b/block/nvme.c
> > > > > @@ -112,6 +112,7 @@ typedef struct {
> > > > >      bool plugged;
> > > > >  
> > > > >      bool supports_write_zeros;
> > > > > +    bool supports_discard;
> > > > >  
> > > > >      CoMutex dma_map_lock;
> > > > >      CoQueue dma_flush_queue;
> > > > > @@ -463,6 +464,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
> > > > >  
> > > > >      oncs = le16_to_cpu(idctrl->oncs);
> > > > >      s->supports_write_zeros = (oncs & NVME_ONCS_WRITE_ZEROS) != 0;
> > > > > +    s->supports_discard = (oncs & NVME_ONCS_DSM) != 0;
> > > > 
> > > > Same comment -- checking !!(register & FIELD) is nicer than the
> > > > negative. (I'm actually not sure even the !! is needed, but it seems to
> > > > be a QEMU-ism and I've caught myself using it...)
> > > 
> > > All right, no problem to use !!
> > > 
> > > > 
> > > > Rest looks good to me on a skim, but I'm not very well-versed in NVME.
> > > 
> > > Thanks!
> > > 
> > 
> > Kind ping about this patch series.
> > 
> > Apart from using !!, do you think that this patch series
> > can be merged, or should I do anything else?
> > Which tree do you think this should be committed to?
> > 
> > I kind of want to see that merged before the freeze
> > starts, if there are no objections,
> > to reduce the amount of pending stuff in my queue.
> > 
> 
> Didn't I ask a few other things?
> 
> like not using "res30" because you've moved the fields around, and
> trying to be consistent about "zeros" vs "zeroes".
> 
> Removing "+#define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)       ((dlfeat) &
> 0x10)" because it's unused.


All right I forgot about it, that's one of they joys of duplication of a kernel
driver in the userspace...


> 
> You also probably require review (or at least an ACK) from Keith Busch
> who maintains this file.
> 
> --js

All right,
	Best regards,
		Maxim Levitsky




^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2] block/nvme: add support for discard
  2019-09-05 17:27         ` John Snow
  2019-09-05 17:32           ` Maxim Levitsky
@ 2019-09-09  9:25           ` Max Reitz
  2019-09-09 17:03             ` John Snow
  1 sibling, 1 reply; 15+ messages in thread
From: Max Reitz @ 2019-09-09  9:25 UTC (permalink / raw)
  To: John Snow, Maxim Levitsky, qemu-devel
  Cc: Fam Zheng, Kevin Wolf, John Ferlan, qemu-block, Paolo Bonzini

[-- Attachment #1.1: Type: text/plain, Size: 265 bytes --]

On 05.09.19 19:27, John Snow wrote:

[...]

> You also probably require review (or at least an ACK) from Keith Busch
> who maintains this file.

Keith actually maintains the NVMe guest device; technically, Fam is the
NVMe block driver maintainer.

Max


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2] block/nvme: add support for discard
  2019-09-09  9:25           ` Max Reitz
@ 2019-09-09 17:03             ` John Snow
  2019-09-10 14:49               ` Paolo Bonzini
  0 siblings, 1 reply; 15+ messages in thread
From: John Snow @ 2019-09-09 17:03 UTC (permalink / raw)
  To: Max Reitz, Maxim Levitsky, qemu-devel
  Cc: Fam Zheng, Kevin Wolf, John Ferlan, qemu-block, Paolo Bonzini



On 9/9/19 5:25 AM, Max Reitz wrote:
> On 05.09.19 19:27, John Snow wrote:
> 
> [...]
> 
>> You also probably require review (or at least an ACK) from Keith Busch
>> who maintains this file.
> 
> Keith actually maintains the NVMe guest device; technically, Fam is the
> NVMe block driver maintainer.

W h o o p s. Thanks for correcting me.

Well, if it's Fam -- he seems a little busier lately -- it's probably
not so crucial to gate on his approval. I thought it'd be nice to at
least get an ACK from someone who has used this module before, because I
haven't -- I was just giving some style review to help push it along.

(On that note, if you felt like my style review was wrong or isn't worth
doing -- it is always perfectly fair to just say so, along with some
reason as to why you won't -- that way patches won't rot on the list
when people may have gotten the impression that a V2 is warranted.)

--js


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2] block/nvme: add support for discard
  2019-09-09 17:03             ` John Snow
@ 2019-09-10 14:49               ` Paolo Bonzini
  2019-09-10 14:57                 ` Maxim Levitsky
  0 siblings, 1 reply; 15+ messages in thread
From: Paolo Bonzini @ 2019-09-10 14:49 UTC (permalink / raw)
  To: John Snow, Max Reitz, Maxim Levitsky, qemu-devel
  Cc: Fam Zheng, Kevin Wolf, qemu-block, John Ferlan

On 09/09/19 19:03, John Snow wrote:
> 
> 
> On 9/9/19 5:25 AM, Max Reitz wrote:
>> On 05.09.19 19:27, John Snow wrote:
>>
>> [...]
>>
>>> You also probably require review (or at least an ACK) from Keith Busch
>>> who maintains this file.
>>
>> Keith actually maintains the NVMe guest device; technically, Fam is the
>> NVMe block driver maintainer.
> 
> W h o o p s. Thanks for correcting me.
> 
> Well, if it's Fam -- he seems a little busier lately -- it's probably
> not so crucial to gate on his approval. I thought it'd be nice to at
> least get an ACK from someone who has used this module before, because I
> haven't -- I was just giving some style review to help push it along.
> 
> (On that note, if you felt like my style review was wrong or isn't worth
> doing -- it is always perfectly fair to just say so, along with some
> reason as to why you won't -- that way patches won't rot on the list
> when people may have gotten the impression that a V2 is warranted.)

Looks good to me with the changes you pointed out (especially res30;
leaving out the unused macros is not so important).

Paolo



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2] block/nvme: add support for discard
  2019-09-10 14:49               ` Paolo Bonzini
@ 2019-09-10 14:57                 ` Maxim Levitsky
  0 siblings, 0 replies; 15+ messages in thread
From: Maxim Levitsky @ 2019-09-10 14:57 UTC (permalink / raw)
  To: Paolo Bonzini, John Snow, Max Reitz, qemu-devel
  Cc: Fam Zheng, Kevin Wolf, qemu-block, John Ferlan

On Tue, 2019-09-10 at 16:49 +0200, Paolo Bonzini wrote:
> On 09/09/19 19:03, John Snow wrote:
> > 
> > 
> > On 9/9/19 5:25 AM, Max Reitz wrote:
> > > On 05.09.19 19:27, John Snow wrote:
> > > 
> > > [...]
> > > 
> > > > You also probably require review (or at least an ACK) from Keith Busch
> > > > who maintains this file.
> > > 
> > > Keith actually maintains the NVMe guest device; technically, Fam is the
> > > NVMe block driver maintainer.
> > 
> > W h o o p s. Thanks for correcting me.
> > 
> > Well, if it's Fam -- he seems a little busier lately -- it's probably
> > not so crucial to gate on his approval. I thought it'd be nice to at
> > least get an ACK from someone who has used this module before, because I
> > haven't -- I was just giving some style review to help push it along.
> > 
> > (On that note, if you felt like my style review was wrong or isn't worth
> > doing -- it is always perfectly fair to just say so, along with some
> > reason as to why you won't -- that way patches won't rot on the list
> > when people may have gotten the impression that a V2 is warranted.)
Absolutely not, your review was fine! I just was/is a bit lazy to send next version of the patches
before I get some kind of indication if anything else is needed for this to be merged,
since the module doesn't have currently an active maintainer.


> 
> Looks good to me with the changes you pointed out (especially res30;
> leaving out the unused macros is not so important).

All right, I'll send an updated version of those two patches soon.

Best regards,
	Maxim Levitsky




^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH 1/2] block/nvme: add support for write zeros
  2019-08-27 22:22   ` John Snow
  2019-08-28  9:02     ` Maxim Levitsky
@ 2019-09-13 13:30     ` Maxim Levitsky
  1 sibling, 0 replies; 15+ messages in thread
From: Maxim Levitsky @ 2019-09-13 13:30 UTC (permalink / raw)
  To: John Snow, qemu-devel
  Cc: Fam Zheng, Kevin Wolf, Paolo Bonzini, qemu-block, Max Reitz

On Tue, 2019-08-27 at 18:22 -0400, John Snow wrote:
> Without a commit message, I have no real hope of reviewing this. I was
> CC'd, though, so I'll give it a blind shot.
> 
> We want to add write_zeroes support for block/nvme, but I can't really
> verify any of that is correct or working without a unit test, a spec, or
> some instructions to help me verify any of this does what it looks like
> it does.
> 
> On 8/25/19 3:15 AM, Maxim Levitsky wrote:
> > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > ---
> >  block/nvme.c         | 72 +++++++++++++++++++++++++++++++++++++++++++-
> >  block/trace-events   |  1 +
> >  include/block/nvme.h | 19 +++++++++++-
> >  3 files changed, 90 insertions(+), 2 deletions(-)
> > 
> > diff --git a/block/nvme.c b/block/nvme.c
> > index 5be3a39b63..f8bd11e19a 100644
> > --- a/block/nvme.c
> > +++ b/block/nvme.c
> > @@ -111,6 +111,8 @@ typedef struct {
> >      uint64_t max_transfer;
> >      bool plugged;
> >  
> > +    bool supports_write_zeros;
> > +
> 
> I suppose the spelling of "zeroes" is not as established as I thought it
> was. Actually, what's worse is that the NVME writers apparently couldn't
> decide what to name it themselves either:
> 
> "Write Zeroes" has 23 hits.
> "Write Zeros" has just two, in Figure 114 for the Identify NS Data.
> 
> Oh, in QEMU we're not much better:
> 
> jhuston@probe (review) ~/s/qemu> git grep -i zeros | wc -l
> 265
> jhuston@probe (review) ~/s/qemu> git grep -i zeroes | wc -l
> 747
> 
> I'm going to suggest that we use 'zeroes' as the spelling here, to match
> the existing 'pwrite_zeroes', and then otherwise to match the NVME
> spec's usual spelling.
OK.


> 
> >      CoMutex dma_map_lock;
> >      CoQueue dma_flush_queue;
> >  
> > @@ -421,6 +423,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
> >      NvmeIdNs *idns;
> >      NvmeLBAF *lbaf;
> >      uint8_t *resp;
> > +    uint16_t oncs;
> >      int r;
> >      uint64_t iova;
> >      NvmeCmd cmd = {
> > @@ -458,6 +461,9 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
> >      s->max_transfer = MIN_NON_ZERO(s->max_transfer,
> >                            s->page_size / sizeof(uint64_t) * s->page_size);
> >  
> > +    oncs = le16_to_cpu(idctrl->oncs);
> > +    s->supports_write_zeros = (oncs & NVME_ONCS_WRITE_ZEROS) != 0;
> > +
> 
> For other reviewers: oncs is "Optional NVM Command Support".
> 
> I think it's better to say `!!(oncs & NVME_ONCS_WRITE_ZEROES)` to remove
> doubt over the width of the bitmask.
OK.
> 
> >      memset(resp, 0, 4096);
> >  
> >      cmd.cdw10 = 0;
> > @@ -470,6 +476,12 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
> >      s->nsze = le64_to_cpu(idns->nsze);
> >      lbaf = &idns->lbaf[NVME_ID_NS_FLBAS_INDEX(idns->flbas)];
> >  
> > +    if (NVME_ID_NS_DLFEAT_WRITE_ZEROS(idns->dlfeat) &&
> > +            NVME_ID_NS_DLFEAT_READ_BEHAVIOR(idns->dlfeat) ==
> > +                    NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROS) {
> > +        bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP;
> > +    }
> > +
> >      if (lbaf->ms) {
> >          error_setg(errp, "Namespaces with metadata are not yet supported");
> >          goto out;
> > @@ -764,6 +776,8 @@ static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
> >      int ret;
> >      BDRVNVMeState *s = bs->opaque;
> >  
> > +    bs->supported_write_flags = BDRV_REQ_FUA;
> > +
> 
> Is this a related change?
Yes, because nvme write zero command supports this flag (always),
so since now the driver exposes write zeros, it tells the qemu that its
write zero command supports 'force unit access' flag.

> 
> >      opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
> >      qemu_opts_absorb_qdict(opts, options, &error_abort);
> >      device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE);
> > @@ -792,7 +806,6 @@ static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
> >              goto fail;
> >          }
> >      }
> > -    bs->supported_write_flags = BDRV_REQ_FUA;
> >      return 0;
> >  fail:
> >      nvme_close(bs);
> > @@ -1086,6 +1099,60 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
> >  }
> >  
> >  
> > +static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
> > +                                              int64_t offset,
> > +                                              int bytes,
> > +                                              BdrvRequestFlags flags)
> > +{
> > +    BDRVNVMeState *s = bs->opaque;
> > +    NVMeQueuePair *ioq = s->queues[1];
> 
> I think it'd be slick to name the queues, but that's not related to this
> patch.
Yea, but let do it in another patch.

> 
> > +    NVMeRequest *req;
> > +
> > +    uint32_t cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF;
> > +
> > +    if (!s->supports_write_zeros) {
> > +        return -ENOTSUP;
> > +    }
> > +
> > +    NvmeCmd cmd = {
> > +        .opcode = NVME_CMD_WRITE_ZEROS,
> > +        .nsid = cpu_to_le32(s->nsid),
> > +        .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
> > +        .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
> > +    };
> > +
> > +    NVMeCoData data = {
> > +        .ctx = bdrv_get_aio_context(bs),
> > +        .ret = -EINPROGRESS,
> > +    };
> > +
> > +    if (flags & BDRV_REQ_MAY_UNMAP) {
> > +        cdw12 |= (1 << 25);
> > +    }
> > +
> > +    if (flags & BDRV_REQ_FUA) {
> > +        cdw12 |= (1 << 30);
> > +    }
> > +
> > +    cmd.cdw12 = cpu_to_le32(cdw12);
> > +
> > +    trace_nvme_write_zeros(s, offset, bytes, flags);
> > +    assert(s->nr_queues > 1);
> > +    req = nvme_get_free_req(ioq);
> > +    assert(req);
> > +
> > +    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
> > +
> > +    data.co = qemu_coroutine_self();
> > +    while (data.ret == -EINPROGRESS) {
> > +        qemu_coroutine_yield();
> > +    }
> > +
> > +    trace_nvme_rw_done(s, true, offset, bytes, data.ret);
> > +    return data.ret;
> > +}
> > +
> > +
> >  static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
> >                                 BlockReopenQueue *queue, Error **errp)
> >  {
> > @@ -1190,6 +1257,9 @@ static BlockDriver bdrv_nvme = {
> >  
> >      .bdrv_co_preadv           = nvme_co_preadv,
> >      .bdrv_co_pwritev          = nvme_co_pwritev,
> > +
> > +    .bdrv_co_pwrite_zeroes    = nvme_co_pwrite_zeroes,
> > +
> >      .bdrv_co_flush_to_disk    = nvme_co_flush,
> >      .bdrv_reopen_prepare      = nvme_reopen_prepare,
> >  
> > diff --git a/block/trace-events b/block/trace-events
> > index 04209f058d..8209fbd0c7 100644
> > --- a/block/trace-events
> > +++ b/block/trace-events
> > @@ -149,6 +149,7 @@ nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6,
> >  nvme_handle_event(void *s) "s %p"
> >  nvme_poll_cb(void *s) "s %p"
> >  nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset %"PRId64" bytes %"PRId64" flags %d niov %d"
> > +nvme_write_zeros(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset %"PRId64" bytes %"PRId64" flags %d"
> 
> I was told once that we wanted trace events to match the name of the
> function they occurred within whenever possible.
> 
> Maybe we haven't really been very good about enforcing that.
> 
> Oh well.
I'll keep it as is, to keep it consistent with the rest of the tracing events.


> 
> >  nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
> >  nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d"
> >  nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d"
> > diff --git a/include/block/nvme.h b/include/block/nvme.h
> > index 3ec8efcc43..1f5b406344 100644
> > --- a/include/block/nvme.h
> > +++ b/include/block/nvme.h
> > @@ -653,12 +653,29 @@ typedef struct NvmeIdNs {
> >      uint8_t     mc;
> >      uint8_t     dpc;
> >      uint8_t     dps;
> > -    uint8_t     res30[98];
> > +
> > +    uint8_t     nmic;
> > +    uint8_t     rescap;
> > +    uint8_t     fpi;
> > +    uint8_t     dlfeat;
> > +
> > +    uint8_t     res30[94];
> 
> I wouldn't call this "res30" anymore, because now it starts at the 34th
> byte instead of the 30th.
Fixed.

> 
> >      NvmeLBAF    lbaf[16];
> >      uint8_t     res192[192];
> >      uint8_t     vs[3712];
> >  } NvmeIdNs;
> >  
> 
> Pre-existing, but what are any of these names supposed to mean?
> 
> (I imagine they match the spec, but where?...)
LBAF - this is LBA format, which basically tells you the hardware sector
size, and metadata size for each sector. On high end drives its is possible
to choose one of such formats out of this table when formatting an namespace.
vs = vendor specific.

Note that the whole struct (and other structs here) are very outdated since fields are
added all the time in newer spec revisions. This is the joy of maintaining the separate
driver, in addition to the kernel driver which is more or less up to date.

> 
> Leaving me some breadcrumbs would greatly reduce the time it takes
> someone who doesn't already know NVME to review this, and I suspect
> you've looked them up recently, so leaving little notes in the cover
> letter at least for relevant sections is very nice for hardware spec
> patches like this.
> 
> > +
> > +/*Deallocate Logical Block Features*/
> 
> ah. "dlfeat" --> "Deallocate Logical (Block) Features".
> 
> From here:
> 
> NVME Express 1.3, Figure 114: Identify - Identify Namespace Data
> Structure, NVM Command Set Specific
> 
> > +#define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)       ((dlfeat) & 0x10)
> 
> Not used in this patch?
Yes, but I prefer at least to document all the bits in the field, even 
if not used.

> 
> > +#define NVME_ID_NS_DLFEAT_WRITE_ZEROS(dlfeat)     ((dlfeat) & 0x08)
> > +
> > +#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR(dlfeat)     ((dlfeat) & 0x7)
> > +#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_UNDEFINED   0
> > +#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROS       1
> > +#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ONES        2
> > +
> > +
> >  #define NVME_ID_NS_NSFEAT_THIN(nsfeat)      ((nsfeat & 0x1))
> >  #define NVME_ID_NS_FLBAS_EXTENDED(flbas)    ((flbas >> 4) & 0x1)
> >  #define NVME_ID_NS_FLBAS_INDEX(flbas)       ((flbas & 0xf))
> > 
> 
> Seems good otherwise, but I didn't trace the actual execution of the new
> command too far -- I'll assume it works. :)

Best regards,
	Maxim Levitsky




^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, back to index

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-25  7:15 [Qemu-devel] [PATCH 0/2] block/nvme: add support for write zeros and discard Maxim Levitsky
2019-08-25  7:15 ` [Qemu-devel] [PATCH 1/2] block/nvme: add support for write zeros Maxim Levitsky
2019-08-27 22:22   ` John Snow
2019-08-28  9:02     ` Maxim Levitsky
2019-09-13 13:30     ` Maxim Levitsky
2019-08-25  7:15 ` [Qemu-devel] [PATCH 2/2] block/nvme: add support for discard Maxim Levitsky
2019-08-27 22:29   ` John Snow
2019-08-28  9:03     ` Maxim Levitsky
2019-09-05 13:24       ` Maxim Levitsky
2019-09-05 17:27         ` John Snow
2019-09-05 17:32           ` Maxim Levitsky
2019-09-09  9:25           ` Max Reitz
2019-09-09 17:03             ` John Snow
2019-09-10 14:49               ` Paolo Bonzini
2019-09-10 14:57                 ` Maxim Levitsky

QEMU-Devel Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/qemu-devel/0 qemu-devel/git/0.git
	git clone --mirror https://lore.kernel.org/qemu-devel/1 qemu-devel/git/1.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 qemu-devel qemu-devel/ https://lore.kernel.org/qemu-devel \
		qemu-devel@nongnu.org qemu-devel@archiver.kernel.org
	public-inbox-index qemu-devel

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.nongnu.qemu-devel


AGPL code for this site: git clone https://public-inbox.org/ public-inbox