All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH] nvme: Implement Write Zeroes
@ 2017-05-05  9:00 Christoph Hellwig
  2017-05-05  9:30 ` Paolo Bonzini
  0 siblings, 1 reply; 5+ messages in thread
From: Christoph Hellwig @ 2017-05-05  9:00 UTC (permalink / raw)
  To: keith.busch, kwolf, mreitz; +Cc: qemu-block, qemu-devel

Signed-off-by: Keith Busch <keith.busch@intel.com>
[hch: ported over from qemu-nvme.git to mainline]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 hw/block/nvme.c | 26 ++++++++++++++++++++++++++
 hw/block/nvme.h |  1 +
 2 files changed, 27 insertions(+)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index ae303d44e5..3f4d2bf2ba 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -227,6 +227,29 @@ static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
     return NVME_NO_COMPLETE;
 }
 
+static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
+    NvmeRequest *req)
+{
+    NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
+    const uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
+    const uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
+    uint64_t slba = le64_to_cpu(rw->slba);
+    uint32_t nlb  = le16_to_cpu(rw->nlb) + 1;
+    uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS);
+    uint32_t aio_nlb = nlb << (data_shift - BDRV_SECTOR_BITS);
+
+    if (slba + nlb > ns->id_ns.nsze) {
+        return NVME_LBA_RANGE | NVME_DNR;
+    }
+
+    req->has_sg = false;
+    block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
+                     BLOCK_ACCT_WRITE);
+    req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, aio_slba, aio_nlb, 0,
+                                        nvme_rw_cb, req);
+    return NVME_NO_COMPLETE;
+}
+
 static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
     NvmeRequest *req)
 {
@@ -279,6 +302,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     switch (cmd->opcode) {
     case NVME_CMD_FLUSH:
         return nvme_flush(n, ns, cmd, req);
+    case NVME_CMD_WRITE_ZEROS:
+        return nvme_write_zeros(n, ns, cmd, req);
     case NVME_CMD_WRITE:
     case NVME_CMD_READ:
         return nvme_rw(n, ns, cmd, req);
@@ -895,6 +920,7 @@ static int nvme_init(PCIDevice *pci_dev)
     id->sqes = (0x6 << 4) | 0x6;
     id->cqes = (0x4 << 4) | 0x4;
     id->nn = cpu_to_le32(n->num_namespaces);
+    id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS);
     id->psd[0].mp = cpu_to_le16(0x9c4);
     id->psd[0].enlat = cpu_to_le32(0x10);
     id->psd[0].exlat = cpu_to_le32(0x4);
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 8fb0c10756..a0d15649f9 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -179,6 +179,7 @@ enum NvmeIoCommands {
     NVME_CMD_READ               = 0x02,
     NVME_CMD_WRITE_UNCOR        = 0x04,
     NVME_CMD_COMPARE            = 0x05,
+    NVME_CMD_WRITE_ZEROS        = 0x08,
     NVME_CMD_DSM                = 0x09,
 };
 
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [Qemu-devel] [PATCH] nvme: Implement Write Zeroes
  2017-05-05  9:00 [Qemu-devel] [PATCH] nvme: Implement Write Zeroes Christoph Hellwig
@ 2017-05-05  9:30 ` Paolo Bonzini
  2017-05-05  9:51   ` Christoph Hellwig
  0 siblings, 1 reply; 5+ messages in thread
From: Paolo Bonzini @ 2017-05-05  9:30 UTC (permalink / raw)
  To: Christoph Hellwig, keith.busch, kwolf, mreitz; +Cc: qemu-devel, qemu-block



On 05/05/2017 11:00, Christoph Hellwig wrote:
> Signed-off-by: Keith Busch <keith.busch@intel.com>
> [hch: ported over from qemu-nvme.git to mainline]
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  hw/block/nvme.c | 26 ++++++++++++++++++++++++++
>  hw/block/nvme.h |  1 +
>  2 files changed, 27 insertions(+)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index ae303d44e5..3f4d2bf2ba 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -227,6 +227,29 @@ static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
>      return NVME_NO_COMPLETE;
>  }
>  
> +static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
> +    NvmeRequest *req)
> +{
> +    NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
> +    const uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
> +    const uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
> +    uint64_t slba = le64_to_cpu(rw->slba);
> +    uint32_t nlb  = le16_to_cpu(rw->nlb) + 1;
> +    uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS);
> +    uint32_t aio_nlb = nlb << (data_shift - BDRV_SECTOR_BITS);
> +
> +    if (slba + nlb > ns->id_ns.nsze) {
> +        return NVME_LBA_RANGE | NVME_DNR;
> +    }
> +
> +    req->has_sg = false;
> +    block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
> +                     BLOCK_ACCT_WRITE);
> +    req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, aio_slba, aio_nlb, 0,
> +                                        nvme_rw_cb, req);

Hi Christoph,

could you pass BDRV_REQ_MAY_UNMAP for the flags here if the deallocate
bit (dword 12 bit 25) is set?

Thanks,

Paolo

> +    return NVME_NO_COMPLETE;
> +}
> +
>  static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
>      NvmeRequest *req)
>  {
> @@ -279,6 +302,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>      switch (cmd->opcode) {
>      case NVME_CMD_FLUSH:
>          return nvme_flush(n, ns, cmd, req);
> +    case NVME_CMD_WRITE_ZEROS:
> +        return nvme_write_zeros(n, ns, cmd, req);
>      case NVME_CMD_WRITE:
>      case NVME_CMD_READ:
>          return nvme_rw(n, ns, cmd, req);
> @@ -895,6 +920,7 @@ static int nvme_init(PCIDevice *pci_dev)
>      id->sqes = (0x6 << 4) | 0x6;
>      id->cqes = (0x4 << 4) | 0x4;
>      id->nn = cpu_to_le32(n->num_namespaces);
> +    id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS);
>      id->psd[0].mp = cpu_to_le16(0x9c4);
>      id->psd[0].enlat = cpu_to_le32(0x10);
>      id->psd[0].exlat = cpu_to_le32(0x4);
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index 8fb0c10756..a0d15649f9 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -179,6 +179,7 @@ enum NvmeIoCommands {
>      NVME_CMD_READ               = 0x02,
>      NVME_CMD_WRITE_UNCOR        = 0x04,
>      NVME_CMD_COMPARE            = 0x05,
> +    NVME_CMD_WRITE_ZEROS        = 0x08,
>      NVME_CMD_DSM                = 0x09,
>  };
>  
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [Qemu-devel] [PATCH] nvme: Implement Write Zeroes
  2017-05-05  9:30 ` Paolo Bonzini
@ 2017-05-05  9:51   ` Christoph Hellwig
  2017-05-05 10:03     ` Paolo Bonzini
  0 siblings, 1 reply; 5+ messages in thread
From: Christoph Hellwig @ 2017-05-05  9:51 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Christoph Hellwig, keith.busch, kwolf, mreitz, qemu-devel, qemu-block

On Fri, May 05, 2017 at 11:30:11AM +0200, Paolo Bonzini wrote:
> could you pass BDRV_REQ_MAY_UNMAP for the flags here if the deallocate
> bit (dword 12 bit 25) is set?

In fact we should do that unconditonally.  The deallocate bit is new
in 1.3 (which we don't claim to support) and forces deallocating, but
NVMe already allows for this behavior without the flag (we in fact
explicitly clarified this in an ECN for 1.2.1).

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [Qemu-devel] [PATCH] nvme: Implement Write Zeroes
  2017-05-05  9:51   ` Christoph Hellwig
@ 2017-05-05 10:03     ` Paolo Bonzini
  2017-05-05 10:10       ` Christoph Hellwig
  0 siblings, 1 reply; 5+ messages in thread
From: Paolo Bonzini @ 2017-05-05 10:03 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: keith.busch, kwolf, mreitz, qemu-devel, qemu-block



On 05/05/2017 11:51, Christoph Hellwig wrote:
>> could you pass BDRV_REQ_MAY_UNMAP for the flags here if the deallocate
>> bit (dword 12 bit 25) is set?
> In fact we should do that unconditonally.  The deallocate bit is new
> in 1.3 (which we don't claim to support) and forces deallocating, but
> NVMe already allows for this behavior without the flag (we in fact
> explicitly clarified this in an ECN for 1.2.1).

While that's allowed and it makes sense indeed on SSDs, for QEMU's
typical usage it can lead to fragmentation and worse performance.  On
extent-based file systems, write zeroes without deallocate can be
implemented very efficiently with FALLOC_FL_ZERO_RANGE, and there's no
reason not to do it.

Is there anything backwards-incompatible in 1.3 that would make it hard
to just bump the supported version?

Paolo

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [Qemu-devel] [PATCH] nvme: Implement Write Zeroes
  2017-05-05 10:03     ` Paolo Bonzini
@ 2017-05-05 10:10       ` Christoph Hellwig
  0 siblings, 0 replies; 5+ messages in thread
From: Christoph Hellwig @ 2017-05-05 10:10 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Christoph Hellwig, keith.busch, kwolf, mreitz, qemu-devel, qemu-block

On Fri, May 05, 2017 at 12:03:40PM +0200, Paolo Bonzini wrote:
> While that's allowed and it makes sense indeed on SSDs, for QEMU's
> typical usage it can lead to fragmentation and worse performance.  On
> extent-based file systems, write zeroes without deallocate can be
> implemented very efficiently with FALLOC_FL_ZERO_RANGE, and there's no
> reason not to do it.

Ok..

> Is there anything backwards-incompatible in 1.3 that would make it hard
> to just bump the supported version?

1.2.1 requires a valid NQN in Identify Controller, and 1.3 requires
the new Identify CNS 03h for controller identification.  Otherwise
it's mostly fine tuning corner conditions for which I'd have to audit
the code.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2017-05-05 10:10 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-05  9:00 [Qemu-devel] [PATCH] nvme: Implement Write Zeroes Christoph Hellwig
2017-05-05  9:30 ` Paolo Bonzini
2017-05-05  9:51   ` Christoph Hellwig
2017-05-05 10:03     ` Paolo Bonzini
2017-05-05 10:10       ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.