Re: [PATCH v5 2/2] hw/block/nvme: add the dataset management command - Philippe Mathieu-Daudé

From: "Philippe Mathieu-Daudé" <philmd@redhat.com>
To: Klaus Jensen <its@irrelevant.dk>, qemu-devel@nongnu.org
Cc: Kevin Wolf <kwolf@redhat.com>,
	Klaus Jensen <k.jensen@samsung.com>,
	Keith Busch <kbusch@kernel.org>,
	qemu-block@nongnu.org, Max Reitz <mreitz@redhat.com>
Subject: Re: [PATCH v5 2/2] hw/block/nvme: add the dataset management command
Date: Thu, 22 Oct 2020 23:02:04 +0200	[thread overview]
Message-ID: <05c41ab7-039f-f327-c6a0-5864430f5ba6@redhat.com> (raw)
In-Reply-To: <20201022184959.240505-3-its@irrelevant.dk>

Hi Klaus,

On 10/22/20 8:49 PM, Klaus Jensen wrote:
> From: Klaus Jensen <k.jensen@samsung.com>
> 
> Add support for the Dataset Management command and the Deallocate
> attribute. Deallocation results in discards being sent to the underlying
> block device. Whether of not the blocks are actually deallocated is
> affected by the same factors as Write Zeroes (see previous commit).
> 
>       format | discard | dsm (512b)  dsm (4kb)  dsm (64kb)

Please use B/KiB units which are unambiguous (kb is for kbits)
(if you queue this yourself, you can fix when applying, no need
to repost).

>      ------------------------------------------------------
>        qcow2    ignore   n           n          n
>        qcow2    unmap    n           n          y
>        raw      ignore   n           n          n
>        raw      unmap    n           y          y
> 
> Again, a raw format and 4kb LBAs are preferable.
> 
> In order to set the Namespace Preferred Deallocate Granularity and
> Alignment fields (NPDG and NPDA), choose a sane minimum discard
> granularity of 4kb. If we are using a passthru device supporting discard
> at a 512b granularity, user should set the discard_granularity property

Ditto.

> explicitly. NPDG and NPDA will also account for the cluster_size of the
> block driver if required (i.e. for QCOW2).
> 
> See NVM Express 1.3d, Section 6.7 ("Dataset Management command").
> 
> Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> ---
>   hw/block/nvme.h      |   2 +
>   include/block/nvme.h |   7 ++-
>   hw/block/nvme-ns.c   |  36 +++++++++++++--
>   hw/block/nvme.c      | 101 ++++++++++++++++++++++++++++++++++++++++++-
>   4 files changed, 140 insertions(+), 6 deletions(-)
> 
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index e080a2318a50..574333caa3f9 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -28,6 +28,7 @@ typedef struct NvmeRequest {
>       struct NvmeNamespace    *ns;
>       BlockAIOCB              *aiocb;
>       uint16_t                status;
> +    void                    *opaque;
>       NvmeCqe                 cqe;
>       NvmeCmd                 cmd;
>       BlockAcctCookie         acct;
> @@ -60,6 +61,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
>       case NVME_CMD_WRITE:            return "NVME_NVM_CMD_WRITE";
>       case NVME_CMD_READ:             return "NVME_NVM_CMD_READ";
>       case NVME_CMD_WRITE_ZEROES:     return "NVME_NVM_CMD_WRITE_ZEROES";
> +    case NVME_CMD_DSM:              return "NVME_NVM_CMD_DSM";
>       default:                        return "NVME_NVM_CMD_UNKNOWN";
>       }
>   }
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index 966c3bb304bd..e95ff6ca9b37 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -990,7 +990,12 @@ typedef struct QEMU_PACKED NvmeIdNs {
>       uint16_t    nabspf;
>       uint16_t    noiob;
>       uint8_t     nvmcap[16];
> -    uint8_t     rsvd64[40];
> +    uint16_t    npwg;
> +    uint16_t    npwa;
> +    uint16_t    npdg;
> +    uint16_t    npda;
> +    uint16_t    nows;
> +    uint8_t     rsvd74[30];
>       uint8_t     nguid[16];
>       uint64_t    eui64;
>       NvmeLBAF    lbaf[16];

If you consider "block/nvme.h" shared by 2 different subsystems,
it is better to add the changes in a separate patch. That way
the changes can be acked individually.

> diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> index f1cc734c60f5..840651db7256 100644
> --- a/hw/block/nvme-ns.c
> +++ b/hw/block/nvme-ns.c
> @@ -28,10 +28,14 @@
>   #include "nvme.h"
>   #include "nvme-ns.h"
>   
> -static void nvme_ns_init(NvmeNamespace *ns)
> +#define MIN_DISCARD_GRANULARITY (4 * KiB)
> +
> +static int nvme_ns_init(NvmeNamespace *ns, Error **errp)

Hmm the Error* argument could be squashed in "hw/block/nvme:
support multiple namespaces". Else better split patch in dumb
units IMHO (maybe a reviewer's taste).

>   {
> +    BlockDriverInfo bdi;
>       NvmeIdNs *id_ns = &ns->id_ns;
>       int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
> +    int npdg, ret;
>   
>       ns->id_ns.dlfeat = 0x9;
>   
> @@ -43,8 +47,25 @@ static void nvme_ns_init(NvmeNamespace *ns)
>       id_ns->ncap = id_ns->nsze;
>       id_ns->nuse = id_ns->ncap;
>   
> -    /* support DULBE */
> -    id_ns->nsfeat |= 0x4;
> +    /* support DULBE and I/O optimization fields */
> +    id_ns->nsfeat |= (0x4 | 0x10);

The comment helps, but isn't needed if you use explicit definitions
for these flags. You already introduced the NVME_ID_NS_NSFEAT_DULBE
and NVME_ID_NS_FLBAS_EXTENDED but they are restricted to extract bits.
This is why I personally prefer the registerfields API (see
"hw/registerfields.h").

> +
> +    npdg = ns->blkconf.discard_granularity / ns->blkconf.logical_block_size;
> +
> +    ret = bdrv_get_info(blk_bs(ns->blkconf.blk), &bdi);
> +    if (ret < 0) {
> +        error_setg_errno(errp, -ret, "could not get block driver info");
> +        return ret;
> +    }
> +
> +    if (bdi.cluster_size &&
> +        bdi.cluster_size > ns->blkconf.discard_granularity) {
> +        npdg = bdi.cluster_size / ns->blkconf.logical_block_size;
> +    }
> +
> +    id_ns->npda = id_ns->npdg = npdg - 1;
> +
> +    return 0;
>   }
>   
>   static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
> @@ -59,6 +80,11 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
>           return -1;
>       }
>   
> +    if (ns->blkconf.discard_granularity == -1) {
> +        ns->blkconf.discard_granularity =
> +            MAX(ns->blkconf.logical_block_size, MIN_DISCARD_GRANULARITY);
> +    }
> +
>       ns->size = blk_getlength(ns->blkconf.blk);
>       if (ns->size < 0) {
>           error_setg_errno(errp, -ns->size, "could not get blockdev size");
> @@ -92,7 +118,9 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
>           return -1;
>       }
>   
> -    nvme_ns_init(ns);
> +    if (nvme_ns_init(ns, errp)) {
> +        return -1;
> +    }
>   
>       if (nvme_register_namespace(n, ns, errp)) {
>           return -1;
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 4ab0705f5a92..7acb9e9dc38a 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -959,6 +959,103 @@ static void nvme_rw_cb(void *opaque, int ret)
>       nvme_enqueue_req_completion(nvme_cq(req), req);
>   }
>   
> +static void nvme_aio_discard_cb(void *opaque, int ret)
> +{
> +    NvmeRequest *req = opaque;
> +    int *discards = req->opaque;
> +
> +    trace_pci_nvme_aio_discard_cb(nvme_cid(req));
> +
> +    if (ret) {
> +        req->status = NVME_INTERNAL_DEV_ERROR;
> +        trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret),
> +                               req->status);
> +    }
> +
> +    if (discards && --(*discards) > 0) {

This line is hard to read.

> +        return;
> +    }
> +
> +    g_free(req->opaque);
> +    req->opaque = NULL;
> +
> +    nvme_enqueue_req_completion(nvme_cq(req), req);
> +}
> +
> +static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
> +{
> +    NvmeNamespace *ns = req->ns;
> +    NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
> +    NvmeDsmRange *range = NULL;

g_autofree?

> +    int *discards = NULL;
> +
> +    uint32_t attr = le32_to_cpu(dsm->attributes);
> +    uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
> +
> +    uint16_t status = NVME_SUCCESS;
> +
> +    trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
> +
> +    if (attr & NVME_DSMGMT_AD) {
> +        int64_t offset;
> +        size_t len;
> +
> +        range = g_new(NvmeDsmRange, nr);
> +
> +        status = nvme_dma(n, (uint8_t *)range, nr * sizeof(NvmeDsmRange),
> +                          DMA_DIRECTION_TO_DEVICE, req);
> +        if (status) {
> +            goto out;
> +        }
> +
> +        discards = g_new(int, 1);
> +        *discards = 1;
> +        req->opaque = discards;

If opaque is a pointer, why not instead using it as an uintptr_t
discards directly without using the heap?

> +
> +        for (int i = 0; i < nr; i++) {
> +            uint64_t slba = le64_to_cpu(range[i].slba);
> +            uint32_t nlb = le32_to_cpu(range[i].nlb);
> +
> +            if (nvme_check_bounds(n, ns, slba, nlb)) {
> +                trace_pci_nvme_err_invalid_lba_range(slba, nlb,
> +                                                     ns->id_ns.nsze);
> +                continue;
> +            }
> +
> +            trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
> +                                          nlb);
> +
> +            offset = nvme_l2b(ns, slba);
> +            len = nvme_l2b(ns, nlb);
> +
> +            while (len) {
> +                size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
> +
> +                (*discards)++;
> +
> +                blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
> +                                 nvme_aio_discard_cb, req);
> +
> +                offset += bytes;
> +                len -= bytes;
> +            }
> +        }
> +
> +        if (--(*discards)) {
> +            status = NVME_NO_COMPLETE;
> +        } else {
> +            g_free(discards);
> +            req->opaque = NULL;
> +            status = req->status;
> +        }
> +    }
> +
> +out:
> +    g_free(range);
> +
> +    return status;
> +}
[...]