All of lore.kernel.org
 help / color / mirror / Atom feed
From: Beata Michalska <beata.michalska@linaro.org>
To: Klaus Jensen <k.jensen@samsung.com>
Cc: Kevin Wolf <kwolf@redhat.com>,
	qemu-block@nongnu.org, QEMU Developers <qemu-devel@nongnu.org>,
	Max Reitz <mreitz@redhat.com>, Keith Busch <kbusch@kernel.org>,
	Javier Gonzalez <javier.gonz@samsung.com>
Subject: Re: [PATCH v4 19/24] nvme: handle dma errors
Date: Thu, 9 Jan 2020 11:35:02 +0000	[thread overview]
Message-ID: <CADSWDzvedgUVQQx0Pc6gw1hZVtDpOFayXRMUny-7PHf+oFga3g@mail.gmail.com> (raw)
In-Reply-To: <20191219130921.309264-20-k.jensen@samsung.com>

Hi Klaus,

On Thu, 19 Dec 2019 at 13:09, Klaus Jensen <k.jensen@samsung.com> wrote:
>
> Handling DMA errors gracefully is required for the device to pass the
> block/011 test ("disable PCI device while doing I/O") in the blktests
> suite.
>
> With this patch the device passes the test by retrying "critical"
> transfers (posting of completion entries and processing of submission
> queue entries).
>
> If DMA errors occur at any other point in the execution of the command
> (say, while mapping the PRPs), the command is aborted with a Data
> Transfer Error status code.
>
> Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> ---
>  hw/block/nvme.c       | 37 +++++++++++++++++++++++++++++--------
>  hw/block/trace-events |  2 ++
>  include/block/nvme.h  |  2 +-
>  3 files changed, 32 insertions(+), 9 deletions(-)
>
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 56659bbe263a..f6591285b504 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -71,14 +71,14 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
>      return addr >= low && addr < hi;
>  }
>
> -static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
> +static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
>  {
>      if (n->cmbsz && nvme_addr_is_cmb(n, addr)) {
>          memcpy(buf, (void *) &n->cmbuf[addr - n->ctrl_mem.addr], size);
> -        return;
> +        return 0;
>      }
>
> -    pci_dma_read(&n->parent_obj, addr, buf, size);
> +    return pci_dma_read(&n->parent_obj, addr, buf, size);
>  }
>
>  static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
> @@ -216,7 +216,11 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
>
>              nents = (len + n->page_size - 1) >> n->page_bits;
>              prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
> -            nvme_addr_read(n, prp2, (void *) prp_list, prp_trans);
> +            if (nvme_addr_read(n, prp2, (void *) prp_list, prp_trans)) {
> +                trace_nvme_dev_err_addr_read(prp2);
> +                status = NVME_DATA_TRANSFER_ERROR;
> +                goto unmap;
> +            }
>              while (len != 0) {
>                  uint64_t prp_ent = le64_to_cpu(prp_list[i]);
>
> @@ -235,7 +239,11 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
>                      i = 0;
>                      nents = (len + n->page_size - 1) >> n->page_bits;
>                      prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
> -                    nvme_addr_read(n, prp_ent, (void *) prp_list, prp_trans);
> +                    if (nvme_addr_read(n, prp_ent, (void *) prp_list, prp_trans)) {
> +                        trace_nvme_dev_err_addr_read(prp_ent);
> +                        status = NVME_DATA_TRANSFER_ERROR;
> +                        goto unmap;
> +                    }
>                      prp_ent = le64_to_cpu(prp_list[i]);
>                  }
>
> @@ -456,6 +464,7 @@ static void nvme_post_cqes(void *opaque)
>      NvmeCQueue *cq = opaque;
>      NvmeCtrl *n = cq->ctrl;
>      NvmeRequest *req, *next;
> +    int ret;
>
>      QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
>          NvmeSQueue *sq;
> @@ -471,9 +480,16 @@ static void nvme_post_cqes(void *opaque)
>          req->cqe.sq_id = cpu_to_le16(sq->sqid);
>          req->cqe.sq_head = cpu_to_le16(sq->head);
>          addr = cq->dma_addr + cq->tail * n->cqe_size;
> -        nvme_inc_cq_tail(cq);
> -        pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
> +        ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
>              sizeof(req->cqe));
> +        if (ret) {
> +            trace_nvme_dev_err_addr_write(addr);
> +            QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
> +            timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
> +                100 * SCALE_MS);
> +            break;
> +        }
> +        nvme_inc_cq_tail(cq);
>          QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
>      }
>      if (cq->tail != cq->head) {
> @@ -1595,7 +1611,12 @@ static void nvme_process_sq(void *opaque)
>
>      while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
>          addr = sq->dma_addr + sq->head * n->sqe_size;
> -        nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd));
> +        if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
> +            trace_nvme_dev_err_addr_read(addr);
> +            timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
> +                100 * SCALE_MS);
> +            break;
> +        }

Is there a chance we will end up repeatedly triggering the read error here
as this will come back to the same memory location each time (the sq->head
is not moving here) ?


BR
Beata

>          nvme_inc_sq_head(sq);
>
>          req = QTAILQ_FIRST(&sq->req_list);
> diff --git a/hw/block/trace-events b/hw/block/trace-events
> index 90a57fb6099a..09bfb3782dd0 100644
> --- a/hw/block/trace-events
> +++ b/hw/block/trace-events
> @@ -83,6 +83,8 @@ nvme_dev_mmio_shutdown_cleared(void) "shutdown bit cleared"
>  nvme_dev_err_mdts(uint16_t cid, size_t mdts, size_t len) "cid %"PRIu16" mdts %"PRIu64" len %"PRIu64""
>  nvme_dev_err_prinfo(uint16_t cid, uint16_t ctrl) "cid %"PRIu16" ctrl %"PRIu16""
>  nvme_dev_err_aio(uint16_t cid, void *aio, const char *blkname, uint64_t offset, const char *opc, void *req, uint16_t status) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" opc \"%s\" req %p status 0x%"PRIx16""
> +nvme_dev_err_addr_read(uint64_t addr) "addr 0x%"PRIx64""
> +nvme_dev_err_addr_write(uint64_t addr) "addr 0x%"PRIx64""
>  nvme_dev_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
>  nvme_dev_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64""
>  nvme_dev_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64""
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index c1de92179596..a873776d98b8 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -418,7 +418,7 @@ enum NvmeStatusCodes {
>      NVME_INVALID_OPCODE         = 0x0001,
>      NVME_INVALID_FIELD          = 0x0002,
>      NVME_CID_CONFLICT           = 0x0003,
> -    NVME_DATA_TRAS_ERROR        = 0x0004,
> +    NVME_DATA_TRANSFER_ERROR    = 0x0004,
>      NVME_POWER_LOSS_ABORT       = 0x0005,
>      NVME_INTERNAL_DEV_ERROR     = 0x0006,
>      NVME_CMD_ABORT_REQ          = 0x0007,
> --
> 2.24.1
>


  reply	other threads:[~2020-01-09 11:36 UTC|newest]

Thread overview: 37+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <CGME20191219130934eucas1p256e1c97e6f16a85c8da4a107d9047f53@eucas1p2.samsung.com>
2019-12-19 13:08 ` [PATCH v4 00/24] nvme: support NVMe v1.3d, SGLs and multiple namespaces Klaus Jensen
     [not found]   ` <CGME20191219130934eucas1p118e7071b8d35a48f9c8b660142737a50@eucas1p1.samsung.com>
2019-12-19 13:08     ` [PATCH v4 01/24] nvme: rename trace events to nvme_dev Klaus Jensen
     [not found]   ` <CGME20191219130935eucas1p2acf93b9e802b7b24818362cb2366950d@eucas1p2.samsung.com>
2019-12-19 13:08     ` [PATCH v4 02/24] nvme: remove superfluous breaks Klaus Jensen
     [not found]   ` <CGME20191219130936eucas1p20ba8d3d8df30caad0fea8f2a7001f031@eucas1p2.samsung.com>
2019-12-19 13:09     ` [PATCH v4 03/24] nvme: move device parameters to separate struct Klaus Jensen
     [not found]   ` <CGME20191219130936eucas1p22abefb644922c11e79b3ebe32d565e57@eucas1p2.samsung.com>
2019-12-19 13:09     ` [PATCH v4 04/24] nvme: add missing fields in the identify data structures Klaus Jensen
     [not found]   ` <CGME20191219130937eucas1p26be4db4b75dc87168bceff37f4fd5588@eucas1p2.samsung.com>
2019-12-19 13:09     ` [PATCH v4 05/24] nvme: populate the mandatory subnqn and ver fields Klaus Jensen
     [not found]   ` <CGME20191219130938eucas1p2b962d75ab2df429772bb172f94a01f3d@eucas1p2.samsung.com>
2019-12-19 13:09     ` [PATCH v4 06/24] nvme: refactor nvme_addr_read Klaus Jensen
     [not found]   ` <CGME20191219130939eucas1p2234cbe31c7a027e6ccf1b4ceafd19c30@eucas1p2.samsung.com>
2019-12-19 13:09     ` [PATCH v4 07/24] nvme: add support for the abort command Klaus Jensen
     [not found]   ` <CGME20191219130939eucas1p1faaa8a7ebfd3fc40ca8cd72dca1a6682@eucas1p1.samsung.com>
2019-12-19 13:09     ` [PATCH v4 08/24] nvme: refactor device realization Klaus Jensen
     [not found]   ` <CGME20191219130940eucas1p2ed4053766595fbef6b5ddee13565f22f@eucas1p2.samsung.com>
2019-12-19 13:09     ` [PATCH v4 09/24] nvme: add temperature threshold feature Klaus Jensen
     [not found]   ` <CGME20191219130941eucas1p13cd0170cd47303f6be82e254d536892e@eucas1p1.samsung.com>
2019-12-19 13:09     ` [PATCH v4 10/24] nvme: add support for the get log page command Klaus Jensen
     [not found]   ` <CGME20191219130941eucas1p1ac686ade1ad42395500069d8884f9179@eucas1p1.samsung.com>
2019-12-19 13:09     ` [PATCH v4 11/24] nvme: add support for the asynchronous event request command Klaus Jensen
     [not found]   ` <CGME20191219130942eucas1p1de5bbf57c675e3ea20cd0892e7e03c77@eucas1p1.samsung.com>
2019-12-19 13:09     ` [PATCH v4 12/24] nvme: add missing mandatory features Klaus Jensen
     [not found]   ` <CGME20191219130943eucas1p1dd4a7fd7ff94ee679832838b5d61f611@eucas1p1.samsung.com>
2019-12-19 13:09     ` [PATCH v4 13/24] nvme: additional tracing Klaus Jensen
     [not found]   ` <CGME20191219130944eucas1p2090d2b62171699cd0aa44df8e9f98727@eucas1p2.samsung.com>
2019-12-19 13:09     ` [PATCH v4 14/24] nvme: make sure ncqr and nsqr is valid Klaus Jensen
     [not found]   ` <CGME20191219130944eucas1p1c2c335692dc0a44467e33afa6aa5e558@eucas1p1.samsung.com>
2019-12-19 13:09     ` [PATCH v4 15/24] nvme: bump supported specification to 1.3 Klaus Jensen
     [not found]   ` <CGME20191219130945eucas1p15e2032f73d3f128a6488deabfe710589@eucas1p1.samsung.com>
2019-12-19 13:09     ` [PATCH v4 16/24] nvme: refactor prp mapping Klaus Jensen
     [not found]   ` <CGME20191219130945eucas1p1cfffc6af127586ee24746beccbe993fb@eucas1p1.samsung.com>
2019-12-19 13:09     ` [PATCH v4 17/24] nvme: allow multiple aios per command Klaus Jensen
2020-01-09 11:40       ` Beata Michalska
2020-01-13  9:23         ` Klaus Birkelund Jensen
     [not found]   ` <CGME20191219130946eucas1p1f58691ec4abdc3b1f42a082450758a72@eucas1p1.samsung.com>
2019-12-19 13:09     ` [PATCH v4 18/24] pci: pass along the return value of dma_memory_rw Klaus Jensen
     [not found]   ` <CGME20191219130947eucas1p21cf4cec3d0b63850ab0e35aa5fab62b8@eucas1p2.samsung.com>
2019-12-19 13:09     ` [PATCH v4 19/24] nvme: handle dma errors Klaus Jensen
2020-01-09 11:35       ` Beata Michalska [this message]
2020-01-13  9:16         ` Klaus Birkelund Jensen
     [not found]   ` <CGME20191219130948eucas1p27bdef1318a097df34396a2a18b318064@eucas1p2.samsung.com>
2019-12-19 13:09     ` [PATCH v4 20/24] nvme: add support for scatter gather lists Klaus Jensen
2020-01-09 11:44       ` Beata Michalska
2020-01-13  9:28         ` Klaus Birkelund Jensen
     [not found]   ` <CGME20191219130948eucas1p2cfed0da65071fd76446daecbb217e86e@eucas1p2.samsung.com>
2019-12-19 13:09     ` [PATCH v4 21/24] nvme: support multiple namespaces Klaus Jensen
2019-12-19 15:11       ` Michal Prívozník
2019-12-19 17:26         ` Klaus Birkelund Jensen
     [not found]   ` <CGME20191219130949eucas1p172304dc579f8bda43a8febd234064799@eucas1p1.samsung.com>
2019-12-19 13:09     ` [PATCH v4 22/24] nvme: bump controller pci device id Klaus Jensen
2019-12-19 16:16       ` Keith Busch
2019-12-19 17:24         ` Klaus Birkelund Jensen
2019-12-19 17:46           ` Keith Busch
2019-12-19 18:03             ` Klaus Birkelund Jensen
     [not found]   ` <CGME20191219130950eucas1p2333a6d25cb3539362b4e8dc77ebde1d6@eucas1p2.samsung.com>
2019-12-19 13:09     ` [PATCH v4 23/24] nvme: remove redundant NvmeCmd pointer parameter Klaus Jensen
     [not found]   ` <CGME20191219130950eucas1p14932d144e3e12fb98a6ec685cf616701@eucas1p1.samsung.com>
2019-12-19 13:09     ` [PATCH v4 24/24] nvme: make lba data size configurable Klaus Jensen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CADSWDzvedgUVQQx0Pc6gw1hZVtDpOFayXRMUny-7PHf+oFga3g@mail.gmail.com \
    --to=beata.michalska@linaro.org \
    --cc=javier.gonz@samsung.com \
    --cc=k.jensen@samsung.com \
    --cc=kbusch@kernel.org \
    --cc=kwolf@redhat.com \
    --cc=mreitz@redhat.com \
    --cc=qemu-block@nongnu.org \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.