All of lore.kernel.org
 help / color / mirror / Atom feed
* nvme completion handling refactor and fix
@ 2020-08-14 15:15 Christoph Hellwig
  2020-08-14 15:15 ` [PATCH 1/3] nvme: rename and document nvme_end_request Christoph Hellwig
                   ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Christoph Hellwig @ 2020-08-14 15:15 UTC (permalink / raw)
  To: linux-nvme; +Cc: Keith Busch, Sagi Grimberg, Chao Leng

Hi all,

the first two patches refactor the common nvme completion code
to be a little less obsfucated, and the third one is the fix
from Chao to redirect to a different path ported on top of these
changes, which now make the fix very simple.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 1/3] nvme: rename and document nvme_end_request
  2020-08-14 15:15 nvme completion handling refactor and fix Christoph Hellwig
@ 2020-08-14 15:15 ` Christoph Hellwig
  2020-08-14 18:31   ` Sagi Grimberg
  2020-08-14 15:15 ` [PATCH 2/3] nvme: refactor command completion Christoph Hellwig
  2020-08-14 15:15 ` [PATCH 3/3] nvme: redirect commands on dying queue Christoph Hellwig
  2 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2020-08-14 15:15 UTC (permalink / raw)
  To: linux-nvme; +Cc: Keith Busch, Sagi Grimberg, Chao Leng

nvme_end_request is a bit misnamed, as it wraps around the
blk_mq_complete_* API.  It's semantics also are non-trivial, so give it
a more descriptive name and add a comment explaining the semantics.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/fault-injection/nvme-fault-injection.rst | 2 +-
 drivers/nvme/host/fc.c                                 | 2 +-
 drivers/nvme/host/nvme.h                               | 8 +++++++-
 drivers/nvme/host/pci.c                                | 2 +-
 drivers/nvme/host/rdma.c                               | 2 +-
 drivers/nvme/host/tcp.c                                | 4 ++--
 drivers/nvme/target/loop.c                             | 2 +-
 7 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/Documentation/fault-injection/nvme-fault-injection.rst b/Documentation/fault-injection/nvme-fault-injection.rst
index cdb2e829228e3e..1d4427890d7591 100644
--- a/Documentation/fault-injection/nvme-fault-injection.rst
+++ b/Documentation/fault-injection/nvme-fault-injection.rst
@@ -3,7 +3,7 @@ NVMe Fault Injection
 Linux's fault injection framework provides a systematic way to support
 error injection via debugfs in the /sys/kernel/debug directory. When
 enabled, the default NVME_SC_INVALID_OPCODE with no retry will be
-injected into the nvme_end_request. Users can change the default status
+injected into the nvme_try_complete_req. Users can change the default status
 code and no retry flag via the debugfs. The list of Generic Command
 Status can be found in include/linux/nvme.h
 
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index eae43bb444e038..ba4f10144274db 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2035,7 +2035,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 	}
 
 	__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
-	if (!nvme_end_request(rq, status, result))
+	if (!nvme_try_complete_req(rq, status, result))
 		nvme_fc_complete_rq(rq);
 
 check_error:
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ebb8c3ed388554..6d69cc7306d034 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -523,7 +523,13 @@ static inline u32 nvme_bytes_to_numd(size_t len)
 	return (len >> 2) - 1;
 }
 
-static inline bool nvme_end_request(struct request *req, __le16 status,
+/*
+ * Fill in the status and result information from the CQE, and then figure
+ * out if we blk-mq will need to use IPI magic to complete the request, and
+ * if yes do so.  If not let the caller complete the request without an
+ * indirect function call.
+ */
+static inline bool nvme_try_complete_req(struct request *req, __le16 status,
 		union nvme_result result)
 {
 	struct nvme_request *rq = nvme_req(req);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ba725ae47305ef..f96a69bbda70da 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -961,7 +961,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
 
 	req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
 	trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
-	if (!nvme_end_request(req, cqe->status, cqe->result))
+	if (!nvme_try_complete_req(req, cqe->status, cqe->result))
 		nvme_pci_complete_rq(req);
 }
 
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 44c76ffbb264cd..4c5660201009c5 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1189,7 +1189,7 @@ static void nvme_rdma_end_request(struct nvme_rdma_request *req)
 
 	if (!refcount_dec_and_test(&req->ref))
 		return;
-	if (!nvme_end_request(rq, req->status, req->result))
+	if (!nvme_try_complete_req(rq, req->status, req->result))
 		nvme_rdma_complete_rq(rq);
 }
 
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 62fbaecdc960c3..3be4749dbf1127 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -481,7 +481,7 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
 		return -EINVAL;
 	}
 
-	if (!nvme_end_request(rq, cqe->status, cqe->result))
+	if (!nvme_try_complete_req(rq, cqe->status, cqe->result))
 		nvme_complete_rq(rq);
 	queue->nr_cqe++;
 
@@ -672,7 +672,7 @@ static inline void nvme_tcp_end_request(struct request *rq, u16 status)
 {
 	union nvme_result res = {};
 
-	if (!nvme_end_request(rq, cpu_to_le16(status << 1), res))
+	if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
 		nvme_complete_rq(rq);
 }
 
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 4884ef1e46a281..0d6008cf66a2ad 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -115,7 +115,7 @@ static void nvme_loop_queue_response(struct nvmet_req *req)
 			return;
 		}
 
-		if (!nvme_end_request(rq, cqe->status, cqe->result))
+		if (!nvme_try_complete_req(rq, cqe->status, cqe->result))
 			nvme_loop_complete_rq(rq);
 	}
 }
-- 
2.28.0


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/3] nvme: refactor command completion
  2020-08-14 15:15 nvme completion handling refactor and fix Christoph Hellwig
  2020-08-14 15:15 ` [PATCH 1/3] nvme: rename and document nvme_end_request Christoph Hellwig
@ 2020-08-14 15:15 ` Christoph Hellwig
  2020-08-14 18:37   ` Sagi Grimberg
  2020-08-17  3:30   ` Chao Leng
  2020-08-14 15:15 ` [PATCH 3/3] nvme: redirect commands on dying queue Christoph Hellwig
  2 siblings, 2 replies; 18+ messages in thread
From: Christoph Hellwig @ 2020-08-14 15:15 UTC (permalink / raw)
  To: linux-nvme; +Cc: Keith Busch, Sagi Grimberg, Chao Leng

Lift all the code to decide the dispostition of a completed command
from nvme_complete_rq and nvme_failover_req into a new helper, which
returns an emum of the potential actions.  nvme_complete_rq then
just switches on those and calls the proper helper for the action.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c      | 76 ++++++++++++++++++++++-------------
 drivers/nvme/host/multipath.c | 47 ++++++----------------
 drivers/nvme/host/nvme.h      | 31 ++++++++++++--
 3 files changed, 90 insertions(+), 64 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 88cff309d8e4f0..8d474adad721fb 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -241,17 +241,6 @@ static blk_status_t nvme_error_status(u16 status)
 	}
 }
 
-static inline bool nvme_req_needs_retry(struct request *req)
-{
-	if (blk_noretry_request(req))
-		return false;
-	if (nvme_req(req)->status & NVME_SC_DNR)
-		return false;
-	if (nvme_req(req)->retries >= nvme_max_retries)
-		return false;
-	return true;
-}
-
 static void nvme_retry_req(struct request *req)
 {
 	struct nvme_ns *ns = req->q->queuedata;
@@ -268,34 +257,67 @@ static void nvme_retry_req(struct request *req)
 	blk_mq_delay_kick_requeue_list(req->q, delay);
 }
 
-void nvme_complete_rq(struct request *req)
+enum nvme_disposition {
+	COMPLETE,
+	RETRY,
+	FAILOVER,
+};
+
+static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
 {
-	blk_status_t status = nvme_error_status(nvme_req(req)->status);
+	u16 status = nvme_req(req)->status & 0x7ff;
 
-	trace_nvme_complete_rq(req);
+	if (likely(status == 0))
+		return COMPLETE;
 
-	nvme_cleanup_cmd(req);
+	if (blk_noretry_request(req) || (status & NVME_SC_DNR) ||
+	    nvme_req(req)->retries >= nvme_max_retries)
+		return COMPLETE;
 
-	if (nvme_req(req)->ctrl->kas)
-		nvme_req(req)->ctrl->comp_seen = true;
+	if (req->cmd_flags & REQ_NVME_MPATH) {
+		if (nvme_is_path_error(status))
+			return FAILOVER;
+	}
 
-	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
-		if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req))
-			return;
+	if (blk_queue_dying(req->q))
+		return COMPLETE;
 
-		if (!blk_queue_dying(req->q)) {
-			nvme_retry_req(req);
-			return;
-		}
-	} else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
-		   req_op(req) == REQ_OP_ZONE_APPEND) {
+	return RETRY;
+}
+
+static inline void nvme_end_req(struct request *req)
+{
+	blk_status_t status = nvme_error_status(nvme_req(req)->status);
+
+	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+	    req_op(req) == REQ_OP_ZONE_APPEND)
 		req->__sector = nvme_lba_to_sect(req->q->queuedata,
 			le64_to_cpu(nvme_req(req)->result.u64));
-	}
 
 	nvme_trace_bio_complete(req, status);
 	blk_mq_end_request(req, status);
 }
+
+void nvme_complete_rq(struct request *req)
+{
+	trace_nvme_complete_rq(req);
+	nvme_cleanup_cmd(req);
+
+	if (nvme_req(req)->ctrl->kas)
+		nvme_req(req)->ctrl->comp_seen = true;
+
+	switch (nvme_decide_disposition(req)) {
+	case COMPLETE:
+		nvme_end_req(req);
+		return;
+	case RETRY:
+		nvme_retry_req(req);
+		return;
+	case FAILOVER:
+		nvme_failover_req(req);
+		return;
+	}
+}
 EXPORT_SYMBOL_GPL(nvme_complete_rq);
 
 bool nvme_cancel_request(struct request *req, void *data, bool reserved)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 3ded54d2c9c6ad..abc5bcf7209506 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -65,51 +65,30 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 	}
 }
 
-bool nvme_failover_req(struct request *req)
+void nvme_failover_req(struct request *req)
 {
 	struct nvme_ns *ns = req->q->queuedata;
-	u16 status = nvme_req(req)->status;
+	u16 status = nvme_req(req)->status & 0x7ff;
 	unsigned long flags;
 
-	switch (status & 0x7ff) {
-	case NVME_SC_ANA_TRANSITION:
-	case NVME_SC_ANA_INACCESSIBLE:
-	case NVME_SC_ANA_PERSISTENT_LOSS:
-		/*
-		 * If we got back an ANA error we know the controller is alive,
-		 * but not ready to serve this namespaces.  The spec suggests
-		 * we should update our general state here, but due to the fact
-		 * that the admin and I/O queues are not serialized that is
-		 * fundamentally racy.  So instead just clear the current path,
-		 * mark the the path as pending and kick of a re-read of the ANA
-		 * log page ASAP.
-		 */
-		nvme_mpath_clear_current_path(ns);
-		if (ns->ctrl->ana_log_buf) {
-			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
-			queue_work(nvme_wq, &ns->ctrl->ana_work);
-		}
-		break;
-	case NVME_SC_HOST_PATH_ERROR:
-	case NVME_SC_HOST_ABORTED_CMD:
-		/*
-		 * Temporary transport disruption in talking to the controller.
-		 * Try to send on a new path.
-		 */
-		nvme_mpath_clear_current_path(ns);
-		break;
-	default:
-		/* This was a non-ANA error so follow the normal error path. */
-		return false;
+	nvme_mpath_clear_current_path(ns);
+
+	/*
+	 * If we got back an ANA error, we know the controller is alive but not
+	 * ready to serve this namespace.  Kick of a re-read of the ANA
+	 * information page, and just try any other available path for now.
+	 */
+	if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
+		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
+		queue_work(nvme_wq, &ns->ctrl->ana_work);
 	}
 
 	spin_lock_irqsave(&ns->head->requeue_lock, flags);
 	blk_steal_bios(&ns->head->requeue_list, req);
 	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
-	blk_mq_end_request(req, 0);
 
+	blk_mq_end_request(req, 0);
 	kblockd_schedule_work(&ns->head->requeue_work);
-	return true;
 }
 
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 6d69cc7306d034..dfcdeb318f3ab6 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -523,6 +523,32 @@ static inline u32 nvme_bytes_to_numd(size_t len)
 	return (len >> 2) - 1;
 }
 
+static inline bool nvme_is_ana_error(u16 status)
+{
+	switch (status & 0x7ff) {
+	case NVME_SC_ANA_TRANSITION:
+	case NVME_SC_ANA_INACCESSIBLE:
+	case NVME_SC_ANA_PERSISTENT_LOSS:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool nvme_is_path_error(u16 status)
+{
+	switch (status & 0x7ff) {
+	case NVME_SC_HOST_PATH_ERROR:
+	case NVME_SC_HOST_ABORTED_CMD:
+	case NVME_SC_ANA_TRANSITION:
+	case NVME_SC_ANA_INACCESSIBLE:
+	case NVME_SC_ANA_PERSISTENT_LOSS:
+		return true;
+	default:
+		return false;
+	}
+}
+
 /*
  * Fill in the status and result information from the CQE, and then figure
  * out if we blk-mq will need to use IPI magic to complete the request, and
@@ -635,7 +661,7 @@ void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
 void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 			struct nvme_ctrl *ctrl, int *flags);
-bool nvme_failover_req(struct request *req);
+void nvme_failover_req(struct request *req);
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
 void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
@@ -694,9 +720,8 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 	sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
 }
 
-static inline bool nvme_failover_req(struct request *req)
+static inline void nvme_failover_req(struct request *req)
 {
-	return false;
 }
 static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
 {
-- 
2.28.0


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 3/3] nvme: redirect commands on dying queue
  2020-08-14 15:15 nvme completion handling refactor and fix Christoph Hellwig
  2020-08-14 15:15 ` [PATCH 1/3] nvme: rename and document nvme_end_request Christoph Hellwig
  2020-08-14 15:15 ` [PATCH 2/3] nvme: refactor command completion Christoph Hellwig
@ 2020-08-14 15:15 ` Christoph Hellwig
  2020-08-14 18:44   ` Sagi Grimberg
  2020-08-17  3:41   ` Chao Leng
  2 siblings, 2 replies; 18+ messages in thread
From: Christoph Hellwig @ 2020-08-14 15:15 UTC (permalink / raw)
  To: linux-nvme; +Cc: Keith Busch, Sagi Grimberg, Chao Leng

From: Chao Leng <lengchao@huawei.com>

If a command send through nvme-multupath failed on a dying queue, resend it
on another path.

Signed-off-by: Chao Leng <lengchao@huawei.com>
[hch: rebased on top of the completion refactoring]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 8d474adad721fb..271cb9bf29dcd0 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -275,13 +275,13 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
 		return COMPLETE;
 
 	if (req->cmd_flags & REQ_NVME_MPATH) {
-		if (nvme_is_path_error(status))
+		if (nvme_is_path_error(status) || blk_queue_dying(req->q))
 			return FAILOVER;
+	} else {
+		if (blk_queue_dying(req->q))
+			return COMPLETE;
 	}
 
-	if (blk_queue_dying(req->q))
-		return COMPLETE;
-
 	return RETRY;
 }
 
-- 
2.28.0


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/3] nvme: rename and document nvme_end_request
  2020-08-14 15:15 ` [PATCH 1/3] nvme: rename and document nvme_end_request Christoph Hellwig
@ 2020-08-14 18:31   ` Sagi Grimberg
  0 siblings, 0 replies; 18+ messages in thread
From: Sagi Grimberg @ 2020-08-14 18:31 UTC (permalink / raw)
  To: Christoph Hellwig, linux-nvme; +Cc: Keith Busch, Chao Leng

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/3] nvme: refactor command completion
  2020-08-14 15:15 ` [PATCH 2/3] nvme: refactor command completion Christoph Hellwig
@ 2020-08-14 18:37   ` Sagi Grimberg
  2020-08-15  6:54     ` Christoph Hellwig
  2020-08-17  3:30   ` Chao Leng
  1 sibling, 1 reply; 18+ messages in thread
From: Sagi Grimberg @ 2020-08-14 18:37 UTC (permalink / raw)
  To: Christoph Hellwig, linux-nvme; +Cc: Keith Busch, Chao Leng



On 8/14/20 8:15 AM, Christoph Hellwig wrote:
> Lift all the code to decide the dispostition of a completed command
> from nvme_complete_rq and nvme_failover_req into a new helper, which
> returns an emum of the potential actions.  nvme_complete_rq then
> just switches on those and calls the proper helper for the action.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>   drivers/nvme/host/core.c      | 76 ++++++++++++++++++++++-------------
>   drivers/nvme/host/multipath.c | 47 ++++++----------------
>   drivers/nvme/host/nvme.h      | 31 ++++++++++++--
>   3 files changed, 90 insertions(+), 64 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 88cff309d8e4f0..8d474adad721fb 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -241,17 +241,6 @@ static blk_status_t nvme_error_status(u16 status)
>   	}
>   }
>   
> -static inline bool nvme_req_needs_retry(struct request *req)
> -{
> -	if (blk_noretry_request(req))
> -		return false;
> -	if (nvme_req(req)->status & NVME_SC_DNR)
> -		return false;
> -	if (nvme_req(req)->retries >= nvme_max_retries)
> -		return false;
> -	return true;
> -}
> -
>   static void nvme_retry_req(struct request *req)
>   {
>   	struct nvme_ns *ns = req->q->queuedata;
> @@ -268,34 +257,67 @@ static void nvme_retry_req(struct request *req)
>   	blk_mq_delay_kick_requeue_list(req->q, delay);
>   }
>   
> -void nvme_complete_rq(struct request *req)
> +enum nvme_disposition {
> +	COMPLETE,
> +	RETRY,
> +	FAILOVER,
> +};
> +
> +static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
>   {
> -	blk_status_t status = nvme_error_status(nvme_req(req)->status);
> +	u16 status = nvme_req(req)->status & 0x7ff;
>   
> -	trace_nvme_complete_rq(req);
> +	if (likely(status == 0))
> +		return COMPLETE;
>   
> -	nvme_cleanup_cmd(req);
> +	if (blk_noretry_request(req) || (status & NVME_SC_DNR) ||
> +	    nvme_req(req)->retries >= nvme_max_retries)
> +		return COMPLETE;
>   
> -	if (nvme_req(req)->ctrl->kas)
> -		nvme_req(req)->ctrl->comp_seen = true;
> +	if (req->cmd_flags & REQ_NVME_MPATH) {
> +		if (nvme_is_path_error(status))
> +			return FAILOVER;
> +	}
>   
> -	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
> -		if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req))
> -			return;
> +	if (blk_queue_dying(req->q))
> +		return COMPLETE;
>   
> -		if (!blk_queue_dying(req->q)) {
> -			nvme_retry_req(req);
> -			return;
> -		}
> -	} else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> -		   req_op(req) == REQ_OP_ZONE_APPEND) {
> +	return RETRY;
> +}
> +
> +static inline void nvme_end_req(struct request *req)
> +{
> +	blk_status_t status = nvme_error_status(nvme_req(req)->status);
> +
> +	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> +	    req_op(req) == REQ_OP_ZONE_APPEND)
>   		req->__sector = nvme_lba_to_sect(req->q->queuedata,
>   			le64_to_cpu(nvme_req(req)->result.u64));
> -	}
>   
>   	nvme_trace_bio_complete(req, status);
>   	blk_mq_end_request(req, status);
>   }
> +
> +void nvme_complete_rq(struct request *req)
> +{
> +	trace_nvme_complete_rq(req);
> +	nvme_cleanup_cmd(req);
> +
> +	if (nvme_req(req)->ctrl->kas)
> +		nvme_req(req)->ctrl->comp_seen = true;
> +
> +	switch (nvme_decide_disposition(req)) {
> +	case COMPLETE:
> +		nvme_end_req(req);
> +		return;
> +	case RETRY:
> +		nvme_retry_req(req);
> +		return;
> +	case FAILOVER:
> +		nvme_failover_req(req);
> +		return;
> +	}
> +}
>   EXPORT_SYMBOL_GPL(nvme_complete_rq);
>   
>   bool nvme_cancel_request(struct request *req, void *data, bool reserved)
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 3ded54d2c9c6ad..abc5bcf7209506 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -65,51 +65,30 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
>   	}
>   }
>   
> -bool nvme_failover_req(struct request *req)
> +void nvme_failover_req(struct request *req)
>   {
>   	struct nvme_ns *ns = req->q->queuedata;
> -	u16 status = nvme_req(req)->status;
> +	u16 status = nvme_req(req)->status & 0x7ff;
>   	unsigned long flags;
>   
> -	switch (status & 0x7ff) {
> -	case NVME_SC_ANA_TRANSITION:
> -	case NVME_SC_ANA_INACCESSIBLE:
> -	case NVME_SC_ANA_PERSISTENT_LOSS:
> -		/*
> -		 * If we got back an ANA error we know the controller is alive,
> -		 * but not ready to serve this namespaces.  The spec suggests
> -		 * we should update our general state here, but due to the fact
> -		 * that the admin and I/O queues are not serialized that is
> -		 * fundamentally racy.  So instead just clear the current path,
> -		 * mark the the path as pending and kick of a re-read of the ANA
> -		 * log page ASAP.
> -		 */
> -		nvme_mpath_clear_current_path(ns);
> -		if (ns->ctrl->ana_log_buf) {
> -			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> -			queue_work(nvme_wq, &ns->ctrl->ana_work);
> -		}
> -		break;
> -	case NVME_SC_HOST_PATH_ERROR:
> -	case NVME_SC_HOST_ABORTED_CMD:
> -		/*
> -		 * Temporary transport disruption in talking to the controller.
> -		 * Try to send on a new path.
> -		 */
> -		nvme_mpath_clear_current_path(ns);
> -		break;
> -	default:
> -		/* This was a non-ANA error so follow the normal error path. */
> -		return false;
> +	nvme_mpath_clear_current_path(ns);
> +
> +	/*
> +	 * If we got back an ANA error, we know the controller is alive but not
> +	 * ready to serve this namespace.  Kick of a re-read of the ANA
> +	 * information page, and just try any other available path for now.
> +	 */
> +	if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
> +		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> +		queue_work(nvme_wq, &ns->ctrl->ana_work);
>   	}
>   
>   	spin_lock_irqsave(&ns->head->requeue_lock, flags);
>   	blk_steal_bios(&ns->head->requeue_list, req);
>   	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
> -	blk_mq_end_request(req, 0);
>   
> +	blk_mq_end_request(req, 0);
>   	kblockd_schedule_work(&ns->head->requeue_work);
> -	return true;
>   }
>   
>   void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 6d69cc7306d034..dfcdeb318f3ab6 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -523,6 +523,32 @@ static inline u32 nvme_bytes_to_numd(size_t len)
>   	return (len >> 2) - 1;
>   }
>   
> +static inline bool nvme_is_ana_error(u16 status)
> +{
> +	switch (status & 0x7ff) {
> +	case NVME_SC_ANA_TRANSITION:
> +	case NVME_SC_ANA_INACCESSIBLE:
> +	case NVME_SC_ANA_PERSISTENT_LOSS:
> +		return true;
> +	default:
> +		return false;
> +	}
> +}
> +
> +static inline bool nvme_is_path_error(u16 status)
> +{
> +	switch (status & 0x7ff) {
> +	case NVME_SC_HOST_PATH_ERROR:
> +	case NVME_SC_HOST_ABORTED_CMD:
> +	case NVME_SC_ANA_TRANSITION:
> +	case NVME_SC_ANA_INACCESSIBLE:
> +	case NVME_SC_ANA_PERSISTENT_LOSS:
> +		return true;
> +	default:
> +		return false;
> +	}
> +}

Would be better to reuse nvme_is_ana_error I think so we
don't need to maintain ana status codes in both places:

static inline bool nvme_is_path_error(u16 status)
{
	if (nvme_is_ana_error(status))
		return true;
	switch (status & 0x7ff) {
	case NVME_SC_HOST_PATH_ERROR:
	case NVME_SC_HOST_ABORTED_CMD:
		return true;
	default:
		return false;
	}
}

Otherwise looks good.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3] nvme: redirect commands on dying queue
  2020-08-14 15:15 ` [PATCH 3/3] nvme: redirect commands on dying queue Christoph Hellwig
@ 2020-08-14 18:44   ` Sagi Grimberg
  2020-08-15  6:55     ` Christoph Hellwig
  2020-08-17  3:41   ` Chao Leng
  1 sibling, 1 reply; 18+ messages in thread
From: Sagi Grimberg @ 2020-08-14 18:44 UTC (permalink / raw)
  To: Christoph Hellwig, linux-nvme; +Cc: Keith Busch, Chao Leng


> If a command send through nvme-multupath failed on a dying queue, resend it
> on another path.

So this is a race where we got a retry-able status from the controller
(not from the host teardwon sequence) and we just happen to see
a dying queue?

I guess that can happen..
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>


> 
> Signed-off-by: Chao Leng <lengchao@huawei.com>
> [hch: rebased on top of the completion refactoring]
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>   drivers/nvme/host/core.c | 8 ++++----
>   1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 8d474adad721fb..271cb9bf29dcd0 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -275,13 +275,13 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
>   		return COMPLETE;
>   
>   	if (req->cmd_flags & REQ_NVME_MPATH) {
> -		if (nvme_is_path_error(status))
> +		if (nvme_is_path_error(status) || blk_queue_dying(req->q))
>   			return FAILOVER;
> +	} else {
> +		if (blk_queue_dying(req->q))
> +			return COMPLETE;
>   	}
>   
> -	if (blk_queue_dying(req->q))
> -		return COMPLETE;
> -
>   	return RETRY;
>   }
>   
> 

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/3] nvme: refactor command completion
  2020-08-14 18:37   ` Sagi Grimberg
@ 2020-08-15  6:54     ` Christoph Hellwig
  2020-08-17  7:45       ` Sagi Grimberg
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2020-08-15  6:54 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: Keith Busch, Christoph Hellwig, linux-nvme, Chao Leng

On Fri, Aug 14, 2020 at 11:37:14AM -0700, Sagi Grimberg wrote:

[fullquote deleted, sigh..]

> Would be better to reuse nvme_is_ana_error I think so we
> don't need to maintain ana status codes in both places:
>
> static inline bool nvme_is_path_error(u16 status)
> {
> 	if (nvme_is_ana_error(status))
> 		return true;
> 	switch (status & 0x7ff) {
> 	case NVME_SC_HOST_PATH_ERROR:
> 	case NVME_SC_HOST_ABORTED_CMD:
> 		return true;
> 	default:
> 		return false;
> 	}
> }

I initially had:

static inline bool nvme_is_path_error(u16 status)
{
	switch (status & 0x7ff) {
	case NVME_SC_HOST_PATH_ERROR:
	case NVME_SC_HOST_ABORTED_CMD:
		return true;
	default:
		return nvme_is_ana_error(status);
	}
}

but found that a little weird, but I can go back to it.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3] nvme: redirect commands on dying queue
  2020-08-14 18:44   ` Sagi Grimberg
@ 2020-08-15  6:55     ` Christoph Hellwig
  2020-08-17  3:54       ` Chao Leng
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2020-08-15  6:55 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: Keith Busch, Christoph Hellwig, linux-nvme, Chao Leng

On Fri, Aug 14, 2020 at 11:44:12AM -0700, Sagi Grimberg wrote:
>
>> If a command send through nvme-multupath failed on a dying queue, resend it
>> on another path.
>
> So this is a race where we got a retry-able status from the controller
> (not from the host teardwon sequence) and we just happen to see
> a dying queue?

I think so, maybe Chao can explain the scenario in a little more detail.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/3] nvme: refactor command completion
  2020-08-14 15:15 ` [PATCH 2/3] nvme: refactor command completion Christoph Hellwig
  2020-08-14 18:37   ` Sagi Grimberg
@ 2020-08-17  3:30   ` Chao Leng
  1 sibling, 0 replies; 18+ messages in thread
From: Chao Leng @ 2020-08-17  3:30 UTC (permalink / raw)
  To: Christoph Hellwig, linux-nvme; +Cc: Keith Busch, Sagi Grimberg



On 2020/8/14 23:15, Christoph Hellwig wrote:
> Lift all the code to decide the dispostition of a completed command
> from nvme_complete_rq and nvme_failover_req into a new helper, which
> returns an emum of the potential actions.  nvme_complete_rq then
> just switches on those and calls the proper helper for the action.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>   drivers/nvme/host/core.c      | 76 ++++++++++++++++++++++-------------
>   drivers/nvme/host/multipath.c | 47 ++++++----------------
>   drivers/nvme/host/nvme.h      | 31 ++++++++++++--
>   3 files changed, 90 insertions(+), 64 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 88cff309d8e4f0..8d474adad721fb 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -241,17 +241,6 @@ static blk_status_t nvme_error_status(u16 status)
>   	}
>   }
>   
> -static inline bool nvme_req_needs_retry(struct request *req)
> -{
> -	if (blk_noretry_request(req))
> -		return false;
> -	if (nvme_req(req)->status & NVME_SC_DNR)
> -		return false;
> -	if (nvme_req(req)->retries >= nvme_max_retries)
> -		return false;
> -	return true;
> -}
> -
>   static void nvme_retry_req(struct request *req)
>   {
>   	struct nvme_ns *ns = req->q->queuedata;
> @@ -268,34 +257,67 @@ static void nvme_retry_req(struct request *req)
>   	blk_mq_delay_kick_requeue_list(req->q, delay);
>   }
>   
> -void nvme_complete_rq(struct request *req)
> +enum nvme_disposition {
> +	COMPLETE,
> +	RETRY,
> +	FAILOVER,
> +};
> +
> +static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
>   {
> -	blk_status_t status = nvme_error_status(nvme_req(req)->status);
> +	u16 status = nvme_req(req)->status & 0x7ff;
>   
> -	trace_nvme_complete_rq(req);
> +	if (likely(status == 0))
> +		return COMPLETE;
>   
> -	nvme_cleanup_cmd(req);
> +	if (blk_noretry_request(req) || (status & NVME_SC_DNR) ||
> +	    nvme_req(req)->retries >= nvme_max_retries)
> +		return COMPLETE;
>   
> -	if (nvme_req(req)->ctrl->kas)
> -		nvme_req(req)->ctrl->comp_seen = true;
> +	if (req->cmd_flags & REQ_NVME_MPATH) {
> +		if (nvme_is_path_error(status))
> +			return FAILOVER;
> +	}
>   
> -	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
> -		if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req))
> -			return;
> +	if (blk_queue_dying(req->q))
> +		return COMPLETE;
>   
> -		if (!blk_queue_dying(req->q)) {
> -			nvme_retry_req(req);
> -			return;
> -		}
> -	} else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> -		   req_op(req) == REQ_OP_ZONE_APPEND) {
> +	return RETRY;
> +}
> +
> +static inline void nvme_end_req(struct request *req)
> +{
> +	blk_status_t status = nvme_error_status(nvme_req(req)->status);
> +
> +	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> +	    req_op(req) == REQ_OP_ZONE_APPEND)
>   		req->__sector = nvme_lba_to_sect(req->q->queuedata,
>   			le64_to_cpu(nvme_req(req)->result.u64));
> -	}
>   
>   	nvme_trace_bio_complete(req, status);
>   	blk_mq_end_request(req, status);
>   }
> +
> +void nvme_complete_rq(struct request *req)
> +{
> +	trace_nvme_complete_rq(req);
> +	nvme_cleanup_cmd(req);
> +
> +	if (nvme_req(req)->ctrl->kas)
> +		nvme_req(req)->ctrl->comp_seen = true;
> +
> +	switch (nvme_decide_disposition(req)) {
> +	case COMPLETE:
> +		nvme_end_req(req);
> +		return;
> +	case RETRY:
> +		nvme_retry_req(req);
> +		return;
> +	case FAILOVER:
> +		nvme_failover_req(req);
> +		return;
> +	}
> +}
>   EXPORT_SYMBOL_GPL(nvme_complete_rq);
>   
>   bool nvme_cancel_request(struct request *req, void *data, bool reserved)
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 3ded54d2c9c6ad..abc5bcf7209506 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -65,51 +65,30 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
>   	}
>   }
>   
> -bool nvme_failover_req(struct request *req)
> +void nvme_failover_req(struct request *req)
>   {
>   	struct nvme_ns *ns = req->q->queuedata;
> -	u16 status = nvme_req(req)->status;
> +	u16 status = nvme_req(req)->status & 0x7ff;
>   	unsigned long flags;
>   
> -	switch (status & 0x7ff) {
> -	case NVME_SC_ANA_TRANSITION:
> -	case NVME_SC_ANA_INACCESSIBLE:
> -	case NVME_SC_ANA_PERSISTENT_LOSS:
> -		/*
> -		 * If we got back an ANA error we know the controller is alive,
> -		 * but not ready to serve this namespaces.  The spec suggests
> -		 * we should update our general state here, but due to the fact
> -		 * that the admin and I/O queues are not serialized that is
> -		 * fundamentally racy.  So instead just clear the current path,
> -		 * mark the the path as pending and kick of a re-read of the ANA
> -		 * log page ASAP.
> -		 */
> -		nvme_mpath_clear_current_path(ns);
> -		if (ns->ctrl->ana_log_buf) {
> -			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> -			queue_work(nvme_wq, &ns->ctrl->ana_work);
> -		}
> -		break;
> -	case NVME_SC_HOST_PATH_ERROR:
> -	case NVME_SC_HOST_ABORTED_CMD:
> -		/*
> -		 * Temporary transport disruption in talking to the controller.
> -		 * Try to send on a new path.
> -		 */
> -		nvme_mpath_clear_current_path(ns);
> -		break;
> -	default:
> -		/* This was a non-ANA error so follow the normal error path. */
> -		return false;
> +	nvme_mpath_clear_current_path(ns);
> +
> +	/*
> +	 * If we got back an ANA error, we know the controller is alive but not
> +	 * ready to serve this namespace.  Kick of a re-read of the ANA
> +	 * information page, and just try any other available path for now.
> +	 */
> +	if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
> +		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> +		queue_work(nvme_wq, &ns->ctrl->ana_work);
>   	}
>   
>   	spin_lock_irqsave(&ns->head->requeue_lock, flags);
>   	blk_steal_bios(&ns->head->requeue_list, req);
>   	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
> -	blk_mq_end_request(req, 0);
>   
> +	blk_mq_end_request(req, 0);
>   	kblockd_schedule_work(&ns->head->requeue_work);
> -	return true;
>   }
>   
>   void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 6d69cc7306d034..dfcdeb318f3ab6 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -523,6 +523,32 @@ static inline u32 nvme_bytes_to_numd(size_t len)
>   	return (len >> 2) - 1;
>   }
>   
> +static inline bool nvme_is_ana_error(u16 status)
> +{
> +	switch (status & 0x7ff) {
> +	case NVME_SC_ANA_TRANSITION:
> +	case NVME_SC_ANA_INACCESSIBLE:
> +	case NVME_SC_ANA_PERSISTENT_LOSS:
> +		return true;
> +	default:
> +		return false;
> +	}
> +}
> +
> +static inline bool nvme_is_path_error(u16 status)
> +{
> +	switch (status & 0x7ff) {
> +	case NVME_SC_HOST_PATH_ERROR:
> +	case NVME_SC_HOST_ABORTED_CMD:
> +	case NVME_SC_ANA_TRANSITION:
> +	case NVME_SC_ANA_INACCESSIBLE:
> +	case NVME_SC_ANA_PERSISTENT_LOSS:
Suggest directly use 0x3xx,the scalability may be better.
0x300, 0x360 is already defined by nvme 1.4, but now is not used
in current code. Furthermore , nvme protocol may define more error
status for path related error.

> +		return true;
> +	default:
> +		return false;
> +	}
> +}
> +
>   /*
>    * Fill in the status and result information from the CQE, and then figure
>    * out if we blk-mq will need to use IPI magic to complete the request, and
> @@ -635,7 +661,7 @@ void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
>   void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
>   void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
>   			struct nvme_ctrl *ctrl, int *flags);
> -bool nvme_failover_req(struct request *req);
> +void nvme_failover_req(struct request *req);
>   void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
>   int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
>   void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
> @@ -694,9 +720,8 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
>   	sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
>   }
>   
> -static inline bool nvme_failover_req(struct request *req)
> +static inline void nvme_failover_req(struct request *req)
>   {
> -	return false;
>   }
>   static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
>   {
> 

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3] nvme: redirect commands on dying queue
  2020-08-14 15:15 ` [PATCH 3/3] nvme: redirect commands on dying queue Christoph Hellwig
  2020-08-14 18:44   ` Sagi Grimberg
@ 2020-08-17  3:41   ` Chao Leng
  2020-08-17  5:49     ` Chao Leng
  1 sibling, 1 reply; 18+ messages in thread
From: Chao Leng @ 2020-08-17  3:41 UTC (permalink / raw)
  To: Christoph Hellwig, linux-nvme; +Cc: Keith Busch, Sagi Grimberg



On 2020/8/14 23:15, Christoph Hellwig wrote:
> From: Chao Leng <lengchao@huawei.com>
> 
> If a command send through nvme-multupath failed on a dying queue, resend it
> on another path.
> 
> Signed-off-by: Chao Leng <lengchao@huawei.com>
> [hch: rebased on top of the completion refactoring]
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>   drivers/nvme/host/core.c | 8 ++++----
>   1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 8d474adad721fb..271cb9bf29dcd0 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -275,13 +275,13 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
>   		return COMPLETE;
>   
>   	if (req->cmd_flags & REQ_NVME_MPATH) {
> -		if (nvme_is_path_error(status))
> +		if (nvme_is_path_error(status) || blk_queue_dying(req->q))
>   			return FAILOVER;
> +	} else {
> +		if (blk_queue_dying(req->q))
> +			return COMPLETE;
>   	}
>   
> -	if (blk_queue_dying(req->q))
> -		return COMPLETE;
> -
>   	return RETRY;
>   } 
If path related error, retry maybe fail again. So we should not retry local
for path related error. Suggest do like this:
	if (nvme_is_path_error(status) || blk_queue_dying(req->q)) {
		if (req->cmd_flags & REQ_NVME_MPATH)
			return FAILOVER;
		else
			return COMPLETE;
	}
	return RETRY;

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3] nvme: redirect commands on dying queue
  2020-08-15  6:55     ` Christoph Hellwig
@ 2020-08-17  3:54       ` Chao Leng
  2020-08-17  7:46         ` Sagi Grimberg
  0 siblings, 1 reply; 18+ messages in thread
From: Chao Leng @ 2020-08-17  3:54 UTC (permalink / raw)
  To: Christoph Hellwig, Sagi Grimberg; +Cc: Keith Busch, linux-nvme



On 2020/8/15 14:55, Christoph Hellwig wrote:
> On Fri, Aug 14, 2020 at 11:44:12AM -0700, Sagi Grimberg wrote:
>>
>>> If a command send through nvme-multupath failed on a dying queue, resend it
>>> on another path.
>>
>> So this is a race where we got a retry-able status from the controller
>> (not from the host teardwon sequence) and we just happen to see
>> a dying queue?
> 
> I think so, maybe Chao can explain the scenario in a little more detail.
> .
The scenario: IO already return with non path error(such as
NVME_SC_CMD_INTERRUPTED or NVME_SC_DATA_XFER_ERROR etc.), but is waiting
to be processed, at the same time, delete ctrl happens, delete ctrl may
set queue flag: QUEUE_FLAG_DYING when call nvme_remove_namespaces. Then
for example, if fabric is rdma, delete ctrl will call
nvme_rdma_delete_ctrl, nvme_rdma_delete_ctrl will drain qp first, thus
the IO, which return with non path error, can not be failover retry,
and also can not retry local, IO will interrupt.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3] nvme: redirect commands on dying queue
  2020-08-17  3:41   ` Chao Leng
@ 2020-08-17  5:49     ` Chao Leng
  2020-08-17  8:13       ` Christoph Hellwig
  0 siblings, 1 reply; 18+ messages in thread
From: Chao Leng @ 2020-08-17  5:49 UTC (permalink / raw)
  To: Christoph Hellwig, linux-nvme; +Cc: Keith Busch, Sagi Grimberg



On 2020/8/17 11:41, Chao Leng wrote:
> 
> 
> On 2020/8/14 23:15, Christoph Hellwig wrote:
>> From: Chao Leng <lengchao@huawei.com>
>>
>> If a command send through nvme-multupath failed on a dying queue, resend it
>> on another path.
>>
>> Signed-off-by: Chao Leng <lengchao@huawei.com>
>> [hch: rebased on top of the completion refactoring]
>> Signed-off-by: Christoph Hellwig <hch@lst.de>
>> ---
>>   drivers/nvme/host/core.c | 8 ++++----
>>   1 file changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>> index 8d474adad721fb..271cb9bf29dcd0 100644
>> --- a/drivers/nvme/host/core.c
>> +++ b/drivers/nvme/host/core.c
>> @@ -275,13 +275,13 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
>>           return COMPLETE;
>>       if (req->cmd_flags & REQ_NVME_MPATH) {
>> -        if (nvme_is_path_error(status))
>> +        if (nvme_is_path_error(status) || blk_queue_dying(req->q))
>>               return FAILOVER;
>> +    } else {
>> +        if (blk_queue_dying(req->q))
>> +            return COMPLETE;
>>       }
>> -    if (blk_queue_dying(req->q))
>> -        return COMPLETE;
>> -
>>       return RETRY;
>>   } 
> If path related error, retry maybe fail again. So we should not retry local
> for path related error. Suggest do like this:
>      if (nvme_is_path_error(status) || blk_queue_dying(req->q)) {
>          if (req->cmd_flags & REQ_NVME_MPATH)
>              return FAILOVER;
>          else
>              return COMPLETE;
If without any multipath, will not retry. This may not be what we expected.
If we can support both NVMe multipathing and third-party multipathing,
it would be great. In addition to DM-MULTIPATH, third-party multipathing
software is provided by storage vendors, such as PowerPath(DELL&EMC),
SecurePath(hp), Ultrapath(huawei), etc.
>      }
>      return RETRY;
> 
> _______________________________________________
> Linux-nvme mailing list
> Linux-nvme@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-nvme

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/3] nvme: refactor command completion
  2020-08-15  6:54     ` Christoph Hellwig
@ 2020-08-17  7:45       ` Sagi Grimberg
  2020-08-17  7:54         ` Christoph Hellwig
  0 siblings, 1 reply; 18+ messages in thread
From: Sagi Grimberg @ 2020-08-17  7:45 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Keith Busch, Chao Leng, linux-nvme


> [fullquote deleted, sigh..]
> 
>> Would be better to reuse nvme_is_ana_error I think so we
>> don't need to maintain ana status codes in both places:
>>
>> static inline bool nvme_is_path_error(u16 status)
>> {
>> 	if (nvme_is_ana_error(status))
>> 		return true;
>> 	switch (status & 0x7ff) {
>> 	case NVME_SC_HOST_PATH_ERROR:
>> 	case NVME_SC_HOST_ABORTED_CMD:
>> 		return true;
>> 	default:
>> 		return false;
>> 	}
>> }
> 
> I initially had:
> 
> static inline bool nvme_is_path_error(u16 status)
> {
> 	switch (status & 0x7ff) {
> 	case NVME_SC_HOST_PATH_ERROR:
> 	case NVME_SC_HOST_ABORTED_CMD:
> 		return true;
> 	default:
> 		return nvme_is_ana_error(status);
> 	}
> }
> 
> but found that a little weird, but I can go back to it.

Seems better to me.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3] nvme: redirect commands on dying queue
  2020-08-17  3:54       ` Chao Leng
@ 2020-08-17  7:46         ` Sagi Grimberg
  0 siblings, 0 replies; 18+ messages in thread
From: Sagi Grimberg @ 2020-08-17  7:46 UTC (permalink / raw)
  To: Chao Leng, Christoph Hellwig; +Cc: Keith Busch, linux-nvme


>>>> If a command send through nvme-multupath failed on a dying queue, 
>>>> resend it
>>>> on another path.
>>>
>>> So this is a race where we got a retry-able status from the controller
>>> (not from the host teardwon sequence) and we just happen to see
>>> a dying queue?
>>
>> I think so, maybe Chao can explain the scenario in a little more detail.
>> .
> The scenario: IO already return with non path error(such as
> NVME_SC_CMD_INTERRUPTED or NVME_SC_DATA_XFER_ERROR etc.), but is waiting
> to be processed, at the same time, delete ctrl happens, delete ctrl may
> set queue flag: QUEUE_FLAG_DYING when call nvme_remove_namespaces. Then
> for example, if fabric is rdma, delete ctrl will call
> nvme_rdma_delete_ctrl, nvme_rdma_delete_ctrl will drain qp first, thus
> the IO, which return with non path error, can not be failover retry,
> and also can not retry local, IO will interrupt.

OK

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/3] nvme: refactor command completion
  2020-08-17  7:45       ` Sagi Grimberg
@ 2020-08-17  7:54         ` Christoph Hellwig
  0 siblings, 0 replies; 18+ messages in thread
From: Christoph Hellwig @ 2020-08-17  7:54 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: Keith Busch, Christoph Hellwig, linux-nvme, Chao Leng

On Mon, Aug 17, 2020 at 12:45:10AM -0700, Sagi Grimberg wrote:
>> but found that a little weird, but I can go back to it.
>
> Seems better to me.

I think I'll just keep this patch as-is and add an extra one implementing
the suggestion to just switch for a the path error status code type.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3] nvme: redirect commands on dying queue
  2020-08-17  5:49     ` Chao Leng
@ 2020-08-17  8:13       ` Christoph Hellwig
  2020-08-19  1:42         ` Chao Leng
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2020-08-17  8:13 UTC (permalink / raw)
  To: Chao Leng; +Cc: Keith Busch, Christoph Hellwig, linux-nvme, Sagi Grimberg

On Mon, Aug 17, 2020 at 01:49:47PM +0800, Chao Leng wrote:
> If without any multipath, will not retry.

Which is the right thing to do.

> This may not be what we expected.
> If we can support both NVMe multipathing and third-party multipathing,
> it would be great. In addition to DM-MULTIPATH, third-party multipathing
> software is provided by storage vendors, such as PowerPath(DELL&EMC),
> SecurePath(hp), Ultrapath(huawei), etc.

For dm-multipath we only guarantee we don't break old behavior.  ANA
was never supported without nvme-multipath so this isn't part of it.

Out of tree junk is completely unsupported per the kernel wide policy,
unrelated to the nvme driver itself.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3] nvme: redirect commands on dying queue
  2020-08-17  8:13       ` Christoph Hellwig
@ 2020-08-19  1:42         ` Chao Leng
  0 siblings, 0 replies; 18+ messages in thread
From: Chao Leng @ 2020-08-19  1:42 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Keith Busch, Sagi Grimberg, linux-nvme



On 2020/8/17 16:13, Christoph Hellwig wrote:
> On Mon, Aug 17, 2020 at 01:49:47PM +0800, Chao Leng wrote:
>> If without any multipath, will not retry.
> 
> Which is the right thing to do.
Suggestion:
Do not use blk_noretry_request. The local retry mechanism, which is
defined by nvme protocol, is conflicted with REQ_FAILFAST_TRANSPORT.
blk_noretry_request is not a good choice for nvme, even for SCSI,
this is not a good choice too, so SCSI does not use this MACRO.
We can seperate REQ_FAILFAST_TRANSPORT with other FAILFAST flag.
For REQ_FAILFAST_TRANSPORT, do local retry for non path error.
For other FAILFAST flag, complete request with error code immediately.
> 
>> This may not be what we expected.
>> If we can support both NVMe multipathing and third-party multipathing,
>> it would be great. In addition to DM-MULTIPATH, third-party multipathing
>> software is provided by storage vendors, such as PowerPath(DELL&EMC),
>> SecurePath(hp), Ultrapath(huawei), etc.
> 
> For dm-multipath we only guarantee we don't break old behavior.  ANA
No, now we do not guarantee this. For example, if target return
NVME_SC_NS_NOT_READY or NVME_SC_CMD_INTERRUPTED, we do not local retry,
and translate to BLK_STS_TARGET, blk_path_error treat it as retrying
failover path will not help, will cause io interrupt. This is serious
problem.
  /**
  * blk_path_error - returns true if error may be path related
  * @error: status the request was completed with
  *
  * Description:
  *     This classifies block error status into non-retryable errors and ones
  *     that may be successful if retried on a failover path.
  *
  * Return:
  *     %false - retrying failover path will not help
  *     %true  - may succeed if retried
  */
static inline bool blk_path_error(blk_status_t error)
{
	switch (error) {
	case BLK_STS_NOTSUPP:
	case BLK_STS_NOSPC:
	case BLK_STS_TARGET:
	case BLK_STS_NEXUS:
	case BLK_STS_MEDIUM:
	case BLK_STS_PROTECTION:
		return false;
	}

	/* Anything else could be a path failure, so should be retried */
	return true;
}
> was never supported without nvme-multipath so this isn't part of it.
> 
> Out of tree junk is completely unsupported per the kernel wide policy,
> unrelated to the nvme driver itself.
If we can support third multipathing software without extra cost,
and have no bad effect on NVMe, we will get more palyers, supporters,
and contributors, will faster nvme over fabric development.
I will try to do this base on your patch.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2020-08-19  1:43 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-14 15:15 nvme completion handling refactor and fix Christoph Hellwig
2020-08-14 15:15 ` [PATCH 1/3] nvme: rename and document nvme_end_request Christoph Hellwig
2020-08-14 18:31   ` Sagi Grimberg
2020-08-14 15:15 ` [PATCH 2/3] nvme: refactor command completion Christoph Hellwig
2020-08-14 18:37   ` Sagi Grimberg
2020-08-15  6:54     ` Christoph Hellwig
2020-08-17  7:45       ` Sagi Grimberg
2020-08-17  7:54         ` Christoph Hellwig
2020-08-17  3:30   ` Chao Leng
2020-08-14 15:15 ` [PATCH 3/3] nvme: redirect commands on dying queue Christoph Hellwig
2020-08-14 18:44   ` Sagi Grimberg
2020-08-15  6:55     ` Christoph Hellwig
2020-08-17  3:54       ` Chao Leng
2020-08-17  7:46         ` Sagi Grimberg
2020-08-17  3:41   ` Chao Leng
2020-08-17  5:49     ` Chao Leng
2020-08-17  8:13       ` Christoph Hellwig
2020-08-19  1:42         ` Chao Leng

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.