All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] nvme-fabrics: reject I/O to offline device
@ 2020-02-04 15:49 Victor Gladkov
  2020-02-19 15:28 ` Christoph Hellwig
  0 siblings, 1 reply; 6+ messages in thread
From: Victor Gladkov @ 2020-02-04 15:49 UTC (permalink / raw)
  To: linux-nvme
  Cc: Christoph Hellwig, James Smart, Hannes Reinecke, Mike Snitzer,
	Sagi Grimberg

Commands get stuck while Host NVMe controller (TCP or RDMA) is in reconnect state. NVMe controller enters into reconnect state when it loses connection with the target. It tries to reconnect every 10 seconds (default) until successful reconnection or until reconnect time-out is reached. The default reconnect time out is 10 minutes.

1. Add a new session parameter called "fast_fail_tmo". The timeout is measured in seconds from the controller reconnect, any command beyond that timeout is rejected. 
The new parameter value may be passed during 'connect'. Default value of 0 means no timeout (in similar to current behavior).
2. Add a controller flag of "failfast_expired".
3. Add dedicated delayed_work that update the "failfast_expired" controller flag.
4. When entering CONNECTING, schedule the delayed_work based on failfast timeout value. If transition out of CONNECTING, terminate delayed work item and ensure failfast_expired is false. If delayed work item expires: set "failfast_expired" flag to true.
5. Update nvmf_fail_nonready_command() and nvme_available_path() functions with check the "failfast_expired" controller flag.

Signed-off-by: Victor Gladkov < victor.gladkov@kioxia.com>
---
 drivers/nvme/host/core.c      | 40 ++++++++++++++++++++++++++++++++++++++--
 drivers/nvme/host/fabrics.c   | 23 ++++++++++++++++++++---
 drivers/nvme/host/fabrics.h   |  5 +++++
 drivers/nvme/host/multipath.c |  4 +++-
 drivers/nvme/host/nvme.h      |  3 +++
 5 files changed, 69 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 6ec0350..c00efb2 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -321,6 +321,37 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved)
 }
 EXPORT_SYMBOL_GPL(nvme_cancel_request);

+static void nvme_failfast_work(struct work_struct *work)
+{
+	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
+			struct nvme_ctrl, failfast_work);
+
+	spin_lock_irq(&ctrl->lock);
+	if (ctrl->state == NVME_CTRL_CONNECTING) {
+		set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
+		dev_info(ctrl->device, "failfast expired set for controller %s\n", ctrl->opts->subsysnqn);
+		nvme_kick_requeue_lists(ctrl);
+	}
+	spin_unlock_irq(&ctrl->lock);
+}
+
+static void nvme_start_failfast_work(struct nvme_ctrl *ctrl)
+{
+	if (unlikely(ctrl->opts->fail_fast_tmo == 0))
+		return;
+
+	schedule_delayed_work(&ctrl->failfast_work, ctrl->opts->fail_fast_tmo * HZ);
+}
+
+static void nvme_stop_failfast_work(struct nvme_ctrl *ctrl)
+{
+	if (unlikely(ctrl->opts->fail_fast_tmo == 0))
+		return;
+
+	cancel_delayed_work_sync(&ctrl->failfast_work);
+	clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
+}
+
 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 		enum nvme_ctrl_state new_state)
 {
@@ -334,9 +365,10 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 	switch (new_state) {
 	case NVME_CTRL_LIVE:
 		switch (old_state) {
+		case NVME_CTRL_CONNECTING:
+			nvme_stop_failfast_work(ctrl);
 		case NVME_CTRL_NEW:
 		case NVME_CTRL_RESETTING:
-		case NVME_CTRL_CONNECTING:
 			changed = true;
 			/* FALLTHRU */
 		default:
@@ -355,8 +387,9 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 		break;
 	case NVME_CTRL_CONNECTING:
 		switch (old_state) {
-		case NVME_CTRL_NEW:
 		case NVME_CTRL_RESETTING:
+			nvme_start_failfast_work(ctrl);
+		case NVME_CTRL_NEW:
 			changed = true;
 			/* FALLTHRU */
 		default:
@@ -3960,6 +3993,7 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
 {
 	nvme_mpath_stop(ctrl);
 	nvme_stop_keep_alive(ctrl);
+	nvme_stop_failfast_work(ctrl);
 	flush_work(&ctrl->async_event_work);
 	cancel_work_sync(&ctrl->fw_act_work);
 }
@@ -4024,6 +4058,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	int ret;

 	ctrl->state = NVME_CTRL_NEW;
+	clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
 	spin_lock_init(&ctrl->lock);
 	mutex_init(&ctrl->scan_lock);
 	INIT_LIST_HEAD(&ctrl->namespaces);
@@ -4038,6 +4073,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	init_waitqueue_head(&ctrl->state_wq);

 	INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
+	INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
 	memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
 	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 74b8818..d5fa25e 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -549,6 +549,7 @@ blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl,
 {
 	if (ctrl->state != NVME_CTRL_DELETING &&
 	    ctrl->state != NVME_CTRL_DEAD &&
+	    !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
 	    !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
 		return BLK_STS_RESOURCE;

@@ -612,6 +613,7 @@ bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
 	{ NVMF_OPT_NR_WRITE_QUEUES,	"nr_write_queues=%d"	},
 	{ NVMF_OPT_NR_POLL_QUEUES,	"nr_poll_queues=%d"	},
 	{ NVMF_OPT_TOS,			"tos=%d"		},
+	{ NVMF_OPT_FAIL_FAST_TMO,	"fail_fast_tmo=%d"	},
 	{ NVMF_OPT_ERR,			NULL			}
 };

@@ -630,6 +632,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 	opts->nr_io_queues = num_online_cpus();
 	opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY;
 	opts->kato = NVME_DEFAULT_KATO;
+	opts->fail_fast_tmo = NVMF_DEF_FAIL_FAST_TMO;
 	opts->duplicate_connect = false;
 	opts->hdr_digest = false;
 	opts->data_digest = false;
@@ -751,6 +754,17 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 				pr_warn("ctrl_loss_tmo < 0 will reconnect forever\n");
 			ctrl_loss_tmo = token;
 			break;
+		case NVMF_OPT_FAIL_FAST_TMO:
+			if (match_int(args, &token)) {
+				ret = -EINVAL;
+				goto out;
+			}
+
+			if (token)
+				pr_warn("fail_fast_tmo != 0, I/O will failed on reconnect controller after %d sec\n", token);
+
+			opts->fail_fast_tmo  = token;
+			break;
 		case NVMF_OPT_HOSTNQN:
 			if (opts->host) {
 				pr_err("hostnqn already user-assigned: %s\n",
@@ -881,11 +895,14 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 		opts->nr_poll_queues = 0;
 		opts->duplicate_connect = true;
 	}
-	if (ctrl_loss_tmo < 0)
+	if (ctrl_loss_tmo < 0) {
 		opts->max_reconnects = -1;
-	else
+	} else {
 		opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
 						opts->reconnect_delay);
+		if (ctrl_loss_tmo < opts->fail_fast_tmo)
+			pr_warn("failfast tmo (%d) larger than controller loss tmo (%d)\n", opts->fail_fast_tmo, ctrl_loss_tmo);
+	}

 	if (!opts->host) {
 		kref_get(&nvmf_default_host->ref);
@@ -985,7 +1002,7 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts)
 #define NVMF_ALLOWED_OPTS	(NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \
 				 NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
 				 NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\
-				 NVMF_OPT_DISABLE_SQFLOW)
+				 NVMF_OPT_DISABLE_SQFLOW | NVMF_OPT_FAIL_FAST_TMO)

 static struct nvme_ctrl *
 nvmf_create_ctrl(struct device *dev, const char *buf)
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index a0ec40a..7ca62d4 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -15,6 +15,8 @@
 #define NVMF_DEF_RECONNECT_DELAY	10
 /* default to 600 seconds of reconnect attempts before giving up */
 #define NVMF_DEF_CTRL_LOSS_TMO		600
+/* default is 0: the fail fast mechanism is disable  */
+#define NVMF_DEF_FAIL_FAST_TMO		0

 /*
  * Define a host as seen by the target.  We allocate one at boot, but also
@@ -56,6 +58,7 @@ enum {
 	NVMF_OPT_NR_WRITE_QUEUES = 1 << 17,
 	NVMF_OPT_NR_POLL_QUEUES = 1 << 18,
 	NVMF_OPT_TOS		= 1 << 19,
+	NVMF_OPT_FAIL_FAST_TMO	= 1 << 20,
 };

 /**
@@ -89,6 +92,7 @@ enum {
  * @nr_write_queues: number of queues for write I/O
  * @nr_poll_queues: number of queues for polling I/O
  * @tos: type of service
+ * @fast_fail_tmo: fast I/O fail timeout in seconds;
  */
 struct nvmf_ctrl_options {
 	unsigned		mask;
@@ -111,6 +115,7 @@ struct nvmf_ctrl_options {
 	unsigned int		nr_write_queues;
 	unsigned int		nr_poll_queues;
 	int			tos;
+	unsigned int	fail_fast_tmo;
 };

 /*
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 797c183..4edcaf1 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -281,9 +281,11 @@ static bool nvme_available_path(struct nvme_ns_head *head)

 	list_for_each_entry_rcu(ns, &head->list, siblings) {
 		switch (ns->ctrl->state) {
+		case NVME_CTRL_CONNECTING:
+		    if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
+				break;
 		case NVME_CTRL_LIVE:
 		case NVME_CTRL_RESETTING:
-		case NVME_CTRL_CONNECTING:
 			/* fallthru */
 			return true;
 		default:
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 1024fec..b6a199e 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -256,6 +256,7 @@ struct nvme_ctrl {
 	struct work_struct scan_work;
 	struct work_struct async_event_work;
 	struct delayed_work ka_work;
+	struct delayed_work failfast_work;
 	struct nvme_command ka_cmd;
 	struct work_struct fw_act_work;
 	unsigned long events;
@@ -289,6 +290,8 @@ struct nvme_ctrl {
 	u16 icdoff;
 	u16 maxcmd;
 	int nr_reconnects;
+	unsigned long flags;
+#define NVME_CTRL_FAILFAST_EXPIRED	0
 	struct nvmf_ctrl_options *opts;

 	struct page *discard_page;
--
1.8.3.1


_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v3] nvme-fabrics: reject I/O to offline device
  2020-02-04 15:49 [PATCH v3] nvme-fabrics: reject I/O to offline device Victor Gladkov
@ 2020-02-19 15:28 ` Christoph Hellwig
  2020-02-20  8:34   ` Sagi Grimberg
  0 siblings, 1 reply; 6+ messages in thread
From: Christoph Hellwig @ 2020-02-19 15:28 UTC (permalink / raw)
  To: Victor Gladkov
  Cc: Sagi Grimberg, Mike Snitzer, James Smart, linux-nvme,
	Christoph Hellwig, Hannes Reinecke

On Tue, Feb 04, 2020 at 03:49:48PM +0000, Victor Gladkov wrote:
> Commands get stuck while Host NVMe controller (TCP or RDMA) is in reconnect state. NVMe controller enters into reconnect state when it loses connection with the target. It tries to reconnect every 10 seconds (default) until successful reconnection or until reconnect time-out is reached. The default reconnect time out is 10 minutes.

Please break your lines after 73 character.  As-is your commit log
is pretty much unreadabe.

> 
> 1. Add a new session parameter called "fast_fail_tmo". The timeout is measured in seconds from the controller reconnect, any command beyond that timeout is rejected. 

The SCSI transport classes use fast_io_fail_tmo for this sort of
functionality.  Can't we follow that naming (and also make sure the
semantics match as good as possible).

> index 6ec0350..c00efb2 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -321,6 +321,37 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved)
>  }
>  EXPORT_SYMBOL_GPL(nvme_cancel_request);
> 
> +static void nvme_failfast_work(struct work_struct *work)
> +{
> +	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
> +			struct nvme_ctrl, failfast_work);
> +
> +	spin_lock_irq(&ctrl->lock);
> +	if (ctrl->state == NVME_CTRL_CONNECTING) {
> +		set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
> +		dev_info(ctrl->device, "failfast expired set for controller %s\n", ctrl->opts->subsysnqn);

Please break up the line.

But looking at the use of NVME_CTRL_FAILFAST_EXPIRED, it almost seems
like this is another controller state?

> +static void nvme_start_failfast_work(struct nvme_ctrl *ctrl)
> +{
> +	if (unlikely(ctrl->opts->fail_fast_tmo == 0))
> +		return;
> +
> +	schedule_delayed_work(&ctrl->failfast_work, ctrl->opts->fail_fast_tmo * HZ);

Another overly long line.

> @@ -334,9 +365,10 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
>  	switch (new_state) {
>  	case NVME_CTRL_LIVE:
>  		switch (old_state) {
> +		case NVME_CTRL_CONNECTING:
> +			nvme_stop_failfast_work(ctrl);

We really should not call functionality in the state machine
verifier.  Try to move this out.  It probably should go with the
nvme_kick_requeue_lists call.

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v3] nvme-fabrics: reject I/O to offline device
  2020-02-19 15:28 ` Christoph Hellwig
@ 2020-02-20  8:34   ` Sagi Grimberg
  2020-02-20 17:41     ` James Smart
  0 siblings, 1 reply; 6+ messages in thread
From: Sagi Grimberg @ 2020-02-20  8:34 UTC (permalink / raw)
  To: Christoph Hellwig, Victor Gladkov
  Cc: linux-nvme, James Smart, Mike Snitzer, Hannes Reinecke


>> +static void nvme_failfast_work(struct work_struct *work)
>> +{
>> +	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
>> +			struct nvme_ctrl, failfast_work);
>> +
>> +	spin_lock_irq(&ctrl->lock);
>> +	if (ctrl->state == NVME_CTRL_CONNECTING) {
>> +		set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
>> +		dev_info(ctrl->device, "failfast expired set for controller %s\n", ctrl->opts->subsysnqn);
> 
> Please break up the line.
> 
> But looking at the use of NVME_CTRL_FAILFAST_EXPIRED, it almost seems
> like this is another controller state?

It actually is a controller state. this is just adding another state
without really fitting it into the state machine. I'd personally
would want that to happen, but I know James had some rejects on
this.

If this does end up as a first class controller state we need
to define the transitions.. Maybe something like this:
--
digraph nvme_host {
         node [shape = doublecircle]; new deleting;
         node [shape = circle];

         new;
         connecting;
         live
         resetting;
         failfast;
         deleting;
         dead;

         new -> connecting               [ label = 
"connecting\ncontroller" ]
         connecting -> live              [ label = "controller\nconnected" ]
         live -> resetting               [ label = "error detected\nor 
user initiated" ]
         resetting -> connecting         [ label = 
"reconnecting\ncontroller" ]
         connecting -> live              [ label = "reconnected 
controller\nreconnect_delay <= fast_fail_tmo" ]
         connecting -> failfast          [ label = "reconnect_delay > 
fast_fail_tmo" ]
         connecting -> deleting          [ label = "disconnected 
controller\nctrl_loss_tmo <= fast_fail_tmo\nor user initiated" ]
         failfast -> live                [ label = 
"controller\nreconnected" ]
         failfast -> deleting            [ label = "disconnected 
controller\nctrl_loss_tmo > fast_fail_tmo" ]
         deleting -> dead                [ label = "controller 
died\nduring deletion" ]
}
--

Note: for graphic display put this in a file named f.dot
and view with: dot f.dot -Tsvg | display


Also, I still say that its default changes the existing behavior which
is something we want to avoid.

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v3] nvme-fabrics: reject I/O to offline device
  2020-02-20  8:34   ` Sagi Grimberg
@ 2020-02-20 17:41     ` James Smart
  2020-02-26  8:52       ` Victor Gladkov
  0 siblings, 1 reply; 6+ messages in thread
From: James Smart @ 2020-02-20 17:41 UTC (permalink / raw)
  To: Sagi Grimberg, Christoph Hellwig, Victor Gladkov
  Cc: linux-nvme, Mike Snitzer, Hannes Reinecke

On 2/20/2020 12:34 AM, Sagi Grimberg wrote:
>
>>> +static void nvme_failfast_work(struct work_struct *work)
>>> +{
>>> +    struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
>>> +            struct nvme_ctrl, failfast_work);
>>> +
>>> +    spin_lock_irq(&ctrl->lock);
>>> +    if (ctrl->state == NVME_CTRL_CONNECTING) {
>>> +        set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
>>> +        dev_info(ctrl->device, "failfast expired set for controller 
>>> %s\n", ctrl->opts->subsysnqn);
>>
>> Please break up the line.
>>
>> But looking at the use of NVME_CTRL_FAILFAST_EXPIRED, it almost seems
>> like this is another controller state?
>
> It actually is a controller state. this is just adding another state
> without really fitting it into the state machine. I'd personally
> would want that to happen, but I know James had some rejects on
> this.

I don't believe its a controller state - rather an attribute, relative 
to within the Connecting state (which may be in the process of 
retrying), where instead of busying any io request that may be received, 
we start to fail them.    I don't see this time window as something the 
controller has to actually transition through - meaning we aren't 
reconnecting while in this state.


>
> Also, I still say that its default changes the existing behavior which
> is something we want to avoid.

I thought the last patch was the same as existing behavior (e.g. default 
tmo is 0, and if 0, the timer, which sets the now-fail flag, never gets 
scheduled).

in the last review, I was exploring the option if we did want to change 
the default, as the default is a rather long wait (I did vacilate).  But 
I'm fine with keeping things the same.

-- james


_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH v3] nvme-fabrics: reject I/O to offline device
  2020-02-20 17:41     ` James Smart
@ 2020-02-26  8:52       ` Victor Gladkov
  2020-03-02  3:30         ` Chaitanya Kulkarni
  0 siblings, 1 reply; 6+ messages in thread
From: Victor Gladkov @ 2020-02-26  8:52 UTC (permalink / raw)
  To: James Smart, Sagi Grimberg, Christoph Hellwig
  Cc: linux-nvme, Mike Snitzer, Hannes Reinecke

Refactoring according the last reviews:
- Move call functionality out the state machine.
- Break long lines
- Change parameter name from " fast_fail_tmo" to " fast_io_fail_tmo"
-----------------------------------------
Commands get stuck while Host NVMe controller (TCP or RDMA) is in
reconnect state. NVMe controller enters into reconnect state when it loses
connection with the target. It tries to reconnect every 10 seconds
(default) until successful reconnection or until reconnect time-out is
reached. The default reconnect time out is 10 minutes.

1. Add a new session parameter called "fast_io_fail_tmo".
The timeout is measured in seconds from the controller reconnect,
any command beyond that timeout is rejected.
The new parameter value may be passed during 'connect'.
Default value of 0 means no timeout (in similar to current behavior).
2. Add a controller flag of "NVME_CTRL_FAILFAST_EXPIRED".
3. Add dedicated delayed_work that update the "NVME_CTRL_FAILFAST_EXPIRED"
controller flag.
4. When entering CONNECTING, schedule the delayed_work based on
failfast timeout value. If transition out of CONNECTING, terminate
delayed work item and ensure failfast_expired is false. If delayed work
item expires: set "NVME_CTRL_FAILFAST_EXPIRED" flag to true.
5. Update nvmf_fail_nonready_command() and nvme_available_path()
functions with check the "NVME_CTRL_FAILFAST_EXPIRED" controller flag.

Signed-off-by: Victor Gladkov < victor.gladkov at kioxia.com>
---
 drivers/nvme/host/core.c      | 45 ++++++++++++++++++++++++++++++++++++++++++-
 drivers/nvme/host/fabrics.c   | 25 +++++++++++++++++++++---
 drivers/nvme/host/fabrics.h   |  5 +++++
 drivers/nvme/host/multipath.c |  4 +++-
 drivers/nvme/host/nvme.h      |  3 +++
 5 files changed, 77 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 6ec0350..cddcf765 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -321,6 +321,38 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved)
 }
 EXPORT_SYMBOL_GPL(nvme_cancel_request);

+static void nvme_failfast_work(struct work_struct *work)
+{
+	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
+			struct nvme_ctrl, failfast_work);
+
+	spin_lock_irq(&ctrl->lock);
+	if (ctrl->state == NVME_CTRL_CONNECTING) {
+		set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
+		dev_info(ctrl->device, "failfast expired set for controller %s\n",
+			ctrl->opts->subsysnqn);
+		nvme_kick_requeue_lists(ctrl);
+	}
+	spin_unlock_irq(&ctrl->lock);
+}
+
+static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl)
+{
+	if (ctrl->opts->fast_io_fail_tmo == 0)
+		return;
+
+	schedule_delayed_work(&ctrl->failfast_work, ctrl->opts->fast_io_fail_tmo * HZ);
+}
+
+static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl)
+{
+	if (ctrl->opts->fast_io_fail_tmo == 0)
+		return;
+
+	cancel_delayed_work_sync(&ctrl->failfast_work);
+	clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
+}
+
 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 		enum nvme_ctrl_state new_state)
 {
@@ -393,8 +425,16 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 	}

 	spin_unlock_irqrestore(&ctrl->lock, flags);
-	if (changed && ctrl->state == NVME_CTRL_LIVE)
+	if (changed) {
+		if (ctrl->state == NVME_CTRL_LIVE) {
 			nvme_kick_requeue_lists(ctrl);
+			if (old_state == NVME_CTRL_CONNECTING)
+				nvme_stop_failfast_work(ctrl);
+		} else if (ctrl->state == NVME_CTRL_CONNECTING) {
+			if (old_state == NVME_CTRL_RESETTING)
+				nvme_start_failfast_work(ctrl);
+		}
+	}
 	return changed;
 }
 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
@@ -3960,6 +4000,7 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
 {
 	nvme_mpath_stop(ctrl);
 	nvme_stop_keep_alive(ctrl);
+	nvme_stop_failfast_work(ctrl);
 	flush_work(&ctrl->async_event_work);
 	cancel_work_sync(&ctrl->fw_act_work);
 }
@@ -4024,6 +4065,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	int ret;

 	ctrl->state = NVME_CTRL_NEW;
+	clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
 	spin_lock_init(&ctrl->lock);
 	mutex_init(&ctrl->scan_lock);
 	INIT_LIST_HEAD(&ctrl->namespaces);
@@ -4038,6 +4080,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	init_waitqueue_head(&ctrl->state_wq);

 	INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
+	INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
 	memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
 	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 74b8818..de2cb94 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -549,6 +549,7 @@ blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl,
 {
 	if (ctrl->state != NVME_CTRL_DELETING &&
 	    ctrl->state != NVME_CTRL_DEAD &&
+	    !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
 	    !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
 		return BLK_STS_RESOURCE;

@@ -612,6 +613,7 @@ bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
 	{ NVMF_OPT_NR_WRITE_QUEUES,	"nr_write_queues=%d"	},
 	{ NVMF_OPT_NR_POLL_QUEUES,	"nr_poll_queues=%d"	},
 	{ NVMF_OPT_TOS,			"tos=%d"		},
+	{ NVMF_OPT_FAIL_FAST_TMO,	"fast_io_fail_tmo=%d"	},
 	{ NVMF_OPT_ERR,			NULL			}
 };

@@ -630,6 +632,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 	opts->nr_io_queues = num_online_cpus();
 	opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY;
 	opts->kato = NVME_DEFAULT_KATO;
+	opts->fast_io_fail_tmo = NVMF_DEF_FAIL_FAST_TMO;
 	opts->duplicate_connect = false;
 	opts->hdr_digest = false;
 	opts->data_digest = false;
@@ -751,6 +754,18 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 				pr_warn("ctrl_loss_tmo < 0 will reconnect forever\n");
 			ctrl_loss_tmo = token;
 			break;
+		case NVMF_OPT_FAIL_FAST_TMO:
+			if (match_int(args, &token)) {
+				ret = -EINVAL;
+				goto out;
+			}
+
+			if (token)
+				pr_warn("fail_fast_tmo != 0, I/O will failed on reconnect"
+					" controller after %d sec\n", token);
+
+			opts->fast_io_fail_tmo = token;
+			break;
 		case NVMF_OPT_HOSTNQN:
 			if (opts->host) {
 				pr_err("hostnqn already user-assigned: %s\n",
@@ -881,11 +896,15 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 		opts->nr_poll_queues = 0;
 		opts->duplicate_connect = true;
 	}
-	if (ctrl_loss_tmo < 0)
+	if (ctrl_loss_tmo < 0) {
 		opts->max_reconnects = -1;
-	else
+	} else {
 		opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
 						opts->reconnect_delay);
+		if (ctrl_loss_tmo < opts->fast_io_fail_tmo)
+			pr_warn("failfast tmo (%d) larger than controller loss tmo (%d)\n",
+				opts->fast_io_fail_tmo, ctrl_loss_tmo);
+	}

 	if (!opts->host) {
 		kref_get(&nvmf_default_host->ref);
@@ -985,7 +1004,7 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts)
 #define NVMF_ALLOWED_OPTS	(NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \
 				 NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
 				 NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\
-				 NVMF_OPT_DISABLE_SQFLOW)
+				 NVMF_OPT_DISABLE_SQFLOW | NVMF_OPT_FAIL_FAST_TMO)

 static struct nvme_ctrl *
 nvmf_create_ctrl(struct device *dev, const char *buf)
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index a0ec40a..84a2c8c 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -15,6 +15,8 @@
 #define NVMF_DEF_RECONNECT_DELAY	10
 /* default to 600 seconds of reconnect attempts before giving up */
 #define NVMF_DEF_CTRL_LOSS_TMO		600
+/* default is 0: the fail fast mechanism is disable  */
+#define NVMF_DEF_FAIL_FAST_TMO		0

 /*
  * Define a host as seen by the target.  We allocate one at boot, but also
@@ -56,6 +58,7 @@ enum {
 	NVMF_OPT_NR_WRITE_QUEUES = 1 << 17,
 	NVMF_OPT_NR_POLL_QUEUES = 1 << 18,
 	NVMF_OPT_TOS		= 1 << 19,
+	NVMF_OPT_FAIL_FAST_TMO	= 1 << 20,
 };

 /**
@@ -89,6 +92,7 @@ enum {
  * @nr_write_queues: number of queues for write I/O
  * @nr_poll_queues: number of queues for polling I/O
  * @tos: type of service
+ * @fast_io_fail_tmo: Fast I/O fail timeout in seconds;
  */
 struct nvmf_ctrl_options {
 	unsigned		mask;
@@ -111,6 +115,7 @@ struct nvmf_ctrl_options {
 	unsigned int		nr_write_queues;
 	unsigned int		nr_poll_queues;
 	int			tos;
+	unsigned int	fast_io_fail_tmo;
 };

 /*
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 797c183..a72685a 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -281,9 +281,11 @@ static bool nvme_available_path(struct nvme_ns_head *head)

 	list_for_each_entry_rcu(ns, &head->list, siblings) {
 		switch (ns->ctrl->state) {
+		case NVME_CTRL_CONNECTING:
+		    if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
+				break;
 		case NVME_CTRL_LIVE:
 		case NVME_CTRL_RESETTING:
-		case NVME_CTRL_CONNECTING:
 			/* fallthru */
 			return true;
 		default:
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 1024fec..b6a199e 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -256,6 +256,7 @@ struct nvme_ctrl {
 	struct work_struct scan_work;
 	struct work_struct async_event_work;
 	struct delayed_work ka_work;
+	struct delayed_work failfast_work;
 	struct nvme_command ka_cmd;
 	struct work_struct fw_act_work;
 	unsigned long events;
@@ -289,6 +290,8 @@ struct nvme_ctrl {
 	u16 icdoff;
 	u16 maxcmd;
 	int nr_reconnects;
+	unsigned long flags;
+#define NVME_CTRL_FAILFAST_EXPIRED	0
 	struct nvmf_ctrl_options *opts;

 	struct page *discard_page;
--
1.8.3.1

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v3] nvme-fabrics: reject I/O to offline device
  2020-02-26  8:52       ` Victor Gladkov
@ 2020-03-02  3:30         ` Chaitanya Kulkarni
  0 siblings, 0 replies; 6+ messages in thread
From: Chaitanya Kulkarni @ 2020-03-02  3:30 UTC (permalink / raw)
  To: Victor Gladkov, James Smart, Sagi Grimberg, Christoph Hellwig
  Cc: Mike Snitzer, linux-nvme, Hannes Reinecke

When reviewing this patch I found a bug, along with some
general kernel style fixes. Sent out updated version V4
please have a look.

On 02/26/2020 12:53 AM, Victor Gladkov wrote:
> Refactoring according the last reviews:
> - Move call functionality out the state machine.
> - Break long lines
> - Change parameter name from " fast_fail_tmo" to " fast_io_fail_tmo"
> -----------------------------------------
> Commands get stuck while Host NVMe controller (TCP or RDMA) is in
> reconnect state. NVMe controller enters into reconnect state when it loses
> connection with the target. It tries to reconnect every 10 seconds
> (default) until successful reconnection or until reconnect time-out is
> reached. The default reconnect time out is 10 minutes.
>
> 1. Add a new session parameter called "fast_io_fail_tmo".
> The timeout is measured in seconds from the controller reconnect,
> any command beyond that timeout is rejected.
> The new parameter value may be passed during 'connect'.
> Default value of 0 means no timeout (in similar to current behavior).
> 2. Add a controller flag of "NVME_CTRL_FAILFAST_EXPIRED".
> 3. Add dedicated delayed_work that update the "NVME_CTRL_FAILFAST_EXPIRED"
> controller flag.
> 4. When entering CONNECTING, schedule the delayed_work based on
> failfast timeout value. If transition out of CONNECTING, terminate
> delayed work item and ensure failfast_expired is false. If delayed work
> item expires: set "NVME_CTRL_FAILFAST_EXPIRED" flag to true.
> 5. Update nvmf_fail_nonready_command() and nvme_available_path()
> functions with check the "NVME_CTRL_FAILFAST_EXPIRED" controller flag.
>
> Signed-off-by: Victor Gladkov < victor.gladkov at kioxia.com>


_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-03-02  3:31 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-04 15:49 [PATCH v3] nvme-fabrics: reject I/O to offline device Victor Gladkov
2020-02-19 15:28 ` Christoph Hellwig
2020-02-20  8:34   ` Sagi Grimberg
2020-02-20 17:41     ` James Smart
2020-02-26  8:52       ` Victor Gladkov
2020-03-02  3:30         ` Chaitanya Kulkarni

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.