From: Victor Gladkov <Victor.Gladkov@kioxia.com>
To: "linux-nvme@lists.infradead.org" <linux-nvme@lists.infradead.org>
Cc: Hannes Reinecke <hare@suse.de>, Sagi Grimberg <sagi@grimberg.me>,
James Smart <james.smart@broadcom.com>
Subject: RE: [PATCH v2] nvme-fabrics: reject I/O to offline device
Date: Wed, 15 Jan 2020 15:42:47 +0000 [thread overview]
Message-ID: <70e5b7e60c504c1e8cba7c6be122dd7f@kioxia.com> (raw)
In-Reply-To: <98e76717-1e98-92e1-0d07-d2dac4bd1d76@broadcom.com>
1. Added multipath support for this patch.
2. Small refactoring (according to the review)
On 1/8/2020 9:47 PM, James Smart wrote:
>
> I can agree with this - default behavior is old behavior - but given we are
> talking many minutes for old behavior, perhaps we should be making 0 be
> immediate failure and not preserve the old behavior ?? Thoughts from
> anyone ?
>
>
> overall - looks good. I'd like to see what the answer is to what our default
> action should be (see above, fail_fast = 0) and agree with the multipath
> comment Hannes had.
James, please see your comment on Tue Dec 17 10:03:26 PST 2019
http://lists.infradead.org/pipermail/linux-nvme/2019-December/028483.html
>> + if (fail_fast_tmo < 0)
>> + opts->fail_fast_tmo_ns = -1;
>would prefer it be set to 0 to mean disabled. [J.S.]
On 1/7/2020 6:18 PM, Hannes Reinecke wrote:
>
> What happens if the controller reconnects _after_ failfast has triggered?
> From my reading it would simply establish the connection, so I/O _could_
> continue to be served from this path.
> But seeing that I/O has already been failed due to failfast tmo the system will
> most likely have taken corrective action, such as failing over to another path
> or informing the cluster manager etc.
> So who's going to inform the upper layers that the path has become live again?
>
The function nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) schedules a requeue_work
for all namespaces on the controller.
See drivers/nvme/host/core.c:
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state)
{
..............................................................................
if (changed && ctrl->state == NVME_CTRL_LIVE)
nvme_kick_requeue_lists(ctrl);
return changed;
}
---------------------------------
---------------------------------
The updated patch:
branch nvme/for-5.5
---------------------------------
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2a84e14..aa51d6a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -321,6 +321,37 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved)
}
EXPORT_SYMBOL_GPL(nvme_cancel_request);
+static void nvme_failfast_work(struct work_struct *work)
+{
+ struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
+ struct nvme_ctrl, failfast_work);
+
+ spin_lock_irq(&ctrl->lock);
+ if (ctrl->state == NVME_CTRL_CONNECTING) {
+ set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
+ dev_info(ctrl->device, "failfast expired set for controller %s\n", ctrl->opts->subsysnqn);
+ nvme_kick_requeue_lists(ctrl);
+ }
+ spin_unlock_irq(&ctrl->lock);
+}
+
+static void nvme_start_failfast_work(struct nvme_ctrl *ctrl)
+{
+ if (unlikely(ctrl->opts->fail_fast_tmo == 0))
+ return;
+
+ schedule_delayed_work(&ctrl->failfast_work, ctrl->opts->fail_fast_tmo * HZ);
+}
+
+static void nvme_stop_failfast_work(struct nvme_ctrl *ctrl)
+{
+ if (unlikely(ctrl->opts->fail_fast_tmo == 0))
+ return;
+
+ cancel_delayed_work_sync(&ctrl->failfast_work);
+ clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
+}
+
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state)
{
@@ -334,9 +365,10 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
switch (new_state) {
case NVME_CTRL_LIVE:
switch (old_state) {
+ case NVME_CTRL_CONNECTING:
+ nvme_stop_failfast_work(ctrl);
case NVME_CTRL_NEW:
case NVME_CTRL_RESETTING:
- case NVME_CTRL_CONNECTING:
changed = true;
/* FALLTHRU */
default:
@@ -355,8 +387,9 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
break;
case NVME_CTRL_CONNECTING:
switch (old_state) {
- case NVME_CTRL_NEW:
case NVME_CTRL_RESETTING:
+ nvme_start_failfast_work(ctrl);
+ case NVME_CTRL_NEW:
changed = true;
/* FALLTHRU */
default:
@@ -3979,6 +4012,7 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
{
nvme_mpath_stop(ctrl);
nvme_stop_keep_alive(ctrl);
+ nvme_stop_failfast_work(ctrl);
flush_work(&ctrl->async_event_work);
cancel_work_sync(&ctrl->fw_act_work);
}
@@ -4043,6 +4077,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
int ret;
ctrl->state = NVME_CTRL_NEW;
+ clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
spin_lock_init(&ctrl->lock);
mutex_init(&ctrl->scan_lock);
INIT_LIST_HEAD(&ctrl->namespaces);
@@ -4057,6 +4092,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
init_waitqueue_head(&ctrl->state_wq);
INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
+ INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 74b8818..16e5464 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -549,6 +549,7 @@ blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl,
{
if (ctrl->state != NVME_CTRL_DELETING &&
ctrl->state != NVME_CTRL_DEAD &&
+ !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
return BLK_STS_RESOURCE;
@@ -612,6 +613,7 @@ bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
{ NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" },
{ NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" },
{ NVMF_OPT_TOS, "tos=%d" },
+ { NVMF_OPT_FAIL_FAST_TMO, "fail_fast_tmo=%d" },
{ NVMF_OPT_ERR, NULL }
};
@@ -630,6 +632,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
opts->nr_io_queues = num_online_cpus();
opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY;
opts->kato = NVME_DEFAULT_KATO;
+ opts->fail_fast_tmo = NVMF_DEF_FAIL_FAST_TMO;
opts->duplicate_connect = false;
opts->hdr_digest = false;
opts->data_digest = false;
@@ -751,6 +754,17 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
pr_warn("ctrl_loss_tmo < 0 will reconnect forever\n");
ctrl_loss_tmo = token;
break;
+ case NVMF_OPT_FAIL_FAST_TMO:
+ if (match_int(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (token)
+ pr_warn("fail_fast_tmo != 0, I/O will failed on reconnect controller after %d sec\n", token);
+
+ opts->fail_fast_tmo = token;
+ break;
case NVMF_OPT_HOSTNQN:
if (opts->host) {
pr_err("hostnqn already user-assigned: %s\n",
@@ -881,11 +895,14 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
opts->nr_poll_queues = 0;
opts->duplicate_connect = true;
}
- if (ctrl_loss_tmo < 0)
+ if (ctrl_loss_tmo < 0) {
opts->max_reconnects = -1;
- else
+ } else {
opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
opts->reconnect_delay);
+ if(ctrl_loss_tmo < opts->fail_fast_tmo)
+ pr_warn("failfast tmo (%d) larger than controller loss tmo (%d)\n", opts->fail_fast_tmo, ctrl_loss_tmo);
+ }
if (!opts->host) {
kref_get(&nvmf_default_host->ref);
@@ -985,7 +1002,7 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts)
#define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \
NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\
- NVMF_OPT_DISABLE_SQFLOW)
+ NVMF_OPT_DISABLE_SQFLOW | NVMF_OPT_FAIL_FAST_TMO)
static struct nvme_ctrl *
nvmf_create_ctrl(struct device *dev, const char *buf)
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index a0ec40a..fd8c7dd 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -15,6 +15,8 @@
#define NVMF_DEF_RECONNECT_DELAY 10
/* default to 600 seconds of reconnect attempts before giving up */
#define NVMF_DEF_CTRL_LOSS_TMO 600
+/* default to 30 seconds of fail fast IO commands */
+#define NVMF_DEF_FAIL_FAST_TMO 0
/*
* Define a host as seen by the target. We allocate one at boot, but also
@@ -56,6 +58,7 @@ enum {
NVMF_OPT_NR_WRITE_QUEUES = 1 << 17,
NVMF_OPT_NR_POLL_QUEUES = 1 << 18,
NVMF_OPT_TOS = 1 << 19,
+ NVMF_OPT_FAIL_FAST_TMO = 1 << 20,
};
/**
@@ -89,6 +92,7 @@ enum {
* @nr_write_queues: number of queues for write I/O
* @nr_poll_queues: number of queues for polling I/O
* @tos: type of service
+ * @fast_fail_tmo_ns: Fast I/O fail timeout in nanoseconds;
*/
struct nvmf_ctrl_options {
unsigned mask;
@@ -111,6 +115,7 @@ struct nvmf_ctrl_options {
unsigned int nr_write_queues;
unsigned int nr_poll_queues;
int tos;
+ unsigned int fail_fast_tmo;
};
/*
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 797c183..4edcaf1 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -281,9 +281,11 @@ static bool nvme_available_path(struct nvme_ns_head *head)
list_for_each_entry_rcu(ns, &head->list, siblings) {
switch (ns->ctrl->state) {
+ case NVME_CTRL_CONNECTING:
+ if(test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
+ break;
case NVME_CTRL_LIVE:
case NVME_CTRL_RESETTING:
- case NVME_CTRL_CONNECTING:
/* fallthru */
return true;
default:
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 1024fec..b6a199e 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -256,6 +256,7 @@ struct nvme_ctrl {
struct work_struct scan_work;
struct work_struct async_event_work;
struct delayed_work ka_work;
+ struct delayed_work failfast_work;
struct nvme_command ka_cmd;
struct work_struct fw_act_work;
unsigned long events;
@@ -289,6 +290,8 @@ struct nvme_ctrl {
u16 icdoff;
u16 maxcmd;
int nr_reconnects;
+ unsigned long flags;
+#define NVME_CTRL_FAILFAST_EXPIRED 0
struct nvmf_ctrl_options *opts;
struct page *discard_page;
-----------------
Regards,
Victor
_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme
next prev parent reply other threads:[~2020-01-15 15:43 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-01-01 16:45 [PATCH v2] nvme-fabrics: reject I/O to offline device Victor Gladkov
2020-01-07 16:17 ` Hannes Reinecke
2020-01-08 19:47 ` James Smart
2020-01-15 15:42 ` Victor Gladkov [this message]
2020-01-26 10:06 ` Victor Gladkov
2020-01-30 15:08 ` Christoph Hellwig
2020-02-03 13:40 ` Victor Gladkov
2020-02-03 17:08 ` Mike Snitzer
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=70e5b7e60c504c1e8cba7c6be122dd7f@kioxia.com \
--to=victor.gladkov@kioxia.com \
--cc=hare@suse.de \
--cc=james.smart@broadcom.com \
--cc=linux-nvme@lists.infradead.org \
--cc=sagi@grimberg.me \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).