All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Added NVMe MP failover policy.user can configure failover progression time. it don't have any conflict with ANA. it's a simple failover policy that will tell multipather how long it should wait before sending command to next available path
@ 2018-09-23 21:45 Susobhan Dey
  2018-09-24  4:06 ` Sagi Grimberg
  0 siblings, 1 reply; 5+ messages in thread
From: Susobhan Dey @ 2018-09-23 21:45 UTC (permalink / raw)


---
 drivers/nvme/host/multipath.c | 59 +++++++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h      | 19 ++++++++++-
 2 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 5a9562881d4e..dbe5d24a3a14 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -20,6 +20,11 @@ module_param(multipath, bool, 0444);
 MODULE_PARM_DESC(multipath,
 	"turn on native support for multiple controllers per subsystem");
 
+unsigned int failover_tt = 0;
+module_param(failover_tt, uint, 0644);
+MODULE_PARM_DESC(failover_tt,
+	"failover progression time.");
+
 inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
 {
 	return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
@@ -47,6 +52,22 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 	}
 }
 
+/*
+ * when fo_work executes that eventually means that failover_tt is
+ * over. Now multipather can send IOs to next live path.
+ * hence inside the function we clear failover progress bit of mpath
+ * so that multipath_make_request can choose current path and send
+ * IOs down
+ */
+static void nvme_mpath_fo_work(struct work_struct *work)
+{
+	struct nvme_ns_head *head =
+		container_of(to_delayed_work(work), struct nvme_ns_head, fo_work);
+	nvme_mpath_clear_fo_in_progress(head);
+	/* kick requeue list as path is now available */
+	kblockd_schedule_work(&head->requeue_work);
+}
+
 void nvme_failover_req(struct request *req)
 {
 	struct nvme_ns *ns = req->q->queuedata;
@@ -58,6 +79,22 @@ void nvme_failover_req(struct request *req)
 	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
 	blk_mq_end_request(req, 0);
 
+	/*
+	 * if user configure failover_tt, meaning next path is
+	 * not immediately available to serve IOs. In that case 
+	 * multipather has to wait till failover_tt before pushing
+	 * IOs to next live path.
+	 */
+	if (failover_tt)
+	{
+		/* set failover progression bit of mpath NS */
+		if (!nvme_mpath_set_fo_in_progress(ns->head))
+		{
+			INIT_DELAYED_WORK(&ns->head->fo_work, nvme_mpath_fo_work);
+			schedule_delayed_work(&ns->head->fo_work,failover_tt*HZ);
+		}
+	}
+
 	switch (status & 0x7ff) {
 	case NVME_SC_ANA_TRANSITION:
 	case NVME_SC_ANA_INACCESSIBLE:
@@ -141,12 +178,34 @@ static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
 		ns->ana_state == NVME_ANA_OPTIMIZED;
 }
 
+static inline bool nvme_fo_is_in_progress(struct nvme_ns_head *head)
+{
+	/*
+	 * if failover_tt is not given, implicitly assume progression
+	 * time is 0. so multipather immediately choose next path
+	 * start sending IOs to that.
+	 */
+	if (!failover_tt)
+		return false;
+	if (test_bit(NVME_NS_FAILOVER_IN_PROGRESS, &head->flags))
+		return true;
+	return false;
+}
+
 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 {
 	struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu);
+	/*
+	 * if failover_tt is configured then it will check failover
+	 * progression status. default it has no effect.
+	 */
+	if (nvme_fo_is_in_progress(head))
+		return NULL;
 
 	if (unlikely(!ns || !nvme_path_is_optimized(ns)))
+	{
 		ns = __nvme_find_path(head);
+	}
 	return ns;
 }
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index bb4a2003c097..c23d5f46db6e 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -283,7 +283,10 @@ struct nvme_ns_head {
 	struct bio_list		requeue_list;
 	spinlock_t		requeue_lock;
 	struct work_struct	requeue_work;
+	struct delayed_work     fo_work;
 	struct mutex		lock;
+#define NVME_NS_FAILOVER_IN_PROGRESS 4
+	unsigned long flags;
 #endif
 	struct list_head	list;
 	struct srcu_struct      srcu;
@@ -478,10 +481,17 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl);
 static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
 {
 	struct nvme_ns_head *head = ns->head;
-
 	if (head && ns == rcu_access_pointer(head->current_path))
 		rcu_assign_pointer(head->current_path, NULL);
 }
+static inline int nvme_mpath_set_fo_in_progress(struct nvme_ns_head *head)
+{
+	return (test_and_set_bit(NVME_NS_FAILOVER_IN_PROGRESS, &head->flags));
+}
+static inline void nvme_mpath_clear_fo_in_progress(struct nvme_ns_head *head)
+{
+	test_and_clear_bit(NVME_NS_FAILOVER_IN_PROGRESS, &head->flags);
+}
 struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
 
 static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
@@ -534,6 +544,13 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
 static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
 {
 }
+static inline int nvme_mpath_set_fo_in_progress(struct nvme_ns_head *head)
+{
+	return 0
+}
+static inline void nvme_mpath_clear_fo_in_progress(struct nvme_ns_head *head)
+{
+}
 static inline int nvme_mpath_init(struct nvme_ctrl *ctrl,
 		struct nvme_id_ctrl *id)
 {
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH] Added NVMe MP failover policy.user can configure failover progression time. it don't have any conflict with ANA. it's a simple failover policy that will tell multipather how long it should wait before sending command to next available path
  2018-09-23 21:45 [PATCH] Added NVMe MP failover policy.user can configure failover progression time. it don't have any conflict with ANA. it's a simple failover policy that will tell multipather how long it should wait before sending command to next available path Susobhan Dey
@ 2018-09-24  4:06 ` Sagi Grimberg
  2018-09-24  7:31   ` [PATCH v2] NVMe multipath Added configarable failover progression time Susobhan Dey
  0 siblings, 1 reply; 5+ messages in thread
From: Sagi Grimberg @ 2018-09-24  4:06 UTC (permalink / raw)


Hi Susobhan,

Please fix split your change log to a title and body.

Please explain what is this trying to achieve instead of
mentioning "it don't have any conflict with ANA".

What is the point in delaying failover if we have a different
path available?

You are also adding an atomic operation to the hot path for
a questionable rational.

In a different note, I suggest we stop adding bit flags for namespace
states.

And modparam is not the way to go here at all.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v2] NVMe multipath Added configarable failover progression time
  2018-09-24  4:06 ` Sagi Grimberg
@ 2018-09-24  7:31   ` Susobhan Dey
  2018-09-24  9:16     ` Hannes Reinecke
  2018-09-24  9:40     ` Sagi Grimberg
  0 siblings, 2 replies; 5+ messages in thread
From: Susobhan Dey @ 2018-09-24  7:31 UTC (permalink / raw)


Added a module parameter failover_tt to tell multipather
how long it should wait before sending commands to next
available path. The whole reason of adding it, not all
architecture can handle IOs in active/active fashion even
though next path is live/accessible it can't handle IOs
immediately. We should give some time to underneath controller
to make itself active. ANA gives them the provision,using ANA
it is possible to advertise path status. Say for example if
path is in transition then host will requeue the commands till
path become optimized/unoptimized if no other path is optimized.
So this is a host side additional provision where multipather
will wait for sometime for example 10 sec and by that time controller
should be active to handle IOs. Whole point is controller is live
does not necessarily mean that controller is active.

Signed-off-by: Susobhan Dey <susobhan.dey at gmail.com>
---
 drivers/nvme/host/multipath.c | 59 +++++++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h      | 19 ++++++++++-
 2 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 5a9562881d4e..dbe5d24a3a14 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -20,6 +20,11 @@ module_param(multipath, bool, 0444);
 MODULE_PARM_DESC(multipath,
 	"turn on native support for multiple controllers per subsystem");
 
+unsigned int failover_tt = 0;
+module_param(failover_tt, uint, 0644);
+MODULE_PARM_DESC(failover_tt,
+	"failover progression time.");
+
 inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
 {
 	return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
@@ -47,6 +52,22 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 	}
 }
 
+/*
+ * when fo_work executes that eventually means that failover_tt is
+ * over. Now multipather can send IOs to next live path.
+ * hence inside the function we clear failover progress bit of mpath
+ * so that multipath_make_request can choose current path and send
+ * IOs down
+ */
+static void nvme_mpath_fo_work(struct work_struct *work)
+{
+	struct nvme_ns_head *head =
+		container_of(to_delayed_work(work), struct nvme_ns_head, fo_work);
+	nvme_mpath_clear_fo_in_progress(head);
+	/* kick requeue list as path is now available */
+	kblockd_schedule_work(&head->requeue_work);
+}
+
 void nvme_failover_req(struct request *req)
 {
 	struct nvme_ns *ns = req->q->queuedata;
@@ -58,6 +79,22 @@ void nvme_failover_req(struct request *req)
 	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
 	blk_mq_end_request(req, 0);
 
+	/*
+	 * if user configure failover_tt, meaning next path is
+	 * not immediately available to serve IOs. In that case 
+	 * multipather has to wait till failover_tt before pushing
+	 * IOs to next live path.
+	 */
+	if (failover_tt)
+	{
+		/* set failover progression bit of mpath NS */
+		if (!nvme_mpath_set_fo_in_progress(ns->head))
+		{
+			INIT_DELAYED_WORK(&ns->head->fo_work, nvme_mpath_fo_work);
+			schedule_delayed_work(&ns->head->fo_work,failover_tt*HZ);
+		}
+	}
+
 	switch (status & 0x7ff) {
 	case NVME_SC_ANA_TRANSITION:
 	case NVME_SC_ANA_INACCESSIBLE:
@@ -141,12 +178,34 @@ static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
 		ns->ana_state == NVME_ANA_OPTIMIZED;
 }
 
+static inline bool nvme_fo_is_in_progress(struct nvme_ns_head *head)
+{
+	/*
+	 * if failover_tt is not given, implicitly assume progression
+	 * time is 0. so multipather immediately choose next path
+	 * start sending IOs to that.
+	 */
+	if (!failover_tt)
+		return false;
+	if (test_bit(NVME_NS_FAILOVER_IN_PROGRESS, &head->flags))
+		return true;
+	return false;
+}
+
 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 {
 	struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu);
+	/*
+	 * if failover_tt is configured then it will check failover
+	 * progression status. default it has no effect.
+	 */
+	if (nvme_fo_is_in_progress(head))
+		return NULL;
 
 	if (unlikely(!ns || !nvme_path_is_optimized(ns)))
+	{
 		ns = __nvme_find_path(head);
+	}
 	return ns;
 }
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index bb4a2003c097..c23d5f46db6e 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -283,7 +283,10 @@ struct nvme_ns_head {
 	struct bio_list		requeue_list;
 	spinlock_t		requeue_lock;
 	struct work_struct	requeue_work;
+	struct delayed_work     fo_work;
 	struct mutex		lock;
+#define NVME_NS_FAILOVER_IN_PROGRESS 4
+	unsigned long flags;
 #endif
 	struct list_head	list;
 	struct srcu_struct      srcu;
@@ -478,10 +481,17 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl);
 static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
 {
 	struct nvme_ns_head *head = ns->head;
-
 	if (head && ns == rcu_access_pointer(head->current_path))
 		rcu_assign_pointer(head->current_path, NULL);
 }
+static inline int nvme_mpath_set_fo_in_progress(struct nvme_ns_head *head)
+{
+	return (test_and_set_bit(NVME_NS_FAILOVER_IN_PROGRESS, &head->flags));
+}
+static inline void nvme_mpath_clear_fo_in_progress(struct nvme_ns_head *head)
+{
+	test_and_clear_bit(NVME_NS_FAILOVER_IN_PROGRESS, &head->flags);
+}
 struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
 
 static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
@@ -534,6 +544,13 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
 static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
 {
 }
+static inline int nvme_mpath_set_fo_in_progress(struct nvme_ns_head *head)
+{
+	return 0
+}
+static inline void nvme_mpath_clear_fo_in_progress(struct nvme_ns_head *head)
+{
+}
 static inline int nvme_mpath_init(struct nvme_ctrl *ctrl,
 		struct nvme_id_ctrl *id)
 {
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH v2] NVMe multipath Added configarable failover progression time
  2018-09-24  7:31   ` [PATCH v2] NVMe multipath Added configarable failover progression time Susobhan Dey
@ 2018-09-24  9:16     ` Hannes Reinecke
  2018-09-24  9:40     ` Sagi Grimberg
  1 sibling, 0 replies; 5+ messages in thread
From: Hannes Reinecke @ 2018-09-24  9:16 UTC (permalink / raw)


On 9/24/18 9:31 AM, Susobhan Dey wrote:
> Added a module parameter failover_tt to tell multipather
> how long it should wait before sending commands to next
> available path. The whole reason of adding it, not all
> architecture can handle IOs in active/active fashion even
> though next path is live/accessible it can't handle IOs
> immediately. We should give some time to underneath controller
> to make itself active. ANA gives them the provision,using ANA
> it is possible to advertise path status. Say for example if
> path is in transition then host will requeue the commands till
> path become optimized/unoptimized if no other path is optimized.
> So this is a host side additional provision where multipather
> will wait for sometime for example 10 sec and by that time controller
> should be active to handle IOs. Whole point is controller is live
> does not necessarily mean that controller is active.
> 
> Signed-off-by: Susobhan Dey <susobhan.dey at gmail.com>
> ---
>   drivers/nvme/host/multipath.c | 59 +++++++++++++++++++++++++++++++++++
>   drivers/nvme/host/nvme.h      | 19 ++++++++++-
>   2 files changed, 77 insertions(+), 1 deletion(-)
> 
That all sounds very dodgy.

If a controller is not able to accept I/Os immediately after switching
(Remember: this is NVMe-oF, where the _controller_ is responsible for 
switching paths, _not_ the host) it really should be setting the state 
to 'transitioning'.
And set the final state once it's ready to service I/Os.
Yes, this might result in all paths being set to 'transitioning', but 
that's well within scope.
Any controller which signals 'active/optimized' but is not able to 
service I/O really should be declared dead, and not worked around.
Please update the controller firmware to return 'transitioning' if it's 
not ready to service I/O after failover.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare at suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: F. Imend?rffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG N?rnberg)

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v2] NVMe multipath Added configarable failover progression time
  2018-09-24  7:31   ` [PATCH v2] NVMe multipath Added configarable failover progression time Susobhan Dey
  2018-09-24  9:16     ` Hannes Reinecke
@ 2018-09-24  9:40     ` Sagi Grimberg
  1 sibling, 0 replies; 5+ messages in thread
From: Sagi Grimberg @ 2018-09-24  9:40 UTC (permalink / raw)



> Added a module parameter failover_tt to tell multipather
> how long it should wait before sending commands to next
> available path. The whole reason of adding it, not all
> architecture can handle IOs in active/active fashion even
> though next path is live/accessible it can't handle IOs
> immediately.

Then it shouldn't be exposed as a path to the host.

> We should give some time to underneath controller
> to make itself active.

If its not active, don't have the host connected to it
in the first place. Otherwise, block the I/O on the controller
end until it gets into shape. non-optimized state semantics tells the
host that I/O service will not be optimal through that path, so
it should be well expected.

> ANA gives them the provision,using ANA
> it is possible to advertise path status.

Its not status, its state.

> Say for example if
> path is in transition then host will requeue the commands till
> path become optimized/unoptimized if no other path is optimized.
> So this is a host side additional provision where multipather
> will wait for sometime for example 10 sec and by that time controller
> should be active to handle IOs.

I'm sorry, but there is no justification for this sort of bogus setting.
If you have a proposal to address some architecture that asks the host
to block I/O although it has an available path, please propose a
technical proposal and make it a standard extension that can be
interrogated by the host.

Adding a global knob in the form of a modparam is a non-starter.

> Whole point is controller is live
> does not necessarily mean that controller is active.

Then what does it mean? its enabled, allocated I/O queues, contains
namespaces, exposes ANA states etc.. It should be capable of servicing
I/O.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2018-09-24  9:40 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-09-23 21:45 [PATCH] Added NVMe MP failover policy.user can configure failover progression time. it don't have any conflict with ANA. it's a simple failover policy that will tell multipather how long it should wait before sending command to next available path Susobhan Dey
2018-09-24  4:06 ` Sagi Grimberg
2018-09-24  7:31   ` [PATCH v2] NVMe multipath Added configarable failover progression time Susobhan Dey
2018-09-24  9:16     ` Hannes Reinecke
2018-09-24  9:40     ` Sagi Grimberg

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.