Linux-NVME Archive on lore.kernel.org
 help / color / Atom feed
* [PATCHv2] nvme-mpath: delete disk after last connection
@ 2021-04-16  6:24 Hannes Reinecke
  2021-04-20  8:05 ` Christoph Hellwig
  2021-04-20 14:16 ` Keith Busch
  0 siblings, 2 replies; 12+ messages in thread
From: Hannes Reinecke @ 2021-04-16  6:24 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Sagi Grimberg, Keith Busch, linux-nvme, Keith Busch,
	Hannes Reinecke, Daniel Wagner

From: Keith Busch <kbusch@kernel.org>

The multipath code currently deletes the disk only after all references
to it are dropped rather than when the last path to that disk is lost.
This has been reported to cause problems with some usage, like MD RAID.

Delete the disk when the last path is gone. This is the same behavior we
currently have with non-multipathed nvme devices.

The following is just a simple example that demonstrates what is currently
observed using a simple nvme loop back (loop setup file not shown):

 # nvmetcli restore loop.json
 [   31.156452] nvmet: adding nsid 1 to subsystem testnqn1
 [   31.159140] nvmet: adding nsid 1 to subsystem testnqn2

 # nvme connect -t loop -n testnqn1 -q hostnqn
 [   36.866302] nvmet: creating controller 1 for subsystem testnqn1 for NQN hostnqn.
 [   36.872926] nvme nvme3: new ctrl: "testnqn1"

 # nvme connect -t loop -n testnqn1 -q hostnqn
 [   38.227186] nvmet: creating controller 2 for subsystem testnqn1 for NQN hostnqn.
 [   38.234450] nvme nvme4: new ctrl: "testnqn1"

 # nvme connect -t loop -n testnqn2 -q hostnqn
 [   43.902761] nvmet: creating controller 3 for subsystem testnqn2 for NQN hostnqn.
 [   43.907401] nvme nvme5: new ctrl: "testnqn2"

 # nvme connect -t loop -n testnqn2 -q hostnqn
 [   44.627689] nvmet: creating controller 4 for subsystem testnqn2 for NQN hostnqn.
 [   44.641773] nvme nvme6: new ctrl: "testnqn2"

 # mdadm --create /dev/md0 --level=mirror --raid-devices=2 /dev/nvme3n1 /dev/nvme5n1
 [   53.497038] md/raid1:md0: active with 2 out of 2 mirrors
 [   53.501717] md0: detected capacity change from 0 to 66060288

 # cat /proc/mdstat
 Personalities : [raid1]
 md0 : active raid1 nvme5n1[1] nvme3n1[0]
       64512 blocks super 1.2 [2/2] [UU]

Now delete all paths to one of the namespaces:

 # echo 1 > /sys/class/nvme/nvme3/delete_controller
 # echo 1 > /sys/class/nvme/nvme4/delete_controller

We have no path, but mdstat says:

 # cat /proc/mdstat
 Personalities : [raid1]
 md0 : active (auto-read-only) raid1 nvme5n1[1]
       64512 blocks super 1.2 [2/1] [_U]

And this is reported to cause a problem.

With the proposed patch, the following messages appear:

 [  227.516807] md/raid1:md0: Disk failure on nvme3n1, disabling device.
 [  227.516807] md/raid1:md0: Operation continuing on 1 devices.

And mdstat shows only the viable members:

 # cat /proc/mdstat
 Personalities : [raid1]
 md0 : active (auto-read-only) raid1 nvme5n1[1]
       64512 blocks super 1.2 [2/1] [_U]

Reported-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Daniel Wagner <daniel.wagner@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/core.c      |  2 +-
 drivers/nvme/host/multipath.c |  1 -
 drivers/nvme/host/nvme.h      | 11 ++++++++++-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 40f08e6325ef..e89ec2522ca6 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -542,7 +542,7 @@ static void nvme_free_ns_head(struct kref *ref)
 	struct nvme_ns_head *head =
 		container_of(ref, struct nvme_ns_head, ref);
 
-	nvme_mpath_remove_disk(head);
+	nvme_mpath_put_disk(head);
 	ida_simple_remove(&head->subsys->ns_ida, head->instance);
 	cleanup_srcu_struct(&head->srcu);
 	nvme_put_subsystem(head->subsys);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 68918ea1d3d0..86178cb220c9 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -729,7 +729,6 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head)
 		 */
 		head->disk->queue = NULL;
 	}
-	put_disk(head->disk);
 }
 
 int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index c6102ce83bb4..4140789a3628 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -693,12 +693,18 @@ bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl);
 struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
 
+static inline void nvme_mpath_put_disk(struct nvme_ns_head *head)
+{
+	if (head->disk)
+		put_disk(head->disk);
+}
+
 static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
 {
 	struct nvme_ns_head *head = ns->head;
 
 	if (head->disk && list_empty(&head->list))
-		kblockd_schedule_work(&head->requeue_work);
+		nvme_mpath_remove_disk(head);
 }
 
 static inline void nvme_trace_bio_complete(struct request *req)
@@ -738,6 +744,9 @@ static inline void nvme_mpath_add_disk(struct nvme_ns *ns,
 		struct nvme_id_ns *id)
 {
 }
+static inline void nvme_mpath_put_disk(struct nvme_ns_head *head)
+{
+}
 static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
 {
 }
-- 
2.26.2


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCHv2] nvme-mpath: delete disk after last connection
  2021-04-16  6:24 [PATCHv2] nvme-mpath: delete disk after last connection Hannes Reinecke
@ 2021-04-20  8:05 ` Christoph Hellwig
       [not found]   ` <91f25f95-e5a0-466f-b410-6f6dafdec0a0@email.android.com>
  2021-04-20 13:19   ` Hannes Reinecke
  2021-04-20 14:16 ` Keith Busch
  1 sibling, 2 replies; 12+ messages in thread
From: Christoph Hellwig @ 2021-04-20  8:05 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Christoph Hellwig, Sagi Grimberg, Keith Busch, linux-nvme,
	Keith Busch, Daniel Wagner

On Fri, Apr 16, 2021 at 08:24:11AM +0200, Hannes Reinecke wrote:
> With the proposed patch, the following messages appear:
> 
>  [  227.516807] md/raid1:md0: Disk failure on nvme3n1, disabling device.
>  [  227.516807] md/raid1:md0: Operation continuing on 1 devices.

So how is this going to work for e.g. a case where the device
disappear due to resets or fabrics connection problems?  This now
directly teards down the device.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCHv2] nvme-mpath: delete disk after last connection
       [not found]   ` <91f25f95-e5a0-466f-b410-6f6dafdec0a0@email.android.com>
@ 2021-04-20  9:54     ` Christoph Hellwig
  0 siblings, 0 replies; 12+ messages in thread
From: Christoph Hellwig @ 2021-04-20  9:54 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Christoph Hellwig, Sagi Grimberg, Keith Busch, linux-nvme,
	Keith Busch, Daniel Wagner

On Tue, Apr 20, 2021 at 11:41:23AM +0200, Hannes Reinecke wrote:
> <div dir='auto'><div dir="auto"><div dir="auto"><br></div><div><br><div class="elided-text">Am 20.04.2021 10:05 schrieb Christoph Hellwig &lt;hch@lst.de&gt;:<br type="attribution"><blockquote style="margin:0 0 0 0.8ex;border-left:1px #ccc solid;padding-left:1ex"><p dir="ltr">On Fri, Apr 16, 2021 at 08:24:11AM +0200, Hannes Reinecke wrote:

I'm unable to read this garbage unfortunately.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCHv2] nvme-mpath: delete disk after last connection
  2021-04-20  8:05 ` Christoph Hellwig
       [not found]   ` <91f25f95-e5a0-466f-b410-6f6dafdec0a0@email.android.com>
@ 2021-04-20 13:19   ` Hannes Reinecke
  2021-04-20 14:14     ` Keith Busch
  1 sibling, 1 reply; 12+ messages in thread
From: Hannes Reinecke @ 2021-04-20 13:19 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Sagi Grimberg, Keith Busch, linux-nvme, Keith Busch, Daniel Wagner

On 4/20/21 10:05 AM, Christoph Hellwig wrote:
> On Fri, Apr 16, 2021 at 08:24:11AM +0200, Hannes Reinecke wrote:
>> With the proposed patch, the following messages appear:
>>
>>   [  227.516807] md/raid1:md0: Disk failure on nvme3n1, disabling device.
>>   [  227.516807] md/raid1:md0: Operation continuing on 1 devices.
> 
> So how is this going to work for e.g. a case where the device
> disappear due to resets or fabrics connection problems?  This now
> directly teards down the device.
> 
Yes, that is correct; the nshead will be removed once the last path is 
_removed_. But key point here is that once the system finds itself in 
that situation it's impossible to recover, as the refcounts are messed.
Even a manual connect call with the same parameter will _not_ restore 
operation, but rather result in a new namespace.
So with this patch we change from stalled I/O (where the user has to
reboot the machine to restore operation) to I/O errors (giving the user 
at least a _chance_ to recover).

I can easily consider adding a 'queue_if_no_path' option to allow I/O to 
be held even once all paths are disconnected, but that will be enother 
patch.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                Kernel Storage Architect
hare@suse.de                              +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCHv2] nvme-mpath: delete disk after last connection
  2021-04-20 13:19   ` Hannes Reinecke
@ 2021-04-20 14:14     ` Keith Busch
  2021-04-20 14:39       ` Christoph Hellwig
  2021-04-20 16:21       ` Hannes Reinecke
  0 siblings, 2 replies; 12+ messages in thread
From: Keith Busch @ 2021-04-20 14:14 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Christoph Hellwig, Sagi Grimberg, Keith Busch, linux-nvme, Daniel Wagner

On Tue, Apr 20, 2021 at 03:19:10PM +0200, Hannes Reinecke wrote:
> On 4/20/21 10:05 AM, Christoph Hellwig wrote:
> > On Fri, Apr 16, 2021 at 08:24:11AM +0200, Hannes Reinecke wrote:
> > > With the proposed patch, the following messages appear:
> > > 
> > >   [  227.516807] md/raid1:md0: Disk failure on nvme3n1, disabling device.
> > >   [  227.516807] md/raid1:md0: Operation continuing on 1 devices.
> > 
> > So how is this going to work for e.g. a case where the device
> > disappear due to resets or fabrics connection problems?  This now
> > directly teards down the device.
> > 
> Yes, that is correct; the nshead will be removed once the last path is
> _removed_.

The end result is also how non-multipath nvme behaves, so I think that's
what users have come to expect.

> But key point here is that once the system finds itself in that
> situation it's impossible to recover, as the refcounts are messed.
> Even a manual connect call with the same parameter will _not_ restore
> operation, but rather result in a new namespace.

I haven't looked at this yet, but is it really not possible to restore
the original namespace upon the reestablished connection?

> So with this patch we change from stalled I/O (where the user has to
> reboot the machine to restore operation) to I/O errors (giving the user at
> least a _chance_ to recover).
> 
> I can easily consider adding a 'queue_if_no_path' option to allow I/O to be
> held even once all paths are disconnected, but that will be enother patch.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCHv2] nvme-mpath: delete disk after last connection
  2021-04-16  6:24 [PATCHv2] nvme-mpath: delete disk after last connection Hannes Reinecke
  2021-04-20  8:05 ` Christoph Hellwig
@ 2021-04-20 14:16 ` Keith Busch
  2021-05-01 11:59   ` Hannes Reinecke
  1 sibling, 1 reply; 12+ messages in thread
From: Keith Busch @ 2021-04-20 14:16 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Christoph Hellwig, Sagi Grimberg, Keith Busch, linux-nvme, Daniel Wagner

On Fri, Apr 16, 2021 at 08:24:11AM +0200, Hannes Reinecke wrote:
> Now delete all paths to one of the namespaces:
> 
>  # echo 1 > /sys/class/nvme/nvme3/delete_controller
>  # echo 1 > /sys/class/nvme/nvme4/delete_controller
> 
> We have no path, but mdstat says:
> 
>  # cat /proc/mdstat
>  Personalities : [raid1]
>  md0 : active (auto-read-only) raid1 nvme5n1[1]
>        64512 blocks super 1.2 [2/1] [_U]
> 
> And this is reported to cause a problem.
> 
> With the proposed patch, the following messages appear:
> 
>  [  227.516807] md/raid1:md0: Disk failure on nvme3n1, disabling device.
>  [  227.516807] md/raid1:md0: Operation continuing on 1 devices.
> 
> And mdstat shows only the viable members:
> 
>  # cat /proc/mdstat
>  Personalities : [raid1]
>  md0 : active (auto-read-only) raid1 nvme5n1[1]
>        64512 blocks super 1.2 [2/1] [_U]

I think there's a copy-paste error in this change log (probably my
mistake). The first mdstat should have shown nvme3n1 was still there,
right?

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCHv2] nvme-mpath: delete disk after last connection
  2021-04-20 14:14     ` Keith Busch
@ 2021-04-20 14:39       ` Christoph Hellwig
  2021-04-20 17:02         ` Hannes Reinecke
  2021-04-20 20:05         ` Martin Wilck
  2021-04-20 16:21       ` Hannes Reinecke
  1 sibling, 2 replies; 12+ messages in thread
From: Christoph Hellwig @ 2021-04-20 14:39 UTC (permalink / raw)
  To: Keith Busch
  Cc: Hannes Reinecke, Christoph Hellwig, Sagi Grimberg, Keith Busch,
	linux-nvme, Daniel Wagner

On Tue, Apr 20, 2021 at 11:14:36PM +0900, Keith Busch wrote:
> On Tue, Apr 20, 2021 at 03:19:10PM +0200, Hannes Reinecke wrote:
> > On 4/20/21 10:05 AM, Christoph Hellwig wrote:
> > > On Fri, Apr 16, 2021 at 08:24:11AM +0200, Hannes Reinecke wrote:
> > > > With the proposed patch, the following messages appear:
> > > > 
> > > >   [  227.516807] md/raid1:md0: Disk failure on nvme3n1, disabling device.
> > > >   [  227.516807] md/raid1:md0: Operation continuing on 1 devices.
> > > 
> > > So how is this going to work for e.g. a case where the device
> > > disappear due to resets or fabrics connection problems?  This now
> > > directly teards down the device.
> > > 
> > Yes, that is correct; the nshead will be removed once the last path is
> > _removed_.
> 
> The end result is also how non-multipath nvme behaves, so I think that's
> what users have come to expect.

I'm not sure that is what users expect.  At least the SCSI multipath
setups I've worked do not expect it and ensure the queue_if_no_path
option is set.

> > But key point here is that once the system finds itself in that
> > situation it's impossible to recover, as the refcounts are messed.
> > Even a manual connect call with the same parameter will _not_ restore
> > operation, but rather result in a new namespace.
> 
> I haven't looked at this yet, but is it really not possible to restore
> the original namespace upon the reestablished connection?

It is possible, and in fact is what we do.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCHv2] nvme-mpath: delete disk after last connection
  2021-04-20 14:14     ` Keith Busch
  2021-04-20 14:39       ` Christoph Hellwig
@ 2021-04-20 16:21       ` Hannes Reinecke
  1 sibling, 0 replies; 12+ messages in thread
From: Hannes Reinecke @ 2021-04-20 16:21 UTC (permalink / raw)
  To: Keith Busch
  Cc: Christoph Hellwig, Sagi Grimberg, Keith Busch, linux-nvme, Daniel Wagner

On 4/20/21 4:14 PM, Keith Busch wrote:
> On Tue, Apr 20, 2021 at 03:19:10PM +0200, Hannes Reinecke wrote:
>> On 4/20/21 10:05 AM, Christoph Hellwig wrote:
>>> On Fri, Apr 16, 2021 at 08:24:11AM +0200, Hannes Reinecke wrote:
>>>> With the proposed patch, the following messages appear:
>>>>
>>>>    [  227.516807] md/raid1:md0: Disk failure on nvme3n1, disabling device.
>>>>    [  227.516807] md/raid1:md0: Operation continuing on 1 devices.
>>>
>>> So how is this going to work for e.g. a case where the device
>>> disappear due to resets or fabrics connection problems?  This now
>>> directly teards down the device.
>>>
>> Yes, that is correct; the nshead will be removed once the last path is
>> _removed_.
> 
> The end result is also how non-multipath nvme behaves, so I think that's
> what users have come to expect.
> 
Nope. It doesn't.

This patch precisely _aligns_ the behaviour between multipathed (or, to 
be precise, namespaces with CMIC != 0) and non-multipathed ones.

When CMIC == 0 this patch doesn't change the behaviour at all.

>> But key point here is that once the system finds itself in that
>> situation it's impossible to recover, as the refcounts are messed.
>> Even a manual connect call with the same parameter will _not_ restore
>> operation, but rather result in a new namespace.
> 
> I haven't looked at this yet, but is it really not possible to restore
> the original namespace upon the reestablished connection?
> 

No. As mentioned, refcount is messed up.

With the current behaviour, once the last path drops, the namespace will 
_vanish_ from the list of namespaces.
Once you reconnect the original namespace 'magically' reappears in 'nvme 
list', but with size '0', _and_ the new namespace beside it:

Needless to say, the current opener (mount, MD, you name it) still refer 
to the original namespace, and you can't correct the situation as I/O is 
still pending you have no way to abort it.

(Funny, though; it was you who suggested the patch in the first place; 
feels kinda strange to explain the patch to its author ...)

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                Kernel Storage Architect
hare@suse.de                              +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCHv2] nvme-mpath: delete disk after last connection
  2021-04-20 14:39       ` Christoph Hellwig
@ 2021-04-20 17:02         ` Hannes Reinecke
  2021-04-20 17:16           ` Keith Busch
  2021-04-20 20:05         ` Martin Wilck
  1 sibling, 1 reply; 12+ messages in thread
From: Hannes Reinecke @ 2021-04-20 17:02 UTC (permalink / raw)
  To: Christoph Hellwig, Keith Busch
  Cc: Sagi Grimberg, Keith Busch, linux-nvme, Daniel Wagner

On 4/20/21 4:39 PM, Christoph Hellwig wrote:
> On Tue, Apr 20, 2021 at 11:14:36PM +0900, Keith Busch wrote:
>> On Tue, Apr 20, 2021 at 03:19:10PM +0200, Hannes Reinecke wrote:
>>> On 4/20/21 10:05 AM, Christoph Hellwig wrote:
>>>> On Fri, Apr 16, 2021 at 08:24:11AM +0200, Hannes Reinecke wrote:
>>>>> With the proposed patch, the following messages appear:
>>>>>
>>>>>    [  227.516807] md/raid1:md0: Disk failure on nvme3n1, disabling device.
>>>>>    [  227.516807] md/raid1:md0: Operation continuing on 1 devices.
>>>>
>>>> So how is this going to work for e.g. a case where the device
>>>> disappear due to resets or fabrics connection problems?  This now
>>>> directly teards down the device.
>>>>
>>> Yes, that is correct; the nshead will be removed once the last path is
>>> _removed_.
>>
>> The end result is also how non-multipath nvme behaves, so I think that's
>> what users have come to expect.
> 
> I'm not sure that is what users expect.  At least the SCSI multipath
> setups I've worked do not expect it and ensure the queue_if_no_path
> option is set.
> 
Yes, sure. And as I said, I'm happy to implement this option for NVMe, too.
But that is _not_ what this patch is about.

NVMe since day one has _removed_ the namespace if the controller goes 
away (ie if you do a PCI hotplug). So customer rightly expects this 
behaviour to continue.

And this is what the patch does; _aligning_ the behaviour from 
multipathed to non-multipathed controllers when the last path is gone.

Non-multipathed (ie CMIC==0) controllers will remove the namespace once 
the last _reference_ to that namespace drops (ie the PCI hotplug case).
Multipathed (ie CMIC!=0) controllers will remove the namespace once the 
last _opener_ goes away.
The refcount is long gone by that time.

>>> But key point here is that once the system finds itself in that
>>> situation it's impossible to recover, as the refcounts are messed.
>>> Even a manual connect call with the same parameter will _not_ restore
>>> operation, but rather result in a new namespace.
>>
>> I haven't looked at this yet, but is it really not possible to restore
>> the original namespace upon the reestablished connection?
> 
> It is possible, and in fact is what we do.
> 
It is _not_ once the namespace is mounted.
Or MD has claimed the device.
And the problem is that the refcount already _is_ zero, so we are 
already in teardown. We're just waiting for the reference to the gendisk 
to drop.
Which is never will, as we would have to umount (or detach) the device 
for that, but I/O is still pending which cannot be flushed, so that'll fail.
And if we try to connect the same namespace again, nvme_find_ns_head() 
will not return the existing ns_head as the refcount is zero.
Causing a new ns_head to be created.

If you manage to get this working with the current code please show me 
from the testcase in the description what we should have done differently.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                Kernel Storage Architect
hare@suse.de                              +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCHv2] nvme-mpath: delete disk after last connection
  2021-04-20 17:02         ` Hannes Reinecke
@ 2021-04-20 17:16           ` Keith Busch
  0 siblings, 0 replies; 12+ messages in thread
From: Keith Busch @ 2021-04-20 17:16 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Christoph Hellwig, Sagi Grimberg, Keith Busch, linux-nvme, Daniel Wagner

On Tue, Apr 20, 2021 at 07:02:32PM +0200, Hannes Reinecke wrote:
> On 4/20/21 4:39 PM, Christoph Hellwig wrote:
> > On Tue, Apr 20, 2021 at 11:14:36PM +0900, Keith Busch wrote:
> > > On Tue, Apr 20, 2021 at 03:19:10PM +0200, Hannes Reinecke wrote:
> > > > On 4/20/21 10:05 AM, Christoph Hellwig wrote:
> > > > > On Fri, Apr 16, 2021 at 08:24:11AM +0200, Hannes Reinecke wrote:
> > > > > > With the proposed patch, the following messages appear:
> > > > > > 
> > > > > >    [  227.516807] md/raid1:md0: Disk failure on nvme3n1, disabling device.
> > > > > >    [  227.516807] md/raid1:md0: Operation continuing on 1 devices.
> > > > > 
> > > > > So how is this going to work for e.g. a case where the device
> > > > > disappear due to resets or fabrics connection problems?  This now
> > > > > directly teards down the device.
> > > > > 
> > > > Yes, that is correct; the nshead will be removed once the last path is
> > > > _removed_.
> > > 
> > > The end result is also how non-multipath nvme behaves, so I think that's
> > > what users have come to expect.
> > 
> > I'm not sure that is what users expect.  At least the SCSI multipath
> > setups I've worked do not expect it and ensure the queue_if_no_path
> > option is set.
> > 
> Yes, sure. And as I said, I'm happy to implement this option for NVMe, too.
> But that is _not_ what this patch is about.
> 
> NVMe since day one has _removed_ the namespace if the controller goes away
> (ie if you do a PCI hotplug). So customer rightly expects this behaviour to
> continue.
> 
> And this is what the patch does; _aligning_ the behaviour from multipathed
> to non-multipathed controllers when the last path is gone.
> 
> Non-multipathed (ie CMIC==0) controllers will remove the namespace once the
> last _reference_ to that namespace drops (ie the PCI hotplug case).
> Multipathed (ie CMIC!=0) controllers will remove the namespace once the last
> _opener_ goes away.
> The refcount is long gone by that time.
> 
> > > > But key point here is that once the system finds itself in that
> > > > situation it's impossible to recover, as the refcounts are messed.
> > > > Even a manual connect call with the same parameter will _not_ restore
> > > > operation, but rather result in a new namespace.
> > > 
> > > I haven't looked at this yet, but is it really not possible to restore
> > > the original namespace upon the reestablished connection?
> > 
> > It is possible, and in fact is what we do.
> > 
> It is _not_ once the namespace is mounted.
> Or MD has claimed the device.
> And the problem is that the refcount already _is_ zero, so we are already in
> teardown. We're just waiting for the reference to the gendisk to drop.
> Which is never will, as we would have to umount (or detach) the device for
> that, but I/O is still pending which cannot be flushed, so that'll fail.
> And if we try to connect the same namespace again, nvme_find_ns_head() will
> not return the existing ns_head as the refcount is zero.
> Causing a new ns_head to be created.
> 
> If you manage to get this working with the current code please show me from
> the testcase in the description what we should have done differently.

I see what you mean. I think we can detangle this, but it's not as
straight forward as I hoped.

As far as *this* patch goes, I think you and I are aligned on the
behavior, and I still think it's good.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCHv2] nvme-mpath: delete disk after last connection
  2021-04-20 14:39       ` Christoph Hellwig
  2021-04-20 17:02         ` Hannes Reinecke
@ 2021-04-20 20:05         ` Martin Wilck
  1 sibling, 0 replies; 12+ messages in thread
From: Martin Wilck @ 2021-04-20 20:05 UTC (permalink / raw)
  To: Christoph Hellwig, Keith Busch
  Cc: Hannes Reinecke, Sagi Grimberg, Keith Busch, linux-nvme, Daniel Wagner

On Tue, 2021-04-20 at 16:39 +0200, Christoph Hellwig wrote:
> On Tue, Apr 20, 2021 at 11:14:36PM +0900, Keith Busch wrote:
> > 
> > The end result is also how non-multipath nvme behaves, so I think
> > that's
> > what users have come to expect.
> 
> I'm not sure that is what users expect.  At least the SCSI multipath
> setups I've worked do not expect it and ensure the queue_if_no_path
> option is set.

For dm-multipath, this only holds for the case that all paths are
failed. But if the last path is actually *deleted*, multipathd flushes
the map (tries to, at least).
 
Martin



_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCHv2] nvme-mpath: delete disk after last connection
  2021-04-20 14:16 ` Keith Busch
@ 2021-05-01 11:59   ` Hannes Reinecke
  0 siblings, 0 replies; 12+ messages in thread
From: Hannes Reinecke @ 2021-05-01 11:59 UTC (permalink / raw)
  To: linux-nvme

On 4/20/21 4:16 PM, Keith Busch wrote:
> On Fri, Apr 16, 2021 at 08:24:11AM +0200, Hannes Reinecke wrote:
>> Now delete all paths to one of the namespaces:
>>
>>   # echo 1 > /sys/class/nvme/nvme3/delete_controller
>>   # echo 1 > /sys/class/nvme/nvme4/delete_controller
>>
>> We have no path, but mdstat says:
>>
>>   # cat /proc/mdstat
>>   Personalities : [raid1]
>>   md0 : active (auto-read-only) raid1 nvme5n1[1]
>>         64512 blocks super 1.2 [2/1] [_U]
>>
>> And this is reported to cause a problem.
>>
>> With the proposed patch, the following messages appear:
>>
>>   [  227.516807] md/raid1:md0: Disk failure on nvme3n1, disabling device.
>>   [  227.516807] md/raid1:md0: Operation continuing on 1 devices.
>>
>> And mdstat shows only the viable members:
>>
>>   # cat /proc/mdstat
>>   Personalities : [raid1]
>>   md0 : active (auto-read-only) raid1 nvme5n1[1]
>>         64512 blocks super 1.2 [2/1] [_U]
> 
> I think there's a copy-paste error in this change log (probably my
> mistake). The first mdstat should have shown nvme3n1 was still there,
> right?
> 
Possibly. Will be resending.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                Kernel Storage Architect
hare@suse.de                              +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, back to index

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-16  6:24 [PATCHv2] nvme-mpath: delete disk after last connection Hannes Reinecke
2021-04-20  8:05 ` Christoph Hellwig
     [not found]   ` <91f25f95-e5a0-466f-b410-6f6dafdec0a0@email.android.com>
2021-04-20  9:54     ` Christoph Hellwig
2021-04-20 13:19   ` Hannes Reinecke
2021-04-20 14:14     ` Keith Busch
2021-04-20 14:39       ` Christoph Hellwig
2021-04-20 17:02         ` Hannes Reinecke
2021-04-20 17:16           ` Keith Busch
2021-04-20 20:05         ` Martin Wilck
2021-04-20 16:21       ` Hannes Reinecke
2021-04-20 14:16 ` Keith Busch
2021-05-01 11:59   ` Hannes Reinecke

Linux-NVME Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-nvme/0 linux-nvme/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-nvme linux-nvme/ https://lore.kernel.org/linux-nvme \
		linux-nvme@lists.infradead.org
	public-inbox-index linux-nvme

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.infradead.lists.linux-nvme


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git