All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHv6] nvme: allow to re-attach namespaces after all paths are down
@ 2021-06-09 15:01 Hannes Reinecke
  2021-06-21  6:38 ` Christoph Hellwig
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Hannes Reinecke @ 2021-06-09 15:01 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Sagi Grimberg, Keith Busch, linux-nvme, Hannes Reinecke

We should only remove the ns head from the list of heads per
subsystem if the reference count drops to zero. That cleans up
reference counting, and allows us to call del_gendisk() once the last
path is removed (as then the ns_head should be removed anyway).
As this introduces a (theoretical) race condition where I/O might have
been requeued before the last path went down we also should be checking
if the gendisk is still present in nvme_ns_head_submit_bio(),
and failing I/O if so.

Changes to v5:
- Synchronize between nvme_init_ns_head() and nvme_mpath_check_last_path()
- Check for removed gendisk in nvme_ns_head_submit_bio()
Changes to v4:
- Call del_gendisk() in nvme_mpath_check_last_path() to avoid deadlock
Changes to v3:
- Simplify if() clause to detect duplicate namespaces
Changes to v2:
- Drop memcpy() statement
Changes to v1:
- Always check NSIDs after reattach

Signed-off-by: Hannes Reinecke <hare@suse.de>
---
 drivers/nvme/host/core.c      |  9 ++++-----
 drivers/nvme/host/multipath.c | 30 +++++++++++++++++++++++++-----
 drivers/nvme/host/nvme.h      | 11 ++---------
 3 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 177cae44b612..6d7c2958b3e2 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -566,6 +566,9 @@ static void nvme_free_ns_head(struct kref *ref)
 	struct nvme_ns_head *head =
 		container_of(ref, struct nvme_ns_head, ref);
 
+	mutex_lock(&head->subsys->lock);
+	list_del_init(&head->entry);
+	mutex_unlock(&head->subsys->lock);
 	nvme_mpath_remove_disk(head);
 	ida_simple_remove(&head->subsys->ns_ida, head->instance);
 	cleanup_srcu_struct(&head->srcu);
@@ -3806,8 +3809,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
  out_unlink_ns:
 	mutex_lock(&ctrl->subsys->lock);
 	list_del_rcu(&ns->siblings);
-	if (list_empty(&ns->head->list))
-		list_del_init(&ns->head->entry);
 	mutex_unlock(&ctrl->subsys->lock);
 	nvme_put_ns_head(ns->head);
  out_free_queue:
@@ -3828,8 +3829,6 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 
 	mutex_lock(&ns->ctrl->subsys->lock);
 	list_del_rcu(&ns->siblings);
-	if (list_empty(&ns->head->list))
-		list_del_init(&ns->head->entry);
 	mutex_unlock(&ns->ctrl->subsys->lock);
 
 	synchronize_rcu(); /* guarantee not available in head->list */
@@ -3849,7 +3848,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 	list_del_init(&ns->list);
 	up_write(&ns->ctrl->namespaces_rwsem);
 
-	nvme_mpath_check_last_path(ns);
+	nvme_mpath_check_last_path(ns->head);
 	nvme_put_ns(ns);
 }
 
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 23573fe3fc7d..31153f6ec582 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -266,6 +266,8 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 	int node = numa_node_id();
 	struct nvme_ns *ns;
 
+	if (!(head->disk->flags & GENHD_FL_UP))
+		return NULL;
 	ns = srcu_dereference(head->current_path[node], &head->srcu);
 	if (unlikely(!ns))
 		return __nvme_find_path(head, node);
@@ -281,6 +283,8 @@ static bool nvme_available_path(struct nvme_ns_head *head)
 {
 	struct nvme_ns *ns;
 
+	if (!(head->disk->flags & GENHD_FL_UP))
+		return false;
 	list_for_each_entry_rcu(ns, &head->list, siblings) {
 		if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
 			continue;
@@ -771,20 +775,36 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
 #endif
 }
 
-void nvme_mpath_remove_disk(struct nvme_ns_head *head)
+void nvme_mpath_check_last_path(struct nvme_ns_head *head)
 {
+	bool last_path = false;
 	if (!head->disk)
 		return;
-	if (head->disk->flags & GENHD_FL_UP) {
-		nvme_cdev_del(&head->cdev, &head->cdev_device);
-		del_gendisk(head->disk);
+
+	/* Synchronize with nvme_init_ns_head() */
+	mutex_lock(&head->subsys->lock);
+	if (list_empty(&head->list))
+		last_path = true;
+	mutex_unlock(&head->subsys->lock);
+	if (last_path) {
+		kblockd_schedule_work(&head->requeue_work);
+		if (head->disk->flags & GENHD_FL_UP) {
+			nvme_cdev_del(&head->cdev, &head->cdev_device);
+			del_gendisk(head->disk);
+		}
 	}
+}
+
+void nvme_mpath_remove_disk(struct nvme_ns_head *head)
+{
+	if (!head->disk)
+		return;
 	blk_set_queue_dying(head->disk->queue);
 	/* make sure all pending bios are cleaned up */
 	kblockd_schedule_work(&head->requeue_work);
 	flush_work(&head->requeue_work);
 	blk_cleanup_queue(head->disk->queue);
-	if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
+	if (!test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
 		/*
 		 * if device_add_disk wasn't called, prevent
 		 * disk release to put a bogus reference on the
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 1f397ecba16c..812fc1d273e3 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -716,14 +716,7 @@ void nvme_mpath_uninit(struct nvme_ctrl *ctrl);
 void nvme_mpath_stop(struct nvme_ctrl *ctrl);
 bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl);
-
-static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
-{
-	struct nvme_ns_head *head = ns->head;
-
-	if (head->disk && list_empty(&head->list))
-		kblockd_schedule_work(&head->requeue_work);
-}
+void nvme_mpath_check_last_path(struct nvme_ns_head *head);
 
 static inline void nvme_trace_bio_complete(struct request *req)
 {
@@ -772,7 +765,7 @@ static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
 static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
 {
 }
-static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
+static inline void nvme_mpath_check_last_path(struct nvme_ns_head *head)
 {
 }
 static inline void nvme_trace_bio_complete(struct request *req)
-- 
2.26.2


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCHv6] nvme: allow to re-attach namespaces after all paths are down
  2021-06-09 15:01 [PATCHv6] nvme: allow to re-attach namespaces after all paths are down Hannes Reinecke
@ 2021-06-21  6:38 ` Christoph Hellwig
  2021-06-21  7:33   ` Hannes Reinecke
  2021-06-21 17:26 ` Keith Busch
  2021-06-21 18:13 ` Sagi Grimberg
  2 siblings, 1 reply; 7+ messages in thread
From: Christoph Hellwig @ 2021-06-21  6:38 UTC (permalink / raw)
  To: Hannes Reinecke; +Cc: Christoph Hellwig, Sagi Grimberg, Keith Busch, linux-nvme

On Wed, Jun 09, 2021 at 05:01:18PM +0200, Hannes Reinecke wrote:
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 177cae44b612..6d7c2958b3e2 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -566,6 +566,9 @@ static void nvme_free_ns_head(struct kref *ref)
>  	struct nvme_ns_head *head =
>  		container_of(ref, struct nvme_ns_head, ref);
>  
> +	mutex_lock(&head->subsys->lock);
> +	list_del_init(&head->entry);
> +	mutex_unlock(&head->subsys->lock);
>  	nvme_mpath_remove_disk(head);
>  	ida_simple_remove(&head->subsys->ns_ida, head->instance);
>  	cleanup_srcu_struct(&head->srcu);
> @@ -3806,8 +3809,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
>   out_unlink_ns:
>  	mutex_lock(&ctrl->subsys->lock);
>  	list_del_rcu(&ns->siblings);
> -	if (list_empty(&ns->head->list))
> -		list_del_init(&ns->head->entry);
>  	mutex_unlock(&ctrl->subsys->lock);
>  	nvme_put_ns_head(ns->head);
>   out_free_queue:
> @@ -3828,8 +3829,6 @@ static void nvme_ns_remove(struct nvme_ns *ns)
>  
>  	mutex_lock(&ns->ctrl->subsys->lock);
>  	list_del_rcu(&ns->siblings);
> -	if (list_empty(&ns->head->list))
> -		list_del_init(&ns->head->entry);
>  	mutex_unlock(&ns->ctrl->subsys->lock);
>  
>  	synchronize_rcu(); /* guarantee not available in head->list */
> @@ -3849,7 +3848,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
>  	list_del_init(&ns->list);
>  	up_write(&ns->ctrl->namespaces_rwsem);
>  
> -	nvme_mpath_check_last_path(ns);
> +	nvme_mpath_check_last_path(ns->head);
>  	nvme_put_ns(ns);
>  }
>  
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 23573fe3fc7d..31153f6ec582 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -266,6 +266,8 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
>  	int node = numa_node_id();
>  	struct nvme_ns *ns;
>  
> +	if (!(head->disk->flags & GENHD_FL_UP))
> +		return NULL;
>  	ns = srcu_dereference(head->current_path[node], &head->srcu);
>  	if (unlikely(!ns))
>  		return __nvme_find_path(head, node);
> @@ -281,6 +283,8 @@ static bool nvme_available_path(struct nvme_ns_head *head)
>  {
>  	struct nvme_ns *ns;
>  
> +	if (!(head->disk->flags & GENHD_FL_UP))
> +		return false;
>  	list_for_each_entry_rcu(ns, &head->list, siblings) {
>  		if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
>  			continue;
> @@ -771,20 +775,36 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
>  #endif
>  }
>  
> -void nvme_mpath_remove_disk(struct nvme_ns_head *head)
> +void nvme_mpath_check_last_path(struct nvme_ns_head *head)
>  {
> +	bool last_path = false;
>  	if (!head->disk)
>  		return;
> +
> +	/* Synchronize with nvme_init_ns_head() */
> +	mutex_lock(&head->subsys->lock);
> +	if (list_empty(&head->list))
> +		last_path = true;
> +	mutex_unlock(&head->subsys->lock);
> +	if (last_path) {
> +		kblockd_schedule_work(&head->requeue_work);
> +		if (head->disk->flags & GENHD_FL_UP) {
> +			nvme_cdev_del(&head->cdev, &head->cdev_device);
> +			del_gendisk(head->disk);
> +		}
>  	}
> +}
> +
> +void nvme_mpath_remove_disk(struct nvme_ns_head *head)
> +{
> +	if (!head->disk)
> +		return;
>  	blk_set_queue_dying(head->disk->queue);
>  	/* make sure all pending bios are cleaned up */
>  	kblockd_schedule_work(&head->requeue_work);
>  	flush_work(&head->requeue_work);
>  	blk_cleanup_queue(head->disk->queue);
> -	if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
> +	if (!test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
>  		/*
>  		 * if device_add_disk wasn't called, prevent
>  		 * disk release to put a bogus reference on the

So if a nvme_mpath_set_live comes in between nvme_mpath_check_last_path
and nvme_mpath_remove_disk we'll end up without a gendisk still, don't we?

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCHv6] nvme: allow to re-attach namespaces after all paths are down
  2021-06-21  6:38 ` Christoph Hellwig
@ 2021-06-21  7:33   ` Hannes Reinecke
  0 siblings, 0 replies; 7+ messages in thread
From: Hannes Reinecke @ 2021-06-21  7:33 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Sagi Grimberg, Keith Busch, linux-nvme

On 6/21/21 8:38 AM, Christoph Hellwig wrote:
> On Wed, Jun 09, 2021 at 05:01:18PM +0200, Hannes Reinecke wrote:
>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>> index 177cae44b612..6d7c2958b3e2 100644
>> --- a/drivers/nvme/host/core.c
>> +++ b/drivers/nvme/host/core.c
>> @@ -566,6 +566,9 @@ static void nvme_free_ns_head(struct kref *ref)
>>   	struct nvme_ns_head *head =
>>   		container_of(ref, struct nvme_ns_head, ref);
>>   
>> +	mutex_lock(&head->subsys->lock);
>> +	list_del_init(&head->entry);
>> +	mutex_unlock(&head->subsys->lock);
>>   	nvme_mpath_remove_disk(head);
>>   	ida_simple_remove(&head->subsys->ns_ida, head->instance);
>>   	cleanup_srcu_struct(&head->srcu);
>> @@ -3806,8 +3809,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
>>    out_unlink_ns:
>>   	mutex_lock(&ctrl->subsys->lock);
>>   	list_del_rcu(&ns->siblings);
>> -	if (list_empty(&ns->head->list))
>> -		list_del_init(&ns->head->entry);
>>   	mutex_unlock(&ctrl->subsys->lock);
>>   	nvme_put_ns_head(ns->head);
>>    out_free_queue:
>> @@ -3828,8 +3829,6 @@ static void nvme_ns_remove(struct nvme_ns *ns)
>>   
>>   	mutex_lock(&ns->ctrl->subsys->lock);
>>   	list_del_rcu(&ns->siblings);
>> -	if (list_empty(&ns->head->list))
>> -		list_del_init(&ns->head->entry);
>>   	mutex_unlock(&ns->ctrl->subsys->lock);
>>   
>>   	synchronize_rcu(); /* guarantee not available in head->list */
>> @@ -3849,7 +3848,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
>>   	list_del_init(&ns->list);
>>   	up_write(&ns->ctrl->namespaces_rwsem);
>>   
>> -	nvme_mpath_check_last_path(ns);
>> +	nvme_mpath_check_last_path(ns->head);
>>   	nvme_put_ns(ns);
>>   }
>>   
>> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
>> index 23573fe3fc7d..31153f6ec582 100644
>> --- a/drivers/nvme/host/multipath.c
>> +++ b/drivers/nvme/host/multipath.c
>> @@ -266,6 +266,8 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
>>   	int node = numa_node_id();
>>   	struct nvme_ns *ns;
>>   
>> +	if (!(head->disk->flags & GENHD_FL_UP))
>> +		return NULL;
>>   	ns = srcu_dereference(head->current_path[node], &head->srcu);
>>   	if (unlikely(!ns))
>>   		return __nvme_find_path(head, node);
>> @@ -281,6 +283,8 @@ static bool nvme_available_path(struct nvme_ns_head *head)
>>   {
>>   	struct nvme_ns *ns;
>>   
>> +	if (!(head->disk->flags & GENHD_FL_UP))
>> +		return false;
>>   	list_for_each_entry_rcu(ns, &head->list, siblings) {
>>   		if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
>>   			continue;
>> @@ -771,20 +775,36 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
>>   #endif
>>   }
>>   
>> -void nvme_mpath_remove_disk(struct nvme_ns_head *head)
>> +void nvme_mpath_check_last_path(struct nvme_ns_head *head)
>>   {
>> +	bool last_path = false;
>>   	if (!head->disk)
>>   		return;
>> +
>> +	/* Synchronize with nvme_init_ns_head() */
>> +	mutex_lock(&head->subsys->lock);
>> +	if (list_empty(&head->list))
>> +		last_path = true;
>> +	mutex_unlock(&head->subsys->lock);
>> +	if (last_path) {
>> +		kblockd_schedule_work(&head->requeue_work);
>> +		if (head->disk->flags & GENHD_FL_UP) {
>> +			nvme_cdev_del(&head->cdev, &head->cdev_device);
>> +			del_gendisk(head->disk);
>> +		}
>>   	}
>> +}
>> +
>> +void nvme_mpath_remove_disk(struct nvme_ns_head *head)
>> +{
>> +	if (!head->disk)
>> +		return;
>>   	blk_set_queue_dying(head->disk->queue);
>>   	/* make sure all pending bios are cleaned up */
>>   	kblockd_schedule_work(&head->requeue_work);
>>   	flush_work(&head->requeue_work);
>>   	blk_cleanup_queue(head->disk->queue);
>> -	if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
>> +	if (!test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
>>   		/*
>>   		 * if device_add_disk wasn't called, prevent
>>   		 * disk release to put a bogus reference on the
> 
> So if a nvme_mpath_set_live comes in between nvme_mpath_check_last_path
> and nvme_mpath_remove_disk we'll end up without a gendisk still, don't we?
> 
I can't see how we can end up there.
If we call nvme_mpath_set_live() we will have to have a reference on 
ns_head, consequently nvme_mpath_remove_disk() will not be called for 
that ns_head as that function is called from the kref callback once all 
references are dropped.

Am I missing something?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                Kernel Storage Architect
hare@suse.de                              +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCHv6] nvme: allow to re-attach namespaces after all paths are down
  2021-06-09 15:01 [PATCHv6] nvme: allow to re-attach namespaces after all paths are down Hannes Reinecke
  2021-06-21  6:38 ` Christoph Hellwig
@ 2021-06-21 17:26 ` Keith Busch
  2021-06-22  6:21   ` Hannes Reinecke
  2021-06-21 18:13 ` Sagi Grimberg
  2 siblings, 1 reply; 7+ messages in thread
From: Keith Busch @ 2021-06-21 17:26 UTC (permalink / raw)
  To: Hannes Reinecke; +Cc: Christoph Hellwig, Sagi Grimberg, Keith Busch, linux-nvme

On Wed, Jun 09, 2021 at 05:01:18PM +0200, Hannes Reinecke wrote:
> -void nvme_mpath_remove_disk(struct nvme_ns_head *head)
> +void nvme_mpath_check_last_path(struct nvme_ns_head *head)
>  {
> +	bool last_path = false;
>  	if (!head->disk)
>  		return;
> -	if (head->disk->flags & GENHD_FL_UP) {
> -		nvme_cdev_del(&head->cdev, &head->cdev_device);
> -		del_gendisk(head->disk);
> +
> +	/* Synchronize with nvme_init_ns_head() */
> +	mutex_lock(&head->subsys->lock);
> +	if (list_empty(&head->list))
> +		last_path = true;
> +	mutex_unlock(&head->subsys->lock);
> +	if (last_path) {
> +		kblockd_schedule_work(&head->requeue_work);
> +		if (head->disk->flags & GENHD_FL_UP) {
> +			nvme_cdev_del(&head->cdev, &head->cdev_device);
> +			del_gendisk(head->disk);
> +		}
>  	}
> +}

This is being called when a path is removed. If it is the last path, the
head's block device is being deleted. If a live head reference is held,
and the a path reconnects, the code will find the previous head, but the
disk is gone and it's not coming back, so the new connection can't be
used.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCHv6] nvme: allow to re-attach namespaces after all paths are down
  2021-06-09 15:01 [PATCHv6] nvme: allow to re-attach namespaces after all paths are down Hannes Reinecke
  2021-06-21  6:38 ` Christoph Hellwig
  2021-06-21 17:26 ` Keith Busch
@ 2021-06-21 18:13 ` Sagi Grimberg
  2021-06-22  6:31   ` Hannes Reinecke
  2 siblings, 1 reply; 7+ messages in thread
From: Sagi Grimberg @ 2021-06-21 18:13 UTC (permalink / raw)
  To: Hannes Reinecke, Christoph Hellwig; +Cc: Keith Busch, linux-nvme



On 6/9/21 8:01 AM, Hannes Reinecke wrote:
> We should only remove the ns head from the list of heads per
> subsystem if the reference count drops to zero. That cleans up
> reference counting, and allows us to call del_gendisk() once the last
> path is removed (as then the ns_head should be removed anyway).
> As this introduces a (theoretical) race condition where I/O might have
> been requeued before the last path went down we also should be checking
> if the gendisk is still present in nvme_ns_head_submit_bio(),
> and failing I/O if so.
> 
> Changes to v5:
> - Synchronize between nvme_init_ns_head() and nvme_mpath_check_last_path()
> - Check for removed gendisk in nvme_ns_head_submit_bio()
> Changes to v4:
> - Call del_gendisk() in nvme_mpath_check_last_path() to avoid deadlock
> Changes to v3:
> - Simplify if() clause to detect duplicate namespaces
> Changes to v2:
> - Drop memcpy() statement
> Changes to v1:
> - Always check NSIDs after reattach
> 
> Signed-off-by: Hannes Reinecke <hare@suse.de>
> ---
>   drivers/nvme/host/core.c      |  9 ++++-----
>   drivers/nvme/host/multipath.c | 30 +++++++++++++++++++++++++-----
>   drivers/nvme/host/nvme.h      | 11 ++---------
>   3 files changed, 31 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 177cae44b612..6d7c2958b3e2 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -566,6 +566,9 @@ static void nvme_free_ns_head(struct kref *ref)
>   	struct nvme_ns_head *head =
>   		container_of(ref, struct nvme_ns_head, ref);
>   
> +	mutex_lock(&head->subsys->lock);
> +	list_del_init(&head->entry);
> +	mutex_unlock(&head->subsys->lock);
>   	nvme_mpath_remove_disk(head);
>   	ida_simple_remove(&head->subsys->ns_ida, head->instance);
>   	cleanup_srcu_struct(&head->srcu);
> @@ -3806,8 +3809,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
>    out_unlink_ns:
>   	mutex_lock(&ctrl->subsys->lock);
>   	list_del_rcu(&ns->siblings);
> -	if (list_empty(&ns->head->list))
> -		list_del_init(&ns->head->entry);
>   	mutex_unlock(&ctrl->subsys->lock);
>   	nvme_put_ns_head(ns->head);
>    out_free_queue:
> @@ -3828,8 +3829,6 @@ static void nvme_ns_remove(struct nvme_ns *ns)
>   
>   	mutex_lock(&ns->ctrl->subsys->lock);
>   	list_del_rcu(&ns->siblings);
> -	if (list_empty(&ns->head->list))
> -		list_del_init(&ns->head->entry);
>   	mutex_unlock(&ns->ctrl->subsys->lock);
>   
>   	synchronize_rcu(); /* guarantee not available in head->list */
> @@ -3849,7 +3848,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
>   	list_del_init(&ns->list);
>   	up_write(&ns->ctrl->namespaces_rwsem);
>   
> -	nvme_mpath_check_last_path(ns);
> +	nvme_mpath_check_last_path(ns->head);
>   	nvme_put_ns(ns);
>   }
>   
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 23573fe3fc7d..31153f6ec582 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -266,6 +266,8 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
>   	int node = numa_node_id();
>   	struct nvme_ns *ns;
>   
> +	if (!(head->disk->flags & GENHD_FL_UP))
> +		return NULL;
>   	ns = srcu_dereference(head->current_path[node], &head->srcu);
>   	if (unlikely(!ns))
>   		return __nvme_find_path(head, node);
> @@ -281,6 +283,8 @@ static bool nvme_available_path(struct nvme_ns_head *head)
>   {
>   	struct nvme_ns *ns;
>   
> +	if (!(head->disk->flags & GENHD_FL_UP))
> +		return false;

nvme_available_path should have no business looking at the head gendisk,
it should just understand if a PATH (a.k.a a controller) exists.

IMO, the fact that it does should tell that we should take a step back
and think about this. We are trying to keep an zombie nshead around
just for the possibility the host will reconnect (not as part of
error recovery, but as a brand new connect). Why shouldn't we just
remove it and restore it as a brand new nshead when the host attaches
again?

And IMO for queue_if_no_path, we should add a special handling such that
nvme_ns_head_submit_bio would do something explicit that is a special
case like the following:
--
         ns = nvme_find_path(head);
         if (likely(ns)) {
		/* submit */
         } else if (nvme_available_path(head)) {
		/* requeue */
	} else if (nvme_queue_if_no_path(head) &&
		   nvme_zombie_head(head))
		/* requeue */
         } else {
                 /* fail */
         }
--

And it should hopefully make it easier to debug.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCHv6] nvme: allow to re-attach namespaces after all paths are down
  2021-06-21 17:26 ` Keith Busch
@ 2021-06-22  6:21   ` Hannes Reinecke
  0 siblings, 0 replies; 7+ messages in thread
From: Hannes Reinecke @ 2021-06-22  6:21 UTC (permalink / raw)
  To: Keith Busch; +Cc: Christoph Hellwig, Sagi Grimberg, Keith Busch, linux-nvme

On 6/21/21 7:26 PM, Keith Busch wrote:
> On Wed, Jun 09, 2021 at 05:01:18PM +0200, Hannes Reinecke wrote:
>> -void nvme_mpath_remove_disk(struct nvme_ns_head *head)
>> +void nvme_mpath_check_last_path(struct nvme_ns_head *head)
>>   {
>> +	bool last_path = false;
>>   	if (!head->disk)
>>   		return;
>> -	if (head->disk->flags & GENHD_FL_UP) {
>> -		nvme_cdev_del(&head->cdev, &head->cdev_device);
>> -		del_gendisk(head->disk);
>> +
>> +	/* Synchronize with nvme_init_ns_head() */
>> +	mutex_lock(&head->subsys->lock);
>> +	if (list_empty(&head->list))
>> +		last_path = true;
>> +	mutex_unlock(&head->subsys->lock);
>> +	if (last_path) {
>> +		kblockd_schedule_work(&head->requeue_work);
>> +		if (head->disk->flags & GENHD_FL_UP) {
>> +			nvme_cdev_del(&head->cdev, &head->cdev_device);
>> +			del_gendisk(head->disk);
>> +		}
>>   	}
>> +}
> 
> This is being called when a path is removed. If it is the last path, the
> head's block device is being deleted. If a live head reference is held,
> and the a path reconnects, the code will find the previous head, but the
> disk is gone and it's not coming back, so the new connection can't be
> used.
> 
Good point. But that can be fixed by removing the ns_head from the 
subsystem list at this check rather than in nvme_free_ns_head().
Will be sending an updated version.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                Kernel Storage Architect
hare@suse.de                              +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCHv6] nvme: allow to re-attach namespaces after all paths are down
  2021-06-21 18:13 ` Sagi Grimberg
@ 2021-06-22  6:31   ` Hannes Reinecke
  0 siblings, 0 replies; 7+ messages in thread
From: Hannes Reinecke @ 2021-06-22  6:31 UTC (permalink / raw)
  To: Sagi Grimberg, Christoph Hellwig; +Cc: Keith Busch, linux-nvme

On 6/21/21 8:13 PM, Sagi Grimberg wrote:
> 
> 
> On 6/9/21 8:01 AM, Hannes Reinecke wrote:
>> We should only remove the ns head from the list of heads per
>> subsystem if the reference count drops to zero. That cleans up
>> reference counting, and allows us to call del_gendisk() once the last
>> path is removed (as then the ns_head should be removed anyway).
>> As this introduces a (theoretical) race condition where I/O might have
>> been requeued before the last path went down we also should be checking
>> if the gendisk is still present in nvme_ns_head_submit_bio(),
>> and failing I/O if so.
>>
>> Changes to v5:
>> - Synchronize between nvme_init_ns_head() and 
>> nvme_mpath_check_last_path()
>> - Check for removed gendisk in nvme_ns_head_submit_bio()
>> Changes to v4:
>> - Call del_gendisk() in nvme_mpath_check_last_path() to avoid deadlock
>> Changes to v3:
>> - Simplify if() clause to detect duplicate namespaces
>> Changes to v2:
>> - Drop memcpy() statement
>> Changes to v1:
>> - Always check NSIDs after reattach
>>
>> Signed-off-by: Hannes Reinecke <hare@suse.de>
>> ---
>>   drivers/nvme/host/core.c      |  9 ++++-----
>>   drivers/nvme/host/multipath.c | 30 +++++++++++++++++++++++++-----
>>   drivers/nvme/host/nvme.h      | 11 ++---------
>>   3 files changed, 31 insertions(+), 19 deletions(-)
>>
>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>> index 177cae44b612..6d7c2958b3e2 100644
>> --- a/drivers/nvme/host/core.c
>> +++ b/drivers/nvme/host/core.c
>> @@ -566,6 +566,9 @@ static void nvme_free_ns_head(struct kref *ref)
>>       struct nvme_ns_head *head =
>>           container_of(ref, struct nvme_ns_head, ref);
>> +    mutex_lock(&head->subsys->lock);
>> +    list_del_init(&head->entry);
>> +    mutex_unlock(&head->subsys->lock);
>>       nvme_mpath_remove_disk(head);
>>       ida_simple_remove(&head->subsys->ns_ida, head->instance);
>>       cleanup_srcu_struct(&head->srcu);
>> @@ -3806,8 +3809,6 @@ static void nvme_alloc_ns(struct nvme_ctrl 
>> *ctrl, unsigned nsid,
>>    out_unlink_ns:
>>       mutex_lock(&ctrl->subsys->lock);
>>       list_del_rcu(&ns->siblings);
>> -    if (list_empty(&ns->head->list))
>> -        list_del_init(&ns->head->entry);
>>       mutex_unlock(&ctrl->subsys->lock);
>>       nvme_put_ns_head(ns->head);
>>    out_free_queue:
>> @@ -3828,8 +3829,6 @@ static void nvme_ns_remove(struct nvme_ns *ns)
>>       mutex_lock(&ns->ctrl->subsys->lock);
>>       list_del_rcu(&ns->siblings);
>> -    if (list_empty(&ns->head->list))
>> -        list_del_init(&ns->head->entry);
>>       mutex_unlock(&ns->ctrl->subsys->lock);
>>       synchronize_rcu(); /* guarantee not available in head->list */
>> @@ -3849,7 +3848,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
>>       list_del_init(&ns->list);
>>       up_write(&ns->ctrl->namespaces_rwsem);
>> -    nvme_mpath_check_last_path(ns);
>> +    nvme_mpath_check_last_path(ns->head);
>>       nvme_put_ns(ns);
>>   }
>> diff --git a/drivers/nvme/host/multipath.c 
>> b/drivers/nvme/host/multipath.c
>> index 23573fe3fc7d..31153f6ec582 100644
>> --- a/drivers/nvme/host/multipath.c
>> +++ b/drivers/nvme/host/multipath.c
>> @@ -266,6 +266,8 @@ inline struct nvme_ns *nvme_find_path(struct 
>> nvme_ns_head *head)
>>       int node = numa_node_id();
>>       struct nvme_ns *ns;
>> +    if (!(head->disk->flags & GENHD_FL_UP))
>> +        return NULL;
>>       ns = srcu_dereference(head->current_path[node], &head->srcu);
>>       if (unlikely(!ns))
>>           return __nvme_find_path(head, node);
>> @@ -281,6 +283,8 @@ static bool nvme_available_path(struct 
>> nvme_ns_head *head)
>>   {
>>       struct nvme_ns *ns;
>> +    if (!(head->disk->flags & GENHD_FL_UP))
>> +        return false;
> 
> nvme_available_path should have no business looking at the head gendisk,
> it should just understand if a PATH (a.k.a a controller) exists.
> 
Agreed. I was only overly cautious here; will be dropping this check.

> IMO, the fact that it does should tell that we should take a step back
> and think about this. We are trying to keep an zombie nshead around
> just for the possibility the host will reconnect (not as part of
> error recovery, but as a brand new connect). Why shouldn't we just
> remove it and restore it as a brand new nshead when the host attaches
> again?
> 
This patch has now evolved quite a bit, and in fact diverged slightly 
from the description. The original intent indeed was to keep the nshead 
around until the last reference drops, such that if a controller gets 
reattached it will be able to connect the namespaces to the correct 
(existing) ns_head.
However, as it turned out this was just a band-aid, and the real fix is 
to get the reference counts between 'struct ns' and 'struct ns_head' 
correct: if the last path to a ns_head drops, we should be removing the 
ns_head by calling del_gendisk() and removing it from the list of ns_heads.

As noted by Keith the first part is done correctly in this patch (namely 
del_gendisk() is called when the last path drops), but the second bit of 
detaching it from the list of ns_heads is _not_ done correctly.
Both should be happening at the same time to avoid any race conditions.

Will be sending an updated patch.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                Kernel Storage Architect
hare@suse.de                              +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-06-22  6:32 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-09 15:01 [PATCHv6] nvme: allow to re-attach namespaces after all paths are down Hannes Reinecke
2021-06-21  6:38 ` Christoph Hellwig
2021-06-21  7:33   ` Hannes Reinecke
2021-06-21 17:26 ` Keith Busch
2021-06-22  6:21   ` Hannes Reinecke
2021-06-21 18:13 ` Sagi Grimberg
2021-06-22  6:31   ` Hannes Reinecke

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.