All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] nvme: fixup crash in device_add_disk()
@ 2019-02-19 12:13 Hannes Reinecke
  2019-02-19 12:13 ` [PATCH 1/2] nvme: return error from nvme_alloc_ns() Hannes Reinecke
  2019-02-19 12:13 ` [PATCH 2/2] nvme: protect against race condition in nvme_validate_ns() Hannes Reinecke
  0 siblings, 2 replies; 8+ messages in thread
From: Hannes Reinecke @ 2019-02-19 12:13 UTC (permalink / raw)


Hi all,

during testing we've ran into an issue where the system would crash
in device_add_disk(); analysis showed that there is a race condition
in nvme_validate_ns() if called simultaneously for the same controller.
This patchset tries to fix it up.

As usual, comments and reviews are appreciated.

Hannes Reinecke (2):
  nvme: return error from nvme_alloc_ns()
  nvme: protect against race condition in nvme_validate_ns()

 drivers/nvme/host/core.c | 51 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 38 insertions(+), 13 deletions(-)

-- 
2.16.4

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/2] nvme: return error from nvme_alloc_ns()
  2019-02-19 12:13 [PATCH 0/2] nvme: fixup crash in device_add_disk() Hannes Reinecke
@ 2019-02-19 12:13 ` Hannes Reinecke
  2019-02-19 19:42   ` Sagi Grimberg
  2019-02-20 14:21   ` Christoph Hellwig
  2019-02-19 12:13 ` [PATCH 2/2] nvme: protect against race condition in nvme_validate_ns() Hannes Reinecke
  1 sibling, 2 replies; 8+ messages in thread
From: Hannes Reinecke @ 2019-02-19 12:13 UTC (permalink / raw)


nvme_alloc_ns() might fail, so we should be returning an error code.

Signed-off-by: Hannes Reinecke <hare at suse.com>
---
 drivers/nvme/host/core.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f2f75831decd..9c6f6a4db60a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3214,21 +3214,23 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
 	return 0;
 }
 
-static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 {
 	struct nvme_ns *ns;
 	struct gendisk *disk;
 	struct nvme_id_ns *id;
 	char disk_name[DISK_NAME_LEN];
-	int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT;
+	int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret;
 
 	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
 	if (!ns)
-		return;
+		return -ENOMEM;
 
 	ns->queue = blk_mq_init_queue(ctrl->tagset);
-	if (IS_ERR(ns->queue))
+	if (IS_ERR(ns->queue)) {
+		ret = PTR_ERR(ns->queue);
 		goto out_free_ns;
+	}
 
 	blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
 	if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
@@ -3244,20 +3246,27 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	nvme_set_queue_limits(ctrl, ns->queue);
 
 	id = nvme_identify_ns(ctrl, nsid);
-	if (!id)
+	if (!id) {
+		ret = -EIO;
 		goto out_free_queue;
+	}
 
-	if (id->ncap == 0)
+	if (id->ncap == 0) {
+		ret = -EINVAL;
 		goto out_free_id;
+	}
 
-	if (nvme_init_ns_head(ns, nsid, id))
+	ret = nvme_init_ns_head(ns, nsid, id);
+	if (ret)
 		goto out_free_id;
 	nvme_setup_streams_ns(ctrl, ns);
 	nvme_set_disk_name(disk_name, ns, ctrl, &flags);
 
 	disk = alloc_disk_node(0, node);
-	if (!disk)
+	if (!disk) {
+		ret = -ENOMEM;
 		goto out_unlink_ns;
+	}
 
 	disk->fops = &nvme_fops;
 	disk->private_data = ns;
@@ -3269,7 +3278,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	__nvme_revalidate_disk(disk, id);
 
 	if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
-		if (nvme_nvm_register(ns, disk_name, node)) {
+		ret = nvme_nvm_register(ns, disk_name, node);
+		if (ret) {
 			dev_warn(ctrl->device, "LightNVM init failure\n");
 			goto out_put_disk;
 		}
@@ -3287,7 +3297,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	nvme_fault_inject_init(ns);
 	kfree(id);
 
-	return;
+	return 0;
  out_put_disk:
 	put_disk(ns->disk);
  out_unlink_ns:
@@ -3300,6 +3310,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	blk_cleanup_queue(ns->queue);
  out_free_ns:
 	kfree(ns);
+	return ret;
 }
 
 static void nvme_ns_remove(struct nvme_ns *ns)
-- 
2.16.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/2] nvme: protect against race condition in nvme_validate_ns()
  2019-02-19 12:13 [PATCH 0/2] nvme: fixup crash in device_add_disk() Hannes Reinecke
  2019-02-19 12:13 ` [PATCH 1/2] nvme: return error from nvme_alloc_ns() Hannes Reinecke
@ 2019-02-19 12:13 ` Hannes Reinecke
  2019-02-19 19:44   ` Sagi Grimberg
  1 sibling, 1 reply; 8+ messages in thread
From: Hannes Reinecke @ 2019-02-19 12:13 UTC (permalink / raw)


When subsystems are rapidly reconfigured (or sending out several AENs)
we might end up in a situation where several instances of nvme_scan_work()
are running. Each of which might be trying to register the same nsid,
so nvme_find_get_ns() in nvme_validate_ns() will return 0 for both,
resulting in a crash in nvme_alloc_ns() as both are registering a
gendisk with the same name.

Signed-off-by: Hannes Reinecke <hare at suse.com>
---
 drivers/nvme/host/core.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 9c6f6a4db60a..7cf710e8d98d 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3216,7 +3216,7 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
 
 static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 {
-	struct nvme_ns *ns;
+	struct nvme_ns *ns, *tmp;
 	struct gendisk *disk;
 	struct nvme_id_ns *id;
 	char disk_name[DISK_NAME_LEN];
@@ -3286,6 +3286,15 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	}
 
 	down_write(&ctrl->namespaces_rwsem);
+	list_for_each_entry(tmp, &ctrl->namespaces, list) {
+		if (nsid == tmp->head->ns_id) {
+			up_write(&ctrl->namespaces_rwsem);
+			dev_warn(ctrl->device,
+				 "Duplicate ns %d, rescanning", nsid);
+			ret = -EAGAIN;
+			goto out_put_disk;
+		}
+	}
 	list_add_tail(&ns->list, &ctrl->namespaces);
 	up_write(&ctrl->namespaces_rwsem);
 
@@ -3343,14 +3352,19 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 {
 	struct nvme_ns *ns;
+	int ret;
 
+rescan:
 	ns = nvme_find_get_ns(ctrl, nsid);
 	if (ns) {
 		if (ns->disk && revalidate_disk(ns->disk))
 			nvme_ns_remove(ns);
 		nvme_put_ns(ns);
-	} else
-		nvme_alloc_ns(ctrl, nsid);
+	} else {
+		ret = nvme_alloc_ns(ctrl, nsid);
+		if (ret == -EAGAIN)
+			goto rescan;
+	}
 }
 
 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
-- 
2.16.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 1/2] nvme: return error from nvme_alloc_ns()
  2019-02-19 12:13 ` [PATCH 1/2] nvme: return error from nvme_alloc_ns() Hannes Reinecke
@ 2019-02-19 19:42   ` Sagi Grimberg
  2019-02-20 14:21   ` Christoph Hellwig
  1 sibling, 0 replies; 8+ messages in thread
From: Sagi Grimberg @ 2019-02-19 19:42 UTC (permalink / raw)


Reviewed-by: Sagi Grimberg <sagi at grimberg.me>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 2/2] nvme: protect against race condition in nvme_validate_ns()
  2019-02-19 12:13 ` [PATCH 2/2] nvme: protect against race condition in nvme_validate_ns() Hannes Reinecke
@ 2019-02-19 19:44   ` Sagi Grimberg
  2019-02-19 19:54     ` Keith Busch
  0 siblings, 1 reply; 8+ messages in thread
From: Sagi Grimberg @ 2019-02-19 19:44 UTC (permalink / raw)




On 2/19/19 4:13 AM, Hannes Reinecke wrote:
> When subsystems are rapidly reconfigured (or sending out several AENs)
> we might end up in a situation where several instances of nvme_scan_work()
> are running. Each of which might be trying to register the same nsid,
> so nvme_find_get_ns() in nvme_validate_ns() will return 0 for both,
> resulting in a crash in nvme_alloc_ns() as both are registering a
> gendisk with the same name.

Wouldn't it be better to serialize nvme_scan_work such that it doesn't
run multiple times in parallel?

> Signed-off-by: Hannes Reinecke <hare at suse.com>
> ---
>   drivers/nvme/host/core.c | 20 +++++++++++++++++---
>   1 file changed, 17 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 9c6f6a4db60a..7cf710e8d98d 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -3216,7 +3216,7 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
>   
>   static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
>   {
> -	struct nvme_ns *ns;
> +	struct nvme_ns *ns, *tmp;
>   	struct gendisk *disk;
>   	struct nvme_id_ns *id;
>   	char disk_name[DISK_NAME_LEN];
> @@ -3286,6 +3286,15 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
>   	}
>   
>   	down_write(&ctrl->namespaces_rwsem);
> +	list_for_each_entry(tmp, &ctrl->namespaces, list) {
> +		if (nsid == tmp->head->ns_id) {
> +			up_write(&ctrl->namespaces_rwsem);
> +			dev_warn(ctrl->device,
> +				 "Duplicate ns %d, rescanning", nsid);

Can you move this print to the caller where the actual rescanning happens.

> +			ret = -EAGAIN;
> +			goto out_put_disk;
> +		}
> +	}
>   	list_add_tail(&ns->list, &ctrl->namespaces);
>   	up_write(&ctrl->namespaces_rwsem);
>   
> @@ -3343,14 +3352,19 @@ static void nvme_ns_remove(struct nvme_ns *ns)
>   static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
>   {
>   	struct nvme_ns *ns;
> +	int ret;
>   
> +rescan:
>   	ns = nvme_find_get_ns(ctrl, nsid);
>   	if (ns) {
>   		if (ns->disk && revalidate_disk(ns->disk))
>   			nvme_ns_remove(ns);
>   		nvme_put_ns(ns);
> -	} else
> -		nvme_alloc_ns(ctrl, nsid);
> +	} else {
> +		ret = nvme_alloc_ns(ctrl, nsid);
> +		if (ret == -EAGAIN)
> +			goto rescan;
> +	}
>   }
>   
>   static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 2/2] nvme: protect against race condition in nvme_validate_ns()
  2019-02-19 19:44   ` Sagi Grimberg
@ 2019-02-19 19:54     ` Keith Busch
  2019-02-20  6:52       ` Hannes Reinecke
  0 siblings, 1 reply; 8+ messages in thread
From: Keith Busch @ 2019-02-19 19:54 UTC (permalink / raw)


On Tue, Feb 19, 2019@11:44:41AM -0800, Sagi Grimberg wrote:
> On 2/19/19 4:13 AM, Hannes Reinecke wrote:
> > When subsystems are rapidly reconfigured (or sending out several AENs)
> > we might end up in a situation where several instances of nvme_scan_work()
> > are running. Each of which might be trying to register the same nsid,
> > so nvme_find_get_ns() in nvme_validate_ns() will return 0 for both,
> > resulting in a crash in nvme_alloc_ns() as both are registering a
> > gendisk with the same name.
> 
> Wouldn't it be better to serialize nvme_scan_work such that it doesn't
> run multiple times in parallel?

Doesn't the work queue already serialize individual ctrl's scan_work?

There is also a recently added mutex to synchronize scan work with
command effects handling, which would force an nvme_ctrl's scan_work to
be serialized:

  https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e7ad43c3eda6a1690c4c3c341f95dc1c6898da83

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 2/2] nvme: protect against race condition in nvme_validate_ns()
  2019-02-19 19:54     ` Keith Busch
@ 2019-02-20  6:52       ` Hannes Reinecke
  0 siblings, 0 replies; 8+ messages in thread
From: Hannes Reinecke @ 2019-02-20  6:52 UTC (permalink / raw)


On 2/19/19 8:54 PM, Keith Busch wrote:
> On Tue, Feb 19, 2019@11:44:41AM -0800, Sagi Grimberg wrote:
>> On 2/19/19 4:13 AM, Hannes Reinecke wrote:
>>> When subsystems are rapidly reconfigured (or sending out several AENs)
>>> we might end up in a situation where several instances of nvme_scan_work()
>>> are running. Each of which might be trying to register the same nsid,
>>> so nvme_find_get_ns() in nvme_validate_ns() will return 0 for both,
>>> resulting in a crash in nvme_alloc_ns() as both are registering a
>>> gendisk with the same name.
>>
>> Wouldn't it be better to serialize nvme_scan_work such that it doesn't
>> run multiple times in parallel?
> 
> Doesn't the work queue already serialize individual ctrl's scan_work?
> 
> There is also a recently added mutex to synchronize scan work with
> command effects handling, which would force an nvme_ctrl's scan_work to
> be serialized:
> 
>    https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e7ad43c3eda6a1690c4c3c341f95dc1c6898da83
> 
Ah. Hmm.
Probably.
And indeed, the tests were done without this patch.
I'll check if that patch is sufficient.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare at suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: F. Imend?rffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG N?rnberg)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/2] nvme: return error from nvme_alloc_ns()
  2019-02-19 12:13 ` [PATCH 1/2] nvme: return error from nvme_alloc_ns() Hannes Reinecke
  2019-02-19 19:42   ` Sagi Grimberg
@ 2019-02-20 14:21   ` Christoph Hellwig
  1 sibling, 0 replies; 8+ messages in thread
From: Christoph Hellwig @ 2019-02-20 14:21 UTC (permalink / raw)


Thanks,

applied to nvme-5.1.

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2019-02-20 14:21 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-02-19 12:13 [PATCH 0/2] nvme: fixup crash in device_add_disk() Hannes Reinecke
2019-02-19 12:13 ` [PATCH 1/2] nvme: return error from nvme_alloc_ns() Hannes Reinecke
2019-02-19 19:42   ` Sagi Grimberg
2019-02-20 14:21   ` Christoph Hellwig
2019-02-19 12:13 ` [PATCH 2/2] nvme: protect against race condition in nvme_validate_ns() Hannes Reinecke
2019-02-19 19:44   ` Sagi Grimberg
2019-02-19 19:54     ` Keith Busch
2019-02-20  6:52       ` Hannes Reinecke

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.