All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH/RFC] NVMe: add support for doing offline/online of attached namespaces
@ 2015-01-23 15:24 Jens Axboe
  2015-01-23 17:11 ` Christoph Hellwig
  2015-01-27 20:59 ` Keith Busch
  0 siblings, 2 replies; 11+ messages in thread
From: Jens Axboe @ 2015-01-23 15:24 UTC (permalink / raw)


Add two new ioctls for the char control device:

- NVME_IOCTL_OFFLINE_DISK will offline namespaces/disks and leave only
  the admin queue and char dev running.

- NVME_IOCTL_ONLINE_DISK will do discovery and add namespaces and disks
  back into the system.

The intended use case is for safely doing a format or firmware upgrade,
where device and hardware characteristics could change substantially
beyond just device size and block size.

This is an RFC patch. The goal is as listed, being able to detach any
namespaces and bdevs from a device, so we just have the admin part left.
I'm open to alternative suggestions on how to handle this. Solutions
that include rmmod/modprobe if the driver wont work if you have more
than one NVMe device in a system.

I've run this through various stress testing, and it seems to hold up
fine.

Signed-off-by: Jens Axboe <axboe at fb.com>

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 3eaa0becc52d..5d83292276c8 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -80,6 +80,7 @@ static struct notifier_block nvme_nb;
 
 static void nvme_reset_failed_dev(struct work_struct *ws);
 static int nvme_process_cq(struct nvme_queue *nvmeq);
+static void nvme_set_irq_hints(struct nvme_dev *dev);
 
 struct async_cmd_info {
 	struct kthread_work work;
@@ -1808,6 +1809,21 @@ static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
 #define nvme_compat_ioctl	NULL
 #endif
 
+static int put_ns(struct nvme_ns *ns)
+{
+	if (atomic_dec_and_test(&ns->ref)) {
+		kfree(ns);
+		return 1;
+	}
+
+	return 0;
+}
+
+static void get_ns(struct nvme_ns *ns)
+{
+	atomic_inc(&ns->ref);
+}
+
 static int nvme_open(struct block_device *bdev, fmode_t mode)
 {
 	int ret = 0;
@@ -1819,6 +1835,10 @@ static int nvme_open(struct block_device *bdev, fmode_t mode)
 		ret = -ENXIO;
 	else if (!kref_get_unless_zero(&ns->dev->kref))
 		ret = -ENXIO;
+
+	if (!ret)
+		get_ns(ns);
+
 	spin_unlock(&dev_list_lock);
 
 	return ret;
@@ -1832,6 +1852,7 @@ static void nvme_release(struct gendisk *disk, fmode_t mode)
 	struct nvme_dev *dev = ns->dev;
 
 	kref_put(&dev->kref, nvme_free_dev);
+	put_ns(ns);
 }
 
 static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo)
@@ -1956,6 +1977,7 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
 	queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, ns->queue);
 	ns->dev = dev;
 	ns->queue->queuedata = ns;
+	atomic_set(&ns->ref, 1);
 
 	disk = alloc_disk_node(0, node);
 	if (!disk)
@@ -1992,7 +2014,7 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
  out_free_queue:
 	blk_cleanup_queue(ns->queue);
  out_free_ns:
-	kfree(ns);
+	put_ns(ns);
 	return NULL;
 }
 
@@ -2467,12 +2489,16 @@ static void nvme_dev_remove(struct nvme_dev *dev)
 {
 	struct nvme_ns *ns;
 
+	mutex_lock(&dev->dev_lock);
+
 	list_for_each_entry(ns, &dev->namespaces, list) {
 		if (ns->disk->flags & GENHD_FL_UP)
 			del_gendisk(ns->disk);
 		if (!blk_queue_dying(ns->queue))
 			blk_cleanup_queue(ns->queue);
 	}
+
+	mutex_unlock(&dev->dev_lock);
 }
 
 static int nvme_setup_prp_pools(struct nvme_dev *dev)
@@ -2528,20 +2554,39 @@ static void nvme_release_instance(struct nvme_dev *dev)
 	spin_unlock(&dev_list_lock);
 }
 
+static void nvme_remove_free_ns(struct nvme_ns *ns)
+{
+	struct gendisk *disk = ns->disk;
+	int free_disk;
+
+	spin_lock(&dev_list_lock);
+
+	free_disk = put_ns(ns);
+	if (free_disk)
+		disk->private_data = NULL;
+
+	spin_unlock(&dev_list_lock);
+
+	if (free_disk)
+		put_disk(disk);
+}
+
 static void nvme_free_namespaces(struct nvme_dev *dev)
 {
-	struct nvme_ns *ns, *next;
+	mutex_lock(&dev->dev_lock);
 
-	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
-		list_del(&ns->list);
+	while (!list_empty(&dev->namespaces)) {
+		struct nvme_ns *ns;
 
-		spin_lock(&dev_list_lock);
-		ns->disk->private_data = NULL;
-		spin_unlock(&dev_list_lock);
+		ns = list_first_entry(&dev->namespaces, struct nvme_ns, list);
+		list_del(&ns->list);
 
-		put_disk(ns->disk);
-		kfree(ns);
+		mutex_unlock(&dev->dev_lock);
+		nvme_remove_free_ns(ns);
+		mutex_lock(&dev->dev_lock);
 	}
+
+	mutex_unlock(&dev->dev_lock);
 }
 
 static void nvme_free_dev(struct kref *kref)
@@ -2552,6 +2597,7 @@ static void nvme_free_dev(struct kref *kref)
 	nvme_free_namespaces(dev);
 	nvme_release_instance(dev);
 	blk_mq_free_tag_set(&dev->tagset);
+	mutex_destroy(&dev->dev_lock);
 	kfree(dev->queues);
 	kfree(dev->entry);
 	kfree(dev);
@@ -2573,6 +2619,94 @@ static int nvme_dev_release(struct inode *inode, struct file *f)
 	return 0;
 }
 
+static int nvme_dev_online_disk(struct nvme_dev *dev)
+{
+	int result;
+
+	mutex_lock(&dev->dev_lock);
+
+	if (dev->queue_count != 1) {
+		result = -EBUSY;
+		goto err;
+	}
+
+	result = nvme_setup_io_queues(dev);
+	if (result)
+		goto err;
+
+	nvme_set_irq_hints(dev);
+
+	result = nvme_dev_add(dev);
+	if (result < 0)
+		nvme_free_queues(dev, 1);
+
+err:
+	mutex_unlock(&dev->dev_lock);
+	return result;
+}
+
+static int nvme_offline_ns(struct nvme_ns *ns)
+{
+	struct request_queue *q = ns->queue;
+
+	if (blk_queue_dying(q))
+		return -ENODEV;
+
+	blk_mq_freeze_queue(q);
+	queue_flag_set_unlocked(QUEUE_FLAG_DYING, q);
+	blk_mq_unfreeze_queue(q);
+
+	if (ns->disk->flags & GENHD_FL_UP)
+		del_gendisk(ns->disk);
+
+	blk_cleanup_queue(ns->queue);
+	nvme_remove_free_ns(ns);
+	return 0;
+}
+
+static int nvme_dev_offline_disk(struct nvme_dev *dev)
+{
+	struct nvme_ns *ns;
+	int i, ns_free, q_free;
+
+	ns_free = q_free = 0;
+
+	mutex_lock(&dev->dev_lock);
+
+	while (!list_empty(&dev->namespaces)) {
+		ns = list_first_entry(&dev->namespaces, struct nvme_ns, list);
+		list_del(&ns->list);
+
+		if (!nvme_offline_ns(ns))
+			ns_free++;
+	}
+
+	for (i = dev->queue_count - 1; i >= 1; i--) {
+		struct nvme_queue *nvmeq = dev->queues[i];
+
+		nvme_suspend_queue(nvmeq);
+		nvme_clear_queue(nvmeq);
+		q_free++;
+	}
+
+	nvme_free_queues(dev, 1);
+
+	/*
+	 * If we freed namespaces, we know we have an allocated tag set.
+	 */
+	if (dev->tagset.nr_hw_queues && ns_free) {
+		blk_mq_free_tag_set(&dev->tagset);
+		dev->tagset.nr_hw_queues = 0;
+	}
+
+	mutex_unlock(&dev->dev_lock);
+
+	if (q_free || ns_free)
+		return 0;
+
+	return -ENXIO;
+}
+
 static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 {
 	struct nvme_dev *dev = f->private_data;
@@ -2582,10 +2716,18 @@ static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 	case NVME_IOCTL_ADMIN_CMD:
 		return nvme_user_cmd(dev, NULL, (void __user *)arg);
 	case NVME_IOCTL_IO_CMD:
-		if (list_empty(&dev->namespaces))
+		mutex_lock(&dev->dev_lock);
+		if (list_empty(&dev->namespaces)) {
+			mutex_unlock(&dev->dev_lock);
 			return -ENOTTY;
+		}
 		ns = list_first_entry(&dev->namespaces, struct nvme_ns, list);
+		mutex_unlock(&dev->dev_lock);
 		return nvme_user_cmd(dev, ns, (void __user *)arg);
+	case NVME_IOCTL_ONLINE_DISK:
+		return nvme_dev_online_disk(dev);
+	case NVME_IOCTL_OFFLINE_DISK:
+		return nvme_dev_offline_disk(dev);
 	default:
 		return -ENOTTY;
 	}
@@ -2750,6 +2892,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto free;
 
 	INIT_LIST_HEAD(&dev->namespaces);
+	mutex_init(&dev->dev_lock);
 	dev->reset_workfn = nvme_reset_failed_dev;
 	INIT_WORK(&dev->reset_work, nvme_reset_workfn);
 	dev->pci_dev = pci_dev_get(pdev);
@@ -2767,8 +2910,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (result)
 		goto release_pools;
 
-	if (dev->online_queues > 1)
+	if (dev->online_queues > 1) {
+		mutex_lock(&dev->dev_lock);
 		result = nvme_dev_add(dev);
+		mutex_unlock(&dev->dev_lock);
+	}
 	if (result)
 		goto shutdown;
 
@@ -2799,6 +2945,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	nvme_release_instance(dev);
  put_pci:
 	pci_dev_put(dev->pci_dev);
+	mutex_destroy(&dev->dev_lock);
  free:
 	kfree(dev->queues);
 	kfree(dev->entry);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 19a5d4b23209..78c9f60c09df 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -90,6 +90,7 @@ struct nvme_dev {
 	struct msix_entry *entry;
 	struct nvme_bar __iomem *bar;
 	struct list_head namespaces;
+	struct mutex dev_lock;
 	struct kref kref;
 	struct miscdevice miscdev;
 	work_func_t reset_workfn;
@@ -123,6 +124,8 @@ struct nvme_ns {
 	int ms;
 	u64 mode_select_num_blocks;
 	u32 mode_select_block_len;
+
+	atomic_t ref;
 };
 
 /*
diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h
index 26386cf3db44..c42b2b296614 100644
--- a/include/uapi/linux/nvme.h
+++ b/include/uapi/linux/nvme.h
@@ -555,5 +555,8 @@ struct nvme_passthru_cmd {
 #define NVME_IOCTL_ADMIN_CMD	_IOWR('N', 0x41, struct nvme_admin_cmd)
 #define NVME_IOCTL_SUBMIT_IO	_IOW('N', 0x42, struct nvme_user_io)
 #define NVME_IOCTL_IO_CMD	_IOWR('N', 0x43, struct nvme_passthru_cmd)
+#define NVME_IOCTL_ONLINE_DISK	_IO('N', 0x44)
+#define NVME_IOCTL_OFFLINE_DISK	_IO('N', 0x45)
+
 
 #endif /* _UAPI_LINUX_NVME_H */

-- 
Jens Axboe

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH/RFC] NVMe: add support for doing offline/online of attached namespaces
  2015-01-23 15:24 [PATCH/RFC] NVMe: add support for doing offline/online of attached namespaces Jens Axboe
@ 2015-01-23 17:11 ` Christoph Hellwig
  2015-01-23 17:24   ` Jens Axboe
  2015-01-27 20:59 ` Keith Busch
  1 sibling, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2015-01-23 17:11 UTC (permalink / raw)


On Fri, Jan 23, 2015@08:24:30AM -0700, Jens Axboe wrote:
> This is an RFC patch. The goal is as listed, being able to detach any
> namespaces and bdevs from a device, so we just have the admin part left.
> I'm open to alternative suggestions on how to handle this. Solutions
> that include rmmod/modprobe if the driver wont work if you have more
> than one NVMe device in a system.

Aways use the scsi to nvme translation and run the thing through the
SCSI layer where all these issues are already handled?  I'm only half
joking, with features like multipathing and reservations coming up
this actually does seem like the best long term direction once we've
cut the remaining fat from the SCSI I/O path.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH/RFC] NVMe: add support for doing offline/online of attached namespaces
  2015-01-23 17:11 ` Christoph Hellwig
@ 2015-01-23 17:24   ` Jens Axboe
  2015-01-23 17:31     ` Christoph Hellwig
  0 siblings, 1 reply; 11+ messages in thread
From: Jens Axboe @ 2015-01-23 17:24 UTC (permalink / raw)


On 01/23/2015 10:11 AM, Christoph Hellwig wrote:
> On Fri, Jan 23, 2015@08:24:30AM -0700, Jens Axboe wrote:
>> This is an RFC patch. The goal is as listed, being able to detach any
>> namespaces and bdevs from a device, so we just have the admin part left.
>> I'm open to alternative suggestions on how to handle this. Solutions
>> that include rmmod/modprobe if the driver wont work if you have more
>> than one NVMe device in a system.
>
> Aways use the scsi to nvme translation and run the thing through the
> SCSI layer where all these issues are already handled?  I'm only half
> joking, with features like multipathing and reservations coming up
> this actually does seem like the best long term direction once we've
> cut the remaining fat from the SCSI I/O path.

It'd be an interesting experiment in seeing how close scsi-mq is to raw 
performance, I'd be worried there's still quite a gap though. Maybe when 
we're further along it starts to become a more viable option, at least I 
would not rule it out.

At the same time, this is a feature we use/need now. So I don't think a 
potential switch in future direction should preclude that from being 
done now.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH/RFC] NVMe: add support for doing offline/online of attached namespaces
  2015-01-23 17:24   ` Jens Axboe
@ 2015-01-23 17:31     ` Christoph Hellwig
  2015-01-23 17:39       ` Jens Axboe
  0 siblings, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2015-01-23 17:31 UTC (permalink / raw)


On Fri, Jan 23, 2015@10:24:45AM -0700, Jens Axboe wrote:
> It'd be an interesting experiment in seeing how close scsi-mq is to raw
> performance, I'd be worried there's still quite a gap though. Maybe when
> we're further along it starts to become a more viable option, at least I
> would not rule it out.

I'd love to try it.  On the submission side SCSI for some drivers
actually is more efficient than nvme in terms of memory allocations
as we don't require any allocation for common I/O. (Your patch closes
that gap for the smallest possible IO), altough the per-host and
per-device shost_busy/sdev_busy counters will show some issues with
enough sockets.  On the completion side SCSI might still be a bit
worse as we haven't really haven't started any optimizations yet.

> At the same time, this is a feature we use/need now. So I don't think a
> potential switch in future direction should preclude that from being done
> now.

Sure, I was just answering your question for potential alternatives.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH/RFC] NVMe: add support for doing offline/online of attached namespaces
  2015-01-23 17:31     ` Christoph Hellwig
@ 2015-01-23 17:39       ` Jens Axboe
  2015-01-23 17:44         ` Christoph Hellwig
  0 siblings, 1 reply; 11+ messages in thread
From: Jens Axboe @ 2015-01-23 17:39 UTC (permalink / raw)


On 01/23/2015 10:31 AM, Christoph Hellwig wrote:
> On Fri, Jan 23, 2015@10:24:45AM -0700, Jens Axboe wrote:
>> It'd be an interesting experiment in seeing how close scsi-mq is to raw
>> performance, I'd be worried there's still quite a gap though. Maybe when
>> we're further along it starts to become a more viable option, at least I
>> would not rule it out.
>
> I'd love to try it.  On the submission side SCSI for some drivers
> actually is more efficient than nvme in terms of memory allocations
> as we don't require any allocation for common I/O. (Your patch closes
> that gap for the smallest possible IO), altough the per-host and
> per-device shost_busy/sdev_busy counters will show some issues with
> enough sockets.  On the completion side SCSI might still be a bit
> worse as we haven't really haven't started any optimizations yet.

For the per dev host/sdev busy counters, those could be further improved 
if they turn out to be a problem. I've got various "things" here that'll 
do 1-2M IOPS per dev, so some test/profiling would not be a problem to do.

First issue is support for > 1 hw queue in SCSI, though.

>> At the same time, this is a feature we use/need now. So I don't think a
>> potential switch in future direction should preclude that from being done
>> now.
>
> Sure, I was just answering your question for potential alternatives.

Great, thanks.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH/RFC] NVMe: add support for doing offline/online of attached namespaces
  2015-01-23 17:39       ` Jens Axboe
@ 2015-01-23 17:44         ` Christoph Hellwig
  2015-01-23 17:46           ` Jens Axboe
  0 siblings, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2015-01-23 17:44 UTC (permalink / raw)


On Fri, Jan 23, 2015@10:39:38AM -0700, Jens Axboe wrote:
> For the per dev host/sdev busy counters, those could be further improved if
> they turn out to be a problem. I've got various "things" here that'll do
> 1-2M IOPS per dev, so some test/profiling would not be a problem to do.

We've tested up to 3M IOPS and it wasn't an issue, but none of these
systems had more than two sockets.

> First issue is support for > 1 hw queue in SCSI, though.

We've added > 1 queue support for virtio-scsi and srp, but most HBAs lack
the hardware support for it.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH/RFC] NVMe: add support for doing offline/online of attached namespaces
  2015-01-23 17:44         ` Christoph Hellwig
@ 2015-01-23 17:46           ` Jens Axboe
  2015-01-23 17:51             ` Christoph Hellwig
  0 siblings, 1 reply; 11+ messages in thread
From: Jens Axboe @ 2015-01-23 17:46 UTC (permalink / raw)


On 01/23/2015 10:44 AM, Christoph Hellwig wrote:
> On Fri, Jan 23, 2015@10:39:38AM -0700, Jens Axboe wrote:
>> For the per dev host/sdev busy counters, those could be further improved if
>> they turn out to be a problem. I've got various "things" here that'll do
>> 1-2M IOPS per dev, so some test/profiling would not be a problem to do.
>
> We've tested up to 3M IOPS and it wasn't an issue, but none of these
> systems had more than two sockets.

I've got an older 4 socket here that loves to prove these are issues :-)

>> First issue is support for > 1 hw queue in SCSI, though.
>
> We've added > 1 queue support for virtio-scsi and srp, but most HBAs lack
> the hardware support for it.

Is that queued up for 3.20?


-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH/RFC] NVMe: add support for doing offline/online of attached namespaces
  2015-01-23 17:46           ` Jens Axboe
@ 2015-01-23 17:51             ` Christoph Hellwig
  2015-01-23 17:52               ` Jens Axboe
  0 siblings, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2015-01-23 17:51 UTC (permalink / raw)


On Fri, Jan 23, 2015@10:46:19AM -0700, Jens Axboe wrote:
> >We've added > 1 queue support for virtio-scsi and srp, but most HBAs lack
> >the hardware support for it.
> 
> Is that queued up for 3.20?

All that work already is in 3.19-rc.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH/RFC] NVMe: add support for doing offline/online of attached namespaces
  2015-01-23 17:51             ` Christoph Hellwig
@ 2015-01-23 17:52               ` Jens Axboe
  0 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2015-01-23 17:52 UTC (permalink / raw)


On 01/23/2015 10:51 AM, Christoph Hellwig wrote:
> On Fri, Jan 23, 2015@10:46:19AM -0700, Jens Axboe wrote:
>>> We've added > 1 queue support for virtio-scsi and srp, but most HBAs lack
>>> the hardware support for it.
>>
>> Is that queued up for 3.20?
>
> All that work already is in 3.19-rc.

Oh, guess I had missed that... Thanks!

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH/RFC] NVMe: add support for doing offline/online of attached namespaces
  2015-01-23 15:24 [PATCH/RFC] NVMe: add support for doing offline/online of attached namespaces Jens Axboe
  2015-01-23 17:11 ` Christoph Hellwig
@ 2015-01-27 20:59 ` Keith Busch
  2015-01-28 16:13   ` Jens Axboe
  1 sibling, 1 reply; 11+ messages in thread
From: Keith Busch @ 2015-01-27 20:59 UTC (permalink / raw)


On Fri, 23 Jan 2015, Jens Axboe wrote:
> Add two new ioctls for the char control device:
>
> - NVME_IOCTL_OFFLINE_DISK will offline namespaces/disks and leave only
>  the admin queue and char dev running.
>
> - NVME_IOCTL_ONLINE_DISK will do discovery and add namespaces and disks
>  back into the system.
>
> The intended use case is for safely doing a format or firmware upgrade,
> where device and hardware characteristics could change substantially
> beyond just device size and block size.
>
> This is an RFC patch. The goal is as listed, being able to detach any
> namespaces and bdevs from a device, so we just have the admin part left.
> I'm open to alternative suggestions on how to handle this. Solutions
> that include rmmod/modprobe if the driver wont work if you have more
> than one NVMe device in a system.
>
> I've run this through various stress testing, and it seems to hold up
> fine.

Looks pretty good, I think we need something like this. We trust a user
too much to not to do something bad when changing things like block sizes,
so it'd be good to have a mechanisms to make this safer.

Should these actions be IOCTLs? I think 'sysfs' might be easier to use,
and easier for adding additional actions in the future.

Could we try for something with more control? For example, I might change
the block format on namespace 2, but namespace 1 is my root partition
so I don't want to delete that disk.

I don't think you need to reference count the namespace open's. The
controller reference is also taken, and the last namespace reference
can't be released until the last controller reference is also released,
so it seems redundant.

That's all I got right now. Thanks!

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH/RFC] NVMe: add support for doing offline/online of attached namespaces
  2015-01-27 20:59 ` Keith Busch
@ 2015-01-28 16:13   ` Jens Axboe
  0 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2015-01-28 16:13 UTC (permalink / raw)


On 01/27/2015 01:59 PM, Keith Busch wrote:
> On Fri, 23 Jan 2015, Jens Axboe wrote:
>> Add two new ioctls for the char control device:
>>
>> - NVME_IOCTL_OFFLINE_DISK will offline namespaces/disks and leave only
>>  the admin queue and char dev running.
>>
>> - NVME_IOCTL_ONLINE_DISK will do discovery and add namespaces and disks
>>  back into the system.
>>
>> The intended use case is for safely doing a format or firmware upgrade,
>> where device and hardware characteristics could change substantially
>> beyond just device size and block size.
>>
>> This is an RFC patch. The goal is as listed, being able to detach any
>> namespaces and bdevs from a device, so we just have the admin part left.
>> I'm open to alternative suggestions on how to handle this. Solutions
>> that include rmmod/modprobe if the driver wont work if you have more
>> than one NVMe device in a system.
>>
>> I've run this through various stress testing, and it seems to hold up
>> fine.
>
> Looks pretty good, I think we need something like this. We trust a user
> too much to not to do something bad when changing things like block sizes,
> so it'd be good to have a mechanisms to make this safer.
>
> Should these actions be IOCTLs? I think 'sysfs' might be easier to use,
> and easier for adding additional actions in the future.

Might not hurt to have both entry points for it. From a format (or
other) tool, the ioctl path would be more convenient as it could just do
it for you. I don't see the sysfs path being used much - I could be
wrong, of course. In any case, not a big deal to me and would not really
change the patch as both paths would hook in the same place.

> Could we try for something with more control? For example, I might change
> the block format on namespace 2, but namespace 1 is my root partition
> so I don't want to delete that disk.

That's not a bad idea... I'll update it to do that, or at least with the
option of doing that.

> I don't think you need to reference count the namespace open's. The
> controller reference is also taken, and the last namespace reference
> can't be released until the last controller reference is also released,
> so it seems redundant.

I'll re-run it, but I had issues with not referencing the ns opens
(which is why it exists in the patch). Could be a sign of something else
being wrong, of course. It's been a while since I last did the
exhaustive testing, so perhaps the recent hotplug changes helps it out.
>
> That's all I got right now. Thanks!


-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2015-01-28 16:13 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-01-23 15:24 [PATCH/RFC] NVMe: add support for doing offline/online of attached namespaces Jens Axboe
2015-01-23 17:11 ` Christoph Hellwig
2015-01-23 17:24   ` Jens Axboe
2015-01-23 17:31     ` Christoph Hellwig
2015-01-23 17:39       ` Jens Axboe
2015-01-23 17:44         ` Christoph Hellwig
2015-01-23 17:46           ` Jens Axboe
2015-01-23 17:51             ` Christoph Hellwig
2015-01-23 17:52               ` Jens Axboe
2015-01-27 20:59 ` Keith Busch
2015-01-28 16:13   ` Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.