All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] NVMe: Skip namespaces with interleaved meta-data
@ 2015-01-27 18:07 Keith Busch
  2015-01-27 21:57 ` David Darrington
  0 siblings, 1 reply; 18+ messages in thread
From: Keith Busch @ 2015-01-27 18:07 UTC (permalink / raw)


The block layer does not support extended block sizes that require data
buffers interleave metadata in host memory. The driver was allowing
these under the assumption metadata was contained in a separate region,
but that's not necessarily the format, so we'd inadvertently corrupt
data and memory by allowing these namespaces.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
Hi Christoph,

This is another example where we'd have inaccessible namespaces through
block devices. I'd like to be able to use them with passthrough, and
while the block layer can't deal with this format, a userspace program
could. This has more merits for a h/w vendor than a typical end user,
but I'm happy vendors choose Linux for developing.

 drivers/block/nvme-core.c |   46
 +++++++++++++++++++++++++++++++++------------
 include/uapi/linux/nvme.h |    2 ++
 2 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index d826bf3..6bd66f9 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1825,13 +1825,23 @@ static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo)
 	return 0;
 }
 
+static void nvme_ns_cleanup(struct nvme_ns *ns)
+{
+	if (ns->disk->flags & GENHD_FL_UP)
+		del_gendisk(ns->disk);
+	if (!blk_queue_dying(ns->queue)) {
+		blk_mq_abort_requeue_list(ns->queue);
+		blk_cleanup_queue(ns->queue);
+	}
+}
+
 static int nvme_revalidate_disk(struct gendisk *disk)
 {
 	struct nvme_ns *ns = disk->private_data;
 	struct nvme_dev *dev = ns->dev;
 	struct nvme_id_ns *id;
 	dma_addr_t dma_addr;
-	int lbaf;
+	int lbaf, ms;
 
 	id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
 								GFP_KERNEL);
@@ -1845,7 +1855,15 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 		goto free;
 
 	lbaf = id->flbas & 0xf;
+	ms = le16_to_cpu(id->lbaf[lbaf].ms);
+
+	if ((id->flbas & NVME_NS_FLBAS_META_EXT) && ms) {
+		nvme_ns_cleanup(ns);
+		return 0;
+	}
+
 	ns->lba_shift = id->lbaf[lbaf].ds;
+	ns->ms = ms;
 
 	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
 	set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
@@ -1922,11 +1940,16 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
 	struct nvme_ns *ns;
 	struct gendisk *disk;
 	int node = dev_to_node(&dev->pci_dev->dev);
-	int lbaf;
+	int lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
+	int ms = le16_to_cpu(id->lbaf[lbaf].ms);
 
 	if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
 		return NULL;
 
+	/* block layer does not support interleaved memory for extended block formats */
+	if ((id->flbas & NVME_NS_FLBAS_META_EXT) && ms)
+		return NULL;
+
 	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
 	if (!ns)
 		return NULL;
@@ -1945,9 +1968,8 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
 
 	ns->ns_id = nsid;
 	ns->disk = disk;
-	lbaf = id->flbas & 0xf;
 	ns->lba_shift = id->lbaf[lbaf].ds;
-	ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
+	ns->ms = ms;
 	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
 	if (dev->max_hw_sectors)
 		blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
@@ -2424,6 +2446,9 @@ static void nvme_freeze_queues(struct nvme_dev *dev)
 	struct nvme_ns *ns;
 
 	list_for_each_entry(ns, &dev->namespaces, list) {
+		if (blk_queue_dying(ns->queue))
+			continue;
+
 		blk_mq_freeze_queue_start(ns->queue);
 
 		spin_lock(ns->queue->queue_lock);
@@ -2440,6 +2465,9 @@ static void nvme_unfreeze_queues(struct nvme_dev *dev)
 	struct nvme_ns *ns;
 
 	list_for_each_entry(ns, &dev->namespaces, list) {
+		if (blk_queue_dying(ns->queue))
+			continue;
+
 		queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
 		blk_mq_unfreeze_queue(ns->queue);
 		blk_mq_start_stopped_hw_queues(ns->queue, true);
@@ -2477,14 +2505,8 @@ static void nvme_dev_remove(struct nvme_dev *dev)
 {
 	struct nvme_ns *ns;
 
-	list_for_each_entry(ns, &dev->namespaces, list) {
-		if (ns->disk->flags & GENHD_FL_UP)
-			del_gendisk(ns->disk);
-		if (!blk_queue_dying(ns->queue)) {
-			blk_mq_abort_requeue_list(ns->queue);
-			blk_cleanup_queue(ns->queue);
-		}
-	}
+	list_for_each_entry(ns, &dev->namespaces, list)
+		nvme_ns_cleanup(ns);
 }
 
 static int nvme_setup_prp_pools(struct nvme_dev *dev)
diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h
index 26386cf..7cc0faa 100644
--- a/include/uapi/linux/nvme.h
+++ b/include/uapi/linux/nvme.h
@@ -124,6 +124,8 @@ struct nvme_id_ns {
 
 enum {
 	NVME_NS_FEAT_THIN	= 1 << 0,
+	NVME_NS_FLBAS_LBA_MASK	= 0xf,
+	NVME_NS_FLBAS_META_EXT	= 0x10,
 	NVME_LBAF_RP_BEST	= 0,
 	NVME_LBAF_RP_BETTER	= 1,
 	NVME_LBAF_RP_GOOD	= 2,
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH] NVMe: Skip namespaces with interleaved meta-data
  2015-01-27 18:07 [PATCH] NVMe: Skip namespaces with interleaved meta-data Keith Busch
@ 2015-01-27 21:57 ` David Darrington
  2015-01-27 22:09   ` Keith Busch
  0 siblings, 1 reply; 18+ messages in thread
From: David Darrington @ 2015-01-27 21:57 UTC (permalink / raw)


What happens if a namespace includes metadata that is not interleaved (flbas bit 4 = 0)? Won't the reads/writes through the normal block io path cause metadata to be read/written to/from MPTR, which is not set in nvme_submit_iod() so will point to address 0.

________________________________________
From: Linux-nvme <linux-nvme-bounces@lists.infradead.org> on behalf of Keith Busch <keith.busch@intel.com>
Sent: Tuesday, January 27, 2015 12:07 PM
To: linux-nvme at lists.infradead.org; Matthew Wilcox; Christoph Hellwig
Cc: Keith Busch
Subject: [PATCH] NVMe: Skip namespaces with interleaved meta-data

The block layer does not support extended block sizes that require data
buffers interleave metadata in host memory. The driver was allowing
these under the assumption metadata was contained in a separate region,
but that's not necessarily the format, so we'd inadvertently corrupt
data and memory by allowing these namespaces.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
Hi Christoph,

This is another example where we'd have inaccessible namespaces through
block devices. I'd like to be able to use them with passthrough, and
while the block layer can't deal with this format, a userspace program
could. This has more merits for a h/w vendor than a typical end user,
but I'm happy vendors choose Linux for developing.

 drivers/block/nvme-core.c |   46
 +++++++++++++++++++++++++++++++++------------
 include/uapi/linux/nvme.h |    2 ++
 2 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index d826bf3..6bd66f9 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1825,13 +1825,23 @@ static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo)
        return 0;
 }

+static void nvme_ns_cleanup(struct nvme_ns *ns)
+{
+       if (ns->disk->flags & GENHD_FL_UP)
+               del_gendisk(ns->disk);
+       if (!blk_queue_dying(ns->queue)) {
+               blk_mq_abort_requeue_list(ns->queue);
+               blk_cleanup_queue(ns->queue);
+       }
+}
+
 static int nvme_revalidate_disk(struct gendisk *disk)
 {
        struct nvme_ns *ns = disk->private_data;
        struct nvme_dev *dev = ns->dev;
        struct nvme_id_ns *id;
        dma_addr_t dma_addr;
-       int lbaf;
+       int lbaf, ms;

        id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
                                                                GFP_KERNEL);
@@ -1845,7 +1855,15 @@ static int nvme_revalidate_disk(struct gendisk *disk)
                goto free;

        lbaf = id->flbas & 0xf;
+       ms = le16_to_cpu(id->lbaf[lbaf].ms);
+
+       if ((id->flbas & NVME_NS_FLBAS_META_EXT) && ms) {
+               nvme_ns_cleanup(ns);
+               return 0;
+       }
+
        ns->lba_shift = id->lbaf[lbaf].ds;
+       ns->ms = ms;

        blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
        set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
@@ -1922,11 +1940,16 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
        struct nvme_ns *ns;
        struct gendisk *disk;
        int node = dev_to_node(&dev->pci_dev->dev);
-       int lbaf;
+       int lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
+       int ms = le16_to_cpu(id->lbaf[lbaf].ms);

        if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
                return NULL;

+       /* block layer does not support interleaved memory for extended block formats */
+       if ((id->flbas & NVME_NS_FLBAS_META_EXT) && ms)
+               return NULL;
+
        ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
        if (!ns)
                return NULL;
@@ -1945,9 +1968,8 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,

        ns->ns_id = nsid;
        ns->disk = disk;
-       lbaf = id->flbas & 0xf;
        ns->lba_shift = id->lbaf[lbaf].ds;
-       ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
+       ns->ms = ms;
        blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
        if (dev->max_hw_sectors)
                blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
@@ -2424,6 +2446,9 @@ static void nvme_freeze_queues(struct nvme_dev *dev)
        struct nvme_ns *ns;

        list_for_each_entry(ns, &dev->namespaces, list) {
+               if (blk_queue_dying(ns->queue))
+                       continue;
+
                blk_mq_freeze_queue_start(ns->queue);

                spin_lock(ns->queue->queue_lock);
@@ -2440,6 +2465,9 @@ static void nvme_unfreeze_queues(struct nvme_dev *dev)
        struct nvme_ns *ns;

        list_for_each_entry(ns, &dev->namespaces, list) {
+               if (blk_queue_dying(ns->queue))
+                       continue;
+
                queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
                blk_mq_unfreeze_queue(ns->queue);
                blk_mq_start_stopped_hw_queues(ns->queue, true);
@@ -2477,14 +2505,8 @@ static void nvme_dev_remove(struct nvme_dev *dev)
 {
        struct nvme_ns *ns;

-       list_for_each_entry(ns, &dev->namespaces, list) {
-               if (ns->disk->flags & GENHD_FL_UP)
-                       del_gendisk(ns->disk);
-               if (!blk_queue_dying(ns->queue)) {
-                       blk_mq_abort_requeue_list(ns->queue);
-                       blk_cleanup_queue(ns->queue);
-               }
-       }
+       list_for_each_entry(ns, &dev->namespaces, list)
+               nvme_ns_cleanup(ns);
 }

 static int nvme_setup_prp_pools(struct nvme_dev *dev)
diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h
index 26386cf..7cc0faa 100644
--- a/include/uapi/linux/nvme.h
+++ b/include/uapi/linux/nvme.h
@@ -124,6 +124,8 @@ struct nvme_id_ns {

 enum {
        NVME_NS_FEAT_THIN       = 1 << 0,
+       NVME_NS_FLBAS_LBA_MASK  = 0xf,
+       NVME_NS_FLBAS_META_EXT  = 0x10,
        NVME_LBAF_RP_BEST       = 0,
        NVME_LBAF_RP_BETTER     = 1,
        NVME_LBAF_RP_GOOD       = 2,
--
1.7.10.4


_______________________________________________
Linux-nvme mailing list
Linux-nvme at lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH] NVMe: Skip namespaces with interleaved meta-data
  2015-01-27 21:57 ` David Darrington
@ 2015-01-27 22:09   ` Keith Busch
  2015-01-28  1:21     ` Keith Busch
  0 siblings, 1 reply; 18+ messages in thread
From: Keith Busch @ 2015-01-27 22:09 UTC (permalink / raw)


On Tue, 27 Jan 2015, David Darrington wrote:
> What happens if a namespace includes metadata that is not interleaved (flbas
> bit 4 = 0)? Won't the reads/writes through the normal block io path cause
> metadata to be read/written to/from MPTR, which is not set in
> nvme_submit_iod() so will point to address 0.

You're absolutely right. I'm not sure what happens (I don't have a device
with separate metadata capabilities) but was hoping the device wouldn't
try to DMA to/from 0, or it would be aborted by the root complex if it
made it that far. Wishful thinking, probably not aligned with reality.

The nvme_submit_io() path enforces correct MPTR usage, though!

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] NVMe: Skip namespaces with interleaved meta-data
  2015-01-27 22:09   ` Keith Busch
@ 2015-01-28  1:21     ` Keith Busch
  2015-01-28  1:39       ` Martin K. Petersen
  0 siblings, 1 reply; 18+ messages in thread
From: Keith Busch @ 2015-01-28  1:21 UTC (permalink / raw)


Hi Jens and Martin,

Slightly unrelated to this original thread, but go me rethinking of
adding protection infromation support to NVMe to make use of metadata.

I can't register with the blk-integrity extentions until after add_disk
is completed because the disk's kobj needs to be initialized before
blk_integrity_register. The 'add_disk' itself submits a lot of reads,
though. Am I missing something, or how could those succeed when the
block device's required integrity format hasn't been setup?

Thanks,
Keith

On Tue, 27 Jan 2015, Keith Busch wrote:
> On Tue, 27 Jan 2015, David Darrington wrote:
>> What happens if a namespace includes metadata that is not interleaved 
>> (flbas
>> bit 4 = 0)? Won't the reads/writes through the normal block io path cause
>> metadata to be read/written to/from MPTR, which is not set in
>> nvme_submit_iod() so will point to address 0.
>
> You're absolutely right. I'm not sure what happens (I don't have a device
> with separate metadata capabilities) but was hoping the device wouldn't
> try to DMA to/from 0, or it would be aborted by the root complex if it
> made it that far. Wishful thinking, probably not aligned with reality.
>
> The nvme_submit_io() path enforces correct MPTR usage, though!

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] NVMe: Skip namespaces with interleaved meta-data
  2015-01-28  1:21     ` Keith Busch
@ 2015-01-28  1:39       ` Martin K. Petersen
  2015-01-28 15:11         ` Keith Busch
  0 siblings, 1 reply; 18+ messages in thread
From: Martin K. Petersen @ 2015-01-28  1:39 UTC (permalink / raw)


>>>>> "Keith" == Keith Busch <keith.busch at intel.com> writes:

Keith,

Keith> I can't register with the blk-integrity extentions until after
Keith> add_disk is completed because the disk's kobj needs to be
Keith> initialized before blk_integrity_register. The 'add_disk' itself
Keith> submits a lot of reads, though. Am I missing something, or how
Keith> could those succeed when the block device's required integrity
Keith> format hasn't been setup?

Protection is optional and enabled on a per-I/O basis. Even if the block
layer's auto protection is enabled it is normal to see a flutter of
unprotected reads while partitions are being scanned.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] NVMe: Skip namespaces with interleaved meta-data
  2015-01-28  1:39       ` Martin K. Petersen
@ 2015-01-28 15:11         ` Keith Busch
  2015-01-28 20:50           ` Paul Grabinar
  2015-01-28 22:17           ` Andrey Kuzmin
  0 siblings, 2 replies; 18+ messages in thread
From: Keith Busch @ 2015-01-28 15:11 UTC (permalink / raw)


On Tue, 27 Jan 2015, Martin K. Petersen wrote:
>>>>>> "Keith" == Keith Busch <keith.busch at intel.com> writes:
> Keith,
>
> Keith> I can't register with the blk-integrity extentions until after
> Keith> add_disk is completed because the disk's kobj needs to be
> Keith> initialized before blk_integrity_register. The 'add_disk' itself
> Keith> submits a lot of reads, though. Am I missing something, or how
> Keith> could those succeed when the block device's required integrity
> Keith> format hasn't been setup?
>
> Protection is optional and enabled on a per-I/O basis. Even if the block
> layer's auto protection is enabled it is normal to see a flutter of
> unprotected reads while partitions are being scanned.

Thanks for the info. NVMe doesn't appear to allow disabling metadata
per-io except for a subset of formats. If that's correct, this driver
would have to provide a valid protection buffer for all IO. Checking
with committee comrades for clarification.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] NVMe: Skip namespaces with interleaved meta-data
  2015-01-28 15:11         ` Keith Busch
@ 2015-01-28 20:50           ` Paul Grabinar
  2015-01-28 21:16             ` Keith Busch
  2015-01-28 22:17           ` Andrey Kuzmin
  1 sibling, 1 reply; 18+ messages in thread
From: Paul Grabinar @ 2015-01-28 20:50 UTC (permalink / raw)


On 28/01/15 15:11, Keith Busch wrote:
>
> Thanks for the info. NVMe doesn't appear to allow disabling metadata
> per-io except for a subset of formats. If that's correct, this driver
> would have to provide a valid protection buffer for all IO. Checking
> with committee comrades for clarification.
>

It is indeed a shame that they didn't define a null meta-data pointer as
meaning there is no meta-data. Maybe 1.3?

I got around this as follows.

Limit the maximum transfer so that all the meta-data fits into a single
page. As a side note, the protection information from the kernel needs
to be set to a maximum of one page anyway, as there is only a single PRP
entry for the meta-data and I don't think PRP lists are allowed here.

Allocate two 1 page DMA buffers per device, one will be used for reads
and one for writes.
Fill the write buffer with 0xFF, which means no protection information
is available.
When a read occurs and there is no kernel supplied protection
information, set the meta-data pointer to the read buffer.
When a write occurs and there is no kernel supplied protection
information, set the meta-data pointer to the write buffer. This will
write all the meta-data as 0xFF.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] NVMe: Skip namespaces with interleaved meta-data
  2015-01-28 20:50           ` Paul Grabinar
@ 2015-01-28 21:16             ` Keith Busch
  2015-01-28 21:46               ` Paul Grabinar
  0 siblings, 1 reply; 18+ messages in thread
From: Keith Busch @ 2015-01-28 21:16 UTC (permalink / raw)


On Wed, 28 Jan 2015, Paul Grabinar wrote:
> On 28/01/15 15:11, Keith Busch wrote:
>> Thanks for the info. NVMe doesn't appear to allow disabling metadata
>> per-io except for a subset of formats. If that's correct, this driver
>> would have to provide a valid protection buffer for all IO. Checking
>> with committee comrades for clarification.
>>
>
> It is indeed a shame that they didn't define a null meta-data pointer as
> meaning there is no meta-data. Maybe 1.3?
>
> I got around this as follows.
>
> Limit the maximum transfer so that all the meta-data fits into a single
> page. As a side note, the protection information from the kernel needs
> to be set to a maximum of one page anyway, as there is only a single PRP
> entry for the meta-data and I don't think PRP lists are allowed here.

Correct, meta-data requires a single physically contiguous buffer,
so no prp lists here. It doesn't necessarilly need to be in a single
page, though. We can have the block layer split things accordingly if
we register with this:

 	blk_queue_max_integrity_segments(ns->queue, 1);

This way no command will be recieved requiring two metadata buffers.

I was trying something similar to SCSI, and I think Christoph may
approve. Instead of all the logic deciding which namespace to attach
and which to bail, attach all namespaces from NSID 1 to the max, then:

 	set_capacity(ns->disk, 0);

Wait to initialize the format in nvme_revalidate_disk. If it's a usable
format, set capacity and sector accordingly, and leave capacity at 0 if
not. For metadata formats, leave the capacity at 0 until block integrity
extensions can be registered, then rescan.

> Allocate two 1 page DMA buffers per device, one will be used for reads
> and one for writes.
> Fill the write buffer with 0xFF, which means no protection information
> is available.
> When a read occurs and there is no kernel supplied protection
> information, set the meta-data pointer to the read buffer.
> When a write occurs and there is no kernel supplied protection
> information, set the meta-data pointer to the write buffer. This will
> write all the meta-data as 0xFF.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] NVMe: Skip namespaces with interleaved meta-data
  2015-01-28 21:16             ` Keith Busch
@ 2015-01-28 21:46               ` Paul Grabinar
  2015-01-28 22:08                 ` Martin K. Petersen
  0 siblings, 1 reply; 18+ messages in thread
From: Paul Grabinar @ 2015-01-28 21:46 UTC (permalink / raw)


On 28/01/15 21:16, Keith Busch wrote:
> I was trying something similar to SCSI, and I think Christoph may
> approve. Instead of all the logic deciding which namespace to attach
> and which to bail, attach all namespaces from NSID 1 to the max, then:
>
>     set_capacity(ns->disk, 0);
>
> Wait to initialize the format in nvme_revalidate_disk. If it's a usable
> format, set capacity and sector accordingly, and leave capacity at 0 if
> not. For metadata formats, leave the capacity at 0 until block integrity
> extensions can be registered, then rescan.

What happens if you switch on and off meta-data dynamically with
/sys/block/<bdev>/integrity/write_generate and
/sys/block/<bdev>/integrity/read_verify?

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] NVMe: Skip namespaces with interleaved meta-data
  2015-01-28 21:46               ` Paul Grabinar
@ 2015-01-28 22:08                 ` Martin K. Petersen
  2015-01-28 22:17                   ` Keith Busch
  0 siblings, 1 reply; 18+ messages in thread
From: Martin K. Petersen @ 2015-01-28 22:08 UTC (permalink / raw)


>>>>> "Paul" == Paul Grabinar <paul.grabinar at ranbarg.com> writes:

Paul> What happens if you switch on and off meta-data dynamically with
Paul> /sys/block/<bdev>/integrity/write_generate and
Paul> /sys/block/<bdev>/integrity/read_verify?

I think the best way to go about dealing with all this is to set PRACT=1
if no bip is attached to a bio bound for a name space formatted with PI.

That's essentially how it works in SCSI (albeit for entirely different
reasons -- the DIF nexus is independent of the DIX ditto).

In retrospect I wish we had switched the polarity of PRACT so that you'd
have to explicitly request the PI to be passed to the OS.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] NVMe: Skip namespaces with interleaved meta-data
  2015-01-28 22:08                 ` Martin K. Petersen
@ 2015-01-28 22:17                   ` Keith Busch
  2015-01-28 22:28                     ` Martin K. Petersen
  0 siblings, 1 reply; 18+ messages in thread
From: Keith Busch @ 2015-01-28 22:17 UTC (permalink / raw)


On Wed, 28 Jan 2015, Martin K. Petersen wrote:
>>>>>> "Paul" == Paul Grabinar <paul.grabinar at ranbarg.com> writes:
>
> Paul> What happens if you switch on and off meta-data dynamically with
> Paul> /sys/block/<bdev>/integrity/write_generate and
> Paul> /sys/block/<bdev>/integrity/read_verify?
>
> I think the best way to go about dealing with all this is to set PRACT=1
> if no bip is attached to a bio bound for a name space formatted with PI.

That's an option for a subset of formats I eluded to before. It will
have the controller generate/strip if the metadata size 8 bytes with
DPS settings, but ignored otherwise. We still need a buffer if metadata
is not used for PI, or if the metadata size more than 8 bytes.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] NVMe: Skip namespaces with interleaved meta-data
  2015-01-28 15:11         ` Keith Busch
  2015-01-28 20:50           ` Paul Grabinar
@ 2015-01-28 22:17           ` Andrey Kuzmin
  1 sibling, 0 replies; 18+ messages in thread
From: Andrey Kuzmin @ 2015-01-28 22:17 UTC (permalink / raw)


On Wed, Jan 28, 2015@6:11 PM, Keith Busch <keith.busch@intel.com> wrote:
> On Tue, 27 Jan 2015, Martin K. Petersen wrote:
>>>>>>>
>>>>>>> "Keith" == Keith Busch <keith.busch at intel.com> writes:
>>
>> Keith,
>>
>> Keith> I can't register with the blk-integrity extentions until after
>> Keith> add_disk is completed because the disk's kobj needs to be
>> Keith> initialized before blk_integrity_register. The 'add_disk' itself
>> Keith> submits a lot of reads, though. Am I missing something, or how
>> Keith> could those succeed when the block device's required integrity
>> Keith> format hasn't been setup?
>>
>> Protection is optional and enabled on a per-I/O basis. Even if the block
>> layer's auto protection is enabled it is normal to see a flutter of
>> unprotected reads while partitions are being scanned.
>
>
> Thanks for the info. NVMe doesn't appear to allow disabling metadata
> per-io

PRACT gives you this opportunity on a command basis
(Read/Write/Compare), although I see Martin has already suggested this
workaround of setting PRACT to 1 (strip).

Regards,
Andrey


> except for a subset of formats. If that's correct, this driver
> would have to provide a valid protection buffer for all IO. Checking
> with committee comrades for clarification.
>
>
> _______________________________________________
> Linux-nvme mailing list
> Linux-nvme at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] NVMe: Skip namespaces with interleaved meta-data
  2015-01-28 22:17                   ` Keith Busch
@ 2015-01-28 22:28                     ` Martin K. Petersen
  2015-01-29  0:09                       ` Keith Busch
  0 siblings, 1 reply; 18+ messages in thread
From: Martin K. Petersen @ 2015-01-28 22:28 UTC (permalink / raw)


>>>>> "Keith" == Keith Busch <keith.busch at intel.com> writes:

>> I think the best way to go about dealing with all this is to set
>> PRACT=1 if no bip is attached to a bio bound for a name space
>> formatted with PI.

Keith> That's an option for a subset of formats I eluded to before. It
Keith> will have the controller generate/strip if the metadata size 8
Keith> bytes with DPS settings, but ignored otherwise. We still need a
Keith> buffer if metadata is not used for PI, or if the metadata size
Keith> more than 8 bytes.

I agree it's a deficiency in the NVMe spec that there are no explicit
handling flags for non-PI metadata in split mode.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] NVMe: Skip namespaces with interleaved meta-data
  2015-01-28 22:28                     ` Martin K. Petersen
@ 2015-01-29  0:09                       ` Keith Busch
  2015-01-29  0:57                         ` Martin K. Petersen
  0 siblings, 1 reply; 18+ messages in thread
From: Keith Busch @ 2015-01-29  0:09 UTC (permalink / raw)


On Wed, 28 Jan 2015, Martin K. Petersen wrote:
>>>>>> "Keith" == Keith Busch <keith.busch at intel.com> writes:
> Keith> That's an option for a subset of formats I eluded to before. It
> Keith> will have the controller generate/strip if the metadata size 8
> Keith> bytes with DPS settings, but ignored otherwise. We still need a
> Keith> buffer if metadata is not used for PI, or if the metadata size
> Keith> more than 8 bytes.
>
> I agree it's a deficiency in the NVMe spec that there are no explicit
> handling flags for non-PI metadata in split mode.

Yeah, it is pretty bad. Pressing forward though, I'm working on something
that might be suitable, but hitting another problem using integrity
extensions with blk-mq.

Jens,
Integrity verify is broken in here. blk_mq_end_request() calls
blk_update_request(), which moves the bio iterator to the end.
When we get to bio_integrity_process(), there's nothing for it to do,
so verification is skipped.

Should I look into this, or do you have a good idea how to fix that?

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] NVMe: Skip namespaces with interleaved meta-data
  2015-01-29  0:09                       ` Keith Busch
@ 2015-01-29  0:57                         ` Martin K. Petersen
  2015-01-29 15:44                           ` Keith Busch
  0 siblings, 1 reply; 18+ messages in thread
From: Martin K. Petersen @ 2015-01-29  0:57 UTC (permalink / raw)


>>>>> "Keith" == Keith Busch <keith.busch at intel.com> writes:

Keith> Integrity verify is broken in here. blk_mq_end_request() calls
Keith> blk_update_request(), which moves the bio iterator to the end.
Keith> When we get to bio_integrity_process(), there's nothing for it to
Keith> do, so verification is skipped.

Looks like 594416a72032 inadvertently broke this.

Does using __bio_for_each_segment() help?

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] NVMe: Skip namespaces with interleaved meta-data
  2015-01-29  0:57                         ` Martin K. Petersen
@ 2015-01-29 15:44                           ` Keith Busch
  2015-01-30  0:41                             ` Martin K. Petersen
  0 siblings, 1 reply; 18+ messages in thread
From: Keith Busch @ 2015-01-29 15:44 UTC (permalink / raw)


On Wed, 28 Jan 2015, Martin K. Petersen wrote:
>>>>>> "Keith" == Keith Busch <keith.busch at intel.com> writes:
>
> Keith> Integrity verify is broken in here. blk_mq_end_request() calls
> Keith> blk_update_request(), which moves the bio iterator to the end.
> Keith> When we get to bio_integrity_process(), there's nothing for it to
> Keith> do, so verification is skipped.
>
> Looks like 594416a72032 inadvertently broke this.

Good call, reverting that at least addresses my problem.

> Does using __bio_for_each_segment() help?

Sorry, I don't follow. That still takes an iterator, but the bio's is
advanced to the end, and I don't see how to get it to the original start.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] NVMe: Skip namespaces with interleaved meta-data
  2015-01-29 15:44                           ` Keith Busch
@ 2015-01-30  0:41                             ` Martin K. Petersen
  2015-01-30  0:57                               ` Keith Busch
  0 siblings, 1 reply; 18+ messages in thread
From: Martin K. Petersen @ 2015-01-30  0:41 UTC (permalink / raw)


>>>>> "Keith" == Keith Busch <keith.busch at intel.com> writes:

>> Looks like 594416a72032 inadvertently broke this.

Keith> Good call, reverting that at least addresses my problem.

Yeah, I can't seem to make sense of that commit now that I look at it
again.

Darrick, only the topmost bio should have bio_integrity_process() run on
it. It should never be done to a clone.

My test setup is tied up with a qual cycle right now. I'll take a look
at this once it's done tomorrow morning.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] NVMe: Skip namespaces with interleaved meta-data
  2015-01-30  0:41                             ` Martin K. Petersen
@ 2015-01-30  0:57                               ` Keith Busch
  0 siblings, 0 replies; 18+ messages in thread
From: Keith Busch @ 2015-01-30  0:57 UTC (permalink / raw)


On Thu, 29 Jan 2015, Martin K. Petersen wrote:
>>>>>> "Keith" == Keith Busch <keith.busch at intel.com> writes:
>>> Looks like 594416a72032 inadvertently broke this.
>
> Keith> Good call, reverting that at least addresses my problem.
>
> Yeah, I can't seem to make sense of that commit now that I look at it
> again.
>
> Darrick, only the topmost bio should have bio_integrity_process() run on
> it. It should never be done to a clone.
>
> My test setup is tied up with a qual cycle right now. I'll take a look
> at this once it's done tomorrow morning.

While reverting it fixed the symptoms I observed, I'm not sure it's
really at fault. The comments for blk_update_request() say its for stacked
driver usage, so isn't blk-mq is using it wrong from blk_mq_end_request()?

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2015-01-30  0:57 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-01-27 18:07 [PATCH] NVMe: Skip namespaces with interleaved meta-data Keith Busch
2015-01-27 21:57 ` David Darrington
2015-01-27 22:09   ` Keith Busch
2015-01-28  1:21     ` Keith Busch
2015-01-28  1:39       ` Martin K. Petersen
2015-01-28 15:11         ` Keith Busch
2015-01-28 20:50           ` Paul Grabinar
2015-01-28 21:16             ` Keith Busch
2015-01-28 21:46               ` Paul Grabinar
2015-01-28 22:08                 ` Martin K. Petersen
2015-01-28 22:17                   ` Keith Busch
2015-01-28 22:28                     ` Martin K. Petersen
2015-01-29  0:09                       ` Keith Busch
2015-01-29  0:57                         ` Martin K. Petersen
2015-01-29 15:44                           ` Keith Busch
2015-01-30  0:41                             ` Martin K. Petersen
2015-01-30  0:57                               ` Keith Busch
2015-01-28 22:17           ` Andrey Kuzmin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.