All of lore.kernel.org
 help / color / mirror / Atom feed
* pNFS SCSI layout support V2
@ 2016-03-04 19:46 Christoph Hellwig
  2016-03-04 19:46 ` [PATCH 1/4] nfs4.h: add SCSI layout defintions Christoph Hellwig
                   ` (3 more replies)
  0 siblings, 4 replies; 17+ messages in thread
From: Christoph Hellwig @ 2016-03-04 19:46 UTC (permalink / raw)
  To: trond.myklebust, bfields; +Cc: linux-nfs

This series adds support the pNFS SCSI layout to both the NFS server
and client.  It's a fairly simple extension to the existing block
layout drivers, which relies on the Persistent Reservation API in
the block layer merged a while ago.

On the server side we will now always export SCSI devices a SCSI
layout.  I though about allowing exports of multiple layour types
for the same file system, but both the not really production ready
nature of block layout fencing, and the horrors of passing options
to NFS exports made me go with the simple way for now.
 

Chances since V1:
 - removed __exit annotation from bl_cleanup_pipefs
 - introduced separate config options for each layout driver in nfsd
 - added a file documentation the SCSI layout support


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/4] nfs4.h: add SCSI layout defintions
  2016-03-04 19:46 pNFS SCSI layout support V2 Christoph Hellwig
@ 2016-03-04 19:46 ` Christoph Hellwig
  2016-03-04 19:46 ` [PATCH 2/4] nfs/blocklayout: add SCSI layout support Christoph Hellwig
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 17+ messages in thread
From: Christoph Hellwig @ 2016-03-04 19:46 UTC (permalink / raw)
  To: trond.myklebust, bfields; +Cc: linux-nfs

Based on draft-ietf-nfsv4-scsi-layout-05 after the WG last call.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nfs4.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index d6f9b4e..0114334 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -529,6 +529,7 @@ enum pnfs_layouttype {
 	LAYOUT_OSD2_OBJECTS = 2,
 	LAYOUT_BLOCK_VOLUME = 3,
 	LAYOUT_FLEX_FILES = 4,
+	LAYOUT_SCSI = 5,
 	LAYOUT_TYPE_MAX
 };
 
@@ -555,6 +556,7 @@ enum pnfs_block_volume_type {
 	PNFS_BLOCK_VOLUME_SLICE		= 1,
 	PNFS_BLOCK_VOLUME_CONCAT	= 2,
 	PNFS_BLOCK_VOLUME_STRIPE	= 3,
+	PNFS_BLOCK_VOLUME_SCSI		= 4,
 };
 
 enum pnfs_block_extent_state {
@@ -568,6 +570,23 @@ enum pnfs_block_extent_state {
 #define PNFS_BLOCK_EXTENT_SIZE \
 	(7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
 
+/* on the wire size of a scsi commit range */
+#define PNFS_SCSI_RANGE_SIZE \
+	(4 * sizeof(__be32))
+
+enum scsi_code_set {
+	PS_CODE_SET_BINARY	= 1,
+	PS_CODE_SET_ASCII	= 2,
+	PS_CODE_SET_UTF8	= 3
+};
+
+enum scsi_designator_type {
+	PS_DESIGNATOR_T10	= 1,
+	PS_DESIGNATOR_EUI64	= 2,
+	PS_DESIGNATOR_NAA	= 3,
+	PS_DESIGNATOR_NAME	= 8
+};
+
 #define NFL4_UFLG_MASK			0x0000003F
 #define NFL4_UFLG_DENSE			0x00000001
 #define NFL4_UFLG_COMMIT_THRU_MDS	0x00000002
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/4] nfs/blocklayout: add SCSI layout support
  2016-03-04 19:46 pNFS SCSI layout support V2 Christoph Hellwig
  2016-03-04 19:46 ` [PATCH 1/4] nfs4.h: add SCSI layout defintions Christoph Hellwig
@ 2016-03-04 19:46 ` Christoph Hellwig
  2016-03-08 22:07   ` J. Bruce Fields
  2016-03-08 22:09   ` J. Bruce Fields
  2016-03-04 19:46 ` [PATCH 3/4] nfsd: add a new config option for the block layout driver Christoph Hellwig
  2016-03-04 19:46 ` [PATCH 4/4] nfsd: add SCSI layout support Christoph Hellwig
  3 siblings, 2 replies; 17+ messages in thread
From: Christoph Hellwig @ 2016-03-04 19:46 UTC (permalink / raw)
  To: trond.myklebust, bfields; +Cc: linux-nfs

This is a trivial extension to the block layout driver to support the
new SCSI layouts draft.  There are three changes:

 - device identifcation through the SCSI VPD page.  This allows us to
   directly use the udev generated persistent device names instead of
   requiring an expensive lookup by crawling every block device node
   in /dev and reading a signature for it.
 - use of SCSI persistent reservations to protect device access and
   allow for robust fencing.  On the client sides this just means
   registering and unregistering a server supplied key.
 - an optimized LAYOUTCOMMIT payload that doesn't send unessecary
   fields to the server.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/nfs/blocklayout/blocklayout.c |  59 ++++++++++++++--
 fs/nfs/blocklayout/blocklayout.h |  14 +++-
 fs/nfs/blocklayout/dev.c         | 144 ++++++++++++++++++++++++++++++++++++++-
 fs/nfs/blocklayout/extent_tree.c |  44 ++++++++----
 fs/nfs/blocklayout/rpc_pipefs.c  |   2 +-
 5 files changed, 238 insertions(+), 25 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index ddd0138..b27c409 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -446,8 +446,8 @@ static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
 	kfree(bl);
 }
 
-static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
-						   gfp_t gfp_flags)
+static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode,
+		gfp_t gfp_flags, bool is_scsi_layout)
 {
 	struct pnfs_block_layout *bl;
 
@@ -460,9 +460,22 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
 	bl->bl_ext_ro = RB_ROOT;
 	spin_lock_init(&bl->bl_ext_lock);
 
+	bl->bl_scsi_layout = is_scsi_layout;
 	return &bl->bl_layout;
 }
 
+static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
+						   gfp_t gfp_flags)
+{
+	return __bl_alloc_layout_hdr(inode, gfp_flags, false);
+}
+
+static struct pnfs_layout_hdr *sl_alloc_layout_hdr(struct inode *inode,
+						   gfp_t gfp_flags)
+{
+	return __bl_alloc_layout_hdr(inode, gfp_flags, true);
+}
+
 static void bl_free_lseg(struct pnfs_layout_segment *lseg)
 {
 	dprintk("%s enter\n", __func__);
@@ -888,22 +901,53 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
 	.sync				= pnfs_generic_sync,
 };
 
+static struct pnfs_layoutdriver_type scsilayout_type = {
+	.id				= LAYOUT_SCSI,
+	.name				= "LAYOUT_SCSI",
+	.owner				= THIS_MODULE,
+	.flags				= PNFS_LAYOUTRET_ON_SETATTR |
+					  PNFS_READ_WHOLE_PAGE,
+	.read_pagelist			= bl_read_pagelist,
+	.write_pagelist			= bl_write_pagelist,
+	.alloc_layout_hdr		= sl_alloc_layout_hdr,
+	.free_layout_hdr		= bl_free_layout_hdr,
+	.alloc_lseg			= bl_alloc_lseg,
+	.free_lseg			= bl_free_lseg,
+	.return_range			= bl_return_range,
+	.prepare_layoutcommit		= bl_prepare_layoutcommit,
+	.cleanup_layoutcommit		= bl_cleanup_layoutcommit,
+	.set_layoutdriver		= bl_set_layoutdriver,
+	.alloc_deviceid_node		= bl_alloc_deviceid_node,
+	.free_deviceid_node		= bl_free_deviceid_node,
+	.pg_read_ops			= &bl_pg_read_ops,
+	.pg_write_ops			= &bl_pg_write_ops,
+	.sync				= pnfs_generic_sync,
+};
+
+
 static int __init nfs4blocklayout_init(void)
 {
 	int ret;
 
 	dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
 
-	ret = pnfs_register_layoutdriver(&blocklayout_type);
+	ret = bl_init_pipefs();
 	if (ret)
 		goto out;
-	ret = bl_init_pipefs();
+
+	ret = pnfs_register_layoutdriver(&blocklayout_type);
 	if (ret)
-		goto out_unregister;
+		goto out_cleanup_pipe;
+
+	ret = pnfs_register_layoutdriver(&scsilayout_type);
+	if (ret)
+		goto out_unregister_block;
 	return 0;
 
-out_unregister:
+out_unregister_block:
 	pnfs_unregister_layoutdriver(&blocklayout_type);
+out_cleanup_pipe:
+	bl_cleanup_pipefs();
 out:
 	return ret;
 }
@@ -913,8 +957,9 @@ static void __exit nfs4blocklayout_exit(void)
 	dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
 	       __func__);
 
-	bl_cleanup_pipefs();
+	pnfs_unregister_layoutdriver(&scsilayout_type);
 	pnfs_unregister_layoutdriver(&blocklayout_type);
+	bl_cleanup_pipefs();
 }
 
 MODULE_ALIAS("nfs-layouttype4-3");
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index c556640..bc21205 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -55,7 +55,6 @@ struct pnfs_block_dev;
  */
 #define PNFS_BLOCK_UUID_LEN	128
 
-
 struct pnfs_block_volume {
 	enum pnfs_block_volume_type	type;
 	union {
@@ -82,6 +81,13 @@ struct pnfs_block_volume {
 			u32		volumes_count;
 			u32		volumes[PNFS_BLOCK_MAX_DEVICES];
 		} stripe;
+		struct {
+			enum scsi_code_set		code_set;
+			enum scsi_designator_type	designator_type;
+			int				designator_len;
+			u8				designator[256];
+			u64				pr_key;
+		} scsi;
 	};
 };
 
@@ -106,6 +112,9 @@ struct pnfs_block_dev {
 	struct block_device		*bdev;
 	u64				disk_offset;
 
+	u64				pr_key;
+	bool				pr_registered;
+
 	bool (*map)(struct pnfs_block_dev *dev, u64 offset,
 			struct pnfs_block_dev_map *map);
 };
@@ -131,6 +140,7 @@ struct pnfs_block_layout {
 	struct rb_root		bl_ext_rw;
 	struct rb_root		bl_ext_ro;
 	spinlock_t		bl_ext_lock;   /* Protects list manipulation */
+	bool			bl_scsi_layout;
 };
 
 static inline struct pnfs_block_layout *
@@ -182,6 +192,6 @@ void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
 dev_t bl_resolve_deviceid(struct nfs_server *server,
 		struct pnfs_block_volume *b, gfp_t gfp_mask);
 int __init bl_init_pipefs(void);
-void __exit bl_cleanup_pipefs(void);
+void bl_cleanup_pipefs(void);
 
 #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index a861bbd..31b0f6b 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 2014 Christoph Hellwig.
+ * Copyright (c) 2014-2016 Christoph Hellwig.
  */
 #include <linux/sunrpc/svc.h>
 #include <linux/blkdev.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_xdr.h>
+#include <linux/pr.h>
 
 #include "blocklayout.h"
 
@@ -21,6 +22,17 @@ bl_free_device(struct pnfs_block_dev *dev)
 			bl_free_device(&dev->children[i]);
 		kfree(dev->children);
 	} else {
+		if (dev->pr_registered) {
+			const struct pr_ops *ops = 
+				dev->bdev->bd_disk->fops->pr_ops;
+			int error;
+
+			error = ops->pr_register(dev->bdev, dev->pr_key, 0,
+				false);
+			if (error)
+				pr_err("failed to unregister PR key.\n");
+		}
+
 		if (dev->bdev)
 			blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
 	}
@@ -113,6 +125,24 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
 		for (i = 0; i < b->stripe.volumes_count; i++)
 			b->stripe.volumes[i] = be32_to_cpup(p++);
 		break;
+	case PNFS_BLOCK_VOLUME_SCSI:
+		p = xdr_inline_decode(xdr, 4 + 4 + 4);
+		if (!p)
+			return -EIO;
+		b->scsi.code_set = be32_to_cpup(p++);
+		b->scsi.designator_type = be32_to_cpup(p++);
+		b->scsi.designator_len = be32_to_cpup(p++);
+		p = xdr_inline_decode(xdr, b->scsi.designator_len);
+		if (!p)
+			return -EIO;
+		if (b->scsi.designator_len > 256)
+			return -EIO;
+		memcpy(&b->scsi.designator, p, b->scsi.designator_len);
+		p = xdr_inline_decode(xdr, 8);
+		if (!p)
+			return -EIO;
+		p = xdr_decode_hyper(p, &b->scsi.pr_key);
+		break;
 	default:
 		dprintk("unknown volume type!\n");
 		return -EIO;
@@ -216,6 +246,116 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
 	return 0;
 }
 
+static bool
+bl_validate_designator(struct pnfs_block_volume *v)
+{
+	switch (v->scsi.designator_type) {
+	case PS_DESIGNATOR_EUI64:
+		if (v->scsi.code_set != PS_CODE_SET_BINARY)
+			return false;
+
+		if (v->scsi.designator_len != 8 &&
+		    v->scsi.designator_len != 10 &&
+		    v->scsi.designator_len != 16)
+			return false;
+
+		return true;
+	case PS_DESIGNATOR_NAA:
+		if (v->scsi.code_set != PS_CODE_SET_BINARY)
+			return false;
+
+		if (v->scsi.designator_len != 8 &&
+		    v->scsi.designator_len != 16)
+			return false;
+
+		return true;
+	case PS_DESIGNATOR_T10:
+	case PS_DESIGNATOR_NAME:
+		pr_err("pNFS: unsupported designator "
+			"(code set %d, type %d, len %d.\n",
+			v->scsi.code_set,
+			v->scsi.designator_type,
+			v->scsi.designator_len);
+		return false;
+	default:
+		pr_err("pNFS: invalid designator "
+			"(code set %d, type %d, len %d.\n",
+			v->scsi.code_set,
+			v->scsi.designator_type,
+			v->scsi.designator_len);
+		return false;
+	}
+}
+
+static int
+bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
+		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+	struct pnfs_block_volume *v = &volumes[idx];
+	const struct pr_ops *ops;
+	const char *devname;
+	int error;
+
+	if (!bl_validate_designator(v))
+		return -EINVAL;
+
+	switch (v->scsi.designator_len) {
+	case 8:
+		devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN",
+				v->scsi.designator);
+		break;
+	case 12:
+		devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN",
+				v->scsi.designator);
+		break;
+	case 16:
+		devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN",
+				v->scsi.designator);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL);
+	if (IS_ERR(d->bdev)) {
+		pr_warn("pNFS: failed to open device %s (%ld)\n",
+			devname, PTR_ERR(d->bdev));
+		kfree(devname);
+		return PTR_ERR(d->bdev);
+	}
+
+	kfree(devname);
+
+	d->len = i_size_read(d->bdev->bd_inode);
+	d->map = bl_map_simple;
+	d->pr_key = v->scsi.pr_key;
+
+	pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
+		d->bdev->bd_disk->disk_name, d->pr_key);
+
+	ops = d->bdev->bd_disk->fops->pr_ops;
+	if (!ops) {
+		pr_err("pNFS: block device %s does not support reservations.",
+				d->bdev->bd_disk->disk_name);
+		error = -EINVAL;
+		goto out_blkdev_put;
+	}
+
+	error = ops->pr_register(d->bdev, 0, d->pr_key, true);
+	if (error) {
+		pr_err("pNFS: failed to register key for block device %s.",
+				d->bdev->bd_disk->disk_name);
+		goto out_blkdev_put;
+	}
+
+	d->pr_registered = true;
+	return 0;
+
+out_blkdev_put:
+	blkdev_put(d->bdev, FMODE_READ);
+	return error;
+}
+
 static int
 bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
@@ -303,6 +443,8 @@ bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
 		return bl_parse_concat(server, d, volumes, idx, gfp_mask);
 	case PNFS_BLOCK_VOLUME_STRIPE:
 		return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
+	case PNFS_BLOCK_VOLUME_SCSI:
+		return bl_parse_scsi(server, d, volumes, idx, gfp_mask);
 	default:
 		dprintk("unsupported volume type: %d\n", volumes[idx].type);
 		return -EIO;
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index c59a59c..df366fc 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014 Christoph Hellwig.
+ * Copyright (c) 2014-2016 Christoph Hellwig.
  */
 
 #include <linux/vmalloc.h>
@@ -462,10 +462,12 @@ out:
 	return err;
 }
 
-static size_t ext_tree_layoutupdate_size(size_t count)
+static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count)
 {
-	return sizeof(__be32) /* number of entries */ +
-		PNFS_BLOCK_EXTENT_SIZE * count;
+	if (bl->bl_scsi_layout)
+		return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count;
+	else
+		return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count;
 }
 
 static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
@@ -482,6 +484,23 @@ static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
 	}
 }
 
+static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p)
+{
+	p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
+			NFS4_DEVICEID4_SIZE);
+	p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
+	p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
+	p = xdr_encode_hyper(p, 0LL);
+	*p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
+	return p;
+}
+
+static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
+{
+	p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
+	return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
+}
+
 static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
 		size_t buffer_size, size_t *count)
 {
@@ -495,19 +514,16 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
 			continue;
 
 		(*count)++;
-		if (ext_tree_layoutupdate_size(*count) > buffer_size) {
+		if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
 			/* keep counting.. */
 			ret = -ENOSPC;
 			continue;
 		}
 
-		p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
-				NFS4_DEVICEID4_SIZE);
-		p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
-		p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
-		p = xdr_encode_hyper(p, 0LL);
-		*p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
-
+		if (bl->bl_scsi_layout)
+			p = encode_scsi_range(be, p);
+		else
+			p = encode_block_extent(be, p);
 		be->be_tag = EXTENT_COMMITTING;
 	}
 	spin_unlock(&bl->bl_ext_lock);
@@ -536,7 +552,7 @@ retry:
 	if (unlikely(ret)) {
 		ext_tree_free_commitdata(arg, buffer_size);
 
-		buffer_size = ext_tree_layoutupdate_size(count);
+		buffer_size = ext_tree_layoutupdate_size(bl, count);
 		count = 0;
 
 		arg->layoutupdate_pages =
@@ -555,7 +571,7 @@ retry:
 	}
 
 	*start_p = cpu_to_be32(count);
-	arg->layoutupdate_len = ext_tree_layoutupdate_size(count);
+	arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count);
 
 	if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
 		void *p = start_p, *end = p + arg->layoutupdate_len;
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index dbe5839..9fb067a6 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -281,7 +281,7 @@ out:
 	return ret;
 }
 
-void __exit bl_cleanup_pipefs(void)
+void bl_cleanup_pipefs(void)
 {
 	rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
 	unregister_pernet_subsys(&nfs4blocklayout_net_ops);
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 3/4] nfsd: add a new config option for the block layout driver
  2016-03-04 19:46 pNFS SCSI layout support V2 Christoph Hellwig
  2016-03-04 19:46 ` [PATCH 1/4] nfs4.h: add SCSI layout defintions Christoph Hellwig
  2016-03-04 19:46 ` [PATCH 2/4] nfs/blocklayout: add SCSI layout support Christoph Hellwig
@ 2016-03-04 19:46 ` Christoph Hellwig
  2016-03-04 19:46 ` [PATCH 4/4] nfsd: add SCSI layout support Christoph Hellwig
  3 siblings, 0 replies; 17+ messages in thread
From: Christoph Hellwig @ 2016-03-04 19:46 UTC (permalink / raw)
  To: trond.myklebust, bfields; +Cc: linux-nfs

Split the config symbols into a generic pNFS one, which is invisible
and gets selected by the layout drivers, and one for the block layout
driver.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/nfsd/Kconfig       | 13 +++++++++----
 fs/nfsd/Makefile      |  3 ++-
 fs/nfsd/nfs4layouts.c |  4 ++++
 fs/nfsd/pnfs.h        |  2 ++
 fs/xfs/Makefile       |  2 +-
 fs/xfs/xfs_export.c   |  2 +-
 fs/xfs/xfs_pnfs.h     |  2 +-
 7 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index a0b77fc..eb70d91 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -84,12 +84,17 @@ config NFSD_V4
 	  If unsure, say N.
 
 config NFSD_PNFS
-	bool "NFSv4.1 server support for Parallel NFS (pNFS)"
+	bool
+
+config NFSD_BLOCKLAYOUT
+	bool "NFSv4.1 server support for pNFS block layouts"
 	depends on NFSD_V4
+	select NFSD_PNFS
 	help
-	  This option enables support for the parallel NFS features of the
-	  minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS
-	  server.
+	  This option enables support for the exporting pNFS block layouts
+	  in the kernel's NFS server. The pNFS block layout enables NFS
+	  clients to directly perform I/O to block devices accesible to both
+	  the server and the clients.  See RFC 5663 for more details.
 
 	  If unsure, say N.
 
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 9a6028e..679cdc6 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -17,4 +17,5 @@ nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs3xdr.o
 nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
 nfsd-$(CONFIG_NFSD_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
 			   nfs4acl.o nfs4callback.o nfs4recover.o
-nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o
+nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
+nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index ce2d010..4e4def7 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -26,7 +26,9 @@ static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
 static const struct lock_manager_operations nfsd4_layouts_lm_ops;
 
 const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] =  {
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
 	[LAYOUT_BLOCK_VOLUME]	= &bl_layout_ops,
+#endif
 };
 
 /* pNFS device ID to export fsid mapping */
@@ -121,10 +123,12 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
 	if (!(exp->ex_flags & NFSEXP_PNFS))
 		return;
 
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
 	if (sb->s_export_op->get_uuid &&
 	    sb->s_export_op->map_blocks &&
 	    sb->s_export_op->commit_blocks)
 		exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
+#endif
 }
 
 static void
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index d4c4453..ff50bfa 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -35,7 +35,9 @@ struct nfsd4_layout_ops {
 };
 
 extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
 extern const struct nfsd4_layout_ops bl_layout_ops;
+#endif
 
 __be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *cstate, stateid_t *stateid,
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index f646391..d68b62a 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -121,4 +121,4 @@ xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
 xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
 xfs-$(CONFIG_SYSCTL)		+= xfs_sysctl.o
 xfs-$(CONFIG_COMPAT)		+= xfs_ioctl32.o
-xfs-$(CONFIG_NFSD_PNFS)		+= xfs_pnfs.o
+xfs-$(CONFIG_NFSD_BLOCKLAYOUT)	+= xfs_pnfs.o
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 652cd3c..0492b82 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -246,7 +246,7 @@ const struct export_operations xfs_export_operations = {
 	.fh_to_parent		= xfs_fs_fh_to_parent,
 	.get_parent		= xfs_fs_get_parent,
 	.commit_metadata	= xfs_fs_nfs_commit_metadata,
-#ifdef CONFIG_NFSD_PNFS
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
 	.get_uuid		= xfs_fs_get_uuid,
 	.map_blocks		= xfs_fs_map_blocks,
 	.commit_blocks		= xfs_fs_commit_blocks,
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index 8147ac1..d85529c 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -1,7 +1,7 @@
 #ifndef _XFS_PNFS_H
 #define _XFS_PNFS_H 1
 
-#ifdef CONFIG_NFSD_PNFS
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
 int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
 int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
 		struct iomap *iomap, bool write, u32 *device_generation);
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 4/4] nfsd: add SCSI layout support
  2016-03-04 19:46 pNFS SCSI layout support V2 Christoph Hellwig
                   ` (2 preceding siblings ...)
  2016-03-04 19:46 ` [PATCH 3/4] nfsd: add a new config option for the block layout driver Christoph Hellwig
@ 2016-03-04 19:46 ` Christoph Hellwig
  2016-03-08 22:15   ` J. Bruce Fields
                     ` (3 more replies)
  3 siblings, 4 replies; 17+ messages in thread
From: Christoph Hellwig @ 2016-03-04 19:46 UTC (permalink / raw)
  To: trond.myklebust, bfields; +Cc: linux-nfs

This is a simple extension to the block layout driver to use SCSI
persistent reservations for access control and fencing, as well as
SCSI VPD pages for device identification.

For this we need to pass the nfs4_client to the proc_getdeviceinfo method
to generate the reservation key, and add a new fence_client method
to allow for fence actions in the layout driver.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/filesystems/nfs/pnfs-scsi-server.txt |  22 ++
 fs/nfsd/Kconfig                                    |  13 +
 fs/nfsd/Makefile                                   |   1 +
 fs/nfsd/blocklayout.c                              | 298 ++++++++++++++++++---
 fs/nfsd/blocklayoutxdr.c                           |  65 ++++-
 fs/nfsd/blocklayoutxdr.h                           |  14 +
 fs/nfsd/nfs4layouts.c                              |  27 +-
 fs/nfsd/nfs4proc.c                                 |   6 +-
 fs/nfsd/pnfs.h                                     |   6 +
 fs/xfs/Makefile                                    |   1 +
 fs/xfs/xfs_pnfs.h                                  |   2 +-
 11 files changed, 407 insertions(+), 48 deletions(-)
 create mode 100644 Documentation/filesystems/nfs/pnfs-scsi-server.txt

diff --git a/Documentation/filesystems/nfs/pnfs-scsi-server.txt b/Documentation/filesystems/nfs/pnfs-scsi-server.txt
new file mode 100644
index 0000000..4150979
--- /dev/null
+++ b/Documentation/filesystems/nfs/pnfs-scsi-server.txt
@@ -0,0 +1,22 @@
+
+pNFS SCSI layout server user guide
+==================================
+
+This document describes support for pNFS SCSI layouts in the Linux NFS server.
+With pNFS SCSI layouts, the NFS server acts as Metadata Server (MDS) for pNFS,
+which in addition to handling all the metadata access to the NFS export,
+also hands out layouts to the clients so that they can directly access the
+underlying SCSI LUNs that are shared with the client.
+
+To use pNFS SCSI layouts with with the Linux NFS server, the exported file
+system needs to support the pNFS SCSI layouts (currently just XFS), and the
+file system must sit on a SCSI LUN that is accessible to the clients in
+addition to the MDS.  As of now the file system needs to sit directly on the
+exported LUN, striping or concatenation of LUNs on the MDS and clients
+is not supported yet.
+
+On the server, pNFS SCSI volume support is automatically enabled if the
+file system is exported using the "pnfs" option and the underlying SCSI
+device support persistent reservations.  On the client make sure the kernel
+has the CONFIG_PNFS_BLOCK option enabled, and the file system is mounted
+using the NFSv4.1 protocol version (mount -o vers=4.1).
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index eb70d91..a30a313 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -98,6 +98,19 @@ config NFSD_BLOCKLAYOUT
 
 	  If unsure, say N.
 
+config NFSD_SCSILAYOUT
+	bool "NFSv4.1 server support for pNFS SCSI layouts"
+	depends on NFSD_V4
+	select NFSD_PNFS
+	help
+	  This option enables support for the exporting pNFS SCSI layouts
+	  in the kernel's NFS server. The pNFS SCSI layout enables NFS
+	  clients to directly perform I/O to SCSI devices accesible to both
+	  the server and the clients.  See draft-ietf-nfsv4-scsi-layout for
+	  more details.
+
+	  If unsure, say N.
+
 config NFSD_V4_SECURITY_LABEL
 	bool "Provide Security Label support for NFSv4 server"
 	depends on NFSD_V4 && SECURITY
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 679cdc6..3ae5f3c 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -19,3 +19,4 @@ nfsd-$(CONFIG_NFSD_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
 			   nfs4acl.o nfs4callback.o nfs4recover.o
 nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
 nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
+nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index c29d942..0e87e3e 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -1,11 +1,14 @@
 /*
- * Copyright (c) 2014 Christoph Hellwig.
+ * Copyright (c) 2014-2016 Christoph Hellwig.
  */
 #include <linux/exportfs.h>
 #include <linux/genhd.h>
 #include <linux/slab.h>
+#include <linux/pr.h>
 
 #include <linux/nfsd/debug.h>
+#include <scsi/scsi_proto.h>
+#include <scsi/scsi_common.h>
 
 #include "blocklayoutxdr.h"
 #include "pnfs.h"
@@ -13,37 +16,6 @@
 #define NFSDDBG_FACILITY	NFSDDBG_PNFS
 
 
-static int
-nfsd4_block_get_device_info_simple(struct super_block *sb,
-		struct nfsd4_getdeviceinfo *gdp)
-{
-	struct pnfs_block_deviceaddr *dev;
-	struct pnfs_block_volume *b;
-
-	dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
-		      sizeof(struct pnfs_block_volume), GFP_KERNEL);
-	if (!dev)
-		return -ENOMEM;
-	gdp->gd_device = dev;
-
-	dev->nr_volumes = 1;
-	b = &dev->volumes[0];
-
-	b->type = PNFS_BLOCK_VOLUME_SIMPLE;
-	b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
-	return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
-			&b->simple.offset);
-}
-
-static __be32
-nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
-		struct nfsd4_getdeviceinfo *gdp)
-{
-	if (sb->s_bdev != sb->s_bdev->bd_contains)
-		return nfserr_inval;
-	return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
-}
-
 static __be32
 nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
 		struct nfsd4_layoutget *args)
@@ -141,20 +113,13 @@ out_layoutunavailable:
 }
 
 static __be32
-nfsd4_block_proc_layoutcommit(struct inode *inode,
-		struct nfsd4_layoutcommit *lcp)
+nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp,
+		struct iomap *iomaps, int nr_iomaps)
 {
 	loff_t new_size = lcp->lc_last_wr + 1;
 	struct iattr iattr = { .ia_valid = 0 };
-	struct iomap *iomaps;
-	int nr_iomaps;
 	int error;
 
-	nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
-			lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
-	if (nr_iomaps < 0)
-		return nfserrno(nr_iomaps);
-
 	if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
 	    timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
 		lcp->lc_mtime = current_fs_time(inode->i_sb);
@@ -172,6 +137,54 @@ nfsd4_block_proc_layoutcommit(struct inode *inode,
 	return nfserrno(error);
 }
 
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
+static int
+nfsd4_block_get_device_info_simple(struct super_block *sb,
+		struct nfsd4_getdeviceinfo *gdp)
+{
+	struct pnfs_block_deviceaddr *dev;
+	struct pnfs_block_volume *b;
+
+	dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
+		      sizeof(struct pnfs_block_volume), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+	gdp->gd_device = dev;
+
+	dev->nr_volumes = 1;
+	b = &dev->volumes[0];
+
+	b->type = PNFS_BLOCK_VOLUME_SIMPLE;
+	b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
+	return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
+			&b->simple.offset);
+}
+
+static __be32
+nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
+		struct nfs4_client *clp,
+		struct nfsd4_getdeviceinfo *gdp)
+{
+	if (sb->s_bdev != sb->s_bdev->bd_contains)
+		return nfserr_inval;
+	return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
+}
+
+static __be32
+nfsd4_block_proc_layoutcommit(struct inode *inode,
+		struct nfsd4_layoutcommit *lcp)
+{
+	struct iomap *iomaps;
+	int nr_iomaps;
+
+	nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
+			lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
+	if (nr_iomaps < 0)
+		return nfserrno(nr_iomaps);
+
+	return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
+}
+
 const struct nfsd4_layout_ops bl_layout_ops = {
 	/*
 	 * Pretend that we send notification to the client.  This is a blatant
@@ -190,3 +203,206 @@ const struct nfsd4_layout_ops bl_layout_ops = {
 	.encode_layoutget	= nfsd4_block_encode_layoutget,
 	.proc_layoutcommit	= nfsd4_block_proc_layoutcommit,
 };
+#endif /* CONFIG_NFSD_BLOCKLAYOUT */
+
+#ifdef CONFIG_NFSD_SCSILAYOUT
+static int nfsd4_scsi_identify_device(struct block_device *bdev,
+		struct pnfs_block_volume *b)
+{
+	struct request_queue *q = bdev->bd_disk->queue;
+	struct request *rq;
+	size_t bufflen = 252, len, id_len;
+	u8 *buf, *d, type, assoc;
+	int error;
+
+	buf = kzalloc(bufflen, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	rq = blk_get_request(q, READ, GFP_KERNEL);
+	if (IS_ERR(rq)) {
+		error = -ENOMEM;
+		goto out_free_buf;
+	}
+	blk_rq_set_block_pc(rq);
+
+	error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL);
+	if (error)
+		goto out_put_request;
+
+	rq->cmd[0] = INQUIRY;
+	rq->cmd[1] = 1;
+	rq->cmd[2] = 0x83;
+	rq->cmd[3] = bufflen >> 8;
+	rq->cmd[4] = bufflen & 0xff;
+	rq->cmd_len = COMMAND_SIZE(INQUIRY);
+
+	error = blk_execute_rq(rq->q, NULL, rq, 1);
+	if (error) {
+		pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
+			rq->errors);
+		
+	}
+
+	len = (buf[2] << 8) + buf[3] + 4;
+	if (len > bufflen) {
+		pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n",
+			len);
+		goto out_put_request;
+	}
+
+	d = buf + 4;
+	for (d = buf + 4; d < buf + len; d += id_len + 4) {
+		id_len = d[3];
+		type = d[1] & 0xf;
+		assoc = (d[1] >> 4) & 0x3;
+
+		/*
+		 * We only care about a EUI-64 and NAA designator types
+		 * with LU association.
+		 */
+		if (assoc != 0x00)
+			continue;
+		if (type != 0x02 && type != 0x03)
+			continue;
+		if (id_len != 8 && id_len != 12 && id_len != 16)
+			continue;
+
+		b->scsi.code_set = PS_CODE_SET_BINARY;
+		b->scsi.designator_type = type == 0x02 ?
+			PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA;
+		b->scsi.designator_len = id_len;
+		memcpy(b->scsi.designator, d + 4, id_len);
+
+		/*
+		 * If we found a 8 or 12 byte descriptor continue on to
+		 * see if a 16 byte one is available.  If we find a
+		 * 16 byte descriptor we're done.
+		 */
+		if (id_len == 16)
+			break;
+	}
+
+out_put_request:
+	blk_put_request(rq);
+out_free_buf:
+	kfree(buf);
+	return error;
+}
+
+#define NFSD_MDS_PR_KEY		0x0100000000000000
+
+/*
+ * We use the client ID as a uniqueue key for the reservations.
+ * This allows us to easily fence a client when recalls fail.
+ */
+static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp)
+{
+	return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id;
+}
+
+static int
+nfsd4_block_get_device_info_scsi(struct super_block *sb,
+		struct nfs4_client *clp,
+		struct nfsd4_getdeviceinfo *gdp)
+{
+	struct pnfs_block_deviceaddr *dev;
+	struct pnfs_block_volume *b;
+	const struct pr_ops *ops;
+	int error;
+
+	dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
+		      sizeof(struct pnfs_block_volume), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+	gdp->gd_device = dev;
+
+	dev->nr_volumes = 1;
+	b = &dev->volumes[0];
+
+	b->type = PNFS_BLOCK_VOLUME_SCSI;
+	b->scsi.pr_key = nfsd4_scsi_pr_key(clp);
+
+	error = nfsd4_scsi_identify_device(sb->s_bdev, b);
+	if (error)
+		return error;
+
+	ops = sb->s_bdev->bd_disk->fops->pr_ops;
+	if (!ops) {
+		pr_err("pNFS: device %s does not support PRs.\n",
+			sb->s_id);
+		return -EINVAL;
+	}
+
+	error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
+	if (error) {
+		pr_err("pNFS: failed to register key for device %s.\n",
+			sb->s_id);
+		return -EINVAL;
+	}
+
+	error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
+			PR_EXCLUSIVE_ACCESS_REG_ONLY, 0);
+	if (error) {
+		pr_err("pNFS: failed to reserve device %s.\n",
+			sb->s_id);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static __be32
+nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb,
+		struct nfs4_client *clp,
+		struct nfsd4_getdeviceinfo *gdp)
+{
+	if (sb->s_bdev != sb->s_bdev->bd_contains)
+		return nfserr_inval;
+	return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp));
+}
+static __be32
+nfsd4_scsi_proc_layoutcommit(struct inode *inode,
+		struct nfsd4_layoutcommit *lcp)
+{
+	struct iomap *iomaps;
+	int nr_iomaps;
+
+	nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout,
+			lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
+	if (nr_iomaps < 0)
+		return nfserrno(nr_iomaps);
+
+	return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
+}
+
+static void
+nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls)
+{
+	struct nfs4_client *clp = ls->ls_stid.sc_client;
+	struct block_device *bdev = ls->ls_file->f_path.mnt->mnt_sb->s_bdev;
+
+	bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
+			nfsd4_scsi_pr_key(clp), 0, true);
+}
+
+const struct nfsd4_layout_ops scsi_layout_ops = {
+	/*
+	 * Pretend that we send notification to the client.  This is a blatant
+	 * lie to force recent Linux clients to cache our device IDs.
+	 * We rarely ever change the device ID, so the harm of leaking deviceids
+	 * for a while isn't too bad.  Unfortunately RFC5661 is a complete mess
+	 * in this regard, but I filed errata 4119 for this a while ago, and
+	 * hopefully the Linux client will eventually start caching deviceids
+	 * without this again.
+	 */
+	.notify_types		=
+			NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
+	.proc_getdeviceinfo	= nfsd4_scsi_proc_getdeviceinfo,
+	.encode_getdeviceinfo	= nfsd4_block_encode_getdeviceinfo,
+	.proc_layoutget		= nfsd4_block_proc_layoutget,
+	.encode_layoutget	= nfsd4_block_encode_layoutget,
+	.proc_layoutcommit	= nfsd4_scsi_proc_layoutcommit,
+	.fence_client		= nfsd4_scsi_fence_client,
+};
+#endif /* CONFIG_NFSD_SCSILAYOUT */
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 6d834dc..ca18836 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014 Christoph Hellwig.
+ * Copyright (c) 2014-2016 Christoph Hellwig.
  */
 #include <linux/sunrpc/svc.h>
 #include <linux/exportfs.h>
@@ -53,6 +53,18 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
 		p = xdr_encode_hyper(p, b->simple.offset);
 		p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
 		break;
+	case PNFS_BLOCK_VOLUME_SCSI:
+		len = 4 + 4 + 4 + 4 + b->scsi.designator_len + 8;
+		p = xdr_reserve_space(xdr, len);
+		if (!p)
+			return -ETOOSMALL;
+
+		*p++ = cpu_to_be32(b->type);
+		*p++ = cpu_to_be32(b->scsi.code_set);
+		*p++ = cpu_to_be32(b->scsi.designator_type);
+		p = xdr_encode_opaque(p, b->scsi.designator, b->scsi.designator_len);
+		p = xdr_encode_hyper(p, b->scsi.pr_key);
+		break;
 	default:
 		return -ENOTSUPP;
 	}
@@ -155,3 +167,54 @@ fail:
 	kfree(iomaps);
 	return -EINVAL;
 }
+
+int
+nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+		u32 block_size)
+{
+	struct iomap *iomaps;
+	u32 nr_iomaps, expected, i;
+
+	if (len < sizeof(u32)) {
+		dprintk("%s: extent array too small: %u\n", __func__, len);
+		return -EINVAL;
+	}
+
+	nr_iomaps = be32_to_cpup(p++);
+	expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE;
+	if (len != expected) {
+		dprintk("%s: extent array size mismatch: %u/%u\n",
+			__func__, len, expected);
+		return -EINVAL;
+	}
+
+	iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
+	if (!iomaps) {
+		dprintk("%s: failed to allocate extent array\n", __func__);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < nr_iomaps; i++) {
+		u64 val;
+
+		p = xdr_decode_hyper(p, &val);
+		if (val & (block_size - 1)) {
+			dprintk("%s: unaligned offset 0x%llx\n", __func__, val);
+			goto fail;
+		}
+		iomaps[i].offset = val;
+
+		p = xdr_decode_hyper(p, &val);
+		if (val & (block_size - 1)) {
+			dprintk("%s: unaligned length 0x%llx\n", __func__, val);
+			goto fail;
+		}
+		iomaps[i].length = val;
+	}
+
+	*iomapp = iomaps;
+	return nr_iomaps;
+fail:
+	kfree(iomaps);
+	return -EINVAL;
+}
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index 6de925f..397bc75 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -15,6 +15,11 @@ struct pnfs_block_extent {
 	enum pnfs_block_extent_state	es;
 };
 
+struct pnfs_block_range {
+	u64				foff;
+	u64				len;
+};
+
 /*
  * Random upper cap for the uuid length to avoid unbounded allocation.
  * Not actually limited by the protocol.
@@ -29,6 +34,13 @@ struct pnfs_block_volume {
 			u32		sig_len;
 			u8		sig[PNFS_BLOCK_UUID_LEN];
 		} simple;
+		struct {
+			enum scsi_code_set		code_set;
+			enum scsi_designator_type	designator_type;
+			int				designator_len;
+			u8				designator[256];
+			u64				pr_key;
+		} scsi;
 	};
 };
 
@@ -43,5 +55,7 @@ __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
 		struct nfsd4_layoutget *lgp);
 int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
 		u32 block_size);
+int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+		u32 block_size);
 
 #endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 4e4def7..cbd804e 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2014 Christoph Hellwig.
  */
+#include <linux/blkdev.h>
 #include <linux/kmod.h>
 #include <linux/file.h>
 #include <linux/jhash.h>
@@ -29,6 +30,9 @@ const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] =  {
 #ifdef CONFIG_NFSD_BLOCKLAYOUT
 	[LAYOUT_BLOCK_VOLUME]	= &bl_layout_ops,
 #endif
+#ifdef CONFIG_NFSD_SCSILAYOUT
+	[LAYOUT_SCSI]		= &scsi_layout_ops,
+#endif
 };
 
 /* pNFS device ID to export fsid mapping */
@@ -123,12 +127,24 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
 	if (!(exp->ex_flags & NFSEXP_PNFS))
 		return;
 
+	/*
+	 * Check if the file systems supports exporting a block-like layout.
+	 * If the block device supports reservations prefer the SCSI layout,
+	 * else advertise the block layout.
+	 */
 #ifdef CONFIG_NFSD_BLOCKLAYOUT
 	if (sb->s_export_op->get_uuid &&
 	    sb->s_export_op->map_blocks &&
 	    sb->s_export_op->commit_blocks)
 		exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
 #endif
+#ifdef CONFIG_NFSD_SCSILAYOUT
+	/* overwrite block layout selection if needed */
+	if (sb->s_export_op->map_blocks &&
+	    sb->s_export_op->commit_blocks &&
+	    sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops)
+		exp->ex_layout_type = LAYOUT_SCSI;
+#endif
 }
 
 static void
@@ -594,8 +610,6 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
 
 	rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
 
-	trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
-
 	printk(KERN_WARNING
 		"nfsd: client %s failed to respond to layout recall. "
 		"  Fencing..\n", addr_str);
@@ -630,6 +644,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
 		container_of(cb, struct nfs4_layout_stateid, ls_recall);
 	struct nfsd_net *nn;
 	ktime_t now, cutoff;
+	const struct nfsd4_layout_ops *ops;
 	LIST_HEAD(reaplist);
 
 
@@ -665,7 +680,13 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
 		/*
 		 * Unknown error or non-responding client, we'll need to fence.
 		 */
-		nfsd4_cb_layout_fail(ls);
+		trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
+
+		ops = nfsd4_layout_ops[ls->ls_layout_type];
+		if (ops->fence_client)
+			ops->fence_client(ls);
+		else
+			nfsd4_cb_layout_fail(ls);
 		return -1;
 	}
 }
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 4cba786..629443e 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1269,8 +1269,10 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
 		goto out;
 
 	nfserr = nfs_ok;
-	if (gdp->gd_maxcount != 0)
-		nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
+	if (gdp->gd_maxcount != 0) {
+		nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb,
+					cstate->session->se_client, gdp);
+	}
 
 	gdp->gd_notify_types &= ops->notify_types;
 out:
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index ff50bfa..7d073b9 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -21,6 +21,7 @@ struct nfsd4_layout_ops {
 	u32		notify_types;
 
 	__be32 (*proc_getdeviceinfo)(struct super_block *sb,
+			struct nfs4_client *clp,
 			struct nfsd4_getdeviceinfo *gdevp);
 	__be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
 			struct nfsd4_getdeviceinfo *gdevp);
@@ -32,12 +33,17 @@ struct nfsd4_layout_ops {
 
 	__be32 (*proc_layoutcommit)(struct inode *inode,
 			struct nfsd4_layoutcommit *lcp);
+
+	void (*fence_client)(struct nfs4_layout_stateid *ls);
 };
 
 extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
 #ifdef CONFIG_NFSD_BLOCKLAYOUT
 extern const struct nfsd4_layout_ops bl_layout_ops;
 #endif
+#ifdef CONFIG_NFSD_SCSILAYOUT
+extern const struct nfsd4_layout_ops scsi_layout_ops;
+#endif
 
 __be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *cstate, stateid_t *stateid,
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d68b62a..3542d94 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -122,3 +122,4 @@ xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
 xfs-$(CONFIG_SYSCTL)		+= xfs_sysctl.o
 xfs-$(CONFIG_COMPAT)		+= xfs_ioctl32.o
 xfs-$(CONFIG_NFSD_BLOCKLAYOUT)	+= xfs_pnfs.o
+xfs-$(CONFIG_NFSD_SCSILAYOUT)	+= xfs_pnfs.o
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index d85529c..93f7485 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -1,7 +1,7 @@
 #ifndef _XFS_PNFS_H
 #define _XFS_PNFS_H 1
 
-#ifdef CONFIG_NFSD_BLOCKLAYOUT
+#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT)
 int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
 int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
 		struct iomap *iomap, bool write, u32 *device_generation);
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/4] nfs/blocklayout: add SCSI layout support
  2016-03-04 19:46 ` [PATCH 2/4] nfs/blocklayout: add SCSI layout support Christoph Hellwig
@ 2016-03-08 22:07   ` J. Bruce Fields
  2016-03-08 22:42     ` Trond Myklebust
  2016-03-08 22:09   ` J. Bruce Fields
  1 sibling, 1 reply; 17+ messages in thread
From: J. Bruce Fields @ 2016-03-08 22:07 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: trond.myklebust, linux-nfs

Trond, OK if I take this through the nfsd tree?

Or I'm OK with doing this however.

--b.

On Fri, Mar 04, 2016 at 08:46:15PM +0100, Christoph Hellwig wrote:
> This is a trivial extension to the block layout driver to support the
> new SCSI layouts draft.  There are three changes:
> 
>  - device identifcation through the SCSI VPD page.  This allows us to
>    directly use the udev generated persistent device names instead of
>    requiring an expensive lookup by crawling every block device node
>    in /dev and reading a signature for it.
>  - use of SCSI persistent reservations to protect device access and
>    allow for robust fencing.  On the client sides this just means
>    registering and unregistering a server supplied key.
>  - an optimized LAYOUTCOMMIT payload that doesn't send unessecary
>    fields to the server.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/nfs/blocklayout/blocklayout.c |  59 ++++++++++++++--
>  fs/nfs/blocklayout/blocklayout.h |  14 +++-
>  fs/nfs/blocklayout/dev.c         | 144 ++++++++++++++++++++++++++++++++++++++-
>  fs/nfs/blocklayout/extent_tree.c |  44 ++++++++----
>  fs/nfs/blocklayout/rpc_pipefs.c  |   2 +-
>  5 files changed, 238 insertions(+), 25 deletions(-)
> 
> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
> index ddd0138..b27c409 100644
> --- a/fs/nfs/blocklayout/blocklayout.c
> +++ b/fs/nfs/blocklayout/blocklayout.c
> @@ -446,8 +446,8 @@ static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
>  	kfree(bl);
>  }
>  
> -static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
> -						   gfp_t gfp_flags)
> +static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode,
> +		gfp_t gfp_flags, bool is_scsi_layout)
>  {
>  	struct pnfs_block_layout *bl;
>  
> @@ -460,9 +460,22 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
>  	bl->bl_ext_ro = RB_ROOT;
>  	spin_lock_init(&bl->bl_ext_lock);
>  
> +	bl->bl_scsi_layout = is_scsi_layout;
>  	return &bl->bl_layout;
>  }
>  
> +static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
> +						   gfp_t gfp_flags)
> +{
> +	return __bl_alloc_layout_hdr(inode, gfp_flags, false);
> +}
> +
> +static struct pnfs_layout_hdr *sl_alloc_layout_hdr(struct inode *inode,
> +						   gfp_t gfp_flags)
> +{
> +	return __bl_alloc_layout_hdr(inode, gfp_flags, true);
> +}
> +
>  static void bl_free_lseg(struct pnfs_layout_segment *lseg)
>  {
>  	dprintk("%s enter\n", __func__);
> @@ -888,22 +901,53 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
>  	.sync				= pnfs_generic_sync,
>  };
>  
> +static struct pnfs_layoutdriver_type scsilayout_type = {
> +	.id				= LAYOUT_SCSI,
> +	.name				= "LAYOUT_SCSI",
> +	.owner				= THIS_MODULE,
> +	.flags				= PNFS_LAYOUTRET_ON_SETATTR |
> +					  PNFS_READ_WHOLE_PAGE,
> +	.read_pagelist			= bl_read_pagelist,
> +	.write_pagelist			= bl_write_pagelist,
> +	.alloc_layout_hdr		= sl_alloc_layout_hdr,
> +	.free_layout_hdr		= bl_free_layout_hdr,
> +	.alloc_lseg			= bl_alloc_lseg,
> +	.free_lseg			= bl_free_lseg,
> +	.return_range			= bl_return_range,
> +	.prepare_layoutcommit		= bl_prepare_layoutcommit,
> +	.cleanup_layoutcommit		= bl_cleanup_layoutcommit,
> +	.set_layoutdriver		= bl_set_layoutdriver,
> +	.alloc_deviceid_node		= bl_alloc_deviceid_node,
> +	.free_deviceid_node		= bl_free_deviceid_node,
> +	.pg_read_ops			= &bl_pg_read_ops,
> +	.pg_write_ops			= &bl_pg_write_ops,
> +	.sync				= pnfs_generic_sync,
> +};
> +
> +
>  static int __init nfs4blocklayout_init(void)
>  {
>  	int ret;
>  
>  	dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
>  
> -	ret = pnfs_register_layoutdriver(&blocklayout_type);
> +	ret = bl_init_pipefs();
>  	if (ret)
>  		goto out;
> -	ret = bl_init_pipefs();
> +
> +	ret = pnfs_register_layoutdriver(&blocklayout_type);
>  	if (ret)
> -		goto out_unregister;
> +		goto out_cleanup_pipe;
> +
> +	ret = pnfs_register_layoutdriver(&scsilayout_type);
> +	if (ret)
> +		goto out_unregister_block;
>  	return 0;
>  
> -out_unregister:
> +out_unregister_block:
>  	pnfs_unregister_layoutdriver(&blocklayout_type);
> +out_cleanup_pipe:
> +	bl_cleanup_pipefs();
>  out:
>  	return ret;
>  }
> @@ -913,8 +957,9 @@ static void __exit nfs4blocklayout_exit(void)
>  	dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
>  	       __func__);
>  
> -	bl_cleanup_pipefs();
> +	pnfs_unregister_layoutdriver(&scsilayout_type);
>  	pnfs_unregister_layoutdriver(&blocklayout_type);
> +	bl_cleanup_pipefs();
>  }
>  
>  MODULE_ALIAS("nfs-layouttype4-3");
> diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
> index c556640..bc21205 100644
> --- a/fs/nfs/blocklayout/blocklayout.h
> +++ b/fs/nfs/blocklayout/blocklayout.h
> @@ -55,7 +55,6 @@ struct pnfs_block_dev;
>   */
>  #define PNFS_BLOCK_UUID_LEN	128
>  
> -
>  struct pnfs_block_volume {
>  	enum pnfs_block_volume_type	type;
>  	union {
> @@ -82,6 +81,13 @@ struct pnfs_block_volume {
>  			u32		volumes_count;
>  			u32		volumes[PNFS_BLOCK_MAX_DEVICES];
>  		} stripe;
> +		struct {
> +			enum scsi_code_set		code_set;
> +			enum scsi_designator_type	designator_type;
> +			int				designator_len;
> +			u8				designator[256];
> +			u64				pr_key;
> +		} scsi;
>  	};
>  };
>  
> @@ -106,6 +112,9 @@ struct pnfs_block_dev {
>  	struct block_device		*bdev;
>  	u64				disk_offset;
>  
> +	u64				pr_key;
> +	bool				pr_registered;
> +
>  	bool (*map)(struct pnfs_block_dev *dev, u64 offset,
>  			struct pnfs_block_dev_map *map);
>  };
> @@ -131,6 +140,7 @@ struct pnfs_block_layout {
>  	struct rb_root		bl_ext_rw;
>  	struct rb_root		bl_ext_ro;
>  	spinlock_t		bl_ext_lock;   /* Protects list manipulation */
> +	bool			bl_scsi_layout;
>  };
>  
>  static inline struct pnfs_block_layout *
> @@ -182,6 +192,6 @@ void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
>  dev_t bl_resolve_deviceid(struct nfs_server *server,
>  		struct pnfs_block_volume *b, gfp_t gfp_mask);
>  int __init bl_init_pipefs(void);
> -void __exit bl_cleanup_pipefs(void);
> +void bl_cleanup_pipefs(void);
>  
>  #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
> diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
> index a861bbd..31b0f6b 100644
> --- a/fs/nfs/blocklayout/dev.c
> +++ b/fs/nfs/blocklayout/dev.c
> @@ -1,11 +1,12 @@
>  /*
> - * Copyright (c) 2014 Christoph Hellwig.
> + * Copyright (c) 2014-2016 Christoph Hellwig.
>   */
>  #include <linux/sunrpc/svc.h>
>  #include <linux/blkdev.h>
>  #include <linux/nfs4.h>
>  #include <linux/nfs_fs.h>
>  #include <linux/nfs_xdr.h>
> +#include <linux/pr.h>
>  
>  #include "blocklayout.h"
>  
> @@ -21,6 +22,17 @@ bl_free_device(struct pnfs_block_dev *dev)
>  			bl_free_device(&dev->children[i]);
>  		kfree(dev->children);
>  	} else {
> +		if (dev->pr_registered) {
> +			const struct pr_ops *ops = 
> +				dev->bdev->bd_disk->fops->pr_ops;
> +			int error;
> +
> +			error = ops->pr_register(dev->bdev, dev->pr_key, 0,
> +				false);
> +			if (error)
> +				pr_err("failed to unregister PR key.\n");
> +		}
> +
>  		if (dev->bdev)
>  			blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
>  	}
> @@ -113,6 +125,24 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
>  		for (i = 0; i < b->stripe.volumes_count; i++)
>  			b->stripe.volumes[i] = be32_to_cpup(p++);
>  		break;
> +	case PNFS_BLOCK_VOLUME_SCSI:
> +		p = xdr_inline_decode(xdr, 4 + 4 + 4);
> +		if (!p)
> +			return -EIO;
> +		b->scsi.code_set = be32_to_cpup(p++);
> +		b->scsi.designator_type = be32_to_cpup(p++);
> +		b->scsi.designator_len = be32_to_cpup(p++);
> +		p = xdr_inline_decode(xdr, b->scsi.designator_len);
> +		if (!p)
> +			return -EIO;
> +		if (b->scsi.designator_len > 256)
> +			return -EIO;
> +		memcpy(&b->scsi.designator, p, b->scsi.designator_len);
> +		p = xdr_inline_decode(xdr, 8);
> +		if (!p)
> +			return -EIO;
> +		p = xdr_decode_hyper(p, &b->scsi.pr_key);
> +		break;
>  	default:
>  		dprintk("unknown volume type!\n");
>  		return -EIO;
> @@ -216,6 +246,116 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
>  	return 0;
>  }
>  
> +static bool
> +bl_validate_designator(struct pnfs_block_volume *v)
> +{
> +	switch (v->scsi.designator_type) {
> +	case PS_DESIGNATOR_EUI64:
> +		if (v->scsi.code_set != PS_CODE_SET_BINARY)
> +			return false;
> +
> +		if (v->scsi.designator_len != 8 &&
> +		    v->scsi.designator_len != 10 &&
> +		    v->scsi.designator_len != 16)
> +			return false;
> +
> +		return true;
> +	case PS_DESIGNATOR_NAA:
> +		if (v->scsi.code_set != PS_CODE_SET_BINARY)
> +			return false;
> +
> +		if (v->scsi.designator_len != 8 &&
> +		    v->scsi.designator_len != 16)
> +			return false;
> +
> +		return true;
> +	case PS_DESIGNATOR_T10:
> +	case PS_DESIGNATOR_NAME:
> +		pr_err("pNFS: unsupported designator "
> +			"(code set %d, type %d, len %d.\n",
> +			v->scsi.code_set,
> +			v->scsi.designator_type,
> +			v->scsi.designator_len);
> +		return false;
> +	default:
> +		pr_err("pNFS: invalid designator "
> +			"(code set %d, type %d, len %d.\n",
> +			v->scsi.code_set,
> +			v->scsi.designator_type,
> +			v->scsi.designator_len);
> +		return false;
> +	}
> +}
> +
> +static int
> +bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
> +		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
> +{
> +	struct pnfs_block_volume *v = &volumes[idx];
> +	const struct pr_ops *ops;
> +	const char *devname;
> +	int error;
> +
> +	if (!bl_validate_designator(v))
> +		return -EINVAL;
> +
> +	switch (v->scsi.designator_len) {
> +	case 8:
> +		devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN",
> +				v->scsi.designator);
> +		break;
> +	case 12:
> +		devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN",
> +				v->scsi.designator);
> +		break;
> +	case 16:
> +		devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN",
> +				v->scsi.designator);
> +		break;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL);
> +	if (IS_ERR(d->bdev)) {
> +		pr_warn("pNFS: failed to open device %s (%ld)\n",
> +			devname, PTR_ERR(d->bdev));
> +		kfree(devname);
> +		return PTR_ERR(d->bdev);
> +	}
> +
> +	kfree(devname);
> +
> +	d->len = i_size_read(d->bdev->bd_inode);
> +	d->map = bl_map_simple;
> +	d->pr_key = v->scsi.pr_key;
> +
> +	pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
> +		d->bdev->bd_disk->disk_name, d->pr_key);
> +
> +	ops = d->bdev->bd_disk->fops->pr_ops;
> +	if (!ops) {
> +		pr_err("pNFS: block device %s does not support reservations.",
> +				d->bdev->bd_disk->disk_name);
> +		error = -EINVAL;
> +		goto out_blkdev_put;
> +	}
> +
> +	error = ops->pr_register(d->bdev, 0, d->pr_key, true);
> +	if (error) {
> +		pr_err("pNFS: failed to register key for block device %s.",
> +				d->bdev->bd_disk->disk_name);
> +		goto out_blkdev_put;
> +	}
> +
> +	d->pr_registered = true;
> +	return 0;
> +
> +out_blkdev_put:
> +	blkdev_put(d->bdev, FMODE_READ);
> +	return error;
> +}
> +
>  static int
>  bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
>  		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
> @@ -303,6 +443,8 @@ bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
>  		return bl_parse_concat(server, d, volumes, idx, gfp_mask);
>  	case PNFS_BLOCK_VOLUME_STRIPE:
>  		return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
> +	case PNFS_BLOCK_VOLUME_SCSI:
> +		return bl_parse_scsi(server, d, volumes, idx, gfp_mask);
>  	default:
>  		dprintk("unsupported volume type: %d\n", volumes[idx].type);
>  		return -EIO;
> diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
> index c59a59c..df366fc 100644
> --- a/fs/nfs/blocklayout/extent_tree.c
> +++ b/fs/nfs/blocklayout/extent_tree.c
> @@ -1,5 +1,5 @@
>  /*
> - * Copyright (c) 2014 Christoph Hellwig.
> + * Copyright (c) 2014-2016 Christoph Hellwig.
>   */
>  
>  #include <linux/vmalloc.h>
> @@ -462,10 +462,12 @@ out:
>  	return err;
>  }
>  
> -static size_t ext_tree_layoutupdate_size(size_t count)
> +static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count)
>  {
> -	return sizeof(__be32) /* number of entries */ +
> -		PNFS_BLOCK_EXTENT_SIZE * count;
> +	if (bl->bl_scsi_layout)
> +		return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count;
> +	else
> +		return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count;
>  }
>  
>  static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
> @@ -482,6 +484,23 @@ static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
>  	}
>  }
>  
> +static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p)
> +{
> +	p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
> +			NFS4_DEVICEID4_SIZE);
> +	p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
> +	p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
> +	p = xdr_encode_hyper(p, 0LL);
> +	*p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
> +	return p;
> +}
> +
> +static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
> +{
> +	p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
> +	return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
> +}
> +
>  static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
>  		size_t buffer_size, size_t *count)
>  {
> @@ -495,19 +514,16 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
>  			continue;
>  
>  		(*count)++;
> -		if (ext_tree_layoutupdate_size(*count) > buffer_size) {
> +		if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
>  			/* keep counting.. */
>  			ret = -ENOSPC;
>  			continue;
>  		}
>  
> -		p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
> -				NFS4_DEVICEID4_SIZE);
> -		p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
> -		p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
> -		p = xdr_encode_hyper(p, 0LL);
> -		*p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
> -
> +		if (bl->bl_scsi_layout)
> +			p = encode_scsi_range(be, p);
> +		else
> +			p = encode_block_extent(be, p);
>  		be->be_tag = EXTENT_COMMITTING;
>  	}
>  	spin_unlock(&bl->bl_ext_lock);
> @@ -536,7 +552,7 @@ retry:
>  	if (unlikely(ret)) {
>  		ext_tree_free_commitdata(arg, buffer_size);
>  
> -		buffer_size = ext_tree_layoutupdate_size(count);
> +		buffer_size = ext_tree_layoutupdate_size(bl, count);
>  		count = 0;
>  
>  		arg->layoutupdate_pages =
> @@ -555,7 +571,7 @@ retry:
>  	}
>  
>  	*start_p = cpu_to_be32(count);
> -	arg->layoutupdate_len = ext_tree_layoutupdate_size(count);
> +	arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count);
>  
>  	if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
>  		void *p = start_p, *end = p + arg->layoutupdate_len;
> diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
> index dbe5839..9fb067a6 100644
> --- a/fs/nfs/blocklayout/rpc_pipefs.c
> +++ b/fs/nfs/blocklayout/rpc_pipefs.c
> @@ -281,7 +281,7 @@ out:
>  	return ret;
>  }
>  
> -void __exit bl_cleanup_pipefs(void)
> +void bl_cleanup_pipefs(void)
>  {
>  	rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
>  	unregister_pernet_subsys(&nfs4blocklayout_net_ops);
> -- 
> 2.1.4

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/4] nfs/blocklayout: add SCSI layout support
  2016-03-04 19:46 ` [PATCH 2/4] nfs/blocklayout: add SCSI layout support Christoph Hellwig
  2016-03-08 22:07   ` J. Bruce Fields
@ 2016-03-08 22:09   ` J. Bruce Fields
  1 sibling, 0 replies; 17+ messages in thread
From: J. Bruce Fields @ 2016-03-08 22:09 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: trond.myklebust, linux-nfs

On Fri, Mar 04, 2016 at 08:46:15PM +0100, Christoph Hellwig wrote:
> This is a trivial extension to the block layout driver to support the
> new SCSI layouts draft.  There are three changes:
> 
>  - device identifcation through the SCSI VPD page.  This allows us to
>    directly use the udev generated persistent device names instead of
>    requiring an expensive lookup by crawling every block device node
>    in /dev and reading a signature for it.

And looks like this means there's no change required to the userspace
support code since it all happens in-kernel.

--b.

>  - use of SCSI persistent reservations to protect device access and
>    allow for robust fencing.  On the client sides this just means
>    registering and unregistering a server supplied key.
>  - an optimized LAYOUTCOMMIT payload that doesn't send unessecary
>    fields to the server.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/nfs/blocklayout/blocklayout.c |  59 ++++++++++++++--
>  fs/nfs/blocklayout/blocklayout.h |  14 +++-
>  fs/nfs/blocklayout/dev.c         | 144 ++++++++++++++++++++++++++++++++++++++-
>  fs/nfs/blocklayout/extent_tree.c |  44 ++++++++----
>  fs/nfs/blocklayout/rpc_pipefs.c  |   2 +-
>  5 files changed, 238 insertions(+), 25 deletions(-)
> 
> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
> index ddd0138..b27c409 100644
> --- a/fs/nfs/blocklayout/blocklayout.c
> +++ b/fs/nfs/blocklayout/blocklayout.c
> @@ -446,8 +446,8 @@ static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
>  	kfree(bl);
>  }
>  
> -static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
> -						   gfp_t gfp_flags)
> +static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode,
> +		gfp_t gfp_flags, bool is_scsi_layout)
>  {
>  	struct pnfs_block_layout *bl;
>  
> @@ -460,9 +460,22 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
>  	bl->bl_ext_ro = RB_ROOT;
>  	spin_lock_init(&bl->bl_ext_lock);
>  
> +	bl->bl_scsi_layout = is_scsi_layout;
>  	return &bl->bl_layout;
>  }
>  
> +static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
> +						   gfp_t gfp_flags)
> +{
> +	return __bl_alloc_layout_hdr(inode, gfp_flags, false);
> +}
> +
> +static struct pnfs_layout_hdr *sl_alloc_layout_hdr(struct inode *inode,
> +						   gfp_t gfp_flags)
> +{
> +	return __bl_alloc_layout_hdr(inode, gfp_flags, true);
> +}
> +
>  static void bl_free_lseg(struct pnfs_layout_segment *lseg)
>  {
>  	dprintk("%s enter\n", __func__);
> @@ -888,22 +901,53 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
>  	.sync				= pnfs_generic_sync,
>  };
>  
> +static struct pnfs_layoutdriver_type scsilayout_type = {
> +	.id				= LAYOUT_SCSI,
> +	.name				= "LAYOUT_SCSI",
> +	.owner				= THIS_MODULE,
> +	.flags				= PNFS_LAYOUTRET_ON_SETATTR |
> +					  PNFS_READ_WHOLE_PAGE,
> +	.read_pagelist			= bl_read_pagelist,
> +	.write_pagelist			= bl_write_pagelist,
> +	.alloc_layout_hdr		= sl_alloc_layout_hdr,
> +	.free_layout_hdr		= bl_free_layout_hdr,
> +	.alloc_lseg			= bl_alloc_lseg,
> +	.free_lseg			= bl_free_lseg,
> +	.return_range			= bl_return_range,
> +	.prepare_layoutcommit		= bl_prepare_layoutcommit,
> +	.cleanup_layoutcommit		= bl_cleanup_layoutcommit,
> +	.set_layoutdriver		= bl_set_layoutdriver,
> +	.alloc_deviceid_node		= bl_alloc_deviceid_node,
> +	.free_deviceid_node		= bl_free_deviceid_node,
> +	.pg_read_ops			= &bl_pg_read_ops,
> +	.pg_write_ops			= &bl_pg_write_ops,
> +	.sync				= pnfs_generic_sync,
> +};
> +
> +
>  static int __init nfs4blocklayout_init(void)
>  {
>  	int ret;
>  
>  	dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
>  
> -	ret = pnfs_register_layoutdriver(&blocklayout_type);
> +	ret = bl_init_pipefs();
>  	if (ret)
>  		goto out;
> -	ret = bl_init_pipefs();
> +
> +	ret = pnfs_register_layoutdriver(&blocklayout_type);
>  	if (ret)
> -		goto out_unregister;
> +		goto out_cleanup_pipe;
> +
> +	ret = pnfs_register_layoutdriver(&scsilayout_type);
> +	if (ret)
> +		goto out_unregister_block;
>  	return 0;
>  
> -out_unregister:
> +out_unregister_block:
>  	pnfs_unregister_layoutdriver(&blocklayout_type);
> +out_cleanup_pipe:
> +	bl_cleanup_pipefs();
>  out:
>  	return ret;
>  }
> @@ -913,8 +957,9 @@ static void __exit nfs4blocklayout_exit(void)
>  	dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
>  	       __func__);
>  
> -	bl_cleanup_pipefs();
> +	pnfs_unregister_layoutdriver(&scsilayout_type);
>  	pnfs_unregister_layoutdriver(&blocklayout_type);
> +	bl_cleanup_pipefs();
>  }
>  
>  MODULE_ALIAS("nfs-layouttype4-3");
> diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
> index c556640..bc21205 100644
> --- a/fs/nfs/blocklayout/blocklayout.h
> +++ b/fs/nfs/blocklayout/blocklayout.h
> @@ -55,7 +55,6 @@ struct pnfs_block_dev;
>   */
>  #define PNFS_BLOCK_UUID_LEN	128
>  
> -
>  struct pnfs_block_volume {
>  	enum pnfs_block_volume_type	type;
>  	union {
> @@ -82,6 +81,13 @@ struct pnfs_block_volume {
>  			u32		volumes_count;
>  			u32		volumes[PNFS_BLOCK_MAX_DEVICES];
>  		} stripe;
> +		struct {
> +			enum scsi_code_set		code_set;
> +			enum scsi_designator_type	designator_type;
> +			int				designator_len;
> +			u8				designator[256];
> +			u64				pr_key;
> +		} scsi;
>  	};
>  };
>  
> @@ -106,6 +112,9 @@ struct pnfs_block_dev {
>  	struct block_device		*bdev;
>  	u64				disk_offset;
>  
> +	u64				pr_key;
> +	bool				pr_registered;
> +
>  	bool (*map)(struct pnfs_block_dev *dev, u64 offset,
>  			struct pnfs_block_dev_map *map);
>  };
> @@ -131,6 +140,7 @@ struct pnfs_block_layout {
>  	struct rb_root		bl_ext_rw;
>  	struct rb_root		bl_ext_ro;
>  	spinlock_t		bl_ext_lock;   /* Protects list manipulation */
> +	bool			bl_scsi_layout;
>  };
>  
>  static inline struct pnfs_block_layout *
> @@ -182,6 +192,6 @@ void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
>  dev_t bl_resolve_deviceid(struct nfs_server *server,
>  		struct pnfs_block_volume *b, gfp_t gfp_mask);
>  int __init bl_init_pipefs(void);
> -void __exit bl_cleanup_pipefs(void);
> +void bl_cleanup_pipefs(void);
>  
>  #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
> diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
> index a861bbd..31b0f6b 100644
> --- a/fs/nfs/blocklayout/dev.c
> +++ b/fs/nfs/blocklayout/dev.c
> @@ -1,11 +1,12 @@
>  /*
> - * Copyright (c) 2014 Christoph Hellwig.
> + * Copyright (c) 2014-2016 Christoph Hellwig.
>   */
>  #include <linux/sunrpc/svc.h>
>  #include <linux/blkdev.h>
>  #include <linux/nfs4.h>
>  #include <linux/nfs_fs.h>
>  #include <linux/nfs_xdr.h>
> +#include <linux/pr.h>
>  
>  #include "blocklayout.h"
>  
> @@ -21,6 +22,17 @@ bl_free_device(struct pnfs_block_dev *dev)
>  			bl_free_device(&dev->children[i]);
>  		kfree(dev->children);
>  	} else {
> +		if (dev->pr_registered) {
> +			const struct pr_ops *ops = 
> +				dev->bdev->bd_disk->fops->pr_ops;
> +			int error;
> +
> +			error = ops->pr_register(dev->bdev, dev->pr_key, 0,
> +				false);
> +			if (error)
> +				pr_err("failed to unregister PR key.\n");
> +		}
> +
>  		if (dev->bdev)
>  			blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
>  	}
> @@ -113,6 +125,24 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
>  		for (i = 0; i < b->stripe.volumes_count; i++)
>  			b->stripe.volumes[i] = be32_to_cpup(p++);
>  		break;
> +	case PNFS_BLOCK_VOLUME_SCSI:
> +		p = xdr_inline_decode(xdr, 4 + 4 + 4);
> +		if (!p)
> +			return -EIO;
> +		b->scsi.code_set = be32_to_cpup(p++);
> +		b->scsi.designator_type = be32_to_cpup(p++);
> +		b->scsi.designator_len = be32_to_cpup(p++);
> +		p = xdr_inline_decode(xdr, b->scsi.designator_len);
> +		if (!p)
> +			return -EIO;
> +		if (b->scsi.designator_len > 256)
> +			return -EIO;
> +		memcpy(&b->scsi.designator, p, b->scsi.designator_len);
> +		p = xdr_inline_decode(xdr, 8);
> +		if (!p)
> +			return -EIO;
> +		p = xdr_decode_hyper(p, &b->scsi.pr_key);
> +		break;
>  	default:
>  		dprintk("unknown volume type!\n");
>  		return -EIO;
> @@ -216,6 +246,116 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
>  	return 0;
>  }
>  
> +static bool
> +bl_validate_designator(struct pnfs_block_volume *v)
> +{
> +	switch (v->scsi.designator_type) {
> +	case PS_DESIGNATOR_EUI64:
> +		if (v->scsi.code_set != PS_CODE_SET_BINARY)
> +			return false;
> +
> +		if (v->scsi.designator_len != 8 &&
> +		    v->scsi.designator_len != 10 &&
> +		    v->scsi.designator_len != 16)
> +			return false;
> +
> +		return true;
> +	case PS_DESIGNATOR_NAA:
> +		if (v->scsi.code_set != PS_CODE_SET_BINARY)
> +			return false;
> +
> +		if (v->scsi.designator_len != 8 &&
> +		    v->scsi.designator_len != 16)
> +			return false;
> +
> +		return true;
> +	case PS_DESIGNATOR_T10:
> +	case PS_DESIGNATOR_NAME:
> +		pr_err("pNFS: unsupported designator "
> +			"(code set %d, type %d, len %d.\n",
> +			v->scsi.code_set,
> +			v->scsi.designator_type,
> +			v->scsi.designator_len);
> +		return false;
> +	default:
> +		pr_err("pNFS: invalid designator "
> +			"(code set %d, type %d, len %d.\n",
> +			v->scsi.code_set,
> +			v->scsi.designator_type,
> +			v->scsi.designator_len);
> +		return false;
> +	}
> +}
> +
> +static int
> +bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
> +		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
> +{
> +	struct pnfs_block_volume *v = &volumes[idx];
> +	const struct pr_ops *ops;
> +	const char *devname;
> +	int error;
> +
> +	if (!bl_validate_designator(v))
> +		return -EINVAL;
> +
> +	switch (v->scsi.designator_len) {
> +	case 8:
> +		devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN",
> +				v->scsi.designator);
> +		break;
> +	case 12:
> +		devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN",
> +				v->scsi.designator);
> +		break;
> +	case 16:
> +		devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN",
> +				v->scsi.designator);
> +		break;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL);
> +	if (IS_ERR(d->bdev)) {
> +		pr_warn("pNFS: failed to open device %s (%ld)\n",
> +			devname, PTR_ERR(d->bdev));
> +		kfree(devname);
> +		return PTR_ERR(d->bdev);
> +	}
> +
> +	kfree(devname);
> +
> +	d->len = i_size_read(d->bdev->bd_inode);
> +	d->map = bl_map_simple;
> +	d->pr_key = v->scsi.pr_key;
> +
> +	pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
> +		d->bdev->bd_disk->disk_name, d->pr_key);
> +
> +	ops = d->bdev->bd_disk->fops->pr_ops;
> +	if (!ops) {
> +		pr_err("pNFS: block device %s does not support reservations.",
> +				d->bdev->bd_disk->disk_name);
> +		error = -EINVAL;
> +		goto out_blkdev_put;
> +	}
> +
> +	error = ops->pr_register(d->bdev, 0, d->pr_key, true);
> +	if (error) {
> +		pr_err("pNFS: failed to register key for block device %s.",
> +				d->bdev->bd_disk->disk_name);
> +		goto out_blkdev_put;
> +	}
> +
> +	d->pr_registered = true;
> +	return 0;
> +
> +out_blkdev_put:
> +	blkdev_put(d->bdev, FMODE_READ);
> +	return error;
> +}
> +
>  static int
>  bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
>  		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
> @@ -303,6 +443,8 @@ bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
>  		return bl_parse_concat(server, d, volumes, idx, gfp_mask);
>  	case PNFS_BLOCK_VOLUME_STRIPE:
>  		return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
> +	case PNFS_BLOCK_VOLUME_SCSI:
> +		return bl_parse_scsi(server, d, volumes, idx, gfp_mask);
>  	default:
>  		dprintk("unsupported volume type: %d\n", volumes[idx].type);
>  		return -EIO;
> diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
> index c59a59c..df366fc 100644
> --- a/fs/nfs/blocklayout/extent_tree.c
> +++ b/fs/nfs/blocklayout/extent_tree.c
> @@ -1,5 +1,5 @@
>  /*
> - * Copyright (c) 2014 Christoph Hellwig.
> + * Copyright (c) 2014-2016 Christoph Hellwig.
>   */
>  
>  #include <linux/vmalloc.h>
> @@ -462,10 +462,12 @@ out:
>  	return err;
>  }
>  
> -static size_t ext_tree_layoutupdate_size(size_t count)
> +static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count)
>  {
> -	return sizeof(__be32) /* number of entries */ +
> -		PNFS_BLOCK_EXTENT_SIZE * count;
> +	if (bl->bl_scsi_layout)
> +		return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count;
> +	else
> +		return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count;
>  }
>  
>  static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
> @@ -482,6 +484,23 @@ static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
>  	}
>  }
>  
> +static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p)
> +{
> +	p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
> +			NFS4_DEVICEID4_SIZE);
> +	p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
> +	p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
> +	p = xdr_encode_hyper(p, 0LL);
> +	*p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
> +	return p;
> +}
> +
> +static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
> +{
> +	p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
> +	return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
> +}
> +
>  static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
>  		size_t buffer_size, size_t *count)
>  {
> @@ -495,19 +514,16 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
>  			continue;
>  
>  		(*count)++;
> -		if (ext_tree_layoutupdate_size(*count) > buffer_size) {
> +		if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
>  			/* keep counting.. */
>  			ret = -ENOSPC;
>  			continue;
>  		}
>  
> -		p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
> -				NFS4_DEVICEID4_SIZE);
> -		p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
> -		p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
> -		p = xdr_encode_hyper(p, 0LL);
> -		*p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
> -
> +		if (bl->bl_scsi_layout)
> +			p = encode_scsi_range(be, p);
> +		else
> +			p = encode_block_extent(be, p);
>  		be->be_tag = EXTENT_COMMITTING;
>  	}
>  	spin_unlock(&bl->bl_ext_lock);
> @@ -536,7 +552,7 @@ retry:
>  	if (unlikely(ret)) {
>  		ext_tree_free_commitdata(arg, buffer_size);
>  
> -		buffer_size = ext_tree_layoutupdate_size(count);
> +		buffer_size = ext_tree_layoutupdate_size(bl, count);
>  		count = 0;
>  
>  		arg->layoutupdate_pages =
> @@ -555,7 +571,7 @@ retry:
>  	}
>  
>  	*start_p = cpu_to_be32(count);
> -	arg->layoutupdate_len = ext_tree_layoutupdate_size(count);
> +	arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count);
>  
>  	if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
>  		void *p = start_p, *end = p + arg->layoutupdate_len;
> diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
> index dbe5839..9fb067a6 100644
> --- a/fs/nfs/blocklayout/rpc_pipefs.c
> +++ b/fs/nfs/blocklayout/rpc_pipefs.c
> @@ -281,7 +281,7 @@ out:
>  	return ret;
>  }
>  
> -void __exit bl_cleanup_pipefs(void)
> +void bl_cleanup_pipefs(void)
>  {
>  	rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
>  	unregister_pernet_subsys(&nfs4blocklayout_net_ops);
> -- 
> 2.1.4

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] nfsd: add SCSI layout support
  2016-03-04 19:46 ` [PATCH 4/4] nfsd: add SCSI layout support Christoph Hellwig
@ 2016-03-08 22:15   ` J. Bruce Fields
  2016-03-09 14:48     ` Christoph Hellwig
  2016-03-10 22:26   ` J. Bruce Fields
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 17+ messages in thread
From: J. Bruce Fields @ 2016-03-08 22:15 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: trond.myklebust, linux-nfs

On Fri, Mar 04, 2016 at 08:46:17PM +0100, Christoph Hellwig wrote:
> This is a simple extension to the block layout driver to use SCSI
> persistent reservations for access control and fencing, as well as
> SCSI VPD pages for device identification.
> 
> For this we need to pass the nfs4_client to the proc_getdeviceinfo method
> to generate the reservation key, and add a new fence_client method
> to allow for fence actions in the layout driver.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  Documentation/filesystems/nfs/pnfs-scsi-server.txt |  22 ++
>  fs/nfsd/Kconfig                                    |  13 +
>  fs/nfsd/Makefile                                   |   1 +
>  fs/nfsd/blocklayout.c                              | 298 ++++++++++++++++++---
>  fs/nfsd/blocklayoutxdr.c                           |  65 ++++-
>  fs/nfsd/blocklayoutxdr.h                           |  14 +
>  fs/nfsd/nfs4layouts.c                              |  27 +-
>  fs/nfsd/nfs4proc.c                                 |   6 +-
>  fs/nfsd/pnfs.h                                     |   6 +
>  fs/xfs/Makefile                                    |   1 +
>  fs/xfs/xfs_pnfs.h                                  |   2 +-
>  11 files changed, 407 insertions(+), 48 deletions(-)
>  create mode 100644 Documentation/filesystems/nfs/pnfs-scsi-server.txt
> 
> diff --git a/Documentation/filesystems/nfs/pnfs-scsi-server.txt b/Documentation/filesystems/nfs/pnfs-scsi-server.txt
> new file mode 100644
> index 0000000..4150979
> --- /dev/null
> +++ b/Documentation/filesystems/nfs/pnfs-scsi-server.txt
> @@ -0,0 +1,22 @@
> +
> +pNFS SCSI layout server user guide
> +==================================
> +
> +This document describes support for pNFS SCSI layouts in the Linux NFS server.
> +With pNFS SCSI layouts, the NFS server acts as Metadata Server (MDS) for pNFS,
> +which in addition to handling all the metadata access to the NFS export,
> +also hands out layouts to the clients so that they can directly access the
> +underlying SCSI LUNs that are shared with the client.
> +
> +To use pNFS SCSI layouts with with the Linux NFS server, the exported file
> +system needs to support the pNFS SCSI layouts (currently just XFS), and the
> +file system must sit on a SCSI LUN that is accessible to the clients in
> +addition to the MDS.  As of now the file system needs to sit directly on the
> +exported LUN, striping or concatenation of LUNs on the MDS and clients
> +is not supported yet.
> +
> +On the server, pNFS SCSI volume support is automatically enabled if the
> +file system is exported using the "pnfs" option and the underlying SCSI
> +device support persistent reservations.  On the client make sure the kernel
> +has the CONFIG_PNFS_BLOCK option enabled, and the file system is mounted
> +using the NFSv4.1 protocol version (mount -o vers=4.1).

May as well document the server-side config there too, I guess; I'm
editing that like:

	On {+a server built with CONFIG_NFSD_SCSI,+} the[-server,-] pNFS
	SCSI volume support is

--b.

> diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
> index eb70d91..a30a313 100644
> --- a/fs/nfsd/Kconfig
> +++ b/fs/nfsd/Kconfig
> @@ -98,6 +98,19 @@ config NFSD_BLOCKLAYOUT
>  
>  	  If unsure, say N.
>  
> +config NFSD_SCSILAYOUT
> +	bool "NFSv4.1 server support for pNFS SCSI layouts"
> +	depends on NFSD_V4
> +	select NFSD_PNFS
> +	help
> +	  This option enables support for the exporting pNFS SCSI layouts
> +	  in the kernel's NFS server. The pNFS SCSI layout enables NFS
> +	  clients to directly perform I/O to SCSI devices accesible to both
> +	  the server and the clients.  See draft-ietf-nfsv4-scsi-layout for
> +	  more details.
> +
> +	  If unsure, say N.
> +
>  config NFSD_V4_SECURITY_LABEL
>  	bool "Provide Security Label support for NFSv4 server"
>  	depends on NFSD_V4 && SECURITY
> diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
> index 679cdc6..3ae5f3c 100644
> --- a/fs/nfsd/Makefile
> +++ b/fs/nfsd/Makefile
> @@ -19,3 +19,4 @@ nfsd-$(CONFIG_NFSD_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
>  			   nfs4acl.o nfs4callback.o nfs4recover.o
>  nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
>  nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
> +nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
> index c29d942..0e87e3e 100644
> --- a/fs/nfsd/blocklayout.c
> +++ b/fs/nfsd/blocklayout.c
> @@ -1,11 +1,14 @@
>  /*
> - * Copyright (c) 2014 Christoph Hellwig.
> + * Copyright (c) 2014-2016 Christoph Hellwig.
>   */
>  #include <linux/exportfs.h>
>  #include <linux/genhd.h>
>  #include <linux/slab.h>
> +#include <linux/pr.h>
>  
>  #include <linux/nfsd/debug.h>
> +#include <scsi/scsi_proto.h>
> +#include <scsi/scsi_common.h>
>  
>  #include "blocklayoutxdr.h"
>  #include "pnfs.h"
> @@ -13,37 +16,6 @@
>  #define NFSDDBG_FACILITY	NFSDDBG_PNFS
>  
>  
> -static int
> -nfsd4_block_get_device_info_simple(struct super_block *sb,
> -		struct nfsd4_getdeviceinfo *gdp)
> -{
> -	struct pnfs_block_deviceaddr *dev;
> -	struct pnfs_block_volume *b;
> -
> -	dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
> -		      sizeof(struct pnfs_block_volume), GFP_KERNEL);
> -	if (!dev)
> -		return -ENOMEM;
> -	gdp->gd_device = dev;
> -
> -	dev->nr_volumes = 1;
> -	b = &dev->volumes[0];
> -
> -	b->type = PNFS_BLOCK_VOLUME_SIMPLE;
> -	b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
> -	return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
> -			&b->simple.offset);
> -}
> -
> -static __be32
> -nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
> -		struct nfsd4_getdeviceinfo *gdp)
> -{
> -	if (sb->s_bdev != sb->s_bdev->bd_contains)
> -		return nfserr_inval;
> -	return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
> -}
> -
>  static __be32
>  nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
>  		struct nfsd4_layoutget *args)
> @@ -141,20 +113,13 @@ out_layoutunavailable:
>  }
>  
>  static __be32
> -nfsd4_block_proc_layoutcommit(struct inode *inode,
> -		struct nfsd4_layoutcommit *lcp)
> +nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp,
> +		struct iomap *iomaps, int nr_iomaps)
>  {
>  	loff_t new_size = lcp->lc_last_wr + 1;
>  	struct iattr iattr = { .ia_valid = 0 };
> -	struct iomap *iomaps;
> -	int nr_iomaps;
>  	int error;
>  
> -	nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
> -			lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
> -	if (nr_iomaps < 0)
> -		return nfserrno(nr_iomaps);
> -
>  	if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
>  	    timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
>  		lcp->lc_mtime = current_fs_time(inode->i_sb);
> @@ -172,6 +137,54 @@ nfsd4_block_proc_layoutcommit(struct inode *inode,
>  	return nfserrno(error);
>  }
>  
> +#ifdef CONFIG_NFSD_BLOCKLAYOUT
> +static int
> +nfsd4_block_get_device_info_simple(struct super_block *sb,
> +		struct nfsd4_getdeviceinfo *gdp)
> +{
> +	struct pnfs_block_deviceaddr *dev;
> +	struct pnfs_block_volume *b;
> +
> +	dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
> +		      sizeof(struct pnfs_block_volume), GFP_KERNEL);
> +	if (!dev)
> +		return -ENOMEM;
> +	gdp->gd_device = dev;
> +
> +	dev->nr_volumes = 1;
> +	b = &dev->volumes[0];
> +
> +	b->type = PNFS_BLOCK_VOLUME_SIMPLE;
> +	b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
> +	return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
> +			&b->simple.offset);
> +}
> +
> +static __be32
> +nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
> +		struct nfs4_client *clp,
> +		struct nfsd4_getdeviceinfo *gdp)
> +{
> +	if (sb->s_bdev != sb->s_bdev->bd_contains)
> +		return nfserr_inval;
> +	return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
> +}
> +
> +static __be32
> +nfsd4_block_proc_layoutcommit(struct inode *inode,
> +		struct nfsd4_layoutcommit *lcp)
> +{
> +	struct iomap *iomaps;
> +	int nr_iomaps;
> +
> +	nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
> +			lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
> +	if (nr_iomaps < 0)
> +		return nfserrno(nr_iomaps);
> +
> +	return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
> +}
> +
>  const struct nfsd4_layout_ops bl_layout_ops = {
>  	/*
>  	 * Pretend that we send notification to the client.  This is a blatant
> @@ -190,3 +203,206 @@ const struct nfsd4_layout_ops bl_layout_ops = {
>  	.encode_layoutget	= nfsd4_block_encode_layoutget,
>  	.proc_layoutcommit	= nfsd4_block_proc_layoutcommit,
>  };
> +#endif /* CONFIG_NFSD_BLOCKLAYOUT */
> +
> +#ifdef CONFIG_NFSD_SCSILAYOUT
> +static int nfsd4_scsi_identify_device(struct block_device *bdev,
> +		struct pnfs_block_volume *b)
> +{
> +	struct request_queue *q = bdev->bd_disk->queue;
> +	struct request *rq;
> +	size_t bufflen = 252, len, id_len;
> +	u8 *buf, *d, type, assoc;
> +	int error;
> +
> +	buf = kzalloc(bufflen, GFP_KERNEL);
> +	if (!buf)
> +		return -ENOMEM;
> +
> +	rq = blk_get_request(q, READ, GFP_KERNEL);
> +	if (IS_ERR(rq)) {
> +		error = -ENOMEM;
> +		goto out_free_buf;
> +	}
> +	blk_rq_set_block_pc(rq);
> +
> +	error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL);
> +	if (error)
> +		goto out_put_request;
> +
> +	rq->cmd[0] = INQUIRY;
> +	rq->cmd[1] = 1;
> +	rq->cmd[2] = 0x83;
> +	rq->cmd[3] = bufflen >> 8;
> +	rq->cmd[4] = bufflen & 0xff;
> +	rq->cmd_len = COMMAND_SIZE(INQUIRY);
> +
> +	error = blk_execute_rq(rq->q, NULL, rq, 1);
> +	if (error) {
> +		pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
> +			rq->errors);
> +		
> +	}
> +
> +	len = (buf[2] << 8) + buf[3] + 4;
> +	if (len > bufflen) {
> +		pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n",
> +			len);
> +		goto out_put_request;
> +	}
> +
> +	d = buf + 4;
> +	for (d = buf + 4; d < buf + len; d += id_len + 4) {
> +		id_len = d[3];
> +		type = d[1] & 0xf;
> +		assoc = (d[1] >> 4) & 0x3;
> +
> +		/*
> +		 * We only care about a EUI-64 and NAA designator types
> +		 * with LU association.
> +		 */
> +		if (assoc != 0x00)
> +			continue;
> +		if (type != 0x02 && type != 0x03)
> +			continue;
> +		if (id_len != 8 && id_len != 12 && id_len != 16)
> +			continue;
> +
> +		b->scsi.code_set = PS_CODE_SET_BINARY;
> +		b->scsi.designator_type = type == 0x02 ?
> +			PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA;
> +		b->scsi.designator_len = id_len;
> +		memcpy(b->scsi.designator, d + 4, id_len);
> +
> +		/*
> +		 * If we found a 8 or 12 byte descriptor continue on to
> +		 * see if a 16 byte one is available.  If we find a
> +		 * 16 byte descriptor we're done.
> +		 */
> +		if (id_len == 16)
> +			break;
> +	}
> +
> +out_put_request:
> +	blk_put_request(rq);
> +out_free_buf:
> +	kfree(buf);
> +	return error;
> +}
> +
> +#define NFSD_MDS_PR_KEY		0x0100000000000000
> +
> +/*
> + * We use the client ID as a uniqueue key for the reservations.
> + * This allows us to easily fence a client when recalls fail.
> + */
> +static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp)
> +{
> +	return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id;
> +}
> +
> +static int
> +nfsd4_block_get_device_info_scsi(struct super_block *sb,
> +		struct nfs4_client *clp,
> +		struct nfsd4_getdeviceinfo *gdp)
> +{
> +	struct pnfs_block_deviceaddr *dev;
> +	struct pnfs_block_volume *b;
> +	const struct pr_ops *ops;
> +	int error;
> +
> +	dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
> +		      sizeof(struct pnfs_block_volume), GFP_KERNEL);
> +	if (!dev)
> +		return -ENOMEM;
> +	gdp->gd_device = dev;
> +
> +	dev->nr_volumes = 1;
> +	b = &dev->volumes[0];
> +
> +	b->type = PNFS_BLOCK_VOLUME_SCSI;
> +	b->scsi.pr_key = nfsd4_scsi_pr_key(clp);
> +
> +	error = nfsd4_scsi_identify_device(sb->s_bdev, b);
> +	if (error)
> +		return error;
> +
> +	ops = sb->s_bdev->bd_disk->fops->pr_ops;
> +	if (!ops) {
> +		pr_err("pNFS: device %s does not support PRs.\n",
> +			sb->s_id);
> +		return -EINVAL;
> +	}
> +
> +	error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
> +	if (error) {
> +		pr_err("pNFS: failed to register key for device %s.\n",
> +			sb->s_id);
> +		return -EINVAL;
> +	}
> +
> +	error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
> +			PR_EXCLUSIVE_ACCESS_REG_ONLY, 0);
> +	if (error) {
> +		pr_err("pNFS: failed to reserve device %s.\n",
> +			sb->s_id);
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
> +static __be32
> +nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb,
> +		struct nfs4_client *clp,
> +		struct nfsd4_getdeviceinfo *gdp)
> +{
> +	if (sb->s_bdev != sb->s_bdev->bd_contains)
> +		return nfserr_inval;
> +	return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp));
> +}
> +static __be32
> +nfsd4_scsi_proc_layoutcommit(struct inode *inode,
> +		struct nfsd4_layoutcommit *lcp)
> +{
> +	struct iomap *iomaps;
> +	int nr_iomaps;
> +
> +	nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout,
> +			lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
> +	if (nr_iomaps < 0)
> +		return nfserrno(nr_iomaps);
> +
> +	return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
> +}
> +
> +static void
> +nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls)
> +{
> +	struct nfs4_client *clp = ls->ls_stid.sc_client;
> +	struct block_device *bdev = ls->ls_file->f_path.mnt->mnt_sb->s_bdev;
> +
> +	bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
> +			nfsd4_scsi_pr_key(clp), 0, true);
> +}
> +
> +const struct nfsd4_layout_ops scsi_layout_ops = {
> +	/*
> +	 * Pretend that we send notification to the client.  This is a blatant
> +	 * lie to force recent Linux clients to cache our device IDs.
> +	 * We rarely ever change the device ID, so the harm of leaking deviceids
> +	 * for a while isn't too bad.  Unfortunately RFC5661 is a complete mess
> +	 * in this regard, but I filed errata 4119 for this a while ago, and
> +	 * hopefully the Linux client will eventually start caching deviceids
> +	 * without this again.
> +	 */
> +	.notify_types		=
> +			NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
> +	.proc_getdeviceinfo	= nfsd4_scsi_proc_getdeviceinfo,
> +	.encode_getdeviceinfo	= nfsd4_block_encode_getdeviceinfo,
> +	.proc_layoutget		= nfsd4_block_proc_layoutget,
> +	.encode_layoutget	= nfsd4_block_encode_layoutget,
> +	.proc_layoutcommit	= nfsd4_scsi_proc_layoutcommit,
> +	.fence_client		= nfsd4_scsi_fence_client,
> +};
> +#endif /* CONFIG_NFSD_SCSILAYOUT */
> diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
> index 6d834dc..ca18836 100644
> --- a/fs/nfsd/blocklayoutxdr.c
> +++ b/fs/nfsd/blocklayoutxdr.c
> @@ -1,5 +1,5 @@
>  /*
> - * Copyright (c) 2014 Christoph Hellwig.
> + * Copyright (c) 2014-2016 Christoph Hellwig.
>   */
>  #include <linux/sunrpc/svc.h>
>  #include <linux/exportfs.h>
> @@ -53,6 +53,18 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
>  		p = xdr_encode_hyper(p, b->simple.offset);
>  		p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
>  		break;
> +	case PNFS_BLOCK_VOLUME_SCSI:
> +		len = 4 + 4 + 4 + 4 + b->scsi.designator_len + 8;
> +		p = xdr_reserve_space(xdr, len);
> +		if (!p)
> +			return -ETOOSMALL;
> +
> +		*p++ = cpu_to_be32(b->type);
> +		*p++ = cpu_to_be32(b->scsi.code_set);
> +		*p++ = cpu_to_be32(b->scsi.designator_type);
> +		p = xdr_encode_opaque(p, b->scsi.designator, b->scsi.designator_len);
> +		p = xdr_encode_hyper(p, b->scsi.pr_key);
> +		break;
>  	default:
>  		return -ENOTSUPP;
>  	}
> @@ -155,3 +167,54 @@ fail:
>  	kfree(iomaps);
>  	return -EINVAL;
>  }
> +
> +int
> +nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
> +		u32 block_size)
> +{
> +	struct iomap *iomaps;
> +	u32 nr_iomaps, expected, i;
> +
> +	if (len < sizeof(u32)) {
> +		dprintk("%s: extent array too small: %u\n", __func__, len);
> +		return -EINVAL;
> +	}
> +
> +	nr_iomaps = be32_to_cpup(p++);
> +	expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE;
> +	if (len != expected) {
> +		dprintk("%s: extent array size mismatch: %u/%u\n",
> +			__func__, len, expected);
> +		return -EINVAL;
> +	}
> +
> +	iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
> +	if (!iomaps) {
> +		dprintk("%s: failed to allocate extent array\n", __func__);
> +		return -ENOMEM;
> +	}
> +
> +	for (i = 0; i < nr_iomaps; i++) {
> +		u64 val;
> +
> +		p = xdr_decode_hyper(p, &val);
> +		if (val & (block_size - 1)) {
> +			dprintk("%s: unaligned offset 0x%llx\n", __func__, val);
> +			goto fail;
> +		}
> +		iomaps[i].offset = val;
> +
> +		p = xdr_decode_hyper(p, &val);
> +		if (val & (block_size - 1)) {
> +			dprintk("%s: unaligned length 0x%llx\n", __func__, val);
> +			goto fail;
> +		}
> +		iomaps[i].length = val;
> +	}
> +
> +	*iomapp = iomaps;
> +	return nr_iomaps;
> +fail:
> +	kfree(iomaps);
> +	return -EINVAL;
> +}
> diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
> index 6de925f..397bc75 100644
> --- a/fs/nfsd/blocklayoutxdr.h
> +++ b/fs/nfsd/blocklayoutxdr.h
> @@ -15,6 +15,11 @@ struct pnfs_block_extent {
>  	enum pnfs_block_extent_state	es;
>  };
>  
> +struct pnfs_block_range {
> +	u64				foff;
> +	u64				len;
> +};
> +
>  /*
>   * Random upper cap for the uuid length to avoid unbounded allocation.
>   * Not actually limited by the protocol.
> @@ -29,6 +34,13 @@ struct pnfs_block_volume {
>  			u32		sig_len;
>  			u8		sig[PNFS_BLOCK_UUID_LEN];
>  		} simple;
> +		struct {
> +			enum scsi_code_set		code_set;
> +			enum scsi_designator_type	designator_type;
> +			int				designator_len;
> +			u8				designator[256];
> +			u64				pr_key;
> +		} scsi;
>  	};
>  };
>  
> @@ -43,5 +55,7 @@ __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
>  		struct nfsd4_layoutget *lgp);
>  int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
>  		u32 block_size);
> +int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
> +		u32 block_size);
>  
>  #endif /* _NFSD_BLOCKLAYOUTXDR_H */
> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
> index 4e4def7..cbd804e 100644
> --- a/fs/nfsd/nfs4layouts.c
> +++ b/fs/nfsd/nfs4layouts.c
> @@ -1,6 +1,7 @@
>  /*
>   * Copyright (c) 2014 Christoph Hellwig.
>   */
> +#include <linux/blkdev.h>
>  #include <linux/kmod.h>
>  #include <linux/file.h>
>  #include <linux/jhash.h>
> @@ -29,6 +30,9 @@ const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] =  {
>  #ifdef CONFIG_NFSD_BLOCKLAYOUT
>  	[LAYOUT_BLOCK_VOLUME]	= &bl_layout_ops,
>  #endif
> +#ifdef CONFIG_NFSD_SCSILAYOUT
> +	[LAYOUT_SCSI]		= &scsi_layout_ops,
> +#endif
>  };
>  
>  /* pNFS device ID to export fsid mapping */
> @@ -123,12 +127,24 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
>  	if (!(exp->ex_flags & NFSEXP_PNFS))
>  		return;
>  
> +	/*
> +	 * Check if the file systems supports exporting a block-like layout.
> +	 * If the block device supports reservations prefer the SCSI layout,
> +	 * else advertise the block layout.
> +	 */
>  #ifdef CONFIG_NFSD_BLOCKLAYOUT
>  	if (sb->s_export_op->get_uuid &&
>  	    sb->s_export_op->map_blocks &&
>  	    sb->s_export_op->commit_blocks)
>  		exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
>  #endif
> +#ifdef CONFIG_NFSD_SCSILAYOUT
> +	/* overwrite block layout selection if needed */
> +	if (sb->s_export_op->map_blocks &&
> +	    sb->s_export_op->commit_blocks &&
> +	    sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops)
> +		exp->ex_layout_type = LAYOUT_SCSI;
> +#endif
>  }
>  
>  static void
> @@ -594,8 +610,6 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
>  
>  	rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
>  
> -	trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
> -
>  	printk(KERN_WARNING
>  		"nfsd: client %s failed to respond to layout recall. "
>  		"  Fencing..\n", addr_str);
> @@ -630,6 +644,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
>  		container_of(cb, struct nfs4_layout_stateid, ls_recall);
>  	struct nfsd_net *nn;
>  	ktime_t now, cutoff;
> +	const struct nfsd4_layout_ops *ops;
>  	LIST_HEAD(reaplist);
>  
>  
> @@ -665,7 +680,13 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
>  		/*
>  		 * Unknown error or non-responding client, we'll need to fence.
>  		 */
> -		nfsd4_cb_layout_fail(ls);
> +		trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
> +
> +		ops = nfsd4_layout_ops[ls->ls_layout_type];
> +		if (ops->fence_client)
> +			ops->fence_client(ls);
> +		else
> +			nfsd4_cb_layout_fail(ls);
>  		return -1;
>  	}
>  }
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index 4cba786..629443e 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -1269,8 +1269,10 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
>  		goto out;
>  
>  	nfserr = nfs_ok;
> -	if (gdp->gd_maxcount != 0)
> -		nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
> +	if (gdp->gd_maxcount != 0) {
> +		nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb,
> +					cstate->session->se_client, gdp);
> +	}
>  
>  	gdp->gd_notify_types &= ops->notify_types;
>  out:
> diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
> index ff50bfa..7d073b9 100644
> --- a/fs/nfsd/pnfs.h
> +++ b/fs/nfsd/pnfs.h
> @@ -21,6 +21,7 @@ struct nfsd4_layout_ops {
>  	u32		notify_types;
>  
>  	__be32 (*proc_getdeviceinfo)(struct super_block *sb,
> +			struct nfs4_client *clp,
>  			struct nfsd4_getdeviceinfo *gdevp);
>  	__be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
>  			struct nfsd4_getdeviceinfo *gdevp);
> @@ -32,12 +33,17 @@ struct nfsd4_layout_ops {
>  
>  	__be32 (*proc_layoutcommit)(struct inode *inode,
>  			struct nfsd4_layoutcommit *lcp);
> +
> +	void (*fence_client)(struct nfs4_layout_stateid *ls);
>  };
>  
>  extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
>  #ifdef CONFIG_NFSD_BLOCKLAYOUT
>  extern const struct nfsd4_layout_ops bl_layout_ops;
>  #endif
> +#ifdef CONFIG_NFSD_SCSILAYOUT
> +extern const struct nfsd4_layout_ops scsi_layout_ops;
> +#endif
>  
>  __be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
>  		struct nfsd4_compound_state *cstate, stateid_t *stateid,
> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> index d68b62a..3542d94 100644
> --- a/fs/xfs/Makefile
> +++ b/fs/xfs/Makefile
> @@ -122,3 +122,4 @@ xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
>  xfs-$(CONFIG_SYSCTL)		+= xfs_sysctl.o
>  xfs-$(CONFIG_COMPAT)		+= xfs_ioctl32.o
>  xfs-$(CONFIG_NFSD_BLOCKLAYOUT)	+= xfs_pnfs.o
> +xfs-$(CONFIG_NFSD_SCSILAYOUT)	+= xfs_pnfs.o
> diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
> index d85529c..93f7485 100644
> --- a/fs/xfs/xfs_pnfs.h
> +++ b/fs/xfs/xfs_pnfs.h
> @@ -1,7 +1,7 @@
>  #ifndef _XFS_PNFS_H
>  #define _XFS_PNFS_H 1
>  
> -#ifdef CONFIG_NFSD_BLOCKLAYOUT
> +#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT)
>  int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
>  int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
>  		struct iomap *iomap, bool write, u32 *device_generation);
> -- 
> 2.1.4

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/4] nfs/blocklayout: add SCSI layout support
  2016-03-08 22:07   ` J. Bruce Fields
@ 2016-03-08 22:42     ` Trond Myklebust
  2016-03-17 21:01       ` J. Bruce Fields
  0 siblings, 1 reply; 17+ messages in thread
From: Trond Myklebust @ 2016-03-08 22:42 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Christoph Hellwig, Linux NFS Mailing List

On Tue, Mar 8, 2016 at 5:07 PM, J. Bruce Fields <bfields@fieldses.org> wrote:
>
> Trond, OK if I take this through the nfsd tree?
>
> Or I'm OK with doing this however.

I'm fine with that. I don't think there should be any conflicts with
what is queued up in my 'linux-next' branch.

>
> --b.
>
> On Fri, Mar 04, 2016 at 08:46:15PM +0100, Christoph Hellwig wrote:
> > This is a trivial extension to the block layout driver to support the
> > new SCSI layouts draft.  There are three changes:
> >
> >  - device identifcation through the SCSI VPD page.  This allows us to
> >    directly use the udev generated persistent device names instead of
> >    requiring an expensive lookup by crawling every block device node
> >    in /dev and reading a signature for it.
> >  - use of SCSI persistent reservations to protect device access and
> >    allow for robust fencing.  On the client sides this just means
> >    registering and unregistering a server supplied key.
> >  - an optimized LAYOUTCOMMIT payload that doesn't send unessecary
> >    fields to the server.
> >
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > ---
> >  fs/nfs/blocklayout/blocklayout.c |  59 ++++++++++++++--
> >  fs/nfs/blocklayout/blocklayout.h |  14 +++-
> >  fs/nfs/blocklayout/dev.c         | 144 ++++++++++++++++++++++++++++++++++++++-
> >  fs/nfs/blocklayout/extent_tree.c |  44 ++++++++----
> >  fs/nfs/blocklayout/rpc_pipefs.c  |   2 +-
> >  5 files changed, 238 insertions(+), 25 deletions(-)
> >
> > diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
> > index ddd0138..b27c409 100644
> > --- a/fs/nfs/blocklayout/blocklayout.c
> > +++ b/fs/nfs/blocklayout/blocklayout.c
> > @@ -446,8 +446,8 @@ static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
> >       kfree(bl);
> >  }
> >
> > -static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
> > -                                                gfp_t gfp_flags)
> > +static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode,
> > +             gfp_t gfp_flags, bool is_scsi_layout)
> >  {
> >       struct pnfs_block_layout *bl;
> >
> > @@ -460,9 +460,22 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
> >       bl->bl_ext_ro = RB_ROOT;
> >       spin_lock_init(&bl->bl_ext_lock);
> >
> > +     bl->bl_scsi_layout = is_scsi_layout;
> >       return &bl->bl_layout;
> >  }
> >
> > +static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
> > +                                                gfp_t gfp_flags)
> > +{
> > +     return __bl_alloc_layout_hdr(inode, gfp_flags, false);
> > +}
> > +
> > +static struct pnfs_layout_hdr *sl_alloc_layout_hdr(struct inode *inode,
> > +                                                gfp_t gfp_flags)
> > +{
> > +     return __bl_alloc_layout_hdr(inode, gfp_flags, true);
> > +}
> > +
> >  static void bl_free_lseg(struct pnfs_layout_segment *lseg)
> >  {
> >       dprintk("%s enter\n", __func__);
> > @@ -888,22 +901,53 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
> >       .sync                           = pnfs_generic_sync,
> >  };
> >
> > +static struct pnfs_layoutdriver_type scsilayout_type = {
> > +     .id                             = LAYOUT_SCSI,
> > +     .name                           = "LAYOUT_SCSI",
> > +     .owner                          = THIS_MODULE,
> > +     .flags                          = PNFS_LAYOUTRET_ON_SETATTR |
> > +                                       PNFS_READ_WHOLE_PAGE,
> > +     .read_pagelist                  = bl_read_pagelist,
> > +     .write_pagelist                 = bl_write_pagelist,
> > +     .alloc_layout_hdr               = sl_alloc_layout_hdr,
> > +     .free_layout_hdr                = bl_free_layout_hdr,
> > +     .alloc_lseg                     = bl_alloc_lseg,
> > +     .free_lseg                      = bl_free_lseg,
> > +     .return_range                   = bl_return_range,
> > +     .prepare_layoutcommit           = bl_prepare_layoutcommit,
> > +     .cleanup_layoutcommit           = bl_cleanup_layoutcommit,
> > +     .set_layoutdriver               = bl_set_layoutdriver,
> > +     .alloc_deviceid_node            = bl_alloc_deviceid_node,
> > +     .free_deviceid_node             = bl_free_deviceid_node,
> > +     .pg_read_ops                    = &bl_pg_read_ops,
> > +     .pg_write_ops                   = &bl_pg_write_ops,
> > +     .sync                           = pnfs_generic_sync,
> > +};
> > +
> > +
> >  static int __init nfs4blocklayout_init(void)
> >  {
> >       int ret;
> >
> >       dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
> >
> > -     ret = pnfs_register_layoutdriver(&blocklayout_type);
> > +     ret = bl_init_pipefs();
> >       if (ret)
> >               goto out;
> > -     ret = bl_init_pipefs();
> > +
> > +     ret = pnfs_register_layoutdriver(&blocklayout_type);
> >       if (ret)
> > -             goto out_unregister;
> > +             goto out_cleanup_pipe;
> > +
> > +     ret = pnfs_register_layoutdriver(&scsilayout_type);
> > +     if (ret)
> > +             goto out_unregister_block;
> >       return 0;
> >
> > -out_unregister:
> > +out_unregister_block:
> >       pnfs_unregister_layoutdriver(&blocklayout_type);
> > +out_cleanup_pipe:
> > +     bl_cleanup_pipefs();
> >  out:
> >       return ret;
> >  }
> > @@ -913,8 +957,9 @@ static void __exit nfs4blocklayout_exit(void)
> >       dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
> >              __func__);
> >
> > -     bl_cleanup_pipefs();
> > +     pnfs_unregister_layoutdriver(&scsilayout_type);
> >       pnfs_unregister_layoutdriver(&blocklayout_type);
> > +     bl_cleanup_pipefs();
> >  }
> >
> >  MODULE_ALIAS("nfs-layouttype4-3");
> > diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
> > index c556640..bc21205 100644
> > --- a/fs/nfs/blocklayout/blocklayout.h
> > +++ b/fs/nfs/blocklayout/blocklayout.h
> > @@ -55,7 +55,6 @@ struct pnfs_block_dev;
> >   */
> >  #define PNFS_BLOCK_UUID_LEN  128
> >
> > -
> >  struct pnfs_block_volume {
> >       enum pnfs_block_volume_type     type;
> >       union {
> > @@ -82,6 +81,13 @@ struct pnfs_block_volume {
> >                       u32             volumes_count;
> >                       u32             volumes[PNFS_BLOCK_MAX_DEVICES];
> >               } stripe;
> > +             struct {
> > +                     enum scsi_code_set              code_set;
> > +                     enum scsi_designator_type       designator_type;
> > +                     int                             designator_len;
> > +                     u8                              designator[256];
> > +                     u64                             pr_key;
> > +             } scsi;
> >       };
> >  };
> >
> > @@ -106,6 +112,9 @@ struct pnfs_block_dev {
> >       struct block_device             *bdev;
> >       u64                             disk_offset;
> >
> > +     u64                             pr_key;
> > +     bool                            pr_registered;
> > +
> >       bool (*map)(struct pnfs_block_dev *dev, u64 offset,
> >                       struct pnfs_block_dev_map *map);
> >  };
> > @@ -131,6 +140,7 @@ struct pnfs_block_layout {
> >       struct rb_root          bl_ext_rw;
> >       struct rb_root          bl_ext_ro;
> >       spinlock_t              bl_ext_lock;   /* Protects list manipulation */
> > +     bool                    bl_scsi_layout;
> >  };
> >
> >  static inline struct pnfs_block_layout *
> > @@ -182,6 +192,6 @@ void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
> >  dev_t bl_resolve_deviceid(struct nfs_server *server,
> >               struct pnfs_block_volume *b, gfp_t gfp_mask);
> >  int __init bl_init_pipefs(void);
> > -void __exit bl_cleanup_pipefs(void);
> > +void bl_cleanup_pipefs(void);
> >
> >  #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
> > diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
> > index a861bbd..31b0f6b 100644
> > --- a/fs/nfs/blocklayout/dev.c
> > +++ b/fs/nfs/blocklayout/dev.c
> > @@ -1,11 +1,12 @@
> >  /*
> > - * Copyright (c) 2014 Christoph Hellwig.
> > + * Copyright (c) 2014-2016 Christoph Hellwig.
> >   */
> >  #include <linux/sunrpc/svc.h>
> >  #include <linux/blkdev.h>
> >  #include <linux/nfs4.h>
> >  #include <linux/nfs_fs.h>
> >  #include <linux/nfs_xdr.h>
> > +#include <linux/pr.h>
> >
> >  #include "blocklayout.h"
> >
> > @@ -21,6 +22,17 @@ bl_free_device(struct pnfs_block_dev *dev)
> >                       bl_free_device(&dev->children[i]);
> >               kfree(dev->children);
> >       } else {
> > +             if (dev->pr_registered) {
> > +                     const struct pr_ops *ops =
> > +                             dev->bdev->bd_disk->fops->pr_ops;
> > +                     int error;
> > +
> > +                     error = ops->pr_register(dev->bdev, dev->pr_key, 0,
> > +                             false);
> > +                     if (error)
> > +                             pr_err("failed to unregister PR key.\n");
> > +             }
> > +
> >               if (dev->bdev)
> >                       blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
> >       }
> > @@ -113,6 +125,24 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
> >               for (i = 0; i < b->stripe.volumes_count; i++)
> >                       b->stripe.volumes[i] = be32_to_cpup(p++);
> >               break;
> > +     case PNFS_BLOCK_VOLUME_SCSI:
> > +             p = xdr_inline_decode(xdr, 4 + 4 + 4);
> > +             if (!p)
> > +                     return -EIO;
> > +             b->scsi.code_set = be32_to_cpup(p++);
> > +             b->scsi.designator_type = be32_to_cpup(p++);
> > +             b->scsi.designator_len = be32_to_cpup(p++);
> > +             p = xdr_inline_decode(xdr, b->scsi.designator_len);
> > +             if (!p)
> > +                     return -EIO;
> > +             if (b->scsi.designator_len > 256)
> > +                     return -EIO;
> > +             memcpy(&b->scsi.designator, p, b->scsi.designator_len);
> > +             p = xdr_inline_decode(xdr, 8);
> > +             if (!p)
> > +                     return -EIO;
> > +             p = xdr_decode_hyper(p, &b->scsi.pr_key);
> > +             break;
> >       default:
> >               dprintk("unknown volume type!\n");
> >               return -EIO;
> > @@ -216,6 +246,116 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
> >       return 0;
> >  }
> >
> > +static bool
> > +bl_validate_designator(struct pnfs_block_volume *v)
> > +{
> > +     switch (v->scsi.designator_type) {
> > +     case PS_DESIGNATOR_EUI64:
> > +             if (v->scsi.code_set != PS_CODE_SET_BINARY)
> > +                     return false;
> > +
> > +             if (v->scsi.designator_len != 8 &&
> > +                 v->scsi.designator_len != 10 &&
> > +                 v->scsi.designator_len != 16)
> > +                     return false;
> > +
> > +             return true;
> > +     case PS_DESIGNATOR_NAA:
> > +             if (v->scsi.code_set != PS_CODE_SET_BINARY)
> > +                     return false;
> > +
> > +             if (v->scsi.designator_len != 8 &&
> > +                 v->scsi.designator_len != 16)
> > +                     return false;
> > +
> > +             return true;
> > +     case PS_DESIGNATOR_T10:
> > +     case PS_DESIGNATOR_NAME:
> > +             pr_err("pNFS: unsupported designator "
> > +                     "(code set %d, type %d, len %d.\n",
> > +                     v->scsi.code_set,
> > +                     v->scsi.designator_type,
> > +                     v->scsi.designator_len);
> > +             return false;
> > +     default:
> > +             pr_err("pNFS: invalid designator "
> > +                     "(code set %d, type %d, len %d.\n",
> > +                     v->scsi.code_set,
> > +                     v->scsi.designator_type,
> > +                     v->scsi.designator_len);
> > +             return false;
> > +     }
> > +}
> > +
> > +static int
> > +bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
> > +             struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
> > +{
> > +     struct pnfs_block_volume *v = &volumes[idx];
> > +     const struct pr_ops *ops;
> > +     const char *devname;
> > +     int error;
> > +
> > +     if (!bl_validate_designator(v))
> > +             return -EINVAL;
> > +
> > +     switch (v->scsi.designator_len) {
> > +     case 8:
> > +             devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN",
> > +                             v->scsi.designator);
> > +             break;
> > +     case 12:
> > +             devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN",
> > +                             v->scsi.designator);
> > +             break;
> > +     case 16:
> > +             devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN",
> > +                             v->scsi.designator);
> > +             break;
> > +     default:
> > +             return -EINVAL;
> > +     }
> > +
> > +     d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL);
> > +     if (IS_ERR(d->bdev)) {
> > +             pr_warn("pNFS: failed to open device %s (%ld)\n",
> > +                     devname, PTR_ERR(d->bdev));
> > +             kfree(devname);
> > +             return PTR_ERR(d->bdev);
> > +     }
> > +
> > +     kfree(devname);
> > +
> > +     d->len = i_size_read(d->bdev->bd_inode);
> > +     d->map = bl_map_simple;
> > +     d->pr_key = v->scsi.pr_key;
> > +
> > +     pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
> > +             d->bdev->bd_disk->disk_name, d->pr_key);
> > +
> > +     ops = d->bdev->bd_disk->fops->pr_ops;
> > +     if (!ops) {
> > +             pr_err("pNFS: block device %s does not support reservations.",
> > +                             d->bdev->bd_disk->disk_name);
> > +             error = -EINVAL;
> > +             goto out_blkdev_put;
> > +     }
> > +
> > +     error = ops->pr_register(d->bdev, 0, d->pr_key, true);
> > +     if (error) {
> > +             pr_err("pNFS: failed to register key for block device %s.",
> > +                             d->bdev->bd_disk->disk_name);
> > +             goto out_blkdev_put;
> > +     }
> > +
> > +     d->pr_registered = true;
> > +     return 0;
> > +
> > +out_blkdev_put:
> > +     blkdev_put(d->bdev, FMODE_READ);
> > +     return error;
> > +}
> > +
> >  static int
> >  bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
> >               struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
> > @@ -303,6 +443,8 @@ bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
> >               return bl_parse_concat(server, d, volumes, idx, gfp_mask);
> >       case PNFS_BLOCK_VOLUME_STRIPE:
> >               return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
> > +     case PNFS_BLOCK_VOLUME_SCSI:
> > +             return bl_parse_scsi(server, d, volumes, idx, gfp_mask);
> >       default:
> >               dprintk("unsupported volume type: %d\n", volumes[idx].type);
> >               return -EIO;
> > diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
> > index c59a59c..df366fc 100644
> > --- a/fs/nfs/blocklayout/extent_tree.c
> > +++ b/fs/nfs/blocklayout/extent_tree.c
> > @@ -1,5 +1,5 @@
> >  /*
> > - * Copyright (c) 2014 Christoph Hellwig.
> > + * Copyright (c) 2014-2016 Christoph Hellwig.
> >   */
> >
> >  #include <linux/vmalloc.h>
> > @@ -462,10 +462,12 @@ out:
> >       return err;
> >  }
> >
> > -static size_t ext_tree_layoutupdate_size(size_t count)
> > +static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count)
> >  {
> > -     return sizeof(__be32) /* number of entries */ +
> > -             PNFS_BLOCK_EXTENT_SIZE * count;
> > +     if (bl->bl_scsi_layout)
> > +             return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count;
> > +     else
> > +             return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count;
> >  }
> >
> >  static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
> > @@ -482,6 +484,23 @@ static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
> >       }
> >  }
> >
> > +static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p)
> > +{
> > +     p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
> > +                     NFS4_DEVICEID4_SIZE);
> > +     p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
> > +     p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
> > +     p = xdr_encode_hyper(p, 0LL);
> > +     *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
> > +     return p;
> > +}
> > +
> > +static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
> > +{
> > +     p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
> > +     return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
> > +}
> > +
> >  static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
> >               size_t buffer_size, size_t *count)
> >  {
> > @@ -495,19 +514,16 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
> >                       continue;
> >
> >               (*count)++;
> > -             if (ext_tree_layoutupdate_size(*count) > buffer_size) {
> > +             if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
> >                       /* keep counting.. */
> >                       ret = -ENOSPC;
> >                       continue;
> >               }
> >
> > -             p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
> > -                             NFS4_DEVICEID4_SIZE);
> > -             p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
> > -             p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
> > -             p = xdr_encode_hyper(p, 0LL);
> > -             *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
> > -
> > +             if (bl->bl_scsi_layout)
> > +                     p = encode_scsi_range(be, p);
> > +             else
> > +                     p = encode_block_extent(be, p);
> >               be->be_tag = EXTENT_COMMITTING;
> >       }
> >       spin_unlock(&bl->bl_ext_lock);
> > @@ -536,7 +552,7 @@ retry:
> >       if (unlikely(ret)) {
> >               ext_tree_free_commitdata(arg, buffer_size);
> >
> > -             buffer_size = ext_tree_layoutupdate_size(count);
> > +             buffer_size = ext_tree_layoutupdate_size(bl, count);
> >               count = 0;
> >
> >               arg->layoutupdate_pages =
> > @@ -555,7 +571,7 @@ retry:
> >       }
> >
> >       *start_p = cpu_to_be32(count);
> > -     arg->layoutupdate_len = ext_tree_layoutupdate_size(count);
> > +     arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count);
> >
> >       if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
> >               void *p = start_p, *end = p + arg->layoutupdate_len;
> > diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
> > index dbe5839..9fb067a6 100644
> > --- a/fs/nfs/blocklayout/rpc_pipefs.c
> > +++ b/fs/nfs/blocklayout/rpc_pipefs.c
> > @@ -281,7 +281,7 @@ out:
> >       return ret;
> >  }
> >
> > -void __exit bl_cleanup_pipefs(void)
> > +void bl_cleanup_pipefs(void)
> >  {
> >       rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
> >       unregister_pernet_subsys(&nfs4blocklayout_net_ops);
> > --
> > 2.1.4

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] nfsd: add SCSI layout support
  2016-03-08 22:15   ` J. Bruce Fields
@ 2016-03-09 14:48     ` Christoph Hellwig
  0 siblings, 0 replies; 17+ messages in thread
From: Christoph Hellwig @ 2016-03-09 14:48 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Christoph Hellwig, trond.myklebust, linux-nfs

> May as well document the server-side config there too, I guess; I'm
> editing that like:
> 
> 	On {+a server built with CONFIG_NFSD_SCSI,+} the[-server,-] pNFS
> 	SCSI volume support is

Fine with me!


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] nfsd: add SCSI layout support
  2016-03-04 19:46 ` [PATCH 4/4] nfsd: add SCSI layout support Christoph Hellwig
  2016-03-08 22:15   ` J. Bruce Fields
@ 2016-03-10 22:26   ` J. Bruce Fields
  2016-03-11  9:24     ` Christoph Hellwig
  2016-03-11 22:52   ` J. Bruce Fields
  2016-03-17 21:52   ` J. Bruce Fields
  3 siblings, 1 reply; 17+ messages in thread
From: J. Bruce Fields @ 2016-03-10 22:26 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: trond.myklebust, linux-nfs

On Fri, Mar 04, 2016 at 08:46:17PM +0100, Christoph Hellwig wrote:
> This is a simple extension to the block layout driver to use SCSI
> persistent reservations for access control and fencing, as well as
> SCSI VPD pages for device identification.
> 
> For this we need to pass the nfs4_client to the proc_getdeviceinfo method
> to generate the reservation key, and add a new fence_client method
> to allow for fence actions in the layout driver.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  Documentation/filesystems/nfs/pnfs-scsi-server.txt |  22 ++
>  fs/nfsd/Kconfig                                    |  13 +
>  fs/nfsd/Makefile                                   |   1 +
>  fs/nfsd/blocklayout.c                              | 298 ++++++++++++++++++---
>  fs/nfsd/blocklayoutxdr.c                           |  65 ++++-
>  fs/nfsd/blocklayoutxdr.h                           |  14 +
>  fs/nfsd/nfs4layouts.c                              |  27 +-
>  fs/nfsd/nfs4proc.c                                 |   6 +-
>  fs/nfsd/pnfs.h                                     |   6 +
>  fs/xfs/Makefile                                    |   1 +
>  fs/xfs/xfs_pnfs.h                                  |   2 +-
>  11 files changed, 407 insertions(+), 48 deletions(-)
>  create mode 100644 Documentation/filesystems/nfs/pnfs-scsi-server.txt
> 
> diff --git a/Documentation/filesystems/nfs/pnfs-scsi-server.txt b/Documentation/filesystems/nfs/pnfs-scsi-server.txt
> new file mode 100644
> index 0000000..4150979
> --- /dev/null
> +++ b/Documentation/filesystems/nfs/pnfs-scsi-server.txt
> @@ -0,0 +1,22 @@
> +
> +pNFS SCSI layout server user guide
> +==================================
> +
> +This document describes support for pNFS SCSI layouts in the Linux NFS server.
> +With pNFS SCSI layouts, the NFS server acts as Metadata Server (MDS) for pNFS,
> +which in addition to handling all the metadata access to the NFS export,
> +also hands out layouts to the clients so that they can directly access the
> +underlying SCSI LUNs that are shared with the client.
> +
> +To use pNFS SCSI layouts with with the Linux NFS server, the exported file
> +system needs to support the pNFS SCSI layouts (currently just XFS), and the
> +file system must sit on a SCSI LUN that is accessible to the clients in
> +addition to the MDS.  As of now the file system needs to sit directly on the
> +exported LUN, striping or concatenation of LUNs on the MDS and clients
> +is not supported yet.
> +
> +On the server, pNFS SCSI volume support is automatically enabled if the
> +file system is exported using the "pnfs" option and the underlying SCSI
> +device support persistent reservations.  On the client make sure the kernel
> +has the CONFIG_PNFS_BLOCK option enabled, and the file system is mounted
> +using the NFSv4.1 protocol version (mount -o vers=4.1).
> diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
> index eb70d91..a30a313 100644
> --- a/fs/nfsd/Kconfig
> +++ b/fs/nfsd/Kconfig
> @@ -98,6 +98,19 @@ config NFSD_BLOCKLAYOUT
>  
>  	  If unsure, say N.
>  
> +config NFSD_SCSILAYOUT
> +	bool "NFSv4.1 server support for pNFS SCSI layouts"
> +	depends on NFSD_V4
> +	select NFSD_PNFS
> +	help
> +	  This option enables support for the exporting pNFS SCSI layouts
> +	  in the kernel's NFS server. The pNFS SCSI layout enables NFS
> +	  clients to directly perform I/O to SCSI devices accesible to both
> +	  the server and the clients.  See draft-ietf-nfsv4-scsi-layout for
> +	  more details.
> +
> +	  If unsure, say N.
> +
>  config NFSD_V4_SECURITY_LABEL
>  	bool "Provide Security Label support for NFSv4 server"
>  	depends on NFSD_V4 && SECURITY
> diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
> index 679cdc6..3ae5f3c 100644
> --- a/fs/nfsd/Makefile
> +++ b/fs/nfsd/Makefile
> @@ -19,3 +19,4 @@ nfsd-$(CONFIG_NFSD_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
>  			   nfs4acl.o nfs4callback.o nfs4recover.o
>  nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
>  nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
> +nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
> index c29d942..0e87e3e 100644
> --- a/fs/nfsd/blocklayout.c
> +++ b/fs/nfsd/blocklayout.c
> @@ -1,11 +1,14 @@
>  /*
> - * Copyright (c) 2014 Christoph Hellwig.
> + * Copyright (c) 2014-2016 Christoph Hellwig.
>   */
>  #include <linux/exportfs.h>
>  #include <linux/genhd.h>
>  #include <linux/slab.h>
> +#include <linux/pr.h>
>  
>  #include <linux/nfsd/debug.h>
> +#include <scsi/scsi_proto.h>
> +#include <scsi/scsi_common.h>
>  
>  #include "blocklayoutxdr.h"
>  #include "pnfs.h"
> @@ -13,37 +16,6 @@
>  #define NFSDDBG_FACILITY	NFSDDBG_PNFS
>  
>  
> -static int
> -nfsd4_block_get_device_info_simple(struct super_block *sb,
> -		struct nfsd4_getdeviceinfo *gdp)
> -{
> -	struct pnfs_block_deviceaddr *dev;
> -	struct pnfs_block_volume *b;
> -
> -	dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
> -		      sizeof(struct pnfs_block_volume), GFP_KERNEL);
> -	if (!dev)
> -		return -ENOMEM;
> -	gdp->gd_device = dev;
> -
> -	dev->nr_volumes = 1;
> -	b = &dev->volumes[0];
> -
> -	b->type = PNFS_BLOCK_VOLUME_SIMPLE;
> -	b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
> -	return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
> -			&b->simple.offset);
> -}
> -
> -static __be32
> -nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
> -		struct nfsd4_getdeviceinfo *gdp)
> -{
> -	if (sb->s_bdev != sb->s_bdev->bd_contains)
> -		return nfserr_inval;
> -	return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
> -}
> -
>  static __be32
>  nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
>  		struct nfsd4_layoutget *args)
> @@ -141,20 +113,13 @@ out_layoutunavailable:
>  }
>  
>  static __be32
> -nfsd4_block_proc_layoutcommit(struct inode *inode,
> -		struct nfsd4_layoutcommit *lcp)
> +nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp,
> +		struct iomap *iomaps, int nr_iomaps)
>  {
>  	loff_t new_size = lcp->lc_last_wr + 1;
>  	struct iattr iattr = { .ia_valid = 0 };
> -	struct iomap *iomaps;
> -	int nr_iomaps;
>  	int error;
>  
> -	nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
> -			lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
> -	if (nr_iomaps < 0)
> -		return nfserrno(nr_iomaps);
> -
>  	if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
>  	    timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
>  		lcp->lc_mtime = current_fs_time(inode->i_sb);
> @@ -172,6 +137,54 @@ nfsd4_block_proc_layoutcommit(struct inode *inode,
>  	return nfserrno(error);
>  }
>  
> +#ifdef CONFIG_NFSD_BLOCKLAYOUT
> +static int
> +nfsd4_block_get_device_info_simple(struct super_block *sb,
> +		struct nfsd4_getdeviceinfo *gdp)
> +{
> +	struct pnfs_block_deviceaddr *dev;
> +	struct pnfs_block_volume *b;
> +
> +	dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
> +		      sizeof(struct pnfs_block_volume), GFP_KERNEL);
> +	if (!dev)
> +		return -ENOMEM;
> +	gdp->gd_device = dev;
> +
> +	dev->nr_volumes = 1;
> +	b = &dev->volumes[0];
> +
> +	b->type = PNFS_BLOCK_VOLUME_SIMPLE;
> +	b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
> +	return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
> +			&b->simple.offset);
> +}
> +
> +static __be32
> +nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
> +		struct nfs4_client *clp,
> +		struct nfsd4_getdeviceinfo *gdp)
> +{
> +	if (sb->s_bdev != sb->s_bdev->bd_contains)
> +		return nfserr_inval;
> +	return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
> +}
> +
> +static __be32
> +nfsd4_block_proc_layoutcommit(struct inode *inode,
> +		struct nfsd4_layoutcommit *lcp)
> +{
> +	struct iomap *iomaps;
> +	int nr_iomaps;
> +
> +	nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
> +			lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
> +	if (nr_iomaps < 0)
> +		return nfserrno(nr_iomaps);
> +
> +	return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
> +}
> +
>  const struct nfsd4_layout_ops bl_layout_ops = {
>  	/*
>  	 * Pretend that we send notification to the client.  This is a blatant
> @@ -190,3 +203,206 @@ const struct nfsd4_layout_ops bl_layout_ops = {
>  	.encode_layoutget	= nfsd4_block_encode_layoutget,
>  	.proc_layoutcommit	= nfsd4_block_proc_layoutcommit,
>  };
> +#endif /* CONFIG_NFSD_BLOCKLAYOUT */
> +
> +#ifdef CONFIG_NFSD_SCSILAYOUT
> +static int nfsd4_scsi_identify_device(struct block_device *bdev,
> +		struct pnfs_block_volume *b)
> +{
> +	struct request_queue *q = bdev->bd_disk->queue;
> +	struct request *rq;
> +	size_t bufflen = 252, len, id_len;
> +	u8 *buf, *d, type, assoc;
> +	int error;
> +
> +	buf = kzalloc(bufflen, GFP_KERNEL);
> +	if (!buf)
> +		return -ENOMEM;
> +
> +	rq = blk_get_request(q, READ, GFP_KERNEL);
> +	if (IS_ERR(rq)) {
> +		error = -ENOMEM;
> +		goto out_free_buf;
> +	}
> +	blk_rq_set_block_pc(rq);
> +
> +	error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL);
> +	if (error)
> +		goto out_put_request;
> +
> +	rq->cmd[0] = INQUIRY;
> +	rq->cmd[1] = 1;
> +	rq->cmd[2] = 0x83;
> +	rq->cmd[3] = bufflen >> 8;
> +	rq->cmd[4] = bufflen & 0xff;
> +	rq->cmd_len = COMMAND_SIZE(INQUIRY);
> +
> +	error = blk_execute_rq(rq->q, NULL, rq, 1);
> +	if (error) {
> +		pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
> +			rq->errors);
> +		

Did you mean to have a "goto out_put_request;" there?

Fixed up locally under that assumption....

--b.

> +	}
> +
> +	len = (buf[2] << 8) + buf[3] + 4;
> +	if (len > bufflen) {
> +		pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n",
> +			len);
> +		goto out_put_request;
> +	}
> +
> +	d = buf + 4;
> +	for (d = buf + 4; d < buf + len; d += id_len + 4) {
> +		id_len = d[3];
> +		type = d[1] & 0xf;
> +		assoc = (d[1] >> 4) & 0x3;
> +
> +		/*
> +		 * We only care about a EUI-64 and NAA designator types
> +		 * with LU association.
> +		 */
> +		if (assoc != 0x00)
> +			continue;
> +		if (type != 0x02 && type != 0x03)
> +			continue;
> +		if (id_len != 8 && id_len != 12 && id_len != 16)
> +			continue;
> +
> +		b->scsi.code_set = PS_CODE_SET_BINARY;
> +		b->scsi.designator_type = type == 0x02 ?
> +			PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA;
> +		b->scsi.designator_len = id_len;
> +		memcpy(b->scsi.designator, d + 4, id_len);
> +
> +		/*
> +		 * If we found a 8 or 12 byte descriptor continue on to
> +		 * see if a 16 byte one is available.  If we find a
> +		 * 16 byte descriptor we're done.
> +		 */
> +		if (id_len == 16)
> +			break;
> +	}
> +
> +out_put_request:
> +	blk_put_request(rq);
> +out_free_buf:
> +	kfree(buf);
> +	return error;
> +}
> +
> +#define NFSD_MDS_PR_KEY		0x0100000000000000
> +
> +/*
> + * We use the client ID as a uniqueue key for the reservations.
> + * This allows us to easily fence a client when recalls fail.
> + */
> +static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp)
> +{
> +	return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id;
> +}
> +
> +static int
> +nfsd4_block_get_device_info_scsi(struct super_block *sb,
> +		struct nfs4_client *clp,
> +		struct nfsd4_getdeviceinfo *gdp)
> +{
> +	struct pnfs_block_deviceaddr *dev;
> +	struct pnfs_block_volume *b;
> +	const struct pr_ops *ops;
> +	int error;
> +
> +	dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
> +		      sizeof(struct pnfs_block_volume), GFP_KERNEL);
> +	if (!dev)
> +		return -ENOMEM;
> +	gdp->gd_device = dev;
> +
> +	dev->nr_volumes = 1;
> +	b = &dev->volumes[0];
> +
> +	b->type = PNFS_BLOCK_VOLUME_SCSI;
> +	b->scsi.pr_key = nfsd4_scsi_pr_key(clp);
> +
> +	error = nfsd4_scsi_identify_device(sb->s_bdev, b);
> +	if (error)
> +		return error;
> +
> +	ops = sb->s_bdev->bd_disk->fops->pr_ops;
> +	if (!ops) {
> +		pr_err("pNFS: device %s does not support PRs.\n",
> +			sb->s_id);
> +		return -EINVAL;
> +	}
> +
> +	error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
> +	if (error) {
> +		pr_err("pNFS: failed to register key for device %s.\n",
> +			sb->s_id);
> +		return -EINVAL;
> +	}
> +
> +	error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
> +			PR_EXCLUSIVE_ACCESS_REG_ONLY, 0);
> +	if (error) {
> +		pr_err("pNFS: failed to reserve device %s.\n",
> +			sb->s_id);
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
> +static __be32
> +nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb,
> +		struct nfs4_client *clp,
> +		struct nfsd4_getdeviceinfo *gdp)
> +{
> +	if (sb->s_bdev != sb->s_bdev->bd_contains)
> +		return nfserr_inval;
> +	return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp));
> +}
> +static __be32
> +nfsd4_scsi_proc_layoutcommit(struct inode *inode,
> +		struct nfsd4_layoutcommit *lcp)
> +{
> +	struct iomap *iomaps;
> +	int nr_iomaps;
> +
> +	nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout,
> +			lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
> +	if (nr_iomaps < 0)
> +		return nfserrno(nr_iomaps);
> +
> +	return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
> +}
> +
> +static void
> +nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls)
> +{
> +	struct nfs4_client *clp = ls->ls_stid.sc_client;
> +	struct block_device *bdev = ls->ls_file->f_path.mnt->mnt_sb->s_bdev;
> +
> +	bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
> +			nfsd4_scsi_pr_key(clp), 0, true);
> +}
> +
> +const struct nfsd4_layout_ops scsi_layout_ops = {
> +	/*
> +	 * Pretend that we send notification to the client.  This is a blatant
> +	 * lie to force recent Linux clients to cache our device IDs.
> +	 * We rarely ever change the device ID, so the harm of leaking deviceids
> +	 * for a while isn't too bad.  Unfortunately RFC5661 is a complete mess
> +	 * in this regard, but I filed errata 4119 for this a while ago, and
> +	 * hopefully the Linux client will eventually start caching deviceids
> +	 * without this again.
> +	 */
> +	.notify_types		=
> +			NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
> +	.proc_getdeviceinfo	= nfsd4_scsi_proc_getdeviceinfo,
> +	.encode_getdeviceinfo	= nfsd4_block_encode_getdeviceinfo,
> +	.proc_layoutget		= nfsd4_block_proc_layoutget,
> +	.encode_layoutget	= nfsd4_block_encode_layoutget,
> +	.proc_layoutcommit	= nfsd4_scsi_proc_layoutcommit,
> +	.fence_client		= nfsd4_scsi_fence_client,
> +};
> +#endif /* CONFIG_NFSD_SCSILAYOUT */
> diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
> index 6d834dc..ca18836 100644
> --- a/fs/nfsd/blocklayoutxdr.c
> +++ b/fs/nfsd/blocklayoutxdr.c
> @@ -1,5 +1,5 @@
>  /*
> - * Copyright (c) 2014 Christoph Hellwig.
> + * Copyright (c) 2014-2016 Christoph Hellwig.
>   */
>  #include <linux/sunrpc/svc.h>
>  #include <linux/exportfs.h>
> @@ -53,6 +53,18 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
>  		p = xdr_encode_hyper(p, b->simple.offset);
>  		p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
>  		break;
> +	case PNFS_BLOCK_VOLUME_SCSI:
> +		len = 4 + 4 + 4 + 4 + b->scsi.designator_len + 8;
> +		p = xdr_reserve_space(xdr, len);
> +		if (!p)
> +			return -ETOOSMALL;
> +
> +		*p++ = cpu_to_be32(b->type);
> +		*p++ = cpu_to_be32(b->scsi.code_set);
> +		*p++ = cpu_to_be32(b->scsi.designator_type);
> +		p = xdr_encode_opaque(p, b->scsi.designator, b->scsi.designator_len);
> +		p = xdr_encode_hyper(p, b->scsi.pr_key);
> +		break;
>  	default:
>  		return -ENOTSUPP;
>  	}
> @@ -155,3 +167,54 @@ fail:
>  	kfree(iomaps);
>  	return -EINVAL;
>  }
> +
> +int
> +nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
> +		u32 block_size)
> +{
> +	struct iomap *iomaps;
> +	u32 nr_iomaps, expected, i;
> +
> +	if (len < sizeof(u32)) {
> +		dprintk("%s: extent array too small: %u\n", __func__, len);
> +		return -EINVAL;
> +	}
> +
> +	nr_iomaps = be32_to_cpup(p++);
> +	expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE;
> +	if (len != expected) {
> +		dprintk("%s: extent array size mismatch: %u/%u\n",
> +			__func__, len, expected);
> +		return -EINVAL;
> +	}
> +
> +	iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
> +	if (!iomaps) {
> +		dprintk("%s: failed to allocate extent array\n", __func__);
> +		return -ENOMEM;
> +	}
> +
> +	for (i = 0; i < nr_iomaps; i++) {
> +		u64 val;
> +
> +		p = xdr_decode_hyper(p, &val);
> +		if (val & (block_size - 1)) {
> +			dprintk("%s: unaligned offset 0x%llx\n", __func__, val);
> +			goto fail;
> +		}
> +		iomaps[i].offset = val;
> +
> +		p = xdr_decode_hyper(p, &val);
> +		if (val & (block_size - 1)) {
> +			dprintk("%s: unaligned length 0x%llx\n", __func__, val);
> +			goto fail;
> +		}
> +		iomaps[i].length = val;
> +	}
> +
> +	*iomapp = iomaps;
> +	return nr_iomaps;
> +fail:
> +	kfree(iomaps);
> +	return -EINVAL;
> +}
> diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
> index 6de925f..397bc75 100644
> --- a/fs/nfsd/blocklayoutxdr.h
> +++ b/fs/nfsd/blocklayoutxdr.h
> @@ -15,6 +15,11 @@ struct pnfs_block_extent {
>  	enum pnfs_block_extent_state	es;
>  };
>  
> +struct pnfs_block_range {
> +	u64				foff;
> +	u64				len;
> +};
> +
>  /*
>   * Random upper cap for the uuid length to avoid unbounded allocation.
>   * Not actually limited by the protocol.
> @@ -29,6 +34,13 @@ struct pnfs_block_volume {
>  			u32		sig_len;
>  			u8		sig[PNFS_BLOCK_UUID_LEN];
>  		} simple;
> +		struct {
> +			enum scsi_code_set		code_set;
> +			enum scsi_designator_type	designator_type;
> +			int				designator_len;
> +			u8				designator[256];
> +			u64				pr_key;
> +		} scsi;
>  	};
>  };
>  
> @@ -43,5 +55,7 @@ __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
>  		struct nfsd4_layoutget *lgp);
>  int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
>  		u32 block_size);
> +int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
> +		u32 block_size);
>  
>  #endif /* _NFSD_BLOCKLAYOUTXDR_H */
> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
> index 4e4def7..cbd804e 100644
> --- a/fs/nfsd/nfs4layouts.c
> +++ b/fs/nfsd/nfs4layouts.c
> @@ -1,6 +1,7 @@
>  /*
>   * Copyright (c) 2014 Christoph Hellwig.
>   */
> +#include <linux/blkdev.h>
>  #include <linux/kmod.h>
>  #include <linux/file.h>
>  #include <linux/jhash.h>
> @@ -29,6 +30,9 @@ const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] =  {
>  #ifdef CONFIG_NFSD_BLOCKLAYOUT
>  	[LAYOUT_BLOCK_VOLUME]	= &bl_layout_ops,
>  #endif
> +#ifdef CONFIG_NFSD_SCSILAYOUT
> +	[LAYOUT_SCSI]		= &scsi_layout_ops,
> +#endif
>  };
>  
>  /* pNFS device ID to export fsid mapping */
> @@ -123,12 +127,24 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
>  	if (!(exp->ex_flags & NFSEXP_PNFS))
>  		return;
>  
> +	/*
> +	 * Check if the file systems supports exporting a block-like layout.
> +	 * If the block device supports reservations prefer the SCSI layout,
> +	 * else advertise the block layout.
> +	 */
>  #ifdef CONFIG_NFSD_BLOCKLAYOUT
>  	if (sb->s_export_op->get_uuid &&
>  	    sb->s_export_op->map_blocks &&
>  	    sb->s_export_op->commit_blocks)
>  		exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
>  #endif
> +#ifdef CONFIG_NFSD_SCSILAYOUT
> +	/* overwrite block layout selection if needed */
> +	if (sb->s_export_op->map_blocks &&
> +	    sb->s_export_op->commit_blocks &&
> +	    sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops)
> +		exp->ex_layout_type = LAYOUT_SCSI;
> +#endif
>  }
>  
>  static void
> @@ -594,8 +610,6 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
>  
>  	rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
>  
> -	trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
> -
>  	printk(KERN_WARNING
>  		"nfsd: client %s failed to respond to layout recall. "
>  		"  Fencing..\n", addr_str);
> @@ -630,6 +644,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
>  		container_of(cb, struct nfs4_layout_stateid, ls_recall);
>  	struct nfsd_net *nn;
>  	ktime_t now, cutoff;
> +	const struct nfsd4_layout_ops *ops;
>  	LIST_HEAD(reaplist);
>  
>  
> @@ -665,7 +680,13 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
>  		/*
>  		 * Unknown error or non-responding client, we'll need to fence.
>  		 */
> -		nfsd4_cb_layout_fail(ls);
> +		trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
> +
> +		ops = nfsd4_layout_ops[ls->ls_layout_type];
> +		if (ops->fence_client)
> +			ops->fence_client(ls);
> +		else
> +			nfsd4_cb_layout_fail(ls);
>  		return -1;
>  	}
>  }
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index 4cba786..629443e 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -1269,8 +1269,10 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
>  		goto out;
>  
>  	nfserr = nfs_ok;
> -	if (gdp->gd_maxcount != 0)
> -		nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
> +	if (gdp->gd_maxcount != 0) {
> +		nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb,
> +					cstate->session->se_client, gdp);
> +	}
>  
>  	gdp->gd_notify_types &= ops->notify_types;
>  out:
> diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
> index ff50bfa..7d073b9 100644
> --- a/fs/nfsd/pnfs.h
> +++ b/fs/nfsd/pnfs.h
> @@ -21,6 +21,7 @@ struct nfsd4_layout_ops {
>  	u32		notify_types;
>  
>  	__be32 (*proc_getdeviceinfo)(struct super_block *sb,
> +			struct nfs4_client *clp,
>  			struct nfsd4_getdeviceinfo *gdevp);
>  	__be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
>  			struct nfsd4_getdeviceinfo *gdevp);
> @@ -32,12 +33,17 @@ struct nfsd4_layout_ops {
>  
>  	__be32 (*proc_layoutcommit)(struct inode *inode,
>  			struct nfsd4_layoutcommit *lcp);
> +
> +	void (*fence_client)(struct nfs4_layout_stateid *ls);
>  };
>  
>  extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
>  #ifdef CONFIG_NFSD_BLOCKLAYOUT
>  extern const struct nfsd4_layout_ops bl_layout_ops;
>  #endif
> +#ifdef CONFIG_NFSD_SCSILAYOUT
> +extern const struct nfsd4_layout_ops scsi_layout_ops;
> +#endif
>  
>  __be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
>  		struct nfsd4_compound_state *cstate, stateid_t *stateid,
> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> index d68b62a..3542d94 100644
> --- a/fs/xfs/Makefile
> +++ b/fs/xfs/Makefile
> @@ -122,3 +122,4 @@ xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
>  xfs-$(CONFIG_SYSCTL)		+= xfs_sysctl.o
>  xfs-$(CONFIG_COMPAT)		+= xfs_ioctl32.o
>  xfs-$(CONFIG_NFSD_BLOCKLAYOUT)	+= xfs_pnfs.o
> +xfs-$(CONFIG_NFSD_SCSILAYOUT)	+= xfs_pnfs.o
> diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
> index d85529c..93f7485 100644
> --- a/fs/xfs/xfs_pnfs.h
> +++ b/fs/xfs/xfs_pnfs.h
> @@ -1,7 +1,7 @@
>  #ifndef _XFS_PNFS_H
>  #define _XFS_PNFS_H 1
>  
> -#ifdef CONFIG_NFSD_BLOCKLAYOUT
> +#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT)
>  int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
>  int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
>  		struct iomap *iomap, bool write, u32 *device_generation);
> -- 
> 2.1.4

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] nfsd: add SCSI layout support
  2016-03-10 22:26   ` J. Bruce Fields
@ 2016-03-11  9:24     ` Christoph Hellwig
  0 siblings, 0 replies; 17+ messages in thread
From: Christoph Hellwig @ 2016-03-11  9:24 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Christoph Hellwig, trond.myklebust, linux-nfs

On Thu, Mar 10, 2016 at 05:26:18PM -0500, J. Bruce Fields wrote:
> On Fri, Mar 04, 2016 at 08:46:17PM +0100, Christoph Hellwig wrote:
> > This is a simple extension to the block layout driver to use SCSI
> > persistent reservations for access control and fencing, as well as
> > SCSI VPD pages for device identification.
> > 
> > For this we need to pass the nfs4_client to the proc_getdeviceinfo method
> > to generate the reservation key, and add a new fence_client method
> > to allow for fence actions in the layout driver.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > ---
> >  Documentation/filesystems/nfs/pnfs-scsi-server.txt |  22 ++
> >  fs/nfsd/Kconfig                                    |  13 +
> >  fs/nfsd/Makefile                                   |   1 +
> >  fs/nfsd/blocklayout.c                              | 298 ++++++++++++++++++---
> >  fs/nfsd/blocklayoutxdr.c                           |  65 ++++-
> >  fs/nfsd/blocklayoutxdr.h                           |  14 +
> >  fs/nfsd/nfs4layouts.c                              |  27 +-
> >  fs/nfsd/nfs4proc.c                                 |   6 +-
> >  fs/nfsd/pnfs.h                                     |   6 +
> >  fs/xfs/Makefile                                    |   1 +
> >  fs/xfs/xfs_pnfs.h                                  |   2 +-
> >  11 files changed, 407 insertions(+), 48 deletions(-)
> >  create mode 100644 Documentation/filesystems/nfs/pnfs-scsi-server.txt
> > 
> > diff --git a/Documentation/filesystems/nfs/pnfs-scsi-server.txt b/Documentation/filesystems/nfs/pnfs-scsi-server.txt
> > new file mode 100644
> > index 0000000..4150979
> > --- /dev/null
> > +++ b/Documentation/filesystems/nfs/pnfs-scsi-server.txt
> > @@ -0,0 +1,22 @@
> > +
> > +pNFS SCSI layout server user guide
> > +==================================
> > +
> > +This document describes support for pNFS SCSI layouts in the Linux NFS server.
> > +With pNFS SCSI layouts, the NFS server acts as Metadata Server (MDS) for pNFS,
> > +which in addition to handling all the metadata access to the NFS export,
> > +also hands out layouts to the clients so that they can directly access the
> > +underlying SCSI LUNs that are shared with the client.
> > +
> > +To use pNFS SCSI layouts with with the Linux NFS server, the exported file
> > +system needs to support the pNFS SCSI layouts (currently just XFS), and the
> > +file system must sit on a SCSI LUN that is accessible to the clients in
> > +addition to the MDS.  As of now the file system needs to sit directly on the
> > +exported LUN, striping or concatenation of LUNs on the MDS and clients
> > +is not supported yet.
> > +
> > +On the server, pNFS SCSI volume support is automatically enabled if the
> > +file system is exported using the "pnfs" option and the underlying SCSI
> > +device support persistent reservations.  On the client make sure the kernel
> > +has the CONFIG_PNFS_BLOCK option enabled, and the file system is mounted
> > +using the NFSv4.1 protocol version (mount -o vers=4.1).
> > diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
> > index eb70d91..a30a313 100644
> > --- a/fs/nfsd/Kconfig
> > +++ b/fs/nfsd/Kconfig
> > @@ -98,6 +98,19 @@ config NFSD_BLOCKLAYOUT
> >  
> >  	  If unsure, say N.
> >  
> > +config NFSD_SCSILAYOUT
> > +	bool "NFSv4.1 server support for pNFS SCSI layouts"
> > +	depends on NFSD_V4
> > +	select NFSD_PNFS
> > +	help
> > +	  This option enables support for the exporting pNFS SCSI layouts
> > +	  in the kernel's NFS server. The pNFS SCSI layout enables NFS
> > +	  clients to directly perform I/O to SCSI devices accesible to both
> > +	  the server and the clients.  See draft-ietf-nfsv4-scsi-layout for
> > +	  more details.
> > +
> > +	  If unsure, say N.
> > +
> >  config NFSD_V4_SECURITY_LABEL
> >  	bool "Provide Security Label support for NFSv4 server"
> >  	depends on NFSD_V4 && SECURITY
> > diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
> > index 679cdc6..3ae5f3c 100644
> > --- a/fs/nfsd/Makefile
> > +++ b/fs/nfsd/Makefile
> > @@ -19,3 +19,4 @@ nfsd-$(CONFIG_NFSD_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
> >  			   nfs4acl.o nfs4callback.o nfs4recover.o
> >  nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
> >  nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
> > +nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
> > diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
> > index c29d942..0e87e3e 100644
> > --- a/fs/nfsd/blocklayout.c
> > +++ b/fs/nfsd/blocklayout.c
> > @@ -1,11 +1,14 @@
> >  /*
> > - * Copyright (c) 2014 Christoph Hellwig.
> > + * Copyright (c) 2014-2016 Christoph Hellwig.
> >   */
> >  #include <linux/exportfs.h>
> >  #include <linux/genhd.h>
> >  #include <linux/slab.h>
> > +#include <linux/pr.h>
> >  
> >  #include <linux/nfsd/debug.h>
> > +#include <scsi/scsi_proto.h>
> > +#include <scsi/scsi_common.h>
> >  
> >  #include "blocklayoutxdr.h"
> >  #include "pnfs.h"
> > @@ -13,37 +16,6 @@
> >  #define NFSDDBG_FACILITY	NFSDDBG_PNFS
> >  
> >  
> > -static int
> > -nfsd4_block_get_device_info_simple(struct super_block *sb,
> > -		struct nfsd4_getdeviceinfo *gdp)
> > -{
> > -	struct pnfs_block_deviceaddr *dev;
> > -	struct pnfs_block_volume *b;
> > -
> > -	dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
> > -		      sizeof(struct pnfs_block_volume), GFP_KERNEL);
> > -	if (!dev)
> > -		return -ENOMEM;
> > -	gdp->gd_device = dev;
> > -
> > -	dev->nr_volumes = 1;
> > -	b = &dev->volumes[0];
> > -
> > -	b->type = PNFS_BLOCK_VOLUME_SIMPLE;
> > -	b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
> > -	return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
> > -			&b->simple.offset);
> > -}
> > -
> > -static __be32
> > -nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
> > -		struct nfsd4_getdeviceinfo *gdp)
> > -{
> > -	if (sb->s_bdev != sb->s_bdev->bd_contains)
> > -		return nfserr_inval;
> > -	return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
> > -}
> > -
> >  static __be32
> >  nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
> >  		struct nfsd4_layoutget *args)
> > @@ -141,20 +113,13 @@ out_layoutunavailable:
> >  }
> >  
> >  static __be32
> > -nfsd4_block_proc_layoutcommit(struct inode *inode,
> > -		struct nfsd4_layoutcommit *lcp)
> > +nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp,
> > +		struct iomap *iomaps, int nr_iomaps)
> >  {
> >  	loff_t new_size = lcp->lc_last_wr + 1;
> >  	struct iattr iattr = { .ia_valid = 0 };
> > -	struct iomap *iomaps;
> > -	int nr_iomaps;
> >  	int error;
> >  
> > -	nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
> > -			lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
> > -	if (nr_iomaps < 0)
> > -		return nfserrno(nr_iomaps);
> > -
> >  	if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
> >  	    timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
> >  		lcp->lc_mtime = current_fs_time(inode->i_sb);
> > @@ -172,6 +137,54 @@ nfsd4_block_proc_layoutcommit(struct inode *inode,
> >  	return nfserrno(error);
> >  }
> >  
> > +#ifdef CONFIG_NFSD_BLOCKLAYOUT
> > +static int
> > +nfsd4_block_get_device_info_simple(struct super_block *sb,
> > +		struct nfsd4_getdeviceinfo *gdp)
> > +{
> > +	struct pnfs_block_deviceaddr *dev;
> > +	struct pnfs_block_volume *b;
> > +
> > +	dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
> > +		      sizeof(struct pnfs_block_volume), GFP_KERNEL);
> > +	if (!dev)
> > +		return -ENOMEM;
> > +	gdp->gd_device = dev;
> > +
> > +	dev->nr_volumes = 1;
> > +	b = &dev->volumes[0];
> > +
> > +	b->type = PNFS_BLOCK_VOLUME_SIMPLE;
> > +	b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
> > +	return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
> > +			&b->simple.offset);
> > +}
> > +
> > +static __be32
> > +nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
> > +		struct nfs4_client *clp,
> > +		struct nfsd4_getdeviceinfo *gdp)
> > +{
> > +	if (sb->s_bdev != sb->s_bdev->bd_contains)
> > +		return nfserr_inval;
> > +	return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
> > +}
> > +
> > +static __be32
> > +nfsd4_block_proc_layoutcommit(struct inode *inode,
> > +		struct nfsd4_layoutcommit *lcp)
> > +{
> > +	struct iomap *iomaps;
> > +	int nr_iomaps;
> > +
> > +	nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
> > +			lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
> > +	if (nr_iomaps < 0)
> > +		return nfserrno(nr_iomaps);
> > +
> > +	return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
> > +}
> > +
> >  const struct nfsd4_layout_ops bl_layout_ops = {
> >  	/*
> >  	 * Pretend that we send notification to the client.  This is a blatant
> > @@ -190,3 +203,206 @@ const struct nfsd4_layout_ops bl_layout_ops = {
> >  	.encode_layoutget	= nfsd4_block_encode_layoutget,
> >  	.proc_layoutcommit	= nfsd4_block_proc_layoutcommit,
> >  };
> > +#endif /* CONFIG_NFSD_BLOCKLAYOUT */
> > +
> > +#ifdef CONFIG_NFSD_SCSILAYOUT
> > +static int nfsd4_scsi_identify_device(struct block_device *bdev,
> > +		struct pnfs_block_volume *b)
> > +{
> > +	struct request_queue *q = bdev->bd_disk->queue;
> > +	struct request *rq;
> > +	size_t bufflen = 252, len, id_len;
> > +	u8 *buf, *d, type, assoc;
> > +	int error;
> > +
> > +	buf = kzalloc(bufflen, GFP_KERNEL);
> > +	if (!buf)
> > +		return -ENOMEM;
> > +
> > +	rq = blk_get_request(q, READ, GFP_KERNEL);
> > +	if (IS_ERR(rq)) {
> > +		error = -ENOMEM;
> > +		goto out_free_buf;
> > +	}
> > +	blk_rq_set_block_pc(rq);
> > +
> > +	error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL);
> > +	if (error)
> > +		goto out_put_request;
> > +
> > +	rq->cmd[0] = INQUIRY;
> > +	rq->cmd[1] = 1;
> > +	rq->cmd[2] = 0x83;
> > +	rq->cmd[3] = bufflen >> 8;
> > +	rq->cmd[4] = bufflen & 0xff;
> > +	rq->cmd_len = COMMAND_SIZE(INQUIRY);
> > +
> > +	error = blk_execute_rq(rq->q, NULL, rq, 1);
> > +	if (error) {
> > +		pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
> > +			rq->errors);
> > +		
> 
> Did you mean to have a "goto out_put_request;" there?
> 
> Fixed up locally under that assumption....

Yes, thanks!

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] nfsd: add SCSI layout support
  2016-03-04 19:46 ` [PATCH 4/4] nfsd: add SCSI layout support Christoph Hellwig
  2016-03-08 22:15   ` J. Bruce Fields
  2016-03-10 22:26   ` J. Bruce Fields
@ 2016-03-11 22:52   ` J. Bruce Fields
  2016-03-17 21:52   ` J. Bruce Fields
  3 siblings, 0 replies; 17+ messages in thread
From: J. Bruce Fields @ 2016-03-11 22:52 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: trond.myklebust, linux-nfs

On Fri, Mar 04, 2016 at 08:46:17PM +0100, Christoph Hellwig wrote:
> This is a simple extension to the block layout driver to use SCSI
> persistent reservations for access control and fencing, as well as
> SCSI VPD pages for device identification.
> 
> For this we need to pass the nfs4_client to the proc_getdeviceinfo method
> to generate the reservation key, and add a new fence_client method
> to allow for fence actions in the layout driver.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  Documentation/filesystems/nfs/pnfs-scsi-server.txt |  22 ++
>  fs/nfsd/Kconfig                                    |  13 +
>  fs/nfsd/Makefile                                   |   1 +
>  fs/nfsd/blocklayout.c                              | 298 ++++++++++++++++++---
>  fs/nfsd/blocklayoutxdr.c                           |  65 ++++-
>  fs/nfsd/blocklayoutxdr.h                           |  14 +
>  fs/nfsd/nfs4layouts.c                              |  27 +-
>  fs/nfsd/nfs4proc.c                                 |   6 +-
>  fs/nfsd/pnfs.h                                     |   6 +
>  fs/xfs/Makefile                                    |   1 +
>  fs/xfs/xfs_pnfs.h                                  |   2 +-
>  11 files changed, 407 insertions(+), 48 deletions(-)
>  create mode 100644 Documentation/filesystems/nfs/pnfs-scsi-server.txt
> 
> diff --git a/Documentation/filesystems/nfs/pnfs-scsi-server.txt b/Documentation/filesystems/nfs/pnfs-scsi-server.txt
> new file mode 100644
> index 0000000..4150979
> --- /dev/null
> +++ b/Documentation/filesystems/nfs/pnfs-scsi-server.txt
> @@ -665,7 +680,13 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
>  		/*
>  		 * Unknown error or non-responding client, we'll need to fence.
>  		 */
> -		nfsd4_cb_layout_fail(ls);
> +		trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
> +
> +		ops = nfsd4_layout_ops[ls->ls_layout_type];
> +		if (ops->fence_client)
> +			ops->fence_client(ls);
> +		else
> +			nfsd4_cb_layout_fail(ls);
>  		return -1;

I'm still not happy about this fencing on callback failure.  I think we
discussed it before, but all I can find is a promise from me to finish
some pynfs tests for the right behavior.  Which I haven't done.

I'd have to think through the failure scenarios to figure out how likely
this is to bite us.

--b.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/4] nfs/blocklayout: add SCSI layout support
  2016-03-08 22:42     ` Trond Myklebust
@ 2016-03-17 21:01       ` J. Bruce Fields
  2016-03-18 14:18         ` Trond Myklebust
  0 siblings, 1 reply; 17+ messages in thread
From: J. Bruce Fields @ 2016-03-17 21:01 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: Christoph Hellwig, Linux NFS Mailing List

On Tue, Mar 08, 2016 at 05:42:43PM -0500, Trond Myklebust wrote:
> On Tue, Mar 8, 2016 at 5:07 PM, J. Bruce Fields <bfields@fieldses.org> wrote:
> >
> > Trond, OK if I take this through the nfsd tree?
> >
> > Or I'm OK with doing this however.
> 
> I'm fine with that. I don't think there should be any conflicts with
> what is queued up in my 'linux-next' branch.

I forgot to ask--OK if I interpret that as an "acked-by" from you?

--b.

> 
> >
> > --b.
> >
> > On Fri, Mar 04, 2016 at 08:46:15PM +0100, Christoph Hellwig wrote:
> > > This is a trivial extension to the block layout driver to support the
> > > new SCSI layouts draft.  There are three changes:
> > >
> > >  - device identifcation through the SCSI VPD page.  This allows us to
> > >    directly use the udev generated persistent device names instead of
> > >    requiring an expensive lookup by crawling every block device node
> > >    in /dev and reading a signature for it.
> > >  - use of SCSI persistent reservations to protect device access and
> > >    allow for robust fencing.  On the client sides this just means
> > >    registering and unregistering a server supplied key.
> > >  - an optimized LAYOUTCOMMIT payload that doesn't send unessecary
> > >    fields to the server.
> > >
> > > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > > ---
> > >  fs/nfs/blocklayout/blocklayout.c |  59 ++++++++++++++--
> > >  fs/nfs/blocklayout/blocklayout.h |  14 +++-
> > >  fs/nfs/blocklayout/dev.c         | 144 ++++++++++++++++++++++++++++++++++++++-
> > >  fs/nfs/blocklayout/extent_tree.c |  44 ++++++++----
> > >  fs/nfs/blocklayout/rpc_pipefs.c  |   2 +-
> > >  5 files changed, 238 insertions(+), 25 deletions(-)
> > >
> > > diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
> > > index ddd0138..b27c409 100644
> > > --- a/fs/nfs/blocklayout/blocklayout.c
> > > +++ b/fs/nfs/blocklayout/blocklayout.c
> > > @@ -446,8 +446,8 @@ static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
> > >       kfree(bl);
> > >  }
> > >
> > > -static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
> > > -                                                gfp_t gfp_flags)
> > > +static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode,
> > > +             gfp_t gfp_flags, bool is_scsi_layout)
> > >  {
> > >       struct pnfs_block_layout *bl;
> > >
> > > @@ -460,9 +460,22 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
> > >       bl->bl_ext_ro = RB_ROOT;
> > >       spin_lock_init(&bl->bl_ext_lock);
> > >
> > > +     bl->bl_scsi_layout = is_scsi_layout;
> > >       return &bl->bl_layout;
> > >  }
> > >
> > > +static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
> > > +                                                gfp_t gfp_flags)
> > > +{
> > > +     return __bl_alloc_layout_hdr(inode, gfp_flags, false);
> > > +}
> > > +
> > > +static struct pnfs_layout_hdr *sl_alloc_layout_hdr(struct inode *inode,
> > > +                                                gfp_t gfp_flags)
> > > +{
> > > +     return __bl_alloc_layout_hdr(inode, gfp_flags, true);
> > > +}
> > > +
> > >  static void bl_free_lseg(struct pnfs_layout_segment *lseg)
> > >  {
> > >       dprintk("%s enter\n", __func__);
> > > @@ -888,22 +901,53 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
> > >       .sync                           = pnfs_generic_sync,
> > >  };
> > >
> > > +static struct pnfs_layoutdriver_type scsilayout_type = {
> > > +     .id                             = LAYOUT_SCSI,
> > > +     .name                           = "LAYOUT_SCSI",
> > > +     .owner                          = THIS_MODULE,
> > > +     .flags                          = PNFS_LAYOUTRET_ON_SETATTR |
> > > +                                       PNFS_READ_WHOLE_PAGE,
> > > +     .read_pagelist                  = bl_read_pagelist,
> > > +     .write_pagelist                 = bl_write_pagelist,
> > > +     .alloc_layout_hdr               = sl_alloc_layout_hdr,
> > > +     .free_layout_hdr                = bl_free_layout_hdr,
> > > +     .alloc_lseg                     = bl_alloc_lseg,
> > > +     .free_lseg                      = bl_free_lseg,
> > > +     .return_range                   = bl_return_range,
> > > +     .prepare_layoutcommit           = bl_prepare_layoutcommit,
> > > +     .cleanup_layoutcommit           = bl_cleanup_layoutcommit,
> > > +     .set_layoutdriver               = bl_set_layoutdriver,
> > > +     .alloc_deviceid_node            = bl_alloc_deviceid_node,
> > > +     .free_deviceid_node             = bl_free_deviceid_node,
> > > +     .pg_read_ops                    = &bl_pg_read_ops,
> > > +     .pg_write_ops                   = &bl_pg_write_ops,
> > > +     .sync                           = pnfs_generic_sync,
> > > +};
> > > +
> > > +
> > >  static int __init nfs4blocklayout_init(void)
> > >  {
> > >       int ret;
> > >
> > >       dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
> > >
> > > -     ret = pnfs_register_layoutdriver(&blocklayout_type);
> > > +     ret = bl_init_pipefs();
> > >       if (ret)
> > >               goto out;
> > > -     ret = bl_init_pipefs();
> > > +
> > > +     ret = pnfs_register_layoutdriver(&blocklayout_type);
> > >       if (ret)
> > > -             goto out_unregister;
> > > +             goto out_cleanup_pipe;
> > > +
> > > +     ret = pnfs_register_layoutdriver(&scsilayout_type);
> > > +     if (ret)
> > > +             goto out_unregister_block;
> > >       return 0;
> > >
> > > -out_unregister:
> > > +out_unregister_block:
> > >       pnfs_unregister_layoutdriver(&blocklayout_type);
> > > +out_cleanup_pipe:
> > > +     bl_cleanup_pipefs();
> > >  out:
> > >       return ret;
> > >  }
> > > @@ -913,8 +957,9 @@ static void __exit nfs4blocklayout_exit(void)
> > >       dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
> > >              __func__);
> > >
> > > -     bl_cleanup_pipefs();
> > > +     pnfs_unregister_layoutdriver(&scsilayout_type);
> > >       pnfs_unregister_layoutdriver(&blocklayout_type);
> > > +     bl_cleanup_pipefs();
> > >  }
> > >
> > >  MODULE_ALIAS("nfs-layouttype4-3");
> > > diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
> > > index c556640..bc21205 100644
> > > --- a/fs/nfs/blocklayout/blocklayout.h
> > > +++ b/fs/nfs/blocklayout/blocklayout.h
> > > @@ -55,7 +55,6 @@ struct pnfs_block_dev;
> > >   */
> > >  #define PNFS_BLOCK_UUID_LEN  128
> > >
> > > -
> > >  struct pnfs_block_volume {
> > >       enum pnfs_block_volume_type     type;
> > >       union {
> > > @@ -82,6 +81,13 @@ struct pnfs_block_volume {
> > >                       u32             volumes_count;
> > >                       u32             volumes[PNFS_BLOCK_MAX_DEVICES];
> > >               } stripe;
> > > +             struct {
> > > +                     enum scsi_code_set              code_set;
> > > +                     enum scsi_designator_type       designator_type;
> > > +                     int                             designator_len;
> > > +                     u8                              designator[256];
> > > +                     u64                             pr_key;
> > > +             } scsi;
> > >       };
> > >  };
> > >
> > > @@ -106,6 +112,9 @@ struct pnfs_block_dev {
> > >       struct block_device             *bdev;
> > >       u64                             disk_offset;
> > >
> > > +     u64                             pr_key;
> > > +     bool                            pr_registered;
> > > +
> > >       bool (*map)(struct pnfs_block_dev *dev, u64 offset,
> > >                       struct pnfs_block_dev_map *map);
> > >  };
> > > @@ -131,6 +140,7 @@ struct pnfs_block_layout {
> > >       struct rb_root          bl_ext_rw;
> > >       struct rb_root          bl_ext_ro;
> > >       spinlock_t              bl_ext_lock;   /* Protects list manipulation */
> > > +     bool                    bl_scsi_layout;
> > >  };
> > >
> > >  static inline struct pnfs_block_layout *
> > > @@ -182,6 +192,6 @@ void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
> > >  dev_t bl_resolve_deviceid(struct nfs_server *server,
> > >               struct pnfs_block_volume *b, gfp_t gfp_mask);
> > >  int __init bl_init_pipefs(void);
> > > -void __exit bl_cleanup_pipefs(void);
> > > +void bl_cleanup_pipefs(void);
> > >
> > >  #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
> > > diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
> > > index a861bbd..31b0f6b 100644
> > > --- a/fs/nfs/blocklayout/dev.c
> > > +++ b/fs/nfs/blocklayout/dev.c
> > > @@ -1,11 +1,12 @@
> > >  /*
> > > - * Copyright (c) 2014 Christoph Hellwig.
> > > + * Copyright (c) 2014-2016 Christoph Hellwig.
> > >   */
> > >  #include <linux/sunrpc/svc.h>
> > >  #include <linux/blkdev.h>
> > >  #include <linux/nfs4.h>
> > >  #include <linux/nfs_fs.h>
> > >  #include <linux/nfs_xdr.h>
> > > +#include <linux/pr.h>
> > >
> > >  #include "blocklayout.h"
> > >
> > > @@ -21,6 +22,17 @@ bl_free_device(struct pnfs_block_dev *dev)
> > >                       bl_free_device(&dev->children[i]);
> > >               kfree(dev->children);
> > >       } else {
> > > +             if (dev->pr_registered) {
> > > +                     const struct pr_ops *ops =
> > > +                             dev->bdev->bd_disk->fops->pr_ops;
> > > +                     int error;
> > > +
> > > +                     error = ops->pr_register(dev->bdev, dev->pr_key, 0,
> > > +                             false);
> > > +                     if (error)
> > > +                             pr_err("failed to unregister PR key.\n");
> > > +             }
> > > +
> > >               if (dev->bdev)
> > >                       blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
> > >       }
> > > @@ -113,6 +125,24 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
> > >               for (i = 0; i < b->stripe.volumes_count; i++)
> > >                       b->stripe.volumes[i] = be32_to_cpup(p++);
> > >               break;
> > > +     case PNFS_BLOCK_VOLUME_SCSI:
> > > +             p = xdr_inline_decode(xdr, 4 + 4 + 4);
> > > +             if (!p)
> > > +                     return -EIO;
> > > +             b->scsi.code_set = be32_to_cpup(p++);
> > > +             b->scsi.designator_type = be32_to_cpup(p++);
> > > +             b->scsi.designator_len = be32_to_cpup(p++);
> > > +             p = xdr_inline_decode(xdr, b->scsi.designator_len);
> > > +             if (!p)
> > > +                     return -EIO;
> > > +             if (b->scsi.designator_len > 256)
> > > +                     return -EIO;
> > > +             memcpy(&b->scsi.designator, p, b->scsi.designator_len);
> > > +             p = xdr_inline_decode(xdr, 8);
> > > +             if (!p)
> > > +                     return -EIO;
> > > +             p = xdr_decode_hyper(p, &b->scsi.pr_key);
> > > +             break;
> > >       default:
> > >               dprintk("unknown volume type!\n");
> > >               return -EIO;
> > > @@ -216,6 +246,116 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
> > >       return 0;
> > >  }
> > >
> > > +static bool
> > > +bl_validate_designator(struct pnfs_block_volume *v)
> > > +{
> > > +     switch (v->scsi.designator_type) {
> > > +     case PS_DESIGNATOR_EUI64:
> > > +             if (v->scsi.code_set != PS_CODE_SET_BINARY)
> > > +                     return false;
> > > +
> > > +             if (v->scsi.designator_len != 8 &&
> > > +                 v->scsi.designator_len != 10 &&
> > > +                 v->scsi.designator_len != 16)
> > > +                     return false;
> > > +
> > > +             return true;
> > > +     case PS_DESIGNATOR_NAA:
> > > +             if (v->scsi.code_set != PS_CODE_SET_BINARY)
> > > +                     return false;
> > > +
> > > +             if (v->scsi.designator_len != 8 &&
> > > +                 v->scsi.designator_len != 16)
> > > +                     return false;
> > > +
> > > +             return true;
> > > +     case PS_DESIGNATOR_T10:
> > > +     case PS_DESIGNATOR_NAME:
> > > +             pr_err("pNFS: unsupported designator "
> > > +                     "(code set %d, type %d, len %d.\n",
> > > +                     v->scsi.code_set,
> > > +                     v->scsi.designator_type,
> > > +                     v->scsi.designator_len);
> > > +             return false;
> > > +     default:
> > > +             pr_err("pNFS: invalid designator "
> > > +                     "(code set %d, type %d, len %d.\n",
> > > +                     v->scsi.code_set,
> > > +                     v->scsi.designator_type,
> > > +                     v->scsi.designator_len);
> > > +             return false;
> > > +     }
> > > +}
> > > +
> > > +static int
> > > +bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
> > > +             struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
> > > +{
> > > +     struct pnfs_block_volume *v = &volumes[idx];
> > > +     const struct pr_ops *ops;
> > > +     const char *devname;
> > > +     int error;
> > > +
> > > +     if (!bl_validate_designator(v))
> > > +             return -EINVAL;
> > > +
> > > +     switch (v->scsi.designator_len) {
> > > +     case 8:
> > > +             devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN",
> > > +                             v->scsi.designator);
> > > +             break;
> > > +     case 12:
> > > +             devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN",
> > > +                             v->scsi.designator);
> > > +             break;
> > > +     case 16:
> > > +             devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN",
> > > +                             v->scsi.designator);
> > > +             break;
> > > +     default:
> > > +             return -EINVAL;
> > > +     }
> > > +
> > > +     d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL);
> > > +     if (IS_ERR(d->bdev)) {
> > > +             pr_warn("pNFS: failed to open device %s (%ld)\n",
> > > +                     devname, PTR_ERR(d->bdev));
> > > +             kfree(devname);
> > > +             return PTR_ERR(d->bdev);
> > > +     }
> > > +
> > > +     kfree(devname);
> > > +
> > > +     d->len = i_size_read(d->bdev->bd_inode);
> > > +     d->map = bl_map_simple;
> > > +     d->pr_key = v->scsi.pr_key;
> > > +
> > > +     pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
> > > +             d->bdev->bd_disk->disk_name, d->pr_key);
> > > +
> > > +     ops = d->bdev->bd_disk->fops->pr_ops;
> > > +     if (!ops) {
> > > +             pr_err("pNFS: block device %s does not support reservations.",
> > > +                             d->bdev->bd_disk->disk_name);
> > > +             error = -EINVAL;
> > > +             goto out_blkdev_put;
> > > +     }
> > > +
> > > +     error = ops->pr_register(d->bdev, 0, d->pr_key, true);
> > > +     if (error) {
> > > +             pr_err("pNFS: failed to register key for block device %s.",
> > > +                             d->bdev->bd_disk->disk_name);
> > > +             goto out_blkdev_put;
> > > +     }
> > > +
> > > +     d->pr_registered = true;
> > > +     return 0;
> > > +
> > > +out_blkdev_put:
> > > +     blkdev_put(d->bdev, FMODE_READ);
> > > +     return error;
> > > +}
> > > +
> > >  static int
> > >  bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
> > >               struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
> > > @@ -303,6 +443,8 @@ bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
> > >               return bl_parse_concat(server, d, volumes, idx, gfp_mask);
> > >       case PNFS_BLOCK_VOLUME_STRIPE:
> > >               return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
> > > +     case PNFS_BLOCK_VOLUME_SCSI:
> > > +             return bl_parse_scsi(server, d, volumes, idx, gfp_mask);
> > >       default:
> > >               dprintk("unsupported volume type: %d\n", volumes[idx].type);
> > >               return -EIO;
> > > diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
> > > index c59a59c..df366fc 100644
> > > --- a/fs/nfs/blocklayout/extent_tree.c
> > > +++ b/fs/nfs/blocklayout/extent_tree.c
> > > @@ -1,5 +1,5 @@
> > >  /*
> > > - * Copyright (c) 2014 Christoph Hellwig.
> > > + * Copyright (c) 2014-2016 Christoph Hellwig.
> > >   */
> > >
> > >  #include <linux/vmalloc.h>
> > > @@ -462,10 +462,12 @@ out:
> > >       return err;
> > >  }
> > >
> > > -static size_t ext_tree_layoutupdate_size(size_t count)
> > > +static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count)
> > >  {
> > > -     return sizeof(__be32) /* number of entries */ +
> > > -             PNFS_BLOCK_EXTENT_SIZE * count;
> > > +     if (bl->bl_scsi_layout)
> > > +             return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count;
> > > +     else
> > > +             return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count;
> > >  }
> > >
> > >  static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
> > > @@ -482,6 +484,23 @@ static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
> > >       }
> > >  }
> > >
> > > +static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p)
> > > +{
> > > +     p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
> > > +                     NFS4_DEVICEID4_SIZE);
> > > +     p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
> > > +     p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
> > > +     p = xdr_encode_hyper(p, 0LL);
> > > +     *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
> > > +     return p;
> > > +}
> > > +
> > > +static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
> > > +{
> > > +     p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
> > > +     return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
> > > +}
> > > +
> > >  static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
> > >               size_t buffer_size, size_t *count)
> > >  {
> > > @@ -495,19 +514,16 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
> > >                       continue;
> > >
> > >               (*count)++;
> > > -             if (ext_tree_layoutupdate_size(*count) > buffer_size) {
> > > +             if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
> > >                       /* keep counting.. */
> > >                       ret = -ENOSPC;
> > >                       continue;
> > >               }
> > >
> > > -             p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
> > > -                             NFS4_DEVICEID4_SIZE);
> > > -             p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
> > > -             p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
> > > -             p = xdr_encode_hyper(p, 0LL);
> > > -             *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
> > > -
> > > +             if (bl->bl_scsi_layout)
> > > +                     p = encode_scsi_range(be, p);
> > > +             else
> > > +                     p = encode_block_extent(be, p);
> > >               be->be_tag = EXTENT_COMMITTING;
> > >       }
> > >       spin_unlock(&bl->bl_ext_lock);
> > > @@ -536,7 +552,7 @@ retry:
> > >       if (unlikely(ret)) {
> > >               ext_tree_free_commitdata(arg, buffer_size);
> > >
> > > -             buffer_size = ext_tree_layoutupdate_size(count);
> > > +             buffer_size = ext_tree_layoutupdate_size(bl, count);
> > >               count = 0;
> > >
> > >               arg->layoutupdate_pages =
> > > @@ -555,7 +571,7 @@ retry:
> > >       }
> > >
> > >       *start_p = cpu_to_be32(count);
> > > -     arg->layoutupdate_len = ext_tree_layoutupdate_size(count);
> > > +     arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count);
> > >
> > >       if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
> > >               void *p = start_p, *end = p + arg->layoutupdate_len;
> > > diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
> > > index dbe5839..9fb067a6 100644
> > > --- a/fs/nfs/blocklayout/rpc_pipefs.c
> > > +++ b/fs/nfs/blocklayout/rpc_pipefs.c
> > > @@ -281,7 +281,7 @@ out:
> > >       return ret;
> > >  }
> > >
> > > -void __exit bl_cleanup_pipefs(void)
> > > +void bl_cleanup_pipefs(void)
> > >  {
> > >       rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
> > >       unregister_pernet_subsys(&nfs4blocklayout_net_ops);
> > > --
> > > 2.1.4

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] nfsd: add SCSI layout support
  2016-03-04 19:46 ` [PATCH 4/4] nfsd: add SCSI layout support Christoph Hellwig
                     ` (2 preceding siblings ...)
  2016-03-11 22:52   ` J. Bruce Fields
@ 2016-03-17 21:52   ` J. Bruce Fields
  2016-03-22 18:56     ` J. Bruce Fields
  3 siblings, 1 reply; 17+ messages in thread
From: J. Bruce Fields @ 2016-03-17 21:52 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: trond.myklebust, linux-nfs

On Fri, Mar 04, 2016 at 08:46:17PM +0100, Christoph Hellwig wrote:
> +int
> +nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
> +		u32 block_size)
> +{
> +	struct iomap *iomaps;
> +	u32 nr_iomaps, expected, i;
> +
> +	if (len < sizeof(u32)) {
> +		dprintk("%s: extent array too small: %u\n", __func__, len);
> +		return -EINVAL;
> +	}
> +
> +	nr_iomaps = be32_to_cpup(p++);
> +	expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE;
> +	if (len != expected) {

You could add any multiple of 2^32/PNFS_SCSI_RANGE_SIZE to nr_iomaps and
still pass this check.  Then you'd probably fail the following kcalloc,
but best to be paranoid if this is from-the-wire data.

Maybe something like this?  (Untested)

diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index ca1883668810..6c3b316f932e 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -105,18 +105,22 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
 		u32 block_size)
 {
 	struct iomap *iomaps;
-	u32 nr_iomaps, expected, i;
+	u32 nr_iomaps, i;
 
 	if (len < sizeof(u32)) {
 		dprintk("%s: extent array too small: %u\n", __func__, len);
 		return -EINVAL;
 	}
+	len -= sizeof(u32);
+	if (len % PNFS_BLOCK_EXTENT_SIZE) {
+		dprintk("%s: extent array invalid: %u\n", __func__, len);
+		return -EINVAL;
+	}
 
 	nr_iomaps = be32_to_cpup(p++);
-	expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE;
-	if (len != expected) {
+	if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) {
 		dprintk("%s: extent array size mismatch: %u/%u\n",
-			__func__, len, expected);
+			__func__, len, nr_iomaps);
 		return -EINVAL;
 	}
 
--b.

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/4] nfs/blocklayout: add SCSI layout support
  2016-03-17 21:01       ` J. Bruce Fields
@ 2016-03-18 14:18         ` Trond Myklebust
  0 siblings, 0 replies; 17+ messages in thread
From: Trond Myklebust @ 2016-03-18 14:18 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Christoph Hellwig, Linux NFS Mailing List

On Thu, Mar 17, 2016 at 5:01 PM, J. Bruce Fields <bfields@fieldses.org> wrote:
> On Tue, Mar 08, 2016 at 05:42:43PM -0500, Trond Myklebust wrote:
>> On Tue, Mar 8, 2016 at 5:07 PM, J. Bruce Fields <bfields@fieldses.org> wrote:
>> >
>> > Trond, OK if I take this through the nfsd tree?
>> >
>> > Or I'm OK with doing this however.
>>
>> I'm fine with that. I don't think there should be any conflicts with
>> what is queued up in my 'linux-next' branch.
>
> I forgot to ask--OK if I interpret that as an "acked-by" from you?

Sorry. Yes, that was my intention.

Acked-by: Trond Myklebust <trond.myklebust@primarydata.com>

>
> --b.
>
>>
>> >
>> > --b.
>> >
>> > On Fri, Mar 04, 2016 at 08:46:15PM +0100, Christoph Hellwig wrote:
>> > > This is a trivial extension to the block layout driver to support the
>> > > new SCSI layouts draft.  There are three changes:
>> > >
>> > >  - device identifcation through the SCSI VPD page.  This allows us to
>> > >    directly use the udev generated persistent device names instead of
>> > >    requiring an expensive lookup by crawling every block device node
>> > >    in /dev and reading a signature for it.
>> > >  - use of SCSI persistent reservations to protect device access and
>> > >    allow for robust fencing.  On the client sides this just means
>> > >    registering and unregistering a server supplied key.
>> > >  - an optimized LAYOUTCOMMIT payload that doesn't send unessecary
>> > >    fields to the server.
>> > >
>> > > Signed-off-by: Christoph Hellwig <hch@lst.de>
>> > > ---
>> > >  fs/nfs/blocklayout/blocklayout.c |  59 ++++++++++++++--
>> > >  fs/nfs/blocklayout/blocklayout.h |  14 +++-
>> > >  fs/nfs/blocklayout/dev.c         | 144 ++++++++++++++++++++++++++++++++++++++-
>> > >  fs/nfs/blocklayout/extent_tree.c |  44 ++++++++----
>> > >  fs/nfs/blocklayout/rpc_pipefs.c  |   2 +-
>> > >  5 files changed, 238 insertions(+), 25 deletions(-)
>> > >
>> > > diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
>> > > index ddd0138..b27c409 100644
>> > > --- a/fs/nfs/blocklayout/blocklayout.c
>> > > +++ b/fs/nfs/blocklayout/blocklayout.c
>> > > @@ -446,8 +446,8 @@ static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
>> > >       kfree(bl);
>> > >  }
>> > >
>> > > -static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
>> > > -                                                gfp_t gfp_flags)
>> > > +static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode,
>> > > +             gfp_t gfp_flags, bool is_scsi_layout)
>> > >  {
>> > >       struct pnfs_block_layout *bl;
>> > >
>> > > @@ -460,9 +460,22 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
>> > >       bl->bl_ext_ro = RB_ROOT;
>> > >       spin_lock_init(&bl->bl_ext_lock);
>> > >
>> > > +     bl->bl_scsi_layout = is_scsi_layout;
>> > >       return &bl->bl_layout;
>> > >  }
>> > >
>> > > +static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
>> > > +                                                gfp_t gfp_flags)
>> > > +{
>> > > +     return __bl_alloc_layout_hdr(inode, gfp_flags, false);
>> > > +}
>> > > +
>> > > +static struct pnfs_layout_hdr *sl_alloc_layout_hdr(struct inode *inode,
>> > > +                                                gfp_t gfp_flags)
>> > > +{
>> > > +     return __bl_alloc_layout_hdr(inode, gfp_flags, true);
>> > > +}
>> > > +
>> > >  static void bl_free_lseg(struct pnfs_layout_segment *lseg)
>> > >  {
>> > >       dprintk("%s enter\n", __func__);
>> > > @@ -888,22 +901,53 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
>> > >       .sync                           = pnfs_generic_sync,
>> > >  };
>> > >
>> > > +static struct pnfs_layoutdriver_type scsilayout_type = {
>> > > +     .id                             = LAYOUT_SCSI,
>> > > +     .name                           = "LAYOUT_SCSI",
>> > > +     .owner                          = THIS_MODULE,
>> > > +     .flags                          = PNFS_LAYOUTRET_ON_SETATTR |
>> > > +                                       PNFS_READ_WHOLE_PAGE,
>> > > +     .read_pagelist                  = bl_read_pagelist,
>> > > +     .write_pagelist                 = bl_write_pagelist,
>> > > +     .alloc_layout_hdr               = sl_alloc_layout_hdr,
>> > > +     .free_layout_hdr                = bl_free_layout_hdr,
>> > > +     .alloc_lseg                     = bl_alloc_lseg,
>> > > +     .free_lseg                      = bl_free_lseg,
>> > > +     .return_range                   = bl_return_range,
>> > > +     .prepare_layoutcommit           = bl_prepare_layoutcommit,
>> > > +     .cleanup_layoutcommit           = bl_cleanup_layoutcommit,
>> > > +     .set_layoutdriver               = bl_set_layoutdriver,
>> > > +     .alloc_deviceid_node            = bl_alloc_deviceid_node,
>> > > +     .free_deviceid_node             = bl_free_deviceid_node,
>> > > +     .pg_read_ops                    = &bl_pg_read_ops,
>> > > +     .pg_write_ops                   = &bl_pg_write_ops,
>> > > +     .sync                           = pnfs_generic_sync,
>> > > +};
>> > > +
>> > > +
>> > >  static int __init nfs4blocklayout_init(void)
>> > >  {
>> > >       int ret;
>> > >
>> > >       dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
>> > >
>> > > -     ret = pnfs_register_layoutdriver(&blocklayout_type);
>> > > +     ret = bl_init_pipefs();
>> > >       if (ret)
>> > >               goto out;
>> > > -     ret = bl_init_pipefs();
>> > > +
>> > > +     ret = pnfs_register_layoutdriver(&blocklayout_type);
>> > >       if (ret)
>> > > -             goto out_unregister;
>> > > +             goto out_cleanup_pipe;
>> > > +
>> > > +     ret = pnfs_register_layoutdriver(&scsilayout_type);
>> > > +     if (ret)
>> > > +             goto out_unregister_block;
>> > >       return 0;
>> > >
>> > > -out_unregister:
>> > > +out_unregister_block:
>> > >       pnfs_unregister_layoutdriver(&blocklayout_type);
>> > > +out_cleanup_pipe:
>> > > +     bl_cleanup_pipefs();
>> > >  out:
>> > >       return ret;
>> > >  }
>> > > @@ -913,8 +957,9 @@ static void __exit nfs4blocklayout_exit(void)
>> > >       dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
>> > >              __func__);
>> > >
>> > > -     bl_cleanup_pipefs();
>> > > +     pnfs_unregister_layoutdriver(&scsilayout_type);
>> > >       pnfs_unregister_layoutdriver(&blocklayout_type);
>> > > +     bl_cleanup_pipefs();
>> > >  }
>> > >
>> > >  MODULE_ALIAS("nfs-layouttype4-3");
>> > > diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
>> > > index c556640..bc21205 100644
>> > > --- a/fs/nfs/blocklayout/blocklayout.h
>> > > +++ b/fs/nfs/blocklayout/blocklayout.h
>> > > @@ -55,7 +55,6 @@ struct pnfs_block_dev;
>> > >   */
>> > >  #define PNFS_BLOCK_UUID_LEN  128
>> > >
>> > > -
>> > >  struct pnfs_block_volume {
>> > >       enum pnfs_block_volume_type     type;
>> > >       union {
>> > > @@ -82,6 +81,13 @@ struct pnfs_block_volume {
>> > >                       u32             volumes_count;
>> > >                       u32             volumes[PNFS_BLOCK_MAX_DEVICES];
>> > >               } stripe;
>> > > +             struct {
>> > > +                     enum scsi_code_set              code_set;
>> > > +                     enum scsi_designator_type       designator_type;
>> > > +                     int                             designator_len;
>> > > +                     u8                              designator[256];
>> > > +                     u64                             pr_key;
>> > > +             } scsi;
>> > >       };
>> > >  };
>> > >
>> > > @@ -106,6 +112,9 @@ struct pnfs_block_dev {
>> > >       struct block_device             *bdev;
>> > >       u64                             disk_offset;
>> > >
>> > > +     u64                             pr_key;
>> > > +     bool                            pr_registered;
>> > > +
>> > >       bool (*map)(struct pnfs_block_dev *dev, u64 offset,
>> > >                       struct pnfs_block_dev_map *map);
>> > >  };
>> > > @@ -131,6 +140,7 @@ struct pnfs_block_layout {
>> > >       struct rb_root          bl_ext_rw;
>> > >       struct rb_root          bl_ext_ro;
>> > >       spinlock_t              bl_ext_lock;   /* Protects list manipulation */
>> > > +     bool                    bl_scsi_layout;
>> > >  };
>> > >
>> > >  static inline struct pnfs_block_layout *
>> > > @@ -182,6 +192,6 @@ void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
>> > >  dev_t bl_resolve_deviceid(struct nfs_server *server,
>> > >               struct pnfs_block_volume *b, gfp_t gfp_mask);
>> > >  int __init bl_init_pipefs(void);
>> > > -void __exit bl_cleanup_pipefs(void);
>> > > +void bl_cleanup_pipefs(void);
>> > >
>> > >  #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
>> > > diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
>> > > index a861bbd..31b0f6b 100644
>> > > --- a/fs/nfs/blocklayout/dev.c
>> > > +++ b/fs/nfs/blocklayout/dev.c
>> > > @@ -1,11 +1,12 @@
>> > >  /*
>> > > - * Copyright (c) 2014 Christoph Hellwig.
>> > > + * Copyright (c) 2014-2016 Christoph Hellwig.
>> > >   */
>> > >  #include <linux/sunrpc/svc.h>
>> > >  #include <linux/blkdev.h>
>> > >  #include <linux/nfs4.h>
>> > >  #include <linux/nfs_fs.h>
>> > >  #include <linux/nfs_xdr.h>
>> > > +#include <linux/pr.h>
>> > >
>> > >  #include "blocklayout.h"
>> > >
>> > > @@ -21,6 +22,17 @@ bl_free_device(struct pnfs_block_dev *dev)
>> > >                       bl_free_device(&dev->children[i]);
>> > >               kfree(dev->children);
>> > >       } else {
>> > > +             if (dev->pr_registered) {
>> > > +                     const struct pr_ops *ops =
>> > > +                             dev->bdev->bd_disk->fops->pr_ops;
>> > > +                     int error;
>> > > +
>> > > +                     error = ops->pr_register(dev->bdev, dev->pr_key, 0,
>> > > +                             false);
>> > > +                     if (error)
>> > > +                             pr_err("failed to unregister PR key.\n");
>> > > +             }
>> > > +
>> > >               if (dev->bdev)
>> > >                       blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
>> > >       }
>> > > @@ -113,6 +125,24 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
>> > >               for (i = 0; i < b->stripe.volumes_count; i++)
>> > >                       b->stripe.volumes[i] = be32_to_cpup(p++);
>> > >               break;
>> > > +     case PNFS_BLOCK_VOLUME_SCSI:
>> > > +             p = xdr_inline_decode(xdr, 4 + 4 + 4);
>> > > +             if (!p)
>> > > +                     return -EIO;
>> > > +             b->scsi.code_set = be32_to_cpup(p++);
>> > > +             b->scsi.designator_type = be32_to_cpup(p++);
>> > > +             b->scsi.designator_len = be32_to_cpup(p++);
>> > > +             p = xdr_inline_decode(xdr, b->scsi.designator_len);
>> > > +             if (!p)
>> > > +                     return -EIO;
>> > > +             if (b->scsi.designator_len > 256)
>> > > +                     return -EIO;
>> > > +             memcpy(&b->scsi.designator, p, b->scsi.designator_len);
>> > > +             p = xdr_inline_decode(xdr, 8);
>> > > +             if (!p)
>> > > +                     return -EIO;
>> > > +             p = xdr_decode_hyper(p, &b->scsi.pr_key);
>> > > +             break;
>> > >       default:
>> > >               dprintk("unknown volume type!\n");
>> > >               return -EIO;
>> > > @@ -216,6 +246,116 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
>> > >       return 0;
>> > >  }
>> > >
>> > > +static bool
>> > > +bl_validate_designator(struct pnfs_block_volume *v)
>> > > +{
>> > > +     switch (v->scsi.designator_type) {
>> > > +     case PS_DESIGNATOR_EUI64:
>> > > +             if (v->scsi.code_set != PS_CODE_SET_BINARY)
>> > > +                     return false;
>> > > +
>> > > +             if (v->scsi.designator_len != 8 &&
>> > > +                 v->scsi.designator_len != 10 &&
>> > > +                 v->scsi.designator_len != 16)
>> > > +                     return false;
>> > > +
>> > > +             return true;
>> > > +     case PS_DESIGNATOR_NAA:
>> > > +             if (v->scsi.code_set != PS_CODE_SET_BINARY)
>> > > +                     return false;
>> > > +
>> > > +             if (v->scsi.designator_len != 8 &&
>> > > +                 v->scsi.designator_len != 16)
>> > > +                     return false;
>> > > +
>> > > +             return true;
>> > > +     case PS_DESIGNATOR_T10:
>> > > +     case PS_DESIGNATOR_NAME:
>> > > +             pr_err("pNFS: unsupported designator "
>> > > +                     "(code set %d, type %d, len %d.\n",
>> > > +                     v->scsi.code_set,
>> > > +                     v->scsi.designator_type,
>> > > +                     v->scsi.designator_len);
>> > > +             return false;
>> > > +     default:
>> > > +             pr_err("pNFS: invalid designator "
>> > > +                     "(code set %d, type %d, len %d.\n",
>> > > +                     v->scsi.code_set,
>> > > +                     v->scsi.designator_type,
>> > > +                     v->scsi.designator_len);
>> > > +             return false;
>> > > +     }
>> > > +}
>> > > +
>> > > +static int
>> > > +bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
>> > > +             struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
>> > > +{
>> > > +     struct pnfs_block_volume *v = &volumes[idx];
>> > > +     const struct pr_ops *ops;
>> > > +     const char *devname;
>> > > +     int error;
>> > > +
>> > > +     if (!bl_validate_designator(v))
>> > > +             return -EINVAL;
>> > > +
>> > > +     switch (v->scsi.designator_len) {
>> > > +     case 8:
>> > > +             devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN",
>> > > +                             v->scsi.designator);
>> > > +             break;
>> > > +     case 12:
>> > > +             devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN",
>> > > +                             v->scsi.designator);
>> > > +             break;
>> > > +     case 16:
>> > > +             devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN",
>> > > +                             v->scsi.designator);
>> > > +             break;
>> > > +     default:
>> > > +             return -EINVAL;
>> > > +     }
>> > > +
>> > > +     d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL);
>> > > +     if (IS_ERR(d->bdev)) {
>> > > +             pr_warn("pNFS: failed to open device %s (%ld)\n",
>> > > +                     devname, PTR_ERR(d->bdev));
>> > > +             kfree(devname);
>> > > +             return PTR_ERR(d->bdev);
>> > > +     }
>> > > +
>> > > +     kfree(devname);
>> > > +
>> > > +     d->len = i_size_read(d->bdev->bd_inode);
>> > > +     d->map = bl_map_simple;
>> > > +     d->pr_key = v->scsi.pr_key;
>> > > +
>> > > +     pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
>> > > +             d->bdev->bd_disk->disk_name, d->pr_key);
>> > > +
>> > > +     ops = d->bdev->bd_disk->fops->pr_ops;
>> > > +     if (!ops) {
>> > > +             pr_err("pNFS: block device %s does not support reservations.",
>> > > +                             d->bdev->bd_disk->disk_name);
>> > > +             error = -EINVAL;
>> > > +             goto out_blkdev_put;
>> > > +     }
>> > > +
>> > > +     error = ops->pr_register(d->bdev, 0, d->pr_key, true);
>> > > +     if (error) {
>> > > +             pr_err("pNFS: failed to register key for block device %s.",
>> > > +                             d->bdev->bd_disk->disk_name);
>> > > +             goto out_blkdev_put;
>> > > +     }
>> > > +
>> > > +     d->pr_registered = true;
>> > > +     return 0;
>> > > +
>> > > +out_blkdev_put:
>> > > +     blkdev_put(d->bdev, FMODE_READ);
>> > > +     return error;
>> > > +}
>> > > +
>> > >  static int
>> > >  bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
>> > >               struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
>> > > @@ -303,6 +443,8 @@ bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
>> > >               return bl_parse_concat(server, d, volumes, idx, gfp_mask);
>> > >       case PNFS_BLOCK_VOLUME_STRIPE:
>> > >               return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
>> > > +     case PNFS_BLOCK_VOLUME_SCSI:
>> > > +             return bl_parse_scsi(server, d, volumes, idx, gfp_mask);
>> > >       default:
>> > >               dprintk("unsupported volume type: %d\n", volumes[idx].type);
>> > >               return -EIO;
>> > > diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
>> > > index c59a59c..df366fc 100644
>> > > --- a/fs/nfs/blocklayout/extent_tree.c
>> > > +++ b/fs/nfs/blocklayout/extent_tree.c
>> > > @@ -1,5 +1,5 @@
>> > >  /*
>> > > - * Copyright (c) 2014 Christoph Hellwig.
>> > > + * Copyright (c) 2014-2016 Christoph Hellwig.
>> > >   */
>> > >
>> > >  #include <linux/vmalloc.h>
>> > > @@ -462,10 +462,12 @@ out:
>> > >       return err;
>> > >  }
>> > >
>> > > -static size_t ext_tree_layoutupdate_size(size_t count)
>> > > +static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count)
>> > >  {
>> > > -     return sizeof(__be32) /* number of entries */ +
>> > > -             PNFS_BLOCK_EXTENT_SIZE * count;
>> > > +     if (bl->bl_scsi_layout)
>> > > +             return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count;
>> > > +     else
>> > > +             return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count;
>> > >  }
>> > >
>> > >  static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
>> > > @@ -482,6 +484,23 @@ static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
>> > >       }
>> > >  }
>> > >
>> > > +static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p)
>> > > +{
>> > > +     p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
>> > > +                     NFS4_DEVICEID4_SIZE);
>> > > +     p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
>> > > +     p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
>> > > +     p = xdr_encode_hyper(p, 0LL);
>> > > +     *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
>> > > +     return p;
>> > > +}
>> > > +
>> > > +static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
>> > > +{
>> > > +     p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
>> > > +     return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
>> > > +}
>> > > +
>> > >  static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
>> > >               size_t buffer_size, size_t *count)
>> > >  {
>> > > @@ -495,19 +514,16 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
>> > >                       continue;
>> > >
>> > >               (*count)++;
>> > > -             if (ext_tree_layoutupdate_size(*count) > buffer_size) {
>> > > +             if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
>> > >                       /* keep counting.. */
>> > >                       ret = -ENOSPC;
>> > >                       continue;
>> > >               }
>> > >
>> > > -             p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
>> > > -                             NFS4_DEVICEID4_SIZE);
>> > > -             p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
>> > > -             p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
>> > > -             p = xdr_encode_hyper(p, 0LL);
>> > > -             *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
>> > > -
>> > > +             if (bl->bl_scsi_layout)
>> > > +                     p = encode_scsi_range(be, p);
>> > > +             else
>> > > +                     p = encode_block_extent(be, p);
>> > >               be->be_tag = EXTENT_COMMITTING;
>> > >       }
>> > >       spin_unlock(&bl->bl_ext_lock);
>> > > @@ -536,7 +552,7 @@ retry:
>> > >       if (unlikely(ret)) {
>> > >               ext_tree_free_commitdata(arg, buffer_size);
>> > >
>> > > -             buffer_size = ext_tree_layoutupdate_size(count);
>> > > +             buffer_size = ext_tree_layoutupdate_size(bl, count);
>> > >               count = 0;
>> > >
>> > >               arg->layoutupdate_pages =
>> > > @@ -555,7 +571,7 @@ retry:
>> > >       }
>> > >
>> > >       *start_p = cpu_to_be32(count);
>> > > -     arg->layoutupdate_len = ext_tree_layoutupdate_size(count);
>> > > +     arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count);
>> > >
>> > >       if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
>> > >               void *p = start_p, *end = p + arg->layoutupdate_len;
>> > > diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
>> > > index dbe5839..9fb067a6 100644
>> > > --- a/fs/nfs/blocklayout/rpc_pipefs.c
>> > > +++ b/fs/nfs/blocklayout/rpc_pipefs.c
>> > > @@ -281,7 +281,7 @@ out:
>> > >       return ret;
>> > >  }
>> > >
>> > > -void __exit bl_cleanup_pipefs(void)
>> > > +void bl_cleanup_pipefs(void)
>> > >  {
>> > >       rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
>> > >       unregister_pernet_subsys(&nfs4blocklayout_net_ops);
>> > > --
>> > > 2.1.4

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] nfsd: add SCSI layout support
  2016-03-17 21:52   ` J. Bruce Fields
@ 2016-03-22 18:56     ` J. Bruce Fields
  0 siblings, 0 replies; 17+ messages in thread
From: J. Bruce Fields @ 2016-03-22 18:56 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: trond.myklebust, linux-nfs

On Thu, Mar 17, 2016 at 05:52:18PM -0400, J. Bruce Fields wrote:
> On Fri, Mar 04, 2016 at 08:46:17PM +0100, Christoph Hellwig wrote:
> > +int
> > +nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
> > +		u32 block_size)
> > +{
> > +	struct iomap *iomaps;
> > +	u32 nr_iomaps, expected, i;
> > +
> > +	if (len < sizeof(u32)) {
> > +		dprintk("%s: extent array too small: %u\n", __func__, len);
> > +		return -EINVAL;
> > +	}
> > +
> > +	nr_iomaps = be32_to_cpup(p++);
> > +	expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE;
> > +	if (len != expected) {
> 
> You could add any multiple of 2^32/PNFS_SCSI_RANGE_SIZE to nr_iomaps and
> still pass this check.  Then you'd probably fail the following kcalloc,
> but best to be paranoid if this is from-the-wire data.
> 
> Maybe something like this?  (Untested)

OK, I've added this, and also did some minor edits on your patches
(comment typos, split out one bit of code movement into a separate
patch).  Results in

	git://linux-nfs.org/~bfields/linux.git for-next

I'll send that along soon.

--b.

> 
> diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
> index ca1883668810..6c3b316f932e 100644
> --- a/fs/nfsd/blocklayoutxdr.c
> +++ b/fs/nfsd/blocklayoutxdr.c
> @@ -105,18 +105,22 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
>  		u32 block_size)
>  {
>  	struct iomap *iomaps;
> -	u32 nr_iomaps, expected, i;
> +	u32 nr_iomaps, i;
>  
>  	if (len < sizeof(u32)) {
>  		dprintk("%s: extent array too small: %u\n", __func__, len);
>  		return -EINVAL;
>  	}
> +	len -= sizeof(u32);
> +	if (len % PNFS_BLOCK_EXTENT_SIZE) {
> +		dprintk("%s: extent array invalid: %u\n", __func__, len);
> +		return -EINVAL;
> +	}
>  
>  	nr_iomaps = be32_to_cpup(p++);
> -	expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE;
> -	if (len != expected) {
> +	if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) {
>  		dprintk("%s: extent array size mismatch: %u/%u\n",
> -			__func__, len, expected);
> +			__func__, len, nr_iomaps);
>  		return -EINVAL;
>  	}
>  
> --b.

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2016-03-22 18:56 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-03-04 19:46 pNFS SCSI layout support V2 Christoph Hellwig
2016-03-04 19:46 ` [PATCH 1/4] nfs4.h: add SCSI layout defintions Christoph Hellwig
2016-03-04 19:46 ` [PATCH 2/4] nfs/blocklayout: add SCSI layout support Christoph Hellwig
2016-03-08 22:07   ` J. Bruce Fields
2016-03-08 22:42     ` Trond Myklebust
2016-03-17 21:01       ` J. Bruce Fields
2016-03-18 14:18         ` Trond Myklebust
2016-03-08 22:09   ` J. Bruce Fields
2016-03-04 19:46 ` [PATCH 3/4] nfsd: add a new config option for the block layout driver Christoph Hellwig
2016-03-04 19:46 ` [PATCH 4/4] nfsd: add SCSI layout support Christoph Hellwig
2016-03-08 22:15   ` J. Bruce Fields
2016-03-09 14:48     ` Christoph Hellwig
2016-03-10 22:26   ` J. Bruce Fields
2016-03-11  9:24     ` Christoph Hellwig
2016-03-11 22:52   ` J. Bruce Fields
2016-03-17 21:52   ` J. Bruce Fields
2016-03-22 18:56     ` J. Bruce Fields

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.