All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] pNFS generic device ID cache
@ 2010-04-16 15:52 andros
  2010-04-16 15:52 ` [PATCH 1/3] SQUASHME pnfs_submit: " andros
  0 siblings, 1 reply; 11+ messages in thread
From: andros @ 2010-04-16 15:52 UTC (permalink / raw)
  To: pnfs; +Cc: linux-nfs

This patch set implements a shared RCU device ID cache servicing multiple
mounts of a single layout type per meta data server (struct nfs_client).

0001-pnfs_submit-generic-device-ID-cache.patch
0002-pnfs_submit-fix-multiple-mount-set_pnfs_layoutdriver.patch
0003-pnfs-submit-file-layout-driver-generic-device-ID-cac.patch

These patches apply to the 2.6.34-rc3 pnfs branch below the following patch
which is the head of the revert patches - otherwise known as the first patch
not included in the pnfs file layout driver submission.

9a75b7356ca22b10903b507383646201cdcc9020

pnfs: filelayout: CB_NOTIFY_DEVICE support

    This reverts commit 933b55c15ed2b7fd89d8e0342bd5b2825726201a
    "SQUASHME pnfs_submit: remove filelayout CB_NOTIFY_DEVICE support"

Testing:

CONFIG_NFS_V4_1 set:

NFSv4.1/pNFS mounts:
Connectathon tests pass against GFS2/pNFS with a single AUTH_SYS mount, a double
AUTH_SYS mount, and an AUTH_SYS and AUTH_GSS/KRB5 mount (which creates
two superblocks under a struct nfs_client and both share the device id cache).

NFSv4.0 mount;
Connectathon tests pass

Did not test with multiple device ID's. I will create a mulitple device ID
test with the pynfs file layout server.

CONFIG_NFS_V4_1 not set:

NFSv4.0 mount: Connectathon tests pass.


-->Andy


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
  2010-04-16 15:52 [PATCH 0/3] pNFS generic device ID cache andros
@ 2010-04-16 15:52 ` andros
  2010-04-16 15:52   ` [PATCH 2/3] SQUASHME pnfs_submit: fix multiple mount set_pnfs_layoutdriver andros
  2010-04-16 16:04   ` [PATCH 1/3] SQUASHME pnfs_submit: " William A. (Andy) Adamson
  0 siblings, 2 replies; 11+ messages in thread
From: andros @ 2010-04-16 15:52 UTC (permalink / raw)
  To: pnfs-Xh+NVF5n0LLYtjvyW6yDsg; +Cc: linux-nfs, Andy Adamson

From: Andy Adamson <andros@netapp.com>

A shared RCU device ID cache servicing multiple mounts of a single layout type
per meta data server (struct nfs_client).

Device IDs of type deviceid4 are required by all layout types, long lived and
read at each I/O.  They are added to the deviceid cache at first reference by
a layout via GETDEVICEINFO and (currently) are only removed at umount.

Reference count the device ID cache for each mounted file system
in the initialize_mountpoint layoutdriver_io_operation.

Dereference the device id cache on file system in the uninitialize_mountpoint
layoutdriver_io_operation called at umount

Signed-off-by: Andy Adamson <andros@netapp.com>
---
 fs/nfs/pnfs.c             |  119 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfs4_pnfs.h |   27 ++++++++++
 include/linux/nfs_fs_sb.h |    1 +
 3 files changed, 147 insertions(+), 0 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 91572aa..8492aef 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -45,6 +45,7 @@
 #include <linux/nfs4.h>
 #include <linux/pnfs_xdr.h>
 #include <linux/nfs4_pnfs.h>
+#include <linux/rculist.h>
 
 #include "internal.h"
 #include "nfs4_fs.h"
@@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops = {
 
 EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
 EXPORT_SYMBOL(pnfs_register_layoutdriver);
+
+
+/* Device ID cache. Supports one layout type per struct nfs_client */
+int
+nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
+			 void (*free_callback)(struct rcu_head *))
+{
+	struct nfs4_deviceid_cache *c;
+
+	c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL);
+	if (!c)
+		return -ENOMEM;
+	spin_lock(&clp->cl_lock);
+	if (clp->cl_devid_cache != NULL) {
+		kref_get(&clp->cl_devid_cache->dc_kref);
+		spin_unlock(&clp->cl_lock);
+		dprintk("%s [kref [%d]]\n", __func__,
+			atomic_read(&clp->cl_devid_cache->dc_kref.refcount));
+		kfree(c);
+	} else {
+		spin_lock_init(&c->dc_lock);
+		INIT_LIST_HEAD(&c->dc_deviceids);
+		kref_init(&c->dc_kref);
+		c->dc_free_callback = free_callback;
+		c->dc_clp = clp;
+		clp->cl_devid_cache = c;
+		spin_unlock(&clp->cl_lock);
+		dprintk("%s [new]\n", __func__);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
+
+void
+nfs4_init_deviceid_node(struct nfs4_deviceid *d)
+{
+	INIT_LIST_HEAD(&d->de_node);
+	INIT_RCU_HEAD(&d->de_rcu);
+}
+EXPORT_SYMBOL(nfs4_init_deviceid_node);
+
+struct nfs4_deviceid *
+nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id)
+{
+	struct nfs4_deviceid *d;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
+		if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) {
+			rcu_read_unlock();
+			return d;
+		}
+	}
+	rcu_read_unlock();
+	return NULL;
+}
+EXPORT_SYMBOL(nfs4_find_deviceid);
+
+/*
+ * Add or kref_get a deviceid.
+ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
+ */
+void
+nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new)
+{
+	struct nfs4_deviceid *d;
+
+	spin_lock(&c->dc_lock);
+	list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
+		if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) {
+			spin_unlock(&c->dc_lock);
+			dprintk("%s [discard]\n", __func__);
+			c->dc_free_callback(&new->de_rcu);
+		}
+	}
+	list_add_rcu(&new->de_node, &c->dc_deviceids);
+	spin_unlock(&c->dc_lock);
+	dprintk("%s [new]\n", __func__);
+}
+EXPORT_SYMBOL(nfs4_add_deviceid);
+
+static int
+nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
+{
+	struct nfs4_deviceid *d;
+
+	spin_lock(&c->dc_lock);
+	list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
+		list_del_rcu(&d->de_node);
+		spin_unlock(&c->dc_lock);
+		synchronize_rcu();
+		c->dc_free_callback(&d->de_rcu);
+		return 1;
+	}
+	spin_unlock(&c->dc_lock);
+	return 0;
+}
+
+static void
+nfs4_free_deviceid_cache(struct kref *kref)
+{
+	struct nfs4_deviceid_cache *cache =
+		container_of(kref, struct nfs4_deviceid_cache, dc_kref);
+	int more = 1;
+
+	while (more)
+		more = nfs4_remove_deviceid(cache);
+	cache->dc_clp->cl_devid_cache = NULL;
+	kfree(cache);
+}
+
+void
+nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
+{
+	dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_kref.refcount));
+	kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
+}
+EXPORT_SYMBOL(nfs4_put_deviceid_cache);
diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
index 1d521f4..2a88a06 100644
--- a/include/linux/nfs4_pnfs.h
+++ b/include/linux/nfs4_pnfs.h
@@ -281,6 +281,33 @@ struct pnfs_devicelist {
 	struct pnfs_deviceid	dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
 };
 
+/*
+ * Device ID RCU cache. A device ID is unique per client ID and layout type.
+ */
+struct nfs4_deviceid_cache {
+	spinlock_t		dc_lock;
+	struct list_head	dc_deviceids;
+	struct kref		dc_kref;
+	void			(*dc_free_callback)(struct rcu_head *);
+	struct nfs_client	*dc_clp;
+};
+
+/* Device ID cache node */
+struct nfs4_deviceid {
+	struct list_head	de_node;
+	struct rcu_head		de_rcu;
+	struct pnfs_deviceid	de_id;
+};
+
+extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
+				void (*free_callback)(struct rcu_head *));
+extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *);
+extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
+extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *,
+				struct pnfs_deviceid *);
+extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
+				struct nfs4_deviceid *);
+
 /* pNFS client callback functions.
  * These operations allow the layout driver to access pNFS client
  * specific information or call pNFS client->server operations.
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 8522461..ef2e18e 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -87,6 +87,7 @@ struct nfs_client {
 	u32			cl_exchange_flags;
 	struct nfs4_session	*cl_session; 	/* sharred session */
 	struct list_head	cl_lo_inodes;	/* Inodes having layouts */
+	struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
 #endif /* CONFIG_NFS_V4_1 */
 
 #ifdef CONFIG_NFS_FSCACHE
-- 
1.6.6


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/3] SQUASHME pnfs_submit: fix multiple mount set_pnfs_layoutdriver
  2010-04-16 15:52 ` [PATCH 1/3] SQUASHME pnfs_submit: " andros
@ 2010-04-16 15:52   ` andros
  2010-04-16 15:52     ` [PATCH 3/3] SQUASHME pnfs-submit: file layout driver generic device ID cache andros
  2010-04-16 16:04   ` [PATCH 1/3] SQUASHME pnfs_submit: " William A. (Andy) Adamson
  1 sibling, 1 reply; 11+ messages in thread
From: andros @ 2010-04-16 15:52 UTC (permalink / raw)
  To: pnfs-Xh+NVF5n0LLYtjvyW6yDsg; +Cc: linux-nfs, Andy Adamson

From: Andy Adamson <andros@netapp.com>

The same struct nfs_server can enter set_pnfs_layoutdriver for mounts that
share a super block.  Don't initialize a pnfs mountpoint more than once.

Don't set the pnfs_curr_ld until the pnfs mountpoint initialization succeeds

Signed-off-by: Andy Adamson <andros@netapp.com>
---
 fs/nfs/pnfs.c |   15 ++++++++++-----
 1 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 8492aef..1d903fd 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -215,20 +215,25 @@ set_pnfs_layoutdriver(struct super_block *sb, struct nfs_fh *fh, u32 id)
 	struct pnfs_mount_type *mt;
 	struct nfs_server *server = NFS_SB(sb);
 
+	if (server->pnfs_curr_ld)
+		return;
+
 	if (id > 0 && find_pnfs(id, &mod)) {
-		dprintk("%s: Setting pNFS module\n", __func__);
-		server->pnfs_curr_ld = mod->pnfs_ld_type;
-		mt = server->pnfs_curr_ld->ld_io_ops->initialize_mountpoint(
+		mt = mod->pnfs_ld_type->ld_io_ops->initialize_mountpoint(
 			sb, fh);
 		if (!mt) {
 			printk(KERN_ERR "%s: Error initializing mount point "
 			       "for layout driver %u. ", __func__, id);
 			goto out_err;
 		}
-		/* Layout driver succeeded in initializing mountpoint */
+		/*
+		 * Layout driver succeeded in initializing mountpoint
+		 * and has taken a reference on the nfs_client cl_devid_cache
+		 */
+		server->pnfs_curr_ld = mod->pnfs_ld_type;
 		server->pnfs_mountid = mt;
-		/* Set the rpc_ops */
 		server->nfs_client->rpc_ops = &pnfs_v4_clientops;
+		dprintk("%s: pNFS module for %u set\n", __func__, id);
 		return;
 	}
 
-- 
1.6.6


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 3/3] SQUASHME pnfs-submit: file layout driver generic device ID cache
  2010-04-16 15:52   ` [PATCH 2/3] SQUASHME pnfs_submit: fix multiple mount set_pnfs_layoutdriver andros
@ 2010-04-16 15:52     ` andros
  0 siblings, 0 replies; 11+ messages in thread
From: andros @ 2010-04-16 15:52 UTC (permalink / raw)
  To: pnfs-Xh+NVF5n0LLYtjvyW6yDsg; +Cc: linux-nfs, Andy Adamson

From: Andy Adamson <andros@netapp.com>

Replace the per superblock deviceid cache with the generic deviceid cache.

Embed struct nfs4_deviceid into struct nfs4_file_layout_dsaddr, the file layout
specific deviceid structure.  Provide a free_deviceid_callback.

Signed-off-by: Andy Adamson <andros@netapp.com>
---
 fs/nfs/client.c            |    1 +
 fs/nfs/nfs4filelayout.c    |   31 +++-----
 fs/nfs/nfs4filelayout.h    |   12 +--
 fs/nfs/nfs4filelayoutdev.c |  190 ++++++++------------------------------------
 4 files changed, 51 insertions(+), 183 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e1d1646..82775b7 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -38,6 +38,7 @@
 #include <net/ipv6.h>
 #include <linux/nfs_xdr.h>
 #include <linux/sunrpc/bc_xprt.h>
+#include <linux/nfs4_pnfs.h>
 
 #include <asm/system.h>
 
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 0530b59..c155586 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -76,17 +76,11 @@ filelayout_initialize_mountpoint(struct super_block *sb, struct nfs_fh *fh)
 {
 	struct filelayout_mount_type *fl_mt;
 	struct pnfs_mount_type *mt;
-	int status;
 
 	fl_mt = kmalloc(sizeof(struct filelayout_mount_type), GFP_KERNEL);
 	if (!fl_mt)
 		goto error_ret;
 
-	/* Initialize nfs4 file layout specific device list structure */
-	fl_mt->hlist = kmalloc(sizeof(struct nfs4_pnfs_dev_hlist), GFP_KERNEL);
-	if (!fl_mt->hlist)
-		goto cleanup_fl_mt;
-
 	mt = kmalloc(sizeof(struct pnfs_mount_type), GFP_KERNEL);
 	if (!mt)
 		goto cleanup_fl_mt;
@@ -94,11 +88,11 @@ filelayout_initialize_mountpoint(struct super_block *sb, struct nfs_fh *fh)
 	fl_mt->fl_sb = sb;
 	mt->mountid = (void *)fl_mt;
 
-	status = nfs4_pnfs_devlist_init(fl_mt->hlist);
-	if (status)
+	if (nfs4_alloc_init_deviceid_cache(NFS_SB(sb)->nfs_client,
+					   nfs4_fl_free_deviceid_callback))
 		goto cleanup_mt;
 
-	dprintk("%s: device list has been initialized successfully\n",
+	dprintk("%s: deviceid cache has been initialized successfully\n",
 		__func__);
 	return mt;
 
@@ -106,11 +100,10 @@ cleanup_mt: ;
 	kfree(mt);
 
 cleanup_fl_mt: ;
-	kfree(fl_mt->hlist);
 	kfree(fl_mt);
 
 error_ret: ;
-	printk(KERN_WARNING "%s: device list could not be initialized\n",
+	printk(KERN_WARNING "%s: deviceid cache could not be initialized\n",
 		__func__);
 
 	return NULL;
@@ -123,13 +116,14 @@ filelayout_uninitialize_mountpoint(struct pnfs_mount_type *mountid)
 {
 	struct filelayout_mount_type *fl_mt = NULL;
 
+	dprintk("--> %s\n", __func__);
 	if (mountid) {
-		fl_mt = (struct filelayout_mount_type *)mountid->mountid;
+		struct nfs4_deviceid_cache *cache;
 
-		if (fl_mt != NULL) {
-			nfs4_pnfs_devlist_destroy(fl_mt->hlist);
-			kfree(fl_mt);
-		}
+		fl_mt = (struct filelayout_mount_type *)mountid->mountid;
+		cache = NFS_SB(fl_mt->fl_sb)->nfs_client->cl_devid_cache;
+		nfs4_put_deviceid_cache(cache);
+		kfree(fl_mt);
 		kfree(mountid);
 	}
 	return 0;
@@ -381,8 +375,7 @@ filelayout_check_layout(struct pnfs_layout_type *lo,
 	struct nfs_server *nfss = NFS_SERVER(PNFS_INODE(lo));
 
 	dprintk("--> %s\n", __func__);
-	dsaddr = nfs4_pnfs_device_item_find(FILE_MT(PNFS_INODE(lo))->hlist,
-					     &fl->dev_id);
+	dsaddr = nfs4_pnfs_device_item_find(nfss->nfs_client, &fl->dev_id);
 	if (dsaddr == NULL) {
 		dsaddr = get_device_info(PNFS_INODE(lo), &fl->dev_id);
 		if (dsaddr == NULL) {
@@ -618,7 +611,7 @@ filelayout_commit(struct pnfs_layout_type *layoutid, int sync,
 	stripesz = filelayout_get_stripesize(layoutid);
 	dprintk("%s stripesize %Zd\n", __func__, stripesz);
 
-	dsaddr = nfs4_pnfs_device_item_find(FILE_MT(data->inode)->hlist,
+	dsaddr = nfs4_pnfs_device_item_find(NFS_SERVER(data->inode)->nfs_client,
 					     &nfslay->dev_id);
 	if (dsaddr == NULL) {
 		data->pdata.pnfs_error = -EIO;
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 12498a2..2cb05bd 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -43,8 +43,7 @@ struct nfs4_pnfs_ds {
 };
 
 struct nfs4_file_layout_dsaddr {
-	struct hlist_node	hash_node;   /* nfs4_pnfs_dev_hlist dev_list */
-	struct pnfs_deviceid	dev_id;
+	struct nfs4_deviceid	deviceid;
 	u32 			stripe_count;
 	u8			*stripe_indices;
 	u32			ds_num;
@@ -86,15 +85,13 @@ struct nfs4_filelayout {
 
 struct filelayout_mount_type {
 	struct super_block *fl_sb;
-	struct nfs4_pnfs_dev_hlist *hlist;
 };
 
 extern struct pnfs_client_operations *pnfs_callback_ops;
 
+extern void nfs4_fl_free_deviceid_callback(struct rcu_head *);
 extern void print_ds(struct nfs4_pnfs_ds *ds);
 char *deviceid_fmt(const struct pnfs_deviceid *dev_id);
-int  nfs4_pnfs_devlist_init(struct nfs4_pnfs_dev_hlist *hlist);
-void nfs4_pnfs_devlist_destroy(struct nfs4_pnfs_dev_hlist *hlist);
 int nfs4_pnfs_dserver_get(struct pnfs_layout_segment *lseg,
 			  loff_t offset,
 			  size_t count,
@@ -102,9 +99,8 @@ int nfs4_pnfs_dserver_get(struct pnfs_layout_segment *lseg,
 u32 filelayout_dserver_get_index(loff_t offset,
 				 struct nfs4_file_layout_dsaddr *di,
 				 struct nfs4_filelayout_segment *layout);
-struct nfs4_file_layout_dsaddr *
-nfs4_pnfs_device_item_find(struct nfs4_pnfs_dev_hlist *hlist,
-			   struct pnfs_deviceid *dev_id);
+extern struct nfs4_file_layout_dsaddr *
+nfs4_pnfs_device_item_find(struct nfs_client *, struct pnfs_deviceid *dev_id);
 struct nfs4_file_layout_dsaddr *
 get_device_info(struct inode *inode, struct pnfs_deviceid *dev_id);
 
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 813ddbb..411ffcb 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -45,6 +45,7 @@
 
 #include <linux/utsname.h>
 #include <linux/vmalloc.h>
+#include <linux/nfs4_pnfs.h>
 #include <linux/pnfs_xdr.h>
 #include "nfs4filelayout.h"
 #include "internal.h"
@@ -98,42 +99,6 @@ deviceid_fmt(const struct pnfs_deviceid *dev_id)
 	return buf;
 }
 
-unsigned long
-_deviceid_hash(const struct pnfs_deviceid *dev_id)
-{
-	unsigned char *cptr = (unsigned char *)dev_id->data;
-	unsigned int nbytes = NFS4_PNFS_DEVICEID4_SIZE;
-	u64 x = 0;
-
-	while (nbytes--) {
-		x *= 37;
-		x += *cptr++;
-	}
-	return x & NFS4_PNFS_DEV_HASH_MASK;
-}
-
-/* Assumes lock is held */
-static inline struct nfs4_file_layout_dsaddr *
-_device_lookup(struct nfs4_pnfs_dev_hlist *hlist,
-	       const struct pnfs_deviceid *dev_id)
-{
-	unsigned long      hash;
-	struct hlist_node *np;
-
-	dprintk("_device_lookup: dev_id=%s\n", deviceid_fmt(dev_id));
-
-	hash = _deviceid_hash(dev_id);
-
-	hlist_for_each(np, &hlist->dev_list[hash]) {
-		struct nfs4_file_layout_dsaddr *dsaddr;
-		dsaddr = hlist_entry(np, struct nfs4_file_layout_dsaddr,
-				  hash_node);
-		if (!memcmp(&dsaddr->dev_id, dev_id, NFS4_PNFS_DEVICEID4_SIZE))
-			return dsaddr;
-	}
-	return NULL;
-}
-
 /* nfs4_ds_cache_lock is held */
 static inline struct nfs4_pnfs_ds *
 _data_server_lookup(u32 ip_addr, u32 port)
@@ -152,22 +117,6 @@ _data_server_lookup(u32 ip_addr, u32 port)
 	return NULL;
 }
 
-
-/* Assumes lock is held */
-static inline void
-_device_add(struct nfs4_pnfs_dev_hlist *hlist,
-	    struct nfs4_file_layout_dsaddr *dsaddr)
-{
-	unsigned long      hash;
-
-	dprintk("_device_add: dev_id=%s ds_list:\n",
-		deviceid_fmt(&dsaddr->dev_id));
-	print_ds_list(dsaddr);
-
-	hash = _deviceid_hash(&dsaddr->dev_id);
-	hlist_add_head(&dsaddr->hash_node, &hlist->dev_list[hash]);
-}
-
 /* Create an rpc to the data server defined in 'dev_list' */
 static int
 nfs4_pnfs_ds_create(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
@@ -269,118 +218,47 @@ out_put:
 static void
 destroy_ds(struct nfs4_pnfs_ds *ds)
 {
+	dprintk("--> %s\n", __func__);
+	print_ds(ds);
+
 	if (ds->ds_clp)
 		nfs_put_client(ds->ds_clp);
 	kfree(ds);
 }
 
-/* Assumes lock is NOT held */
 static void
-nfs4_pnfs_device_destroy(struct nfs4_file_layout_dsaddr *dsaddr,
-			 struct nfs4_pnfs_dev_hlist *hlist)
+nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 {
 	struct nfs4_pnfs_ds *ds;
-	LIST_HEAD(release);
 	int i;
 
-	if (!dsaddr)
-		return;
-
-	dprintk("%s: dev_id=%s\ndev_list:\n", __func__,
-		deviceid_fmt(&dsaddr->dev_id));
-	print_ds_list(dsaddr);
-
-	write_lock(&hlist->dev_lock);
-	hlist_del_init(&dsaddr->hash_node);
+	dprintk("%s: device id=%s\n", __func__,
+		deviceid_fmt(&dsaddr->deviceid.de_id));
 
 	for (i = 0; i < dsaddr->ds_num; i++) {
 		ds = dsaddr->ds_list[i];
 		if (ds != NULL) {
-			/* if we are last user - move to release list */
 			if (atomic_dec_and_lock(&ds->ds_count,
 						&nfs4_ds_cache_lock)) {
 				list_del_init(&ds->ds_node);
 				spin_unlock(&nfs4_ds_cache_lock);
-				list_add(&ds->ds_node, &release);
+				destroy_ds(ds);
 			}
 		}
 	}
-	write_unlock(&hlist->dev_lock);
-	while (!list_empty(&release)) {
-		ds = list_entry(release.next, struct nfs4_pnfs_ds, ds_node);
-		list_del(&ds->ds_node);
-		destroy_ds(ds);
-	}
+	kfree(dsaddr->stripe_indices);
 	kfree(dsaddr);
 }
 
-int
-nfs4_pnfs_devlist_init(struct nfs4_pnfs_dev_hlist *hlist)
-{
-	int i;
-
-	rwlock_init(&hlist->dev_lock);
-
-	for (i = 0; i < NFS4_PNFS_DEV_HASH_SIZE; i++) {
-		INIT_HLIST_HEAD(&hlist->dev_list[i]);
-	}
-
-	return 0;
-}
-
-/* De-alloc all devices for a mount point.  This is called in
- * nfs4_kill_super.
- */
 void
-nfs4_pnfs_devlist_destroy(struct nfs4_pnfs_dev_hlist *hlist)
-{
-	int i;
-
-	if (hlist == NULL)
-		return;
-
-	/* No lock held, as synchronization should occur at upper levels */
-	for (i = 0; i < NFS4_PNFS_DEV_HASH_SIZE; i++) {
-		struct hlist_node *np, *next;
-
-		hlist_for_each_safe(np, next, &hlist->dev_list[i]) {
-			struct nfs4_file_layout_dsaddr *dsaddr;
-			dsaddr = hlist_entry(np,
-					     struct nfs4_file_layout_dsaddr,
-					     hash_node);
-			/* nfs4_pnfs_device_destroy grabs hlist->dev_lock */
-			nfs4_pnfs_device_destroy(dsaddr, hlist);
-		}
-	}
-}
-
-/*
- * Add the device to the list of available devices for this mount point.
- * The * rpc client is created during first I/O.
- */
-static int
-nfs4_pnfs_device_add(struct filelayout_mount_type *mt,
-		     struct nfs4_file_layout_dsaddr *dsaddr)
+nfs4_fl_free_deviceid_callback(struct rcu_head *rcu)
 {
-	struct nfs4_file_layout_dsaddr *tmp_dsaddr;
-	struct nfs4_pnfs_dev_hlist *hlist = mt->hlist;
-
-	dprintk("nfs4_pnfs_device_add\n");
-
-	/* Write lock, do lookup again, and then add device */
-	write_lock(&hlist->dev_lock);
-	tmp_dsaddr = _device_lookup(hlist, &dsaddr->dev_id);
-	if (tmp_dsaddr == NULL)
-		_device_add(hlist, dsaddr);
-	write_unlock(&hlist->dev_lock);
-
-	/* Cleanup, if device was recently added */
-	if (tmp_dsaddr != NULL) {
-		dprintk(" device found, not adding (after creation)\n");
-		nfs4_pnfs_device_destroy(dsaddr, hlist);
-	}
+	struct nfs4_deviceid *device =
+		container_of(rcu, struct nfs4_deviceid, de_rcu);
+	struct nfs4_file_layout_dsaddr *dsaddr =
+		container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
 
-	return 0;
+	nfs4_fl_free_deviceid(dsaddr);
 }
 
 static void
@@ -514,7 +392,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev)
 	dsaddr->stripe_count = cnt;
 	dsaddr->ds_num = num;
 
-	memcpy(&dsaddr->dev_id, &pdev->dev_id, NFS4_PNFS_DEVICEID4_SIZE);
+	memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id,
+	       NFS4_PNFS_DEVICEID4_SIZE);
 
 	/* Go back an read stripe indices */
 	p = indicesp;
@@ -553,19 +432,20 @@ decode_device(struct inode *ino, struct pnfs_device *pdev)
 			}
 		}
 	}
+	nfs4_init_deviceid_node(&dsaddr->deviceid);
+
 	return dsaddr;
 
 out_err_free:
-	nfs4_pnfs_device_destroy(dsaddr, FILE_MT(ino)->hlist);
+	nfs4_fl_free_deviceid(dsaddr);
 out_err:
 	dprintk("%s ERROR: returning NULL\n", __func__);
 	return NULL;
 }
 
-/* Decode the opaque device specified in 'dev'
- * and add it to the list of available devices for this
- * mount point.
- * Must at some point be followed up with nfs4_pnfs_device_destroy
+/*
+ * Decode the opaque device specified in 'dev'
+ * and add it to the list of available devices
  */
 static struct nfs4_file_layout_dsaddr*
 decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
@@ -574,14 +454,13 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
 
 	dsaddr = decode_device(inode, dev);
 	if (!dsaddr) {
-		printk(KERN_WARNING "%s: Could not decode device\n",
+		printk(KERN_WARNING "%s: Could not decode or add device\n",
 			__func__);
-		nfs4_pnfs_device_destroy(dsaddr, FILE_MT(inode)->hlist);
 		return NULL;
 	}
 
-	if (nfs4_pnfs_device_add(FILE_MT(inode), dsaddr))
-		return NULL;
+	nfs4_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
+			 &dsaddr->deviceid);
 
 	return dsaddr;
 }
@@ -660,16 +539,15 @@ out_free:
 }
 
 struct nfs4_file_layout_dsaddr *
-nfs4_pnfs_device_item_find(struct nfs4_pnfs_dev_hlist *hlist,
-			   struct pnfs_deviceid *dev_id)
+nfs4_pnfs_device_item_find(struct nfs_client *clp, struct pnfs_deviceid *id)
 {
-	struct nfs4_file_layout_dsaddr *dsaddr;
-
-	read_lock(&hlist->dev_lock);
-	dsaddr = _device_lookup(hlist, dev_id);
-	read_unlock(&hlist->dev_lock);
+	struct nfs4_deviceid *d;
 
-	return dsaddr;
+	d = nfs4_find_deviceid(clp->cl_devid_cache, id);
+	dprintk("%s device id (%s) nfs4_deviceid %p\n", __func__,
+		deviceid_fmt(id), d);
+	return (d == NULL) ? NULL :
+		container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
 }
 
 /* Want res = ((offset / layout->stripe_unit) % dsaddr->stripe_count)
@@ -707,7 +585,7 @@ nfs4_pnfs_dserver_get(struct pnfs_layout_segment *lseg,
 	if (!layout)
 		return 1;
 
-	dsaddr = nfs4_pnfs_device_item_find(FILE_MT(inode)->hlist,
+	dsaddr = nfs4_pnfs_device_item_find(NFS_SERVER(inode)->nfs_client,
 					    &layout->dev_id);
 	if (dsaddr == NULL)
 		return 1;
-- 
1.6.6


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
  2010-04-16 15:52 ` [PATCH 1/3] SQUASHME pnfs_submit: " andros
  2010-04-16 15:52   ` [PATCH 2/3] SQUASHME pnfs_submit: fix multiple mount set_pnfs_layoutdriver andros
@ 2010-04-16 16:04   ` William A. (Andy) Adamson
       [not found]     ` <u2n89c397151004160904m9e862360xcaf0e187640b0177-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 1 reply; 11+ messages in thread
From: William A. (Andy) Adamson @ 2010-04-16 16:04 UTC (permalink / raw)
  To: pnfs; +Cc: linux-nfs, Andy Adamson

On Fri, Apr 16, 2010 at 11:52 AM,  <andros@netapp.com> wrote:
> From: Andy Adamson <andros@netapp.com>
>
> A shared RCU device ID cache servicing multiple mounts of a single la=
yout type
> per meta data server (struct nfs_client).
>
> Device IDs of type deviceid4 are required by all layout types, long l=
ived and
> read at each I/O. =A0They are added to the deviceid cache at first re=
ference by
> a layout via GETDEVICEINFO and (currently) are only removed at umount=
=2E
>
> Reference count the device ID cache for each mounted file system
> in the initialize_mountpoint layoutdriver_io_operation.
>
> Dereference the device id cache on file system in the uninitialize_mo=
untpoint
> layoutdriver_io_operation called at umount
>
> Signed-off-by: Andy Adamson <andros@netapp.com>
> ---
> =A0fs/nfs/pnfs.c =A0 =A0 =A0 =A0 =A0 =A0 | =A0119 +++++++++++++++++++=
++++++++++++++++++++++++++
> =A0include/linux/nfs4_pnfs.h | =A0 27 ++++++++++
> =A0include/linux/nfs_fs_sb.h | =A0 =A01 +
> =A03 files changed, 147 insertions(+), 0 deletions(-)
>
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 91572aa..8492aef 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -45,6 +45,7 @@
> =A0#include <linux/nfs4.h>
> =A0#include <linux/pnfs_xdr.h>
> =A0#include <linux/nfs4_pnfs.h>
> +#include <linux/rculist.h>
>
> =A0#include "internal.h"
> =A0#include "nfs4_fs.h"
> @@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops =3D {
>
> =A0EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
> =A0EXPORT_SYMBOL(pnfs_register_layoutdriver);
> +
> +
> +/* Device ID cache. Supports one layout type per struct nfs_client *=
/
> +int
> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0void (*free_callback=
)(struct rcu_head *))
> +{
> + =A0 =A0 =A0 struct nfs4_deviceid_cache *c;
> +
> + =A0 =A0 =A0 c =3D kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_K=
ERNEL);
> + =A0 =A0 =A0 if (!c)
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 return -ENOMEM;
> + =A0 =A0 =A0 spin_lock(&clp->cl_lock);
> + =A0 =A0 =A0 if (clp->cl_devid_cache !=3D NULL) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kref_get(&clp->cl_devid_cache->dc_kref)=
;
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [kref [%d]]\n", __func__,
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 atomic_read(&clp->cl_de=
vid_cache->dc_kref.refcount));
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kfree(c);
> + =A0 =A0 =A0 } else {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_lock_init(&c->dc_lock);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 INIT_LIST_HEAD(&c->dc_deviceids);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kref_init(&c->dc_kref);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback =3D free_callback;
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_clp =3D clp;
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 clp->cl_devid_cache =3D c;
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
> + =A0 =A0 =A0 }
> + =A0 =A0 =A0 return 0;
> +}
> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
> +
> +void
> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
> +{
> + =A0 =A0 =A0 INIT_LIST_HEAD(&d->de_node);
> + =A0 =A0 =A0 INIT_RCU_HEAD(&d->de_rcu);
> +}
> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
> +
> +struct nfs4_deviceid *
> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_device=
id *id)
> +{
> + =A0 =A0 =A0 struct nfs4_deviceid *d;
> +
> + =A0 =A0 =A0 rcu_read_lock();
> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, id, NFS4_PNFS_DE=
VICEID4_SIZE)) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 rcu_read_unlock();
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 return d;
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
> + =A0 =A0 =A0 }
> + =A0 =A0 =A0 rcu_read_unlock();
> + =A0 =A0 =A0 return NULL;
> +}
> +EXPORT_SYMBOL(nfs4_find_deviceid);
> +
> +/*
> + * Add or kref_get a deviceid.
> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, =
discard new
> + */
> +void
> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_devicei=
d *new)
> +{
> + =A0 =A0 =A0 struct nfs4_deviceid *d;
> +
> + =A0 =A0 =A0 spin_lock(&c->dc_lock);
> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, &new->de_id, NFS=
4_PNFS_DEVICEID4_SIZE)) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lock=
);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [discard]\n=
", __func__);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback(&ne=
w->de_rcu);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
> + =A0 =A0 =A0 }
> + =A0 =A0 =A0 list_add_rcu(&new->de_node, &c->dc_deviceids);
> + =A0 =A0 =A0 spin_unlock(&c->dc_lock);
> + =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
> +}
> +EXPORT_SYMBOL(nfs4_add_deviceid);
> +
> +static int
> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
> +{
> + =A0 =A0 =A0 struct nfs4_deviceid *d;
> +
> + =A0 =A0 =A0 spin_lock(&c->dc_lock);
> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 list_del_rcu(&d->de_node);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lock);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 synchronize_rcu();
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback(&d->de_rcu);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 return 1;
> + =A0 =A0 =A0 }
> + =A0 =A0 =A0 spin_unlock(&c->dc_lock);
> + =A0 =A0 =A0 return 0;
> +}
> +
> +static void
> +nfs4_free_deviceid_cache(struct kref *kref)
> +{
> + =A0 =A0 =A0 struct nfs4_deviceid_cache *cache =3D
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 container_of(kref, struct nfs4_deviceid=
_cache, dc_kref);
> + =A0 =A0 =A0 int more =3D 1;
> +
> + =A0 =A0 =A0 while (more)
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 more =3D nfs4_remove_deviceid(cache);
> + =A0 =A0 =A0 cache->dc_clp->cl_devid_cache =3D NULL;

Need to take the cl_lock around this assignment

spin_lock(&cache->dc_clp->cl_lock);
cache->dc_clp->cl_devid_cache =3D NULL
spin_unlock(&cache->dc_clp->cl_lock);


> + =A0 =A0 =A0 kfree(cache);
> +}
> +
> +void
> +nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
> +{
> + =A0 =A0 =A0 dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_kref.=
refcount));
> + =A0 =A0 =A0 kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
> +}
> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
> index 1d521f4..2a88a06 100644
> --- a/include/linux/nfs4_pnfs.h
> +++ b/include/linux/nfs4_pnfs.h
> @@ -281,6 +281,33 @@ struct pnfs_devicelist {
> =A0 =A0 =A0 =A0struct pnfs_deviceid =A0 =A0dev_id[NFS4_PNFS_GETDEVLIS=
T_MAXNUM];
> =A0};
>
> +/*
> + * Device ID RCU cache. A device ID is unique per client ID and layo=
ut type.
> + */
> +struct nfs4_deviceid_cache {
> + =A0 =A0 =A0 spinlock_t =A0 =A0 =A0 =A0 =A0 =A0 =A0dc_lock;
> + =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0dc_deviceids;
> + =A0 =A0 =A0 struct kref =A0 =A0 =A0 =A0 =A0 =A0 dc_kref;
> + =A0 =A0 =A0 void =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0(*dc_free_c=
allback)(struct rcu_head *);
> + =A0 =A0 =A0 struct nfs_client =A0 =A0 =A0 *dc_clp;
> +};
> +
> +/* Device ID cache node */
> +struct nfs4_deviceid {
> + =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0de_node;
> + =A0 =A0 =A0 struct rcu_head =A0 =A0 =A0 =A0 de_rcu;
> + =A0 =A0 =A0 struct pnfs_deviceid =A0 =A0de_id;
> +};
> +
> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 void (*=
free_callback)(struct rcu_head *));
> +extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *);
> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid=
_cache *,
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struct =
pnfs_deviceid *);
> +extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struct =
nfs4_deviceid *);
> +
> =A0/* pNFS client callback functions.
> =A0* These operations allow the layout driver to access pNFS client
> =A0* specific information or call pNFS client->server operations.
> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
> index 8522461..ef2e18e 100644
> --- a/include/linux/nfs_fs_sb.h
> +++ b/include/linux/nfs_fs_sb.h
> @@ -87,6 +87,7 @@ struct nfs_client {
> =A0 =A0 =A0 =A0u32 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 cl_exchang=
e_flags;
> =A0 =A0 =A0 =A0struct nfs4_session =A0 =A0 *cl_session; =A0 =A0/* sha=
rred session */
> =A0 =A0 =A0 =A0struct list_head =A0 =A0 =A0 =A0cl_lo_inodes; =A0 /* I=
nodes having layouts */
> + =A0 =A0 =A0 struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS dev=
iceid cache */
> =A0#endif /* CONFIG_NFS_V4_1 */
>
> =A0#ifdef CONFIG_NFS_FSCACHE
> --
> 1.6.6
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" =
in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at =A0http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [pnfs] [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
       [not found]     ` <u2n89c397151004160904m9e862360xcaf0e187640b0177-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2010-04-21  5:59       ` Benny Halevy
  2010-04-21 15:22         ` William A. (Andy) Adamson
  0 siblings, 1 reply; 11+ messages in thread
From: Benny Halevy @ 2010-04-21  5:59 UTC (permalink / raw)
  To: William A. (Andy) Adamson; +Cc: pnfs, Andy Adamson, linux-nfs

On Apr. 16, 2010, 19:04 +0300, "William A. (Andy) Adamson" <androsadamson@gmail.com> wrote:
> On Fri, Apr 16, 2010 at 11:52 AM,  <andros@netapp.com> wrote:
>> From: Andy Adamson <andros@netapp.com>
>>
>> A shared RCU device ID cache servicing multiple mounts of a single layout type
>> per meta data server (struct nfs_client).
>>
>> Device IDs of type deviceid4 are required by all layout types, long lived and
>> read at each I/O.  They are added to the deviceid cache at first reference by
>> a layout via GETDEVICEINFO and (currently) are only removed at umount.
>>
>> Reference count the device ID cache for each mounted file system
>> in the initialize_mountpoint layoutdriver_io_operation.
>>
>> Dereference the device id cache on file system in the uninitialize_mountpoint
>> layoutdriver_io_operation called at umount
>>
>> Signed-off-by: Andy Adamson <andros@netapp.com>
>> ---
>>  fs/nfs/pnfs.c             |  119 +++++++++++++++++++++++++++++++++++++++++++++
>>  include/linux/nfs4_pnfs.h |   27 ++++++++++
>>  include/linux/nfs_fs_sb.h |    1 +
>>  3 files changed, 147 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index 91572aa..8492aef 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -45,6 +45,7 @@
>>  #include <linux/nfs4.h>
>>  #include <linux/pnfs_xdr.h>
>>  #include <linux/nfs4_pnfs.h>
>> +#include <linux/rculist.h>
>>
>>  #include "internal.h"
>>  #include "nfs4_fs.h"
>> @@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops = {
>>
>>  EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
>>  EXPORT_SYMBOL(pnfs_register_layoutdriver);
>> +
>> +
>> +/* Device ID cache. Supports one layout type per struct nfs_client */
>> +int
>> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
>> +                        void (*free_callback)(struct rcu_head *))
>> +{
>> +       struct nfs4_deviceid_cache *c;
>> +
>> +       c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL);
>> +       if (!c)
>> +               return -ENOMEM;
>> +       spin_lock(&clp->cl_lock);
>> +       if (clp->cl_devid_cache != NULL) {
>> +               kref_get(&clp->cl_devid_cache->dc_kref);
>> +               spin_unlock(&clp->cl_lock);
>> +               dprintk("%s [kref [%d]]\n", __func__,
>> +                       atomic_read(&clp->cl_devid_cache->dc_kref.refcount));
>> +               kfree(c);
>> +       } else {
>> +               spin_lock_init(&c->dc_lock);
>> +               INIT_LIST_HEAD(&c->dc_deviceids);
>> +               kref_init(&c->dc_kref);
>> +               c->dc_free_callback = free_callback;
>> +               c->dc_clp = clp;
>> +               clp->cl_devid_cache = c;
>> +               spin_unlock(&clp->cl_lock);
>> +               dprintk("%s [new]\n", __func__);
>> +       }
>> +       return 0;
>> +}
>> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
>> +
>> +void
>> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
>> +{
>> +       INIT_LIST_HEAD(&d->de_node);
>> +       INIT_RCU_HEAD(&d->de_rcu);
>> +}
>> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
>> +
>> +struct nfs4_deviceid *
>> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id)
>> +{
>> +       struct nfs4_deviceid *d;
>> +
>> +       rcu_read_lock();
>> +       list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>> +               if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) {
>> +                       rcu_read_unlock();
>> +                       return d;
>> +               }
>> +       }
>> +       rcu_read_unlock();

I hope this is worth the added complexity...

Out of curiosity, do you have a benchmark comparing the cost
of rcu_read_lock (and unlock) vs. spin_lock(&c->dc_lock)?

>> +       return NULL;
>> +}
>> +EXPORT_SYMBOL(nfs4_find_deviceid);
>> +
>> +/*
>> + * Add or kref_get a deviceid.
>> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
>> + */
>> +void
>> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new)
>> +{
>> +       struct nfs4_deviceid *d;
>> +
>> +       spin_lock(&c->dc_lock);
>> +       list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>> +               if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) {
>> +                       spin_unlock(&c->dc_lock);
>> +                       dprintk("%s [discard]\n", __func__);
>> +                       c->dc_free_callback(&new->de_rcu);
>> +               }
>> +       }
>> +       list_add_rcu(&new->de_node, &c->dc_deviceids);
>> +       spin_unlock(&c->dc_lock);
>> +       dprintk("%s [new]\n", __func__);
>> +}
>> +EXPORT_SYMBOL(nfs4_add_deviceid);
>> +
>> +static int
>> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
>> +{
>> +       struct nfs4_deviceid *d;
>> +
>> +       spin_lock(&c->dc_lock);
>> +       list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>> +               list_del_rcu(&d->de_node);
>> +               spin_unlock(&c->dc_lock);
>> +               synchronize_rcu();
>> +               c->dc_free_callback(&d->de_rcu);
>> +               return 1;
>> +       }
>> +       spin_unlock(&c->dc_lock);
>> +       return 0;
>> +}
>> +
>> +static void
>> +nfs4_free_deviceid_cache(struct kref *kref)
>> +{
>> +       struct nfs4_deviceid_cache *cache =
>> +               container_of(kref, struct nfs4_deviceid_cache, dc_kref);
>> +       int more = 1;
>> +
>> +       while (more)
>> +               more = nfs4_remove_deviceid(cache);
>> +       cache->dc_clp->cl_devid_cache = NULL;
> 
> Need to take the cl_lock around this assignment
> 
> spin_lock(&cache->dc_clp->cl_lock);
> cache->dc_clp->cl_devid_cache = NULL
> spin_unlock(&cache->dc_clp->cl_lock);
> 
> 

That must happen atomically before kref_put.
It's illegal to have cl_devid_cache be referenced by cache->dc_clp
without a reference count backing it up.
Otherwise, if accessed concurrently to this piece of code
someone might call kref_get while the refcount is zero.

Normally, you'd first clear the referencing pointer to prevent any
new reference to it and only then kref_put it, e.g.:

spin_lock(&cache->dc_clp->cl_lock);
tmp = cache->dc_clp->cl_devid_cache;
cache->dc_clp->cl_devid_cache = NULL
spin_unlock(&cache->dc_clp->cl_lock);
kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);

Benny

>> +       kfree(cache);
>> +}
>> +
>> +void
>> +nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
>> +{
>> +       dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_kref.refcount));
>> +       kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
>> +}
>> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
>> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
>> index 1d521f4..2a88a06 100644
>> --- a/include/linux/nfs4_pnfs.h
>> +++ b/include/linux/nfs4_pnfs.h
>> @@ -281,6 +281,33 @@ struct pnfs_devicelist {
>>        struct pnfs_deviceid    dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
>>  };
>>
>> +/*
>> + * Device ID RCU cache. A device ID is unique per client ID and layout type.
>> + */
>> +struct nfs4_deviceid_cache {
>> +       spinlock_t              dc_lock;
>> +       struct list_head        dc_deviceids;
>> +       struct kref             dc_kref;
>> +       void                    (*dc_free_callback)(struct rcu_head *);
>> +       struct nfs_client       *dc_clp;
>> +};
>> +
>> +/* Device ID cache node */
>> +struct nfs4_deviceid {
>> +       struct list_head        de_node;
>> +       struct rcu_head         de_rcu;
>> +       struct pnfs_deviceid    de_id;
>> +};
>> +
>> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
>> +                               void (*free_callback)(struct rcu_head *));
>> +extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *);
>> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
>> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *,
>> +                               struct pnfs_deviceid *);
>> +extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
>> +                               struct nfs4_deviceid *);
>> +
>>  /* pNFS client callback functions.
>>  * These operations allow the layout driver to access pNFS client
>>  * specific information or call pNFS client->server operations.
>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
>> index 8522461..ef2e18e 100644
>> --- a/include/linux/nfs_fs_sb.h
>> +++ b/include/linux/nfs_fs_sb.h
>> @@ -87,6 +87,7 @@ struct nfs_client {
>>        u32                     cl_exchange_flags;
>>        struct nfs4_session     *cl_session;    /* sharred session */
>>        struct list_head        cl_lo_inodes;   /* Inodes having layouts */
>> +       struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
>>  #endif /* CONFIG_NFS_V4_1 */
>>
>>  #ifdef CONFIG_NFS_FSCACHE
>> --
>> 1.6.6
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
> _______________________________________________
> pNFS mailing list
> pNFS@linux-nfs.org
> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [pnfs] [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
  2010-04-21  5:59       ` [pnfs] " Benny Halevy
@ 2010-04-21 15:22         ` William A. (Andy) Adamson
       [not found]           ` <l2l89c397151004210822j8b43009o3a9e78ceed901fd9-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 11+ messages in thread
From: William A. (Andy) Adamson @ 2010-04-21 15:22 UTC (permalink / raw)
  To: Benny Halevy; +Cc: pnfs, linux-nfs

On Wed, Apr 21, 2010 at 1:59 AM, Benny Halevy <bhalevy@panasas.com> wro=
te:
> On Apr. 16, 2010, 19:04 +0300, "William A. (Andy) Adamson" <androsada=
mson@gmail.com> wrote:
>> On Fri, Apr 16, 2010 at 11:52 AM, =A0<andros@netapp.com> wrote:
>>> From: Andy Adamson <andros@netapp.com>
>>>
>>> A shared RCU device ID cache servicing multiple mounts of a single =
layout type
>>> per meta data server (struct nfs_client).
>>>
>>> Device IDs of type deviceid4 are required by all layout types, long=
 lived and
>>> read at each I/O. =A0They are added to the deviceid cache at first =
reference by
>>> a layout via GETDEVICEINFO and (currently) are only removed at umou=
nt.
>>>
>>> Reference count the device ID cache for each mounted file system
>>> in the initialize_mountpoint layoutdriver_io_operation.
>>>
>>> Dereference the device id cache on file system in the uninitialize_=
mountpoint
>>> layoutdriver_io_operation called at umount
>>>
>>> Signed-off-by: Andy Adamson <andros@netapp.com>
>>> ---
>>> =A0fs/nfs/pnfs.c =A0 =A0 =A0 =A0 =A0 =A0 | =A0119 +++++++++++++++++=
++++++++++++++++++++++++++++
>>> =A0include/linux/nfs4_pnfs.h | =A0 27 ++++++++++
>>> =A0include/linux/nfs_fs_sb.h | =A0 =A01 +
>>> =A03 files changed, 147 insertions(+), 0 deletions(-)
>>>
>>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>>> index 91572aa..8492aef 100644
>>> --- a/fs/nfs/pnfs.c
>>> +++ b/fs/nfs/pnfs.c
>>> @@ -45,6 +45,7 @@
>>> =A0#include <linux/nfs4.h>
>>> =A0#include <linux/pnfs_xdr.h>
>>> =A0#include <linux/nfs4_pnfs.h>
>>> +#include <linux/rculist.h>
>>>
>>> =A0#include "internal.h"
>>> =A0#include "nfs4_fs.h"
>>> @@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops =3D =
{
>>>
>>> =A0EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
>>> =A0EXPORT_SYMBOL(pnfs_register_layoutdriver);
>>> +
>>> +
>>> +/* Device ID cache. Supports one layout type per struct nfs_client=
 */
>>> +int
>>> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0void (*free_callba=
ck)(struct rcu_head *))
>>> +{
>>> + =A0 =A0 =A0 struct nfs4_deviceid_cache *c;
>>> +
>>> + =A0 =A0 =A0 c =3D kzalloc(sizeof(struct nfs4_deviceid_cache), GFP=
_KERNEL);
>>> + =A0 =A0 =A0 if (!c)
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 return -ENOMEM;
>>> + =A0 =A0 =A0 spin_lock(&clp->cl_lock);
>>> + =A0 =A0 =A0 if (clp->cl_devid_cache !=3D NULL) {
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kref_get(&clp->cl_devid_cache->dc_kre=
f);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [kref [%d]]\n", __func__,
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 atomic_read(&clp->cl_=
devid_cache->dc_kref.refcount));
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kfree(c);
>>> + =A0 =A0 =A0 } else {
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_lock_init(&c->dc_lock);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 INIT_LIST_HEAD(&c->dc_deviceids);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kref_init(&c->dc_kref);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback =3D free_callback=
;
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_clp =3D clp;
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 clp->cl_devid_cache =3D c;
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
>>> + =A0 =A0 =A0 }
>>> + =A0 =A0 =A0 return 0;
>>> +}
>>> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
>>> +
>>> +void
>>> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
>>> +{
>>> + =A0 =A0 =A0 INIT_LIST_HEAD(&d->de_node);
>>> + =A0 =A0 =A0 INIT_RCU_HEAD(&d->de_rcu);
>>> +}
>>> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
>>> +
>>> +struct nfs4_deviceid *
>>> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_devi=
ceid *id)
>>> +{
>>> + =A0 =A0 =A0 struct nfs4_deviceid *d;
>>> +
>>> + =A0 =A0 =A0 rcu_read_lock();
>>> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_node)=
 {
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, id, NFS4_PNFS_=
DEVICEID4_SIZE)) {
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 rcu_read_unlock();
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 return d;
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
>>> + =A0 =A0 =A0 }
>>> + =A0 =A0 =A0 rcu_read_unlock();
>
> I hope this is worth the added complexity...
>
> Out of curiosity, do you have a benchmark comparing the cost
> of rcu_read_lock (and unlock) vs. spin_lock(&c->dc_lock)?

The deviceid cache is read at each I/O. If we use a spin_lock to
protect the deviceid cache, this would mean that all I/0 is serialized
behind the spin_lock even though the deviceid cache is changed
infrequently. The RCU allows readers to "run almost naked" and does
not serialize I/O behind reading the deviceid cache.

So I think it is worth it. I have not benchmarked the difference.


>
>>> + =A0 =A0 =A0 return NULL;
>>> +}
>>> +EXPORT_SYMBOL(nfs4_find_deviceid);
>>> +
>>> +/*
>>> + * Add or kref_get a deviceid.
>>> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found=
, discard new
>>> + */
>>> +void
>>> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_devic=
eid *new)
>>> +{
>>> + =A0 =A0 =A0 struct nfs4_deviceid *d;
>>> +
>>> + =A0 =A0 =A0 spin_lock(&c->dc_lock);
>>> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_node)=
 {
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, &new->de_id, N=
=46S4_PNFS_DEVICEID4_SIZE)) {
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lo=
ck);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [discard]=
\n", __func__);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback(&=
new->de_rcu);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
>>> + =A0 =A0 =A0 }
>>> + =A0 =A0 =A0 list_add_rcu(&new->de_node, &c->dc_deviceids);
>>> + =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>>> + =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
>>> +}
>>> +EXPORT_SYMBOL(nfs4_add_deviceid);
>>> +
>>> +static int
>>> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
>>> +{
>>> + =A0 =A0 =A0 struct nfs4_deviceid *d;
>>> +
>>> + =A0 =A0 =A0 spin_lock(&c->dc_lock);
>>> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_node)=
 {
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 list_del_rcu(&d->de_node);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 synchronize_rcu();
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback(&d->de_rcu);
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 return 1;
>>> + =A0 =A0 =A0 }
>>> + =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>>> + =A0 =A0 =A0 return 0;
>>> +}
>>> +
>>> +static void
>>> +nfs4_free_deviceid_cache(struct kref *kref)
>>> +{
>>> + =A0 =A0 =A0 struct nfs4_deviceid_cache *cache =3D
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 container_of(kref, struct nfs4_device=
id_cache, dc_kref);
>>> + =A0 =A0 =A0 int more =3D 1;
>>> +
>>> + =A0 =A0 =A0 while (more)
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 more =3D nfs4_remove_deviceid(cache);
>>> + =A0 =A0 =A0 cache->dc_clp->cl_devid_cache =3D NULL;
>>
>> Need to take the cl_lock around this assignment
>>
>> spin_lock(&cache->dc_clp->cl_lock);
>> cache->dc_clp->cl_devid_cache =3D NULL
>> spin_unlock(&cache->dc_clp->cl_lock);
>>
>>
>
> That must happen atomically before kref_put.
> It's illegal to have cl_devid_cache be referenced by cache->dc_clp
> without a reference count backing it up.
> Otherwise, if accessed concurrently to this piece of code
> someone might call kref_get while the refcount is zero.
>
> Normally, you'd first clear the referencing pointer to prevent any
> new reference to it and only then kref_put it, e.g.:
>
> spin_lock(&cache->dc_clp->cl_lock);
> tmp =3D cache->dc_clp->cl_devid_cache;
> cache->dc_clp->cl_devid_cache =3D NULL
> spin_unlock(&cache->dc_clp->cl_lock);
> kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);

Good point. Thanks for the review. I'll rethink and resend

-->Andy

>
> Benny
>
>>> + =A0 =A0 =A0 kfree(cache);
>>> +}
>>> +
>>> +void
>>> +nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
>>> +{
>>> + =A0 =A0 =A0 dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_kre=
f.refcount));
>>> + =A0 =A0 =A0 kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
>>> +}
>>> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
>>> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
>>> index 1d521f4..2a88a06 100644
>>> --- a/include/linux/nfs4_pnfs.h
>>> +++ b/include/linux/nfs4_pnfs.h
>>> @@ -281,6 +281,33 @@ struct pnfs_devicelist {
>>> =A0 =A0 =A0 =A0struct pnfs_deviceid =A0 =A0dev_id[NFS4_PNFS_GETDEVL=
IST_MAXNUM];
>>> =A0};
>>>
>>> +/*
>>> + * Device ID RCU cache. A device ID is unique per client ID and la=
yout type.
>>> + */
>>> +struct nfs4_deviceid_cache {
>>> + =A0 =A0 =A0 spinlock_t =A0 =A0 =A0 =A0 =A0 =A0 =A0dc_lock;
>>> + =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0dc_deviceids;
>>> + =A0 =A0 =A0 struct kref =A0 =A0 =A0 =A0 =A0 =A0 dc_kref;
>>> + =A0 =A0 =A0 void =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0(*dc_free=
_callback)(struct rcu_head *);
>>> + =A0 =A0 =A0 struct nfs_client =A0 =A0 =A0 *dc_clp;
>>> +};
>>> +
>>> +/* Device ID cache node */
>>> +struct nfs4_deviceid {
>>> + =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0de_node;
>>> + =A0 =A0 =A0 struct rcu_head =A0 =A0 =A0 =A0 de_rcu;
>>> + =A0 =A0 =A0 struct pnfs_deviceid =A0 =A0de_id;
>>> +};
>>> +
>>> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 void =
(*free_callback)(struct rcu_head *));
>>> +extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *);
>>> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
>>> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_device=
id_cache *,
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struc=
t pnfs_deviceid *);
>>> +extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struc=
t nfs4_deviceid *);
>>> +
>>> =A0/* pNFS client callback functions.
>>> =A0* These operations allow the layout driver to access pNFS client
>>> =A0* specific information or call pNFS client->server operations.
>>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
>>> index 8522461..ef2e18e 100644
>>> --- a/include/linux/nfs_fs_sb.h
>>> +++ b/include/linux/nfs_fs_sb.h
>>> @@ -87,6 +87,7 @@ struct nfs_client {
>>> =A0 =A0 =A0 =A0u32 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 cl_excha=
nge_flags;
>>> =A0 =A0 =A0 =A0struct nfs4_session =A0 =A0 *cl_session; =A0 =A0/* s=
harred session */
>>> =A0 =A0 =A0 =A0struct list_head =A0 =A0 =A0 =A0cl_lo_inodes; =A0 /*=
 Inodes having layouts */
>>> + =A0 =A0 =A0 struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS d=
eviceid cache */
>>> =A0#endif /* CONFIG_NFS_V4_1 */
>>>
>>> =A0#ifdef CONFIG_NFS_FSCACHE
>>> --
>>> 1.6.6
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs=
" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at =A0http://vger.kernel.org/majordomo-info.htm=
l
>>>
>> _______________________________________________
>> pNFS mailing list
>> pNFS@linux-nfs.org
>> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [pnfs] [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
       [not found]           ` <l2l89c397151004210822j8b43009o3a9e78ceed901fd9-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2010-04-22 11:20             ` Benny Halevy
  2010-04-22 15:47               ` William A. (Andy) Adamson
  0 siblings, 1 reply; 11+ messages in thread
From: Benny Halevy @ 2010-04-22 11:20 UTC (permalink / raw)
  To: William A. (Andy) Adamson; +Cc: pnfs, linux-nfs

On Apr. 21, 2010, 18:22 +0300, "William A. (Andy) Adamson" <androsadamson@gmail.com> wrote:
> On Wed, Apr 21, 2010 at 1:59 AM, Benny Halevy <bhalevy@panasas.com> wrote:
>> On Apr. 16, 2010, 19:04 +0300, "William A. (Andy) Adamson" <androsadamson@gmail.com> wrote:
>>> On Fri, Apr 16, 2010 at 11:52 AM,  <andros@netapp.com> wrote:
>>>> From: Andy Adamson <andros@netapp.com>
>>>>
>>>> A shared RCU device ID cache servicing multiple mounts of a single layout type
>>>> per meta data server (struct nfs_client).
>>>>
>>>> Device IDs of type deviceid4 are required by all layout types, long lived and
>>>> read at each I/O.  They are added to the deviceid cache at first reference by
>>>> a layout via GETDEVICEINFO and (currently) are only removed at umount.
>>>>
>>>> Reference count the device ID cache for each mounted file system
>>>> in the initialize_mountpoint layoutdriver_io_operation.
>>>>
>>>> Dereference the device id cache on file system in the uninitialize_mountpoint
>>>> layoutdriver_io_operation called at umount
>>>>
>>>> Signed-off-by: Andy Adamson <andros@netapp.com>
>>>> ---
>>>>  fs/nfs/pnfs.c             |  119 +++++++++++++++++++++++++++++++++++++++++++++
>>>>  include/linux/nfs4_pnfs.h |   27 ++++++++++
>>>>  include/linux/nfs_fs_sb.h |    1 +
>>>>  3 files changed, 147 insertions(+), 0 deletions(-)
>>>>
>>>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>>>> index 91572aa..8492aef 100644
>>>> --- a/fs/nfs/pnfs.c
>>>> +++ b/fs/nfs/pnfs.c
>>>> @@ -45,6 +45,7 @@
>>>>  #include <linux/nfs4.h>
>>>>  #include <linux/pnfs_xdr.h>
>>>>  #include <linux/nfs4_pnfs.h>
>>>> +#include <linux/rculist.h>
>>>>
>>>>  #include "internal.h"
>>>>  #include "nfs4_fs.h"
>>>> @@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops = {
>>>>
>>>>  EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
>>>>  EXPORT_SYMBOL(pnfs_register_layoutdriver);
>>>> +
>>>> +
>>>> +/* Device ID cache. Supports one layout type per struct nfs_client */
>>>> +int
>>>> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
>>>> +                        void (*free_callback)(struct rcu_head *))
>>>> +{
>>>> +       struct nfs4_deviceid_cache *c;
>>>> +
>>>> +       c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL);
>>>> +       if (!c)
>>>> +               return -ENOMEM;
>>>> +       spin_lock(&clp->cl_lock);
>>>> +       if (clp->cl_devid_cache != NULL) {
>>>> +               kref_get(&clp->cl_devid_cache->dc_kref);
>>>> +               spin_unlock(&clp->cl_lock);
>>>> +               dprintk("%s [kref [%d]]\n", __func__,
>>>> +                       atomic_read(&clp->cl_devid_cache->dc_kref.refcount));
>>>> +               kfree(c);
>>>> +       } else {
>>>> +               spin_lock_init(&c->dc_lock);
>>>> +               INIT_LIST_HEAD(&c->dc_deviceids);
>>>> +               kref_init(&c->dc_kref);
>>>> +               c->dc_free_callback = free_callback;
>>>> +               c->dc_clp = clp;
>>>> +               clp->cl_devid_cache = c;
>>>> +               spin_unlock(&clp->cl_lock);
>>>> +               dprintk("%s [new]\n", __func__);
>>>> +       }
>>>> +       return 0;
>>>> +}
>>>> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
>>>> +
>>>> +void
>>>> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
>>>> +{
>>>> +       INIT_LIST_HEAD(&d->de_node);
>>>> +       INIT_RCU_HEAD(&d->de_rcu);
>>>> +}
>>>> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
>>>> +
>>>> +struct nfs4_deviceid *
>>>> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id)
>>>> +{
>>>> +       struct nfs4_deviceid *d;
>>>> +
>>>> +       rcu_read_lock();
>>>> +       list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>>>> +               if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) {
>>>> +                       rcu_read_unlock();
>>>> +                       return d;
>>>> +               }
>>>> +       }
>>>> +       rcu_read_unlock();
>>
>> I hope this is worth the added complexity...
>>
>> Out of curiosity, do you have a benchmark comparing the cost
>> of rcu_read_lock (and unlock) vs. spin_lock(&c->dc_lock)?
> 
> The deviceid cache is read at each I/O. If we use a spin_lock to

Yeah, I see where this goes...
In the objects layout driver we get a reference on the device structure
at alloc_lseg time and keep a pointer to it throughout the lseg's life time.
This saves the deviceid lookup on every I/O.

Benny

> protect the deviceid cache, this would mean that all I/0 is serialized
> behind the spin_lock even though the deviceid cache is changed
> infrequently. The RCU allows readers to "run almost naked" and does
> not serialize I/O behind reading the deviceid cache.
> 
> So I think it is worth it. I have not benchmarked the difference.
> 
> 
>>
>>>> +       return NULL;
>>>> +}
>>>> +EXPORT_SYMBOL(nfs4_find_deviceid);
>>>> +
>>>> +/*
>>>> + * Add or kref_get a deviceid.
>>>> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
>>>> + */
>>>> +void
>>>> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new)
>>>> +{
>>>> +       struct nfs4_deviceid *d;
>>>> +
>>>> +       spin_lock(&c->dc_lock);
>>>> +       list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>>>> +               if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) {
>>>> +                       spin_unlock(&c->dc_lock);
>>>> +                       dprintk("%s [discard]\n", __func__);
>>>> +                       c->dc_free_callback(&new->de_rcu);
>>>> +               }
>>>> +       }
>>>> +       list_add_rcu(&new->de_node, &c->dc_deviceids);
>>>> +       spin_unlock(&c->dc_lock);
>>>> +       dprintk("%s [new]\n", __func__);
>>>> +}
>>>> +EXPORT_SYMBOL(nfs4_add_deviceid);
>>>> +
>>>> +static int
>>>> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
>>>> +{
>>>> +       struct nfs4_deviceid *d;
>>>> +
>>>> +       spin_lock(&c->dc_lock);
>>>> +       list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>>>> +               list_del_rcu(&d->de_node);
>>>> +               spin_unlock(&c->dc_lock);
>>>> +               synchronize_rcu();
>>>> +               c->dc_free_callback(&d->de_rcu);
>>>> +               return 1;
>>>> +       }
>>>> +       spin_unlock(&c->dc_lock);
>>>> +       return 0;
>>>> +}
>>>> +
>>>> +static void
>>>> +nfs4_free_deviceid_cache(struct kref *kref)
>>>> +{
>>>> +       struct nfs4_deviceid_cache *cache =
>>>> +               container_of(kref, struct nfs4_deviceid_cache, dc_kref);
>>>> +       int more = 1;
>>>> +
>>>> +       while (more)
>>>> +               more = nfs4_remove_deviceid(cache);
>>>> +       cache->dc_clp->cl_devid_cache = NULL;
>>>
>>> Need to take the cl_lock around this assignment
>>>
>>> spin_lock(&cache->dc_clp->cl_lock);
>>> cache->dc_clp->cl_devid_cache = NULL
>>> spin_unlock(&cache->dc_clp->cl_lock);
>>>
>>>
>>
>> That must happen atomically before kref_put.
>> It's illegal to have cl_devid_cache be referenced by cache->dc_clp
>> without a reference count backing it up.
>> Otherwise, if accessed concurrently to this piece of code
>> someone might call kref_get while the refcount is zero.
>>
>> Normally, you'd first clear the referencing pointer to prevent any
>> new reference to it and only then kref_put it, e.g.:
>>
>> spin_lock(&cache->dc_clp->cl_lock);
>> tmp = cache->dc_clp->cl_devid_cache;
>> cache->dc_clp->cl_devid_cache = NULL
>> spin_unlock(&cache->dc_clp->cl_lock);
>> kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);
> 
> Good point. Thanks for the review. I'll rethink and resend
> 
> -->Andy
> 
>>
>> Benny
>>
>>>> +       kfree(cache);
>>>> +}
>>>> +
>>>> +void
>>>> +nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
>>>> +{
>>>> +       dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_kref.refcount));
>>>> +       kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
>>>> +}
>>>> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
>>>> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
>>>> index 1d521f4..2a88a06 100644
>>>> --- a/include/linux/nfs4_pnfs.h
>>>> +++ b/include/linux/nfs4_pnfs.h
>>>> @@ -281,6 +281,33 @@ struct pnfs_devicelist {
>>>>        struct pnfs_deviceid    dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
>>>>  };
>>>>
>>>> +/*
>>>> + * Device ID RCU cache. A device ID is unique per client ID and layout type.
>>>> + */
>>>> +struct nfs4_deviceid_cache {
>>>> +       spinlock_t              dc_lock;
>>>> +       struct list_head        dc_deviceids;
>>>> +       struct kref             dc_kref;
>>>> +       void                    (*dc_free_callback)(struct rcu_head *);
>>>> +       struct nfs_client       *dc_clp;
>>>> +};
>>>> +
>>>> +/* Device ID cache node */
>>>> +struct nfs4_deviceid {
>>>> +       struct list_head        de_node;
>>>> +       struct rcu_head         de_rcu;
>>>> +       struct pnfs_deviceid    de_id;
>>>> +};
>>>> +
>>>> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
>>>> +                               void (*free_callback)(struct rcu_head *));
>>>> +extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *);
>>>> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
>>>> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *,
>>>> +                               struct pnfs_deviceid *);
>>>> +extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
>>>> +                               struct nfs4_deviceid *);
>>>> +
>>>>  /* pNFS client callback functions.
>>>>  * These operations allow the layout driver to access pNFS client
>>>>  * specific information or call pNFS client->server operations.
>>>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
>>>> index 8522461..ef2e18e 100644
>>>> --- a/include/linux/nfs_fs_sb.h
>>>> +++ b/include/linux/nfs_fs_sb.h
>>>> @@ -87,6 +87,7 @@ struct nfs_client {
>>>>        u32                     cl_exchange_flags;
>>>>        struct nfs4_session     *cl_session;    /* sharred session */
>>>>        struct list_head        cl_lo_inodes;   /* Inodes having layouts */
>>>> +       struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
>>>>  #endif /* CONFIG_NFS_V4_1 */
>>>>
>>>>  #ifdef CONFIG_NFS_FSCACHE
>>>> --
>>>> 1.6.6
>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>
>>> _______________________________________________
>>> pNFS mailing list
>>> pNFS@linux-nfs.org
>>> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs
>>


-- 
Benny Halevy
Software Architect
Panasas, Inc.
bhalevy@panasas.com
Tel/Fax: +972-3-647-8340
Mobile: +972-54-802-8340

Panasas: The Leader in Parallel Storage
www.panasas.com

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [pnfs] [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
  2010-04-22 11:20             ` Benny Halevy
@ 2010-04-22 15:47               ` William A. (Andy) Adamson
       [not found]                 ` <v2h89c397151004220847v3a31c493s4089d0cd53cf3e19-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 11+ messages in thread
From: William A. (Andy) Adamson @ 2010-04-22 15:47 UTC (permalink / raw)
  To: Benny Halevy; +Cc: pnfs, linux-nfs

On Thu, Apr 22, 2010 at 7:20 AM, Benny Halevy <bhalevy@panasas.com> wro=
te:
> On Apr. 21, 2010, 18:22 +0300, "William A. (Andy) Adamson" <androsada=
mson@gmail.com> wrote:
>> On Wed, Apr 21, 2010 at 1:59 AM, Benny Halevy <bhalevy@panasas.com> =
wrote:
>>> On Apr. 16, 2010, 19:04 +0300, "William A. (Andy) Adamson" <androsa=
damson@gmail.com> wrote:
>>>> On Fri, Apr 16, 2010 at 11:52 AM, =A0<andros@netapp.com> wrote:
>>>>> From: Andy Adamson <andros@netapp.com>
>>>>>
>>>>> A shared RCU device ID cache servicing multiple mounts of a singl=
e layout type
>>>>> per meta data server (struct nfs_client).
>>>>>
>>>>> Device IDs of type deviceid4 are required by all layout types, lo=
ng lived and
>>>>> read at each I/O. =A0They are added to the deviceid cache at firs=
t reference by
>>>>> a layout via GETDEVICEINFO and (currently) are only removed at um=
ount.
>>>>>
>>>>> Reference count the device ID cache for each mounted file system
>>>>> in the initialize_mountpoint layoutdriver_io_operation.
>>>>>
>>>>> Dereference the device id cache on file system in the uninitializ=
e_mountpoint
>>>>> layoutdriver_io_operation called at umount
>>>>>
>>>>> Signed-off-by: Andy Adamson <andros@netapp.com>
>>>>> ---
>>>>> =A0fs/nfs/pnfs.c =A0 =A0 =A0 =A0 =A0 =A0 | =A0119 +++++++++++++++=
++++++++++++++++++++++++++++++
>>>>> =A0include/linux/nfs4_pnfs.h | =A0 27 ++++++++++
>>>>> =A0include/linux/nfs_fs_sb.h | =A0 =A01 +
>>>>> =A03 files changed, 147 insertions(+), 0 deletions(-)
>>>>>
>>>>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>>>>> index 91572aa..8492aef 100644
>>>>> --- a/fs/nfs/pnfs.c
>>>>> +++ b/fs/nfs/pnfs.c
>>>>> @@ -45,6 +45,7 @@
>>>>> =A0#include <linux/nfs4.h>
>>>>> =A0#include <linux/pnfs_xdr.h>
>>>>> =A0#include <linux/nfs4_pnfs.h>
>>>>> +#include <linux/rculist.h>
>>>>>
>>>>> =A0#include "internal.h"
>>>>> =A0#include "nfs4_fs.h"
>>>>> @@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops =3D=
 {
>>>>>
>>>>> =A0EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
>>>>> =A0EXPORT_SYMBOL(pnfs_register_layoutdriver);
>>>>> +
>>>>> +
>>>>> +/* Device ID cache. Supports one layout type per struct nfs_clie=
nt */
>>>>> +int
>>>>> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0void (*free_call=
back)(struct rcu_head *))
>>>>> +{
>>>>> + =A0 =A0 =A0 struct nfs4_deviceid_cache *c;
>>>>> +
>>>>> + =A0 =A0 =A0 c =3D kzalloc(sizeof(struct nfs4_deviceid_cache), G=
=46P_KERNEL);
>>>>> + =A0 =A0 =A0 if (!c)
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 return -ENOMEM;
>>>>> + =A0 =A0 =A0 spin_lock(&clp->cl_lock);
>>>>> + =A0 =A0 =A0 if (clp->cl_devid_cache !=3D NULL) {
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kref_get(&clp->cl_devid_cache->dc_k=
ref);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [kref [%d]]\n", __func_=
_,
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 atomic_read(&clp->c=
l_devid_cache->dc_kref.refcount));
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kfree(c);
>>>>> + =A0 =A0 =A0 } else {
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_lock_init(&c->dc_lock);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 INIT_LIST_HEAD(&c->dc_deviceids);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kref_init(&c->dc_kref);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback =3D free_callba=
ck;
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_clp =3D clp;
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 clp->cl_devid_cache =3D c;
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
>>>>> + =A0 =A0 =A0 }
>>>>> + =A0 =A0 =A0 return 0;
>>>>> +}
>>>>> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
>>>>> +
>>>>> +void
>>>>> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
>>>>> +{
>>>>> + =A0 =A0 =A0 INIT_LIST_HEAD(&d->de_node);
>>>>> + =A0 =A0 =A0 INIT_RCU_HEAD(&d->de_rcu);
>>>>> +}
>>>>> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
>>>>> +
>>>>> +struct nfs4_deviceid *
>>>>> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_de=
viceid *id)
>>>>> +{
>>>>> + =A0 =A0 =A0 struct nfs4_deviceid *d;
>>>>> +
>>>>> + =A0 =A0 =A0 rcu_read_lock();
>>>>> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_nod=
e) {
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, id, NFS4_PNF=
S_DEVICEID4_SIZE)) {
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 rcu_read_unlock();
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 return d;
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
>>>>> + =A0 =A0 =A0 }
>>>>> + =A0 =A0 =A0 rcu_read_unlock();
>>>
>>> I hope this is worth the added complexity...
>>>
>>> Out of curiosity, do you have a benchmark comparing the cost
>>> of rcu_read_lock (and unlock) vs. spin_lock(&c->dc_lock)?
>>
>> The deviceid cache is read at each I/O. If we use a spin_lock to
>
> Yeah, I see where this goes...
> In the objects layout driver we get a reference on the device structu=
re
> at alloc_lseg time and keep a pointer to it throughout the lseg's lif=
e time.
> This saves the deviceid lookup on every I/O.

Perhaps that is a better way to go. How many deviceid's is 'normal'
for the object driver?

-->Andy


>
> Benny
>
>> protect the deviceid cache, this would mean that all I/0 is serializ=
ed
>> behind the spin_lock even though the deviceid cache is changed
>> infrequently. The RCU allows readers to "run almost naked" and does
>> not serialize I/O behind reading the deviceid cache.
>>
>> So I think it is worth it. I have not benchmarked the difference.
>>
>>
>>>
>>>>> + =A0 =A0 =A0 return NULL;
>>>>> +}
>>>>> +EXPORT_SYMBOL(nfs4_find_deviceid);
>>>>> +
>>>>> +/*
>>>>> + * Add or kref_get a deviceid.
>>>>> + * GETDEVICEINFOs for same deviceid can race. If deviceid is fou=
nd, discard new
>>>>> + */
>>>>> +void
>>>>> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_dev=
iceid *new)
>>>>> +{
>>>>> + =A0 =A0 =A0 struct nfs4_deviceid *d;
>>>>> +
>>>>> + =A0 =A0 =A0 spin_lock(&c->dc_lock);
>>>>> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_nod=
e) {
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, &new->de_id,=
 NFS4_PNFS_DEVICEID4_SIZE)) {
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_=
lock);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [discar=
d]\n", __func__);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback=
(&new->de_rcu);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
>>>>> + =A0 =A0 =A0 }
>>>>> + =A0 =A0 =A0 list_add_rcu(&new->de_node, &c->dc_deviceids);
>>>>> + =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>>>>> + =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
>>>>> +}
>>>>> +EXPORT_SYMBOL(nfs4_add_deviceid);
>>>>> +
>>>>> +static int
>>>>> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
>>>>> +{
>>>>> + =A0 =A0 =A0 struct nfs4_deviceid *d;
>>>>> +
>>>>> + =A0 =A0 =A0 spin_lock(&c->dc_lock);
>>>>> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_nod=
e) {
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 list_del_rcu(&d->de_node);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 synchronize_rcu();
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback(&d->de_rcu);
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 return 1;
>>>>> + =A0 =A0 =A0 }
>>>>> + =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>>>>> + =A0 =A0 =A0 return 0;
>>>>> +}
>>>>> +
>>>>> +static void
>>>>> +nfs4_free_deviceid_cache(struct kref *kref)
>>>>> +{
>>>>> + =A0 =A0 =A0 struct nfs4_deviceid_cache *cache =3D
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 container_of(kref, struct nfs4_devi=
ceid_cache, dc_kref);
>>>>> + =A0 =A0 =A0 int more =3D 1;
>>>>> +
>>>>> + =A0 =A0 =A0 while (more)
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 more =3D nfs4_remove_deviceid(cache=
);
>>>>> + =A0 =A0 =A0 cache->dc_clp->cl_devid_cache =3D NULL;
>>>>
>>>> Need to take the cl_lock around this assignment
>>>>
>>>> spin_lock(&cache->dc_clp->cl_lock);
>>>> cache->dc_clp->cl_devid_cache =3D NULL
>>>> spin_unlock(&cache->dc_clp->cl_lock);
>>>>
>>>>
>>>
>>> That must happen atomically before kref_put.
>>> It's illegal to have cl_devid_cache be referenced by cache->dc_clp
>>> without a reference count backing it up.
>>> Otherwise, if accessed concurrently to this piece of code
>>> someone might call kref_get while the refcount is zero.
>>>
>>> Normally, you'd first clear the referencing pointer to prevent any
>>> new reference to it and only then kref_put it, e.g.:
>>>
>>> spin_lock(&cache->dc_clp->cl_lock);
>>> tmp =3D cache->dc_clp->cl_devid_cache;
>>> cache->dc_clp->cl_devid_cache =3D NULL
>>> spin_unlock(&cache->dc_clp->cl_lock);
>>> kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);
>>
>> Good point. Thanks for the review. I'll rethink and resend
>>
>> -->Andy
>>
>>>
>>> Benny
>>>
>>>>> + =A0 =A0 =A0 kfree(cache);
>>>>> +}
>>>>> +
>>>>> +void
>>>>> +nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
>>>>> +{
>>>>> + =A0 =A0 =A0 dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_k=
ref.refcount));
>>>>> + =A0 =A0 =A0 kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
>>>>> +}
>>>>> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
>>>>> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.=
h
>>>>> index 1d521f4..2a88a06 100644
>>>>> --- a/include/linux/nfs4_pnfs.h
>>>>> +++ b/include/linux/nfs4_pnfs.h
>>>>> @@ -281,6 +281,33 @@ struct pnfs_devicelist {
>>>>> =A0 =A0 =A0 =A0struct pnfs_deviceid =A0 =A0dev_id[NFS4_PNFS_GETDE=
VLIST_MAXNUM];
>>>>> =A0};
>>>>>
>>>>> +/*
>>>>> + * Device ID RCU cache. A device ID is unique per client ID and =
layout type.
>>>>> + */
>>>>> +struct nfs4_deviceid_cache {
>>>>> + =A0 =A0 =A0 spinlock_t =A0 =A0 =A0 =A0 =A0 =A0 =A0dc_lock;
>>>>> + =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0dc_deviceids;
>>>>> + =A0 =A0 =A0 struct kref =A0 =A0 =A0 =A0 =A0 =A0 dc_kref;
>>>>> + =A0 =A0 =A0 void =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0(*dc_fr=
ee_callback)(struct rcu_head *);
>>>>> + =A0 =A0 =A0 struct nfs_client =A0 =A0 =A0 *dc_clp;
>>>>> +};
>>>>> +
>>>>> +/* Device ID cache node */
>>>>> +struct nfs4_deviceid {
>>>>> + =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0de_node;
>>>>> + =A0 =A0 =A0 struct rcu_head =A0 =A0 =A0 =A0 de_rcu;
>>>>> + =A0 =A0 =A0 struct pnfs_deviceid =A0 =A0de_id;
>>>>> +};
>>>>> +
>>>>> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 voi=
d (*free_callback)(struct rcu_head *));
>>>>> +extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *=
);
>>>>> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
>>>>> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_devi=
ceid_cache *,
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 str=
uct pnfs_deviceid *);
>>>>> +extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
>>>>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 str=
uct nfs4_deviceid *);
>>>>> +
>>>>> =A0/* pNFS client callback functions.
>>>>> =A0* These operations allow the layout driver to access pNFS clie=
nt
>>>>> =A0* specific information or call pNFS client->server operations.
>>>>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.=
h
>>>>> index 8522461..ef2e18e 100644
>>>>> --- a/include/linux/nfs_fs_sb.h
>>>>> +++ b/include/linux/nfs_fs_sb.h
>>>>> @@ -87,6 +87,7 @@ struct nfs_client {
>>>>> =A0 =A0 =A0 =A0u32 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 cl_exc=
hange_flags;
>>>>> =A0 =A0 =A0 =A0struct nfs4_session =A0 =A0 *cl_session; =A0 =A0/*=
 sharred session */
>>>>> =A0 =A0 =A0 =A0struct list_head =A0 =A0 =A0 =A0cl_lo_inodes; =A0 =
/* Inodes having layouts */
>>>>> + =A0 =A0 =A0 struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS=
 deviceid cache */
>>>>> =A0#endif /* CONFIG_NFS_V4_1 */
>>>>>
>>>>> =A0#ifdef CONFIG_NFS_FSCACHE
>>>>> --
>>>>> 1.6.6
>>>>>
>>>>> --
>>>>> To unsubscribe from this list: send the line "unsubscribe linux-n=
fs" in
>>>>> the body of a message to majordomo@vger.kernel.org
>>>>> More majordomo info at =A0http://vger.kernel.org/majordomo-info.h=
tml
>>>>>
>>>> _______________________________________________
>>>> pNFS mailing list
>>>> pNFS@linux-nfs.org
>>>> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs
>>>
>
>
> --
> Benny Halevy
> Software Architect
> Panasas, Inc.
> bhalevy@panasas.com
> Tel/Fax: +972-3-647-8340
> Mobile: +972-54-802-8340
>
> Panasas: The Leader in Parallel Storage
> www.panasas.com
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [pnfs] [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
       [not found]                 ` <v2h89c397151004220847v3a31c493s4089d0cd53cf3e19-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2010-04-22 15:51                   ` Benny Halevy
  0 siblings, 0 replies; 11+ messages in thread
From: Benny Halevy @ 2010-04-22 15:51 UTC (permalink / raw)
  To: William A. (Andy) Adamson; +Cc: pnfs, linux-nfs

On Apr. 22, 2010, 18:47 +0300, "William A. (Andy) Adamson" <androsadamson@gmail.com> wrote:
> On Thu, Apr 22, 2010 at 7:20 AM, Benny Halevy <bhalevy@panasas.com> wrote:
>> On Apr. 21, 2010, 18:22 +0300, "William A. (Andy) Adamson" <androsadamson@gmail.com> wrote:
>>> On Wed, Apr 21, 2010 at 1:59 AM, Benny Halevy <bhalevy@panasas.com> wrote:
>>>> On Apr. 16, 2010, 19:04 +0300, "William A. (Andy) Adamson" <androsadamson@gmail.com> wrote:
>>>>> On Fri, Apr 16, 2010 at 11:52 AM,  <andros@netapp.com> wrote:
>>>>>> From: Andy Adamson <andros@netapp.com>
>>>>>>
>>>>>> A shared RCU device ID cache servicing multiple mounts of a single layout type
>>>>>> per meta data server (struct nfs_client).
>>>>>>
>>>>>> Device IDs of type deviceid4 are required by all layout types, long lived and
>>>>>> read at each I/O.  They are added to the deviceid cache at first reference by
>>>>>> a layout via GETDEVICEINFO and (currently) are only removed at umount.
>>>>>>
>>>>>> Reference count the device ID cache for each mounted file system
>>>>>> in the initialize_mountpoint layoutdriver_io_operation.
>>>>>>
>>>>>> Dereference the device id cache on file system in the uninitialize_mountpoint
>>>>>> layoutdriver_io_operation called at umount
>>>>>>
>>>>>> Signed-off-by: Andy Adamson <andros@netapp.com>
>>>>>> ---
>>>>>>  fs/nfs/pnfs.c             |  119 +++++++++++++++++++++++++++++++++++++++++++++
>>>>>>  include/linux/nfs4_pnfs.h |   27 ++++++++++
>>>>>>  include/linux/nfs_fs_sb.h |    1 +
>>>>>>  3 files changed, 147 insertions(+), 0 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>>>>>> index 91572aa..8492aef 100644
>>>>>> --- a/fs/nfs/pnfs.c
>>>>>> +++ b/fs/nfs/pnfs.c
>>>>>> @@ -45,6 +45,7 @@
>>>>>>  #include <linux/nfs4.h>
>>>>>>  #include <linux/pnfs_xdr.h>
>>>>>>  #include <linux/nfs4_pnfs.h>
>>>>>> +#include <linux/rculist.h>
>>>>>>
>>>>>>  #include "internal.h"
>>>>>>  #include "nfs4_fs.h"
>>>>>> @@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops = {
>>>>>>
>>>>>>  EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
>>>>>>  EXPORT_SYMBOL(pnfs_register_layoutdriver);
>>>>>> +
>>>>>> +
>>>>>> +/* Device ID cache. Supports one layout type per struct nfs_client */
>>>>>> +int
>>>>>> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
>>>>>> +                        void (*free_callback)(struct rcu_head *))
>>>>>> +{
>>>>>> +       struct nfs4_deviceid_cache *c;
>>>>>> +
>>>>>> +       c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL);
>>>>>> +       if (!c)
>>>>>> +               return -ENOMEM;
>>>>>> +       spin_lock(&clp->cl_lock);
>>>>>> +       if (clp->cl_devid_cache != NULL) {
>>>>>> +               kref_get(&clp->cl_devid_cache->dc_kref);
>>>>>> +               spin_unlock(&clp->cl_lock);
>>>>>> +               dprintk("%s [kref [%d]]\n", __func__,
>>>>>> +                       atomic_read(&clp->cl_devid_cache->dc_kref.refcount));
>>>>>> +               kfree(c);
>>>>>> +       } else {
>>>>>> +               spin_lock_init(&c->dc_lock);
>>>>>> +               INIT_LIST_HEAD(&c->dc_deviceids);
>>>>>> +               kref_init(&c->dc_kref);
>>>>>> +               c->dc_free_callback = free_callback;
>>>>>> +               c->dc_clp = clp;
>>>>>> +               clp->cl_devid_cache = c;
>>>>>> +               spin_unlock(&clp->cl_lock);
>>>>>> +               dprintk("%s [new]\n", __func__);
>>>>>> +       }
>>>>>> +       return 0;
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
>>>>>> +
>>>>>> +void
>>>>>> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
>>>>>> +{
>>>>>> +       INIT_LIST_HEAD(&d->de_node);
>>>>>> +       INIT_RCU_HEAD(&d->de_rcu);
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
>>>>>> +
>>>>>> +struct nfs4_deviceid *
>>>>>> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id)
>>>>>> +{
>>>>>> +       struct nfs4_deviceid *d;
>>>>>> +
>>>>>> +       rcu_read_lock();
>>>>>> +       list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>>>>>> +               if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) {
>>>>>> +                       rcu_read_unlock();
>>>>>> +                       return d;
>>>>>> +               }
>>>>>> +       }
>>>>>> +       rcu_read_unlock();
>>>>
>>>> I hope this is worth the added complexity...
>>>>
>>>> Out of curiosity, do you have a benchmark comparing the cost
>>>> of rcu_read_lock (and unlock) vs. spin_lock(&c->dc_lock)?
>>>
>>> The deviceid cache is read at each I/O. If we use a spin_lock to
>>
>> Yeah, I see where this goes...
>> In the objects layout driver we get a reference on the device structure
>> at alloc_lseg time and keep a pointer to it throughout the lseg's life time.
>> This saves the deviceid lookup on every I/O.
> 
> Perhaps that is a better way to go. How many deviceid's is 'normal'
> for the object driver?
> 

I'd say that order of 10's to a few 100's is normal.
1000 and up is a big installation.

Benny


> -->Andy
> 
> 
>>
>> Benny
>>
>>> protect the deviceid cache, this would mean that all I/0 is serialized
>>> behind the spin_lock even though the deviceid cache is changed
>>> infrequently. The RCU allows readers to "run almost naked" and does
>>> not serialize I/O behind reading the deviceid cache.
>>>
>>> So I think it is worth it. I have not benchmarked the difference.
>>>
>>>
>>>>
>>>>>> +       return NULL;
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(nfs4_find_deviceid);
>>>>>> +
>>>>>> +/*
>>>>>> + * Add or kref_get a deviceid.
>>>>>> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
>>>>>> + */
>>>>>> +void
>>>>>> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new)
>>>>>> +{
>>>>>> +       struct nfs4_deviceid *d;
>>>>>> +
>>>>>> +       spin_lock(&c->dc_lock);
>>>>>> +       list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>>>>>> +               if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) {
>>>>>> +                       spin_unlock(&c->dc_lock);
>>>>>> +                       dprintk("%s [discard]\n", __func__);
>>>>>> +                       c->dc_free_callback(&new->de_rcu);
>>>>>> +               }
>>>>>> +       }
>>>>>> +       list_add_rcu(&new->de_node, &c->dc_deviceids);
>>>>>> +       spin_unlock(&c->dc_lock);
>>>>>> +       dprintk("%s [new]\n", __func__);
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(nfs4_add_deviceid);
>>>>>> +
>>>>>> +static int
>>>>>> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
>>>>>> +{
>>>>>> +       struct nfs4_deviceid *d;
>>>>>> +
>>>>>> +       spin_lock(&c->dc_lock);
>>>>>> +       list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>>>>>> +               list_del_rcu(&d->de_node);
>>>>>> +               spin_unlock(&c->dc_lock);
>>>>>> +               synchronize_rcu();
>>>>>> +               c->dc_free_callback(&d->de_rcu);
>>>>>> +               return 1;
>>>>>> +       }
>>>>>> +       spin_unlock(&c->dc_lock);
>>>>>> +       return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static void
>>>>>> +nfs4_free_deviceid_cache(struct kref *kref)
>>>>>> +{
>>>>>> +       struct nfs4_deviceid_cache *cache =
>>>>>> +               container_of(kref, struct nfs4_deviceid_cache, dc_kref);
>>>>>> +       int more = 1;
>>>>>> +
>>>>>> +       while (more)
>>>>>> +               more = nfs4_remove_deviceid(cache);
>>>>>> +       cache->dc_clp->cl_devid_cache = NULL;
>>>>>
>>>>> Need to take the cl_lock around this assignment
>>>>>
>>>>> spin_lock(&cache->dc_clp->cl_lock);
>>>>> cache->dc_clp->cl_devid_cache = NULL
>>>>> spin_unlock(&cache->dc_clp->cl_lock);
>>>>>
>>>>>
>>>>
>>>> That must happen atomically before kref_put.
>>>> It's illegal to have cl_devid_cache be referenced by cache->dc_clp
>>>> without a reference count backing it up.
>>>> Otherwise, if accessed concurrently to this piece of code
>>>> someone might call kref_get while the refcount is zero.
>>>>
>>>> Normally, you'd first clear the referencing pointer to prevent any
>>>> new reference to it and only then kref_put it, e.g.:
>>>>
>>>> spin_lock(&cache->dc_clp->cl_lock);
>>>> tmp = cache->dc_clp->cl_devid_cache;
>>>> cache->dc_clp->cl_devid_cache = NULL
>>>> spin_unlock(&cache->dc_clp->cl_lock);
>>>> kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);
>>>
>>> Good point. Thanks for the review. I'll rethink and resend
>>>
>>> -->Andy
>>>
>>>>
>>>> Benny
>>>>
>>>>>> +       kfree(cache);
>>>>>> +}
>>>>>> +
>>>>>> +void
>>>>>> +nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
>>>>>> +{
>>>>>> +       dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_kref.refcount));
>>>>>> +       kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
>>>>>> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
>>>>>> index 1d521f4..2a88a06 100644
>>>>>> --- a/include/linux/nfs4_pnfs.h
>>>>>> +++ b/include/linux/nfs4_pnfs.h
>>>>>> @@ -281,6 +281,33 @@ struct pnfs_devicelist {
>>>>>>        struct pnfs_deviceid    dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
>>>>>>  };
>>>>>>
>>>>>> +/*
>>>>>> + * Device ID RCU cache. A device ID is unique per client ID and layout type.
>>>>>> + */
>>>>>> +struct nfs4_deviceid_cache {
>>>>>> +       spinlock_t              dc_lock;
>>>>>> +       struct list_head        dc_deviceids;
>>>>>> +       struct kref             dc_kref;
>>>>>> +       void                    (*dc_free_callback)(struct rcu_head *);
>>>>>> +       struct nfs_client       *dc_clp;
>>>>>> +};
>>>>>> +
>>>>>> +/* Device ID cache node */
>>>>>> +struct nfs4_deviceid {
>>>>>> +       struct list_head        de_node;
>>>>>> +       struct rcu_head         de_rcu;
>>>>>> +       struct pnfs_deviceid    de_id;
>>>>>> +};
>>>>>> +
>>>>>> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
>>>>>> +                               void (*free_callback)(struct rcu_head *));
>>>>>> +extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *);
>>>>>> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
>>>>>> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *,
>>>>>> +                               struct pnfs_deviceid *);
>>>>>> +extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
>>>>>> +                               struct nfs4_deviceid *);
>>>>>> +
>>>>>>  /* pNFS client callback functions.
>>>>>>  * These operations allow the layout driver to access pNFS client
>>>>>>  * specific information or call pNFS client->server operations.
>>>>>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
>>>>>> index 8522461..ef2e18e 100644
>>>>>> --- a/include/linux/nfs_fs_sb.h
>>>>>> +++ b/include/linux/nfs_fs_sb.h
>>>>>> @@ -87,6 +87,7 @@ struct nfs_client {
>>>>>>        u32                     cl_exchange_flags;
>>>>>>        struct nfs4_session     *cl_session;    /* sharred session */
>>>>>>        struct list_head        cl_lo_inodes;   /* Inodes having layouts */
>>>>>> +       struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
>>>>>>  #endif /* CONFIG_NFS_V4_1 */
>>>>>>
>>>>>>  #ifdef CONFIG_NFS_FSCACHE
>>>>>> --
>>>>>> 1.6.6
>>>>>>
>>>>>> --
>>>>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>>>>>> the body of a message to majordomo@vger.kernel.org
>>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>>>
>>>>> _______________________________________________
>>>>> pNFS mailing list
>>>>> pNFS@linux-nfs.org
>>>>> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs
>>>>
>>
>>
>> --
>> Benny Halevy
>> Software Architect
>> Panasas, Inc.
>> bhalevy@panasas.com
>> Tel/Fax: +972-3-647-8340
>> Mobile: +972-54-802-8340
>>
>> Panasas: The Leader in Parallel Storage
>> www.panasas.com
>>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache
  2010-04-26 16:18 [PATCH 0/3] pNFS generic device ID cache version 3 andros
@ 2010-04-26 16:18 ` andros
  0 siblings, 0 replies; 11+ messages in thread
From: andros @ 2010-04-26 16:18 UTC (permalink / raw)
  To: pnfs; +Cc: linux-nfs, Andy Adamson

From: Andy Adamson <andros@netapp.com>

A shared RCU device ID cache servicing multiple mounts of a single layout type
per meta data server (struct nfs_client).

Device IDs of type deviceid4 are required by all layout types, long lived and
read at each I/O.  They are added to the deviceid cache at first reference by
a layout via GETDEVICEINFO and (currently) are only removed at umount.

Reference count the device ID cache for each mounted file system
in the initialize_mountpoint layoutdriver_io_operation.

Dereference the device id cache on file system in the uninitialize_mountpoint
layoutdriver_io_operation called at umount

Each layoutsegment assigns a pointer and takes a reference to the
nfs4_deviceid structure identified by the layout deviceid.
This is so that there are no deviceid lookups for the normal I/O path.

Even thought required by all layouttypes, the deviceid is not exposed in the
LAYOUTGET4res but is instead hidden in the opaque layouttype4.

Therefore, each layout type alloc_lseg calls nfs4_set_layout_deviceid,
and free_lseg calls nfs4_unset_layout_deviceid.

While the file layout driver will not cache very many deviceid's, the object
and block layout drivers could cache 100's for a large installation.
Use an hlist.

Signed-off-by: Andy Adamson <andros@netapp.com>
---
 fs/nfs/pnfs.c             |  167 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfs4_pnfs.h |   50 +++++++++++++
 include/linux/nfs_fs_sb.h |    1 +
 3 files changed, 218 insertions(+), 0 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 91572aa..bf906cc 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -45,6 +45,7 @@
 #include <linux/nfs4.h>
 #include <linux/pnfs_xdr.h>
 #include <linux/nfs4_pnfs.h>
+#include <linux/rculist.h>
 
 #include "internal.h"
 #include "nfs4_fs.h"
@@ -2296,3 +2297,169 @@ struct pnfs_client_operations pnfs_ops = {
 
 EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
 EXPORT_SYMBOL(pnfs_register_layoutdriver);
+
+
+/* Device ID cache. Supports one layout type per struct nfs_client */
+int
+nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
+			 void (*free_callback)(struct kref *))
+{
+	struct nfs4_deviceid_cache *c;
+
+	c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL);
+	if (!c)
+		return -ENOMEM;
+	spin_lock(&clp->cl_lock);
+	if (clp->cl_devid_cache != NULL) {
+		kref_get(&clp->cl_devid_cache->dc_kref);
+		spin_unlock(&clp->cl_lock);
+		dprintk("%s [kref [%d]]\n", __func__,
+			atomic_read(&clp->cl_devid_cache->dc_kref.refcount));
+		kfree(c);
+	} else {
+		int i;
+
+		spin_lock_init(&c->dc_lock);
+		for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE ; i++)
+			INIT_HLIST_HEAD(&c->dc_deviceids[i]);
+		kref_init(&c->dc_kref);
+		c->dc_free_callback = free_callback;
+		clp->cl_devid_cache = c;
+		spin_unlock(&clp->cl_lock);
+		dprintk("%s [new]\n", __func__);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
+
+void
+nfs4_init_deviceid_node(struct nfs4_deviceid *d)
+{
+	INIT_HLIST_NODE(&d->de_node);
+	kref_init(&d->de_kref);
+}
+EXPORT_SYMBOL(nfs4_init_deviceid_node);
+
+/* Called from layoutdriver_io_operations->alloc_lseg */
+void
+nfs4_set_layout_deviceid(struct pnfs_layout_segment *l, struct nfs4_deviceid *d)
+{
+	dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount));
+	l->deviceid = d;
+	kref_get(&d->de_kref);
+}
+EXPORT_SYMBOL(nfs4_set_layout_deviceid);
+
+/* Called from layoutdriver_io_operations->free_lseg */
+void
+nfs4_unset_layout_deviceid(struct pnfs_layout_segment *l,
+			   struct nfs4_deviceid *d,
+			   void (*free_callback)(struct kref *))
+{
+	dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount));
+	l->deviceid = NULL;
+	kref_put(&d->de_kref, free_callback);
+}
+EXPORT_SYMBOL(nfs4_unset_layout_deviceid);
+
+struct nfs4_deviceid *
+nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id)
+{
+	struct nfs4_deviceid *d;
+	struct hlist_node *n;
+	long hash = nfs4_deviceid_hash(id);
+
+	dprintk("--> %s hash %ld\n", __func__, hash);
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
+		if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) {
+			rcu_read_unlock();
+			return d;
+		}
+	}
+	rcu_read_unlock();
+	return NULL;
+}
+EXPORT_SYMBOL(nfs4_find_deviceid);
+
+/*
+ * Add or kref_get a deviceid.
+ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
+ */
+struct nfs4_deviceid *
+nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new)
+{
+	struct nfs4_deviceid *d;
+	struct hlist_node *n;
+	long hash = nfs4_deviceid_hash(&new->de_id);
+
+	dprintk("--> %s hash %ld\n", __func__, hash);
+	spin_lock(&c->dc_lock);
+	hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
+		if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) {
+			spin_unlock(&c->dc_lock);
+			dprintk("%s [discard]\n", __func__);
+			c->dc_free_callback(&new->de_kref);
+			return d;
+		}
+	}
+	hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
+	spin_unlock(&c->dc_lock);
+	dprintk("%s [new]\n", __func__);
+	return new;
+}
+EXPORT_SYMBOL(nfs4_add_deviceid);
+
+static int
+nfs4_remove_deviceid(struct nfs4_deviceid_cache *c, long hash)
+{
+	struct nfs4_deviceid *d;
+	struct hlist_node *n;
+
+	dprintk("--> %s hash %ld\n", __func__, hash);
+	spin_lock(&c->dc_lock);
+	hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
+		hlist_del_rcu(&d->de_node);
+		spin_unlock(&c->dc_lock);
+		synchronize_rcu();
+		dprintk("%s [%d]\n", __func__,
+			atomic_read(&d->de_kref.refcount));
+		kref_put(&d->de_kref, c->dc_free_callback);
+		return 1;
+	}
+	spin_unlock(&c->dc_lock);
+	return 0;
+}
+
+static void
+nfs4_free_deviceid_cache(struct kref *kref)
+{
+	struct nfs4_deviceid_cache *cache =
+		container_of(kref, struct nfs4_deviceid_cache, dc_kref);
+	int more;
+	long i;
+
+	for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) {
+		more = 1;
+		while (more)
+			more = nfs4_remove_deviceid(cache, i);
+	}
+	kfree(cache);
+}
+
+void
+nfs4_put_deviceid_cache(struct nfs_client *clp)
+{
+	struct nfs4_deviceid_cache *tmp = clp->cl_devid_cache;
+	int refcount;
+
+	dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache);
+	spin_lock(&clp->cl_lock);
+	refcount = atomic_read(&clp->cl_devid_cache->dc_kref.refcount);
+	if (refcount == 1)
+		clp->cl_devid_cache = NULL;
+	spin_unlock(&clp->cl_lock);
+	dprintk("%s [%d]\n", __func__, refcount);
+	kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);
+}
+EXPORT_SYMBOL(nfs4_put_deviceid_cache);
diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
index 3caac60..3b7aeb7 100644
--- a/include/linux/nfs4_pnfs.h
+++ b/include/linux/nfs4_pnfs.h
@@ -106,6 +106,7 @@ struct pnfs_layout_segment {
 	struct kref kref;
 	bool valid;
 	struct pnfs_layout_type *layout;
+	struct nfs4_deviceid *deviceid;
 	u8 ld_data[];			/* layout driver private data */
 };
 
@@ -275,6 +276,55 @@ struct pnfs_devicelist {
 	struct pnfs_deviceid	dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
 };
 
+/*
+ * Device ID RCU cache. A device ID is unique per client ID and layout type.
+ */
+#define NFS4_DEVICE_ID_HASH_BITS	5
+#define NFS4_DEVICE_ID_HASH_SIZE	(1 << NFS4_DEVICE_ID_HASH_BITS)
+#define NFS4_DEVICE_ID_HASH_MASK	(NFS4_DEVICE_ID_HASH_SIZE - 1)
+
+static inline u32
+nfs4_deviceid_hash(struct pnfs_deviceid *id)
+{
+	unsigned char *cptr = (unsigned char *)id->data;
+	unsigned int nbytes = NFS4_PNFS_DEVICEID4_SIZE;
+	u32 x = 0;
+
+	while (nbytes--) {
+		x *= 37;
+		x += *cptr++;
+	}
+	return x & NFS4_DEVICE_ID_HASH_MASK;
+}
+
+struct nfs4_deviceid_cache {
+	spinlock_t		dc_lock;
+	struct kref		dc_kref;
+	void			(*dc_free_callback)(struct kref *);
+	struct hlist_head	dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
+};
+
+/* Device ID cache node */
+struct nfs4_deviceid {
+	struct hlist_node	de_node;
+	struct pnfs_deviceid	de_id;
+	struct kref		de_kref;
+};
+
+extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
+				void (*free_callback)(struct kref *));
+extern void nfs4_put_deviceid_cache(struct nfs_client *);
+extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
+extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *,
+				struct pnfs_deviceid *);
+extern struct nfs4_deviceid *nfs4_add_deviceid(struct nfs4_deviceid_cache *,
+				struct nfs4_deviceid *);
+extern void nfs4_set_layout_deviceid(struct pnfs_layout_segment *,
+				struct nfs4_deviceid *);
+extern void nfs4_unset_layout_deviceid(struct pnfs_layout_segment *,
+				struct nfs4_deviceid *,
+				void (*free_callback)(struct kref *));
+
 /* pNFS client callback functions.
  * These operations allow the layout driver to access pNFS client
  * specific information or call pNFS client->server operations.
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 8522461..ef2e18e 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -87,6 +87,7 @@ struct nfs_client {
 	u32			cl_exchange_flags;
 	struct nfs4_session	*cl_session; 	/* sharred session */
 	struct list_head	cl_lo_inodes;	/* Inodes having layouts */
+	struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
 #endif /* CONFIG_NFS_V4_1 */
 
 #ifdef CONFIG_NFS_FSCACHE
-- 
1.6.6


^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2010-04-26 16:18 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-04-16 15:52 [PATCH 0/3] pNFS generic device ID cache andros
2010-04-16 15:52 ` [PATCH 1/3] SQUASHME pnfs_submit: " andros
2010-04-16 15:52   ` [PATCH 2/3] SQUASHME pnfs_submit: fix multiple mount set_pnfs_layoutdriver andros
2010-04-16 15:52     ` [PATCH 3/3] SQUASHME pnfs-submit: file layout driver generic device ID cache andros
2010-04-16 16:04   ` [PATCH 1/3] SQUASHME pnfs_submit: " William A. (Andy) Adamson
     [not found]     ` <u2n89c397151004160904m9e862360xcaf0e187640b0177-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2010-04-21  5:59       ` [pnfs] " Benny Halevy
2010-04-21 15:22         ` William A. (Andy) Adamson
     [not found]           ` <l2l89c397151004210822j8b43009o3a9e78ceed901fd9-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2010-04-22 11:20             ` Benny Halevy
2010-04-22 15:47               ` William A. (Andy) Adamson
     [not found]                 ` <v2h89c397151004220847v3a31c493s4089d0cd53cf3e19-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2010-04-22 15:51                   ` Benny Halevy
2010-04-26 16:18 [PATCH 0/3] pNFS generic device ID cache version 3 andros
2010-04-26 16:18 ` [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache andros

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.